From 56033ff61a40fa9e8f3cd9e7b1424bad1b442a4a Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Fri, 24 Apr 2026 16:41:36 +0200
Subject: [PATCH 01/20] support mixed precision

---
 src/core/common/include/context.h            |  2 +-
 src/core/common/include/md_types.h           | 28 +++++-----
 src/core/common/include/precision.h          |  7 +++
 src/core/common/include/vdw_rules.h          | 26 ++++-----
 src/core/common/src/init.cpp                 | 12 ++--
 src/core/cpu/src/cpu_nonbonded_pp_force.cpp  | 21 ++++---
 src/core/cpu/src/cpu_nonbonded_pw_force.cpp  | 28 +++++-----
 src/core/cpu/src/cpu_nonbonded_qp_force.cpp  | 24 ++++----
 src/core/cpu/src/cpu_nonbonded_qq_force.cpp  | 23 ++++----
 src/core/cpu/src/cpu_nonbonded_qw_force.cpp  | 47 ++++++++--------
 src/core/cpu/src/cpu_nonbonded_ww_force.cpp  | 49 +++++++++--------
 src/core/cuda/src/cuda_angle_force.cu        | 12 ++--
 src/core/cuda/src/cuda_nonbonded_14_force.cu | 42 +++++++-------
 src/core/cuda/src/cuda_nonbonded_force.cu    | 58 ++++++++++++--------
 14 files changed, 202 insertions(+), 177 deletions(-)
 create mode 100644 src/core/common/include/precision.h
diff --git a/src/core/common/include/context.h b/src/core/common/include/context.h
index c77a2c91..83817bb8 100644
--- a/src/core/common/include/context.h
+++ b/src/core/common/include/context.h
@@ -187,7 +187,7 @@ class Context {
     std::unique_ptr<HostDeviceBuffer<int>> p_atoms_list;
     std::unique_ptr<HostDeviceBuffer<int>> w_atoms_list;
     std::unique_ptr<HostDeviceBuffer<int>> q_atoms_list;
-    std::unique_ptr<HostDeviceBuffer<double>> charge_pair_products;
+    std::unique_ptr<HostDeviceBuffer<real_t>> charge_pair_products;
     std::unique_ptr<HostDeviceBuffer<int>> p_charge_types;
     std::unique_ptr<HostDeviceBuffer<int>> w_charge_types;
     std::unique_ptr<HostDeviceBuffer<int>> q_charge_types;
diff --git a/src/core/common/include/md_types.h b/src/core/common/include/md_types.h
index 60f1f56a..6a4d2865 100644
--- a/src/core/common/include/md_types.h
+++ b/src/core/common/include/md_types.h
@@ -2,6 +2,8 @@
 
 #include <string>
 #include <vector>
+
+#include "common/include/precision.h"
 /* =============================================
  * == FROM MD FILE
  * =============================================
@@ -47,9 +49,9 @@ struct md_t {
 };
 
 struct coord_t {
-    double x;
-    double y;
-    double z;
+    real_t x;
+    real_t y;
+    real_t z;
 };
 
 struct bond_t {
@@ -114,7 +116,7 @@ struct charge_t {
 
 struct ccharge_t {
     int code;
-    double charge;
+    real_t charge;
 };
 
 struct atype_t {
@@ -125,17 +127,17 @@ struct atype_t {
 struct catype_t {
     int code;
     double m;
-    double aii_normal;
-    double bii_normal;
+    real_t aii_normal;
+    real_t bii_normal;
     // double aii_polar;
     // double bii_polar;
-    double aii_1_4;
-    double bii_1_4;
+    real_t aii_1_4;
+    real_t bii_1_4;
 };
 
 struct vdw_pair_param_t {
-    double a;
-    double b;
+    real_t a;
+    real_t b;
 };
 
 struct topo_t {
@@ -302,9 +304,9 @@ struct shake_bond_t {
  */
 
 struct vel_t {
-    double x;
-    double y;
-    double z;
+    real_t x;
+    real_t y;
+    real_t z;
 };
 
 struct dvel_t {
diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h
new file mode 100644
index 00000000..f15fc6ca
--- /dev/null
+++ b/src/core/common/include/precision.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#ifdef QDYN_SPFP
+using real_t = float;
+#else
+using real_t = double;
+#endif
diff --git a/src/core/common/include/vdw_rules.h b/src/core/common/include/vdw_rules.h
index ca7bd762..5b8e8604 100644
--- a/src/core/common/include/vdw_rules.h
+++ b/src/core/common/include/vdw_rules.h
@@ -4,15 +4,10 @@
 
 #include <math.h>
 
-
-// Geometric rule: A_ij = sqrt(A_i) * sqrt(A_j), B_ij = sqrt(B_i) * sqrt(B_j)
-// Energy: V = A_ij * r^-12 - B_ij * r^-6
-// Parameters: ai_aii, aj_aii are sqrt(A_i), sqrt(A_j)
-//             ai_bii, aj_bii are sqrt(B_i), sqrt(B_j)
-//             r6 is 1/r^6
+template <typename Real>
 __device__ __host__ inline void calc_vdw_geometric(
-    double ai_aii, double aj_aii, double ai_bii, double aj_bii,
-    double r6, double* V_a, double* V_b) {
+    Real ai_aii, Real aj_aii, Real ai_bii, Real aj_bii,
+    Real r6, Real* V_a, Real* V_b) {
     *V_a = r6 * r6 * ai_aii * aj_aii;
     *V_b = r6 * ai_bii * aj_bii;
 }
@@ -24,16 +19,17 @@ __device__ __host__ inline void calc_vdw_geometric(
 //             ai_aii, aj_aii store R*_i, R*_j (vdW radius)
 //             ai_bii, aj_bii store sqrt(eps_i), sqrt(eps_j) (after preprocessing)
 //             r6 is 1/r^6
+template <typename Real>
 __device__ __host__ inline void calc_vdw_arithmetic(
-    double Rstar_i, double Rstar_j, double sqrt_eps_i, double sqrt_eps_j,
-    double r6, double* V_a, double* V_b) {
-    double Rstar_ij = Rstar_i + Rstar_j;           // Arithmetic combination
-    double sqrt_eps_ij = sqrt_eps_i * sqrt_eps_j;  // Geometric combination (already sqrt)
+    Real Rstar_i, Real Rstar_j, Real sqrt_eps_i, Real sqrt_eps_j,
+    Real r6, Real* V_a, Real* V_b) {
+    Real Rstar_ij = Rstar_i + Rstar_j;           // Arithmetic combination
+    Real sqrt_eps_ij = sqrt_eps_i * sqrt_eps_j;  // Geometric combination (already sqrt)
 
     // Compute R6 = (R*_ij)^6
-    double R2 = Rstar_ij * Rstar_ij;
-    double R6 = R2 * R2 * R2;
+    Real R2 = Rstar_ij * Rstar_ij;
+    Real R6 = R2 * R2 * R2;
 
     *V_a = sqrt_eps_ij * R6 * R6 * r6 * r6;  // sqrt(eps_i * eps_j) * R^12 * r^-12
-    *V_b = 2.0 * sqrt_eps_ij * R6 * r6;      // 2 * sqrt(eps_i * eps_j) * R^6 * r^-6
+    *V_b = static_cast<Real>(2.0) * sqrt_eps_ij * R6 * r6;  // 2 * sqrt(eps_i * eps_j) * R^6 * r^-6
 }
diff --git a/src/core/common/src/init.cpp b/src/core/common/src/init.cpp
index dc519a9f..499c01cb 100644
--- a/src/core/common/src/init.cpp
+++ b/src/core/common/src/init.cpp
@@ -77,9 +77,11 @@ void initialize_catype_tables() {
             const catype_t& cj = h_catype_table_all[j];
             vdw_pair_param_t pair_param = {};
             if (ctx.topo.vdw_rule == VDW_GEOMETRIC) {
-                calc_vdw_geometric(ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, 1.0, &pair_param.a, &pair_param.b);
+                calc_vdw_geometric(
+                    ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, static_cast<real_t>(1.0), &pair_param.a, &pair_param.b);
             } else {
-                calc_vdw_arithmetic(ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, 1.0, &pair_param.a, &pair_param.b);
+                calc_vdw_arithmetic(
+                    ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, static_cast<real_t>(1.0), &pair_param.a, &pair_param.b);
             }
             h_catype_pair_params[i * ctx.n_catype_types + j] = pair_param;
         }
@@ -168,10 +170,11 @@ void initialize_charge_tables() {
     ctx.zero_charge_type = add_charge(0.0);
     ctx.n_charge_types = static_cast<int>(h_charge_table_all.size());
 
-    std::vector<double> h_charge_pair_products(ctx.n_charge_types * ctx.n_charge_types);
+    std::vector<real_t> h_charge_pair_products(ctx.n_charge_types * ctx.n_charge_types);
     for (int i = 0; i < ctx.n_charge_types; i++) {
         for (int j = 0; j < ctx.n_charge_types; j++) {
-            h_charge_pair_products[i * ctx.n_charge_types + j] = h_charge_table_all[i].charge * h_charge_table_all[j].charge;
+            h_charge_pair_products[i * ctx.n_charge_types + j] =
+                static_cast<real_t>(h_charge_table_all[i].charge * h_charge_table_all[j].charge);
         }
     }
 
@@ -913,4 +916,3 @@ void write_headers() {
     write_header("velocities.csv");
     write_energy_header();
 }
-
diff --git a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
index ce744ad0..390c67eb 100644
--- a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
@@ -15,11 +15,10 @@ void calc_nonbonded_pp_forces() {
     bool bond14, bond23;
     double scaling;
     coord_t da;
-    double r2a, ra, r6a;
-    double Vela, V_a, V_b;
-    double dva;
-    double crg_i, crg_j;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
+    real_t r2a, ra, r6a;
+    real_t V_a, V_b;
+    real_t crg_i, crg_j;
+    real_t ai_aii, aj_aii, ai_bii, aj_bii;
     int i, j;
     for (int pi = 0; pi < ctx.n_patoms; pi++) {
         for (int pj = pi + 1; pj < ctx.n_patoms; pj++) {
@@ -42,11 +41,11 @@ void calc_nonbonded_pp_forces() {
             da.x = coords[j].x - coords[i].x;
             da.y = coords[j].y - coords[i].y;
             da.z = coords[j].z - coords[i].z;
-            r2a = 1 / (std::pow(da.x, 2) + std::pow(da.y, 2) + std::pow(da.z, 2));
-            ra = sqrt(r2a);
+            r2a = static_cast<real_t>(1.0) / (da.x * da.x + da.y * da.y + da.z * da.z);
+            ra = static_cast<real_t>(std::sqrt(r2a));
             r6a = r2a * r2a * r2a;
 
-            Vela = scaling * ctx.topo.coulomb_constant * crg_i * crg_j * ra;
+            const real_t Vela = static_cast<real_t>(scaling * ctx.topo.coulomb_constant) * crg_i * crg_j * ra;
 
             ai_aii = bond14 ? ai_type.aii_1_4 : ai_type.aii_normal;
             aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
@@ -58,7 +57,7 @@ void calc_nonbonded_pp_forces() {
             } else {
                 calc_vdw_arithmetic(ai_aii, aj_aii, ai_bii, aj_bii, r6a, &V_a, &V_b);
             }
-            dva = r2a * (-Vela - 12 * V_a + 6 * V_b);
+            const real_t dva = r2a * (-Vela - static_cast<real_t>(12.0) * V_a + static_cast<real_t>(6.0) * V_b);
 
             dvelocities[i].x -= dva * da.x;
             dvelocities[i].y -= dva * da.y;
@@ -68,8 +67,8 @@ void calc_nonbonded_pp_forces() {
             dvelocities[j].y += dva * da.y;
             dvelocities[j].z += dva * da.z;
 
-            ctx.E_nonbond_pp.Ucoul += Vela;
-            ctx.E_nonbond_pp.Uvdw += (V_a - V_b);
+            ctx.E_nonbond_pp.Ucoul += static_cast<double>(Vela);
+            ctx.E_nonbond_pp.Uvdw += static_cast<double>(V_a - V_b);
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
index 6bf2c27e..030c1290 100644
--- a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
@@ -22,21 +22,21 @@ void calc_nonbonded_pw_forces() {
                 continue;
             }
 
-            const double qi = ctx.unified_ccharge(atom_i, 0).charge;
-            const double qj = ctx.unified_ccharge(atom_j, 0).charge;
+            const real_t qi = ctx.unified_ccharge(atom_i, 0).charge;
+            const real_t qj = ctx.unified_ccharge(atom_j, 0).charge;
 
             const catype_t& atom_i_type = ctx.unified_catype(atom_i, 0);
             const catype_t& atom_j_type = ctx.unified_catype(atom_j, 0);
 
-            double v_a = 0.0;
-            double v_b = 0.0;
-            const double dx = coords[atom_j].x - coords[atom_i].x;
-            const double dy = coords[atom_j].y - coords[atom_i].y;
-            const double dz = coords[atom_j].z - coords[atom_i].z;
-            const double r2inv = 1.0 / (dx * dx + dy * dy + dz * dz);
-            const double rinv = std::sqrt(r2inv);
-            const double r6inv = r2inv * r2inv * r2inv;
-            const double ecoul = ctx.topo.coulomb_constant * qi * qj * rinv;
+            real_t v_a = 0.0;
+            real_t v_b = 0.0;
+            const real_t dx = coords[atom_j].x - coords[atom_i].x;
+            const real_t dy = coords[atom_j].y - coords[atom_i].y;
+            const real_t dz = coords[atom_j].z - coords[atom_i].z;
+            const real_t r2inv = static_cast<real_t>(1.0) / (dx * dx + dy * dy + dz * dz);
+            const real_t rinv = static_cast<real_t>(std::sqrt(r2inv));
+            const real_t r6inv = r2inv * r2inv * r2inv;
+            const real_t ecoul = static_cast<real_t>(ctx.topo.coulomb_constant) * qi * qj * rinv;
 
             if (ctx.topo.vdw_rule == VDW_GEOMETRIC) {
                 calc_vdw_geometric(atom_i_type.aii_normal,
@@ -56,7 +56,7 @@ void calc_nonbonded_pw_forces() {
                                     &v_b);
             }
 
-            const double scale = r2inv * (-ecoul - 12.0 * v_a + 6.0 * v_b);
+            const real_t scale = r2inv * (-ecoul - static_cast<real_t>(12.0) * v_a + static_cast<real_t>(6.0) * v_b);
 
             dvelocities[atom_i].x -= scale * dx;
             dvelocities[atom_i].y -= scale * dy;
@@ -66,8 +66,8 @@ void calc_nonbonded_pw_forces() {
             dvelocities[atom_j].y += scale * dy;
             dvelocities[atom_j].z += scale * dz;
 
-            ctx.E_nonbond_pw.Ucoul += ecoul;
-            ctx.E_nonbond_pw.Uvdw += (v_a - v_b);
+            ctx.E_nonbond_pw.Ucoul += static_cast<double>(ecoul);
+            ctx.E_nonbond_pw.Uvdw += static_cast<double>(v_a - v_b);
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
index 65a74a6c..7a81a516 100644
--- a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
@@ -15,10 +15,11 @@ void calc_nonbonded_qp_forces() {
     auto *excluded = ctx.excluded->cpu_data_p;
     int i, j;
     coord_t da;
-    double r2, r6, r;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
+    real_t r2, r;
+    real_t ai_aii, aj_aii, ai_bii, aj_bii;
     bool bond23, bond14;
-    double scaling, Vel, V_a, V_b, dv;
+    double scaling;
+    real_t Vel, V_a, V_b, dv;
 
     for (int qi = 0; qi < ctx.n_qatoms; qi++) {
         for (int pj = 0; pj < ctx.n_patoms; pj++) {
@@ -37,12 +38,10 @@ void calc_nonbonded_qp_forces() {
             da.y = coords[j].y - coords[i].y;
             da.z = coords[j].z - coords[i].z;
 
-            r2 = pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2);
-
-            r6 = r2 * r2 * r2;
-            r2 = 1 / r2;
-            r = sqrt(r2);
-            double r6inv = r2 * r2 * r2;  // 1/r^6 for vdW calculation
+            r2 = da.x * da.x + da.y * da.y + da.z * da.z;
+            r2 = static_cast<real_t>(1.0) / r2;
+            r = static_cast<real_t>(std::sqrt(r2));
+            const real_t r6inv = r2 * r2 * r2;  // 1/r^6 for vdW calculation
 
             for (int state = 0; state < ctx.n_lambdas; state++) {
                 const catype_t& qi_type = ctx.unified_catype(i, state);
@@ -53,7 +52,8 @@ void calc_nonbonded_qp_forces() {
                 ai_bii = bond14 ? qi_type.bii_1_4 : qi_type.bii_normal;
                 aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;
 
-                Vel = ctx.topo.coulomb_constant * scaling * ctx.unified_ccharge(i, state).charge * ctx.unified_ccharge(j, state).charge * r;
+                Vel = static_cast<real_t>(ctx.topo.coulomb_constant * scaling) *
+                      ctx.unified_ccharge(i, state).charge * ctx.unified_ccharge(j, state).charge * r;
                 if (ctx.topo.vdw_rule == VDW_GEOMETRIC) {
                     calc_vdw_geometric(ai_aii, aj_aii, ai_bii, aj_bii, r6inv, &V_a, &V_b);
                 } else {
@@ -70,8 +70,8 @@ void calc_nonbonded_qp_forces() {
                 dvelocities[j].z += dv * da.z;
 
                 // Update Q totals
-                ctx.EQ_nonbond_qp[state].Ucoul += Vel;
-                ctx.EQ_nonbond_qp[state].Uvdw += (V_a - V_b);
+                ctx.EQ_nonbond_qp[state].Ucoul += static_cast<double>(Vel);
+                ctx.EQ_nonbond_qp[state].Uvdw += static_cast<double>(V_a - V_b);
             }
         }
     }
diff --git a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
index 2b062d48..006a3c0e 100644
--- a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
@@ -15,14 +15,14 @@ void calc_nonbonded_qq_forces() {
     auto *excluded = ctx.excluded->cpu_data_p;
     auto *q_elscales = ctx.q_elscales->cpu_data_p;
     int ai, aj;
-    double crg_i, crg_j;
+    real_t crg_i, crg_j;
     double elscale, scaling;
     bool bond23, bond14;
     coord_t da;
-    double r2a, ra, r6a;
-    double Vela, V_a, V_b;
-    double dva;
-    double ai_aii, aj_aii, ai_bii, aj_bii;
+    real_t r2a, ra, r6a;
+    real_t Vela, V_a, V_b;
+    real_t dva;
+    real_t ai_aii, aj_aii, ai_bii, aj_bii;
 
     for (int state = 0; state < ctx.n_lambdas; state++) {
         for (int qi = 0; qi < ctx.n_qatoms; qi++) {
@@ -54,11 +54,11 @@ void calc_nonbonded_qq_forces() {
                 da.x = coords[aj].x - coords[ai].x;
                 da.y = coords[aj].y - coords[ai].y;
                 da.z = coords[aj].z - coords[ai].z;
-                r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2));
-                ra = sqrt(r2a);
+                r2a = static_cast<real_t>(1.0) / (da.x * da.x + da.y * da.y + da.z * da.z);
+                ra = static_cast<real_t>(std::sqrt(r2a));
                 r6a = r2a * r2a * r2a;
 
-                Vela = scaling * ctx.topo.coulomb_constant * crg_i * crg_j * ra * elscale;
+                Vela = static_cast<real_t>(scaling * ctx.topo.coulomb_constant * elscale) * crg_i * crg_j * ra;
 
                 ai_aii = bond14 ? qi_type.aii_1_4 : qi_type.aii_normal;
                 aj_aii = bond14 ? qj_type.aii_1_4 : qj_type.aii_normal;
@@ -70,7 +70,8 @@ void calc_nonbonded_qq_forces() {
                 } else {
                     calc_vdw_arithmetic(ai_aii, aj_aii, ai_bii, aj_bii, r6a, &V_a, &V_b);
                 }
-                dva = r2a * (-Vela - 12 * V_a + 6 * V_b) * lambdas[state];
+                dva = r2a * (-Vela - static_cast<real_t>(12.0) * V_a + static_cast<real_t>(6.0) * V_b) *
+                      static_cast<real_t>(lambdas[state]);
 
                 dvelocities[ai].x -= dva * da.x;
                 dvelocities[ai].y -= dva * da.y;
@@ -80,8 +81,8 @@ void calc_nonbonded_qq_forces() {
                 dvelocities[aj].y += dva * da.y;
                 dvelocities[aj].z += dva * da.z;
 
-                ctx.EQ_nonbond_qq[state].Ucoul += Vela;
-                ctx.EQ_nonbond_qq[state].Uvdw += (V_a - V_b);
+                ctx.EQ_nonbond_qq[state].Ucoul += static_cast<double>(Vela);
+                ctx.EQ_nonbond_qq[state].Uvdw += static_cast<double>(V_a - V_b);
             }
         }
     }
diff --git a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
index 17530a16..8d18bc55 100644
--- a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
@@ -13,17 +13,17 @@ void calc_nonbonded_qw_forces() {
     auto *excluded = ctx.excluded->cpu_data_p;
     int i;
     coord_t dO, dH1, dH2;
-    double r2O, rH1, rH2, r6O, rO, r2H1, r2H2;
-    double dvO, dvH1, dvH2;
-    double V_a, V_b, VelO, VelH1, VelH2;
-    double ai_aii, ai_bii;
+    real_t r2O, rH1, rH2, rO, r2H1, r2H2;
+    real_t dvO, dvH1, dvH2;
+    real_t V_a, V_b, VelO, VelH1, VelH2;
+    real_t ai_aii, ai_bii;
 
     // Loop over O-atoms, q-atoms
     for (int j = ctx.n_atoms_solute; j < ctx.n_atoms; j += 3) {
         const catype_t& ow_type = ctx.unified_catype(j, 0);
-        const double ow_charge = ctx.unified_ccharge(j, 0).charge;
-        const double hw1_charge = ctx.unified_ccharge(j + 1, 0).charge;
-        const double hw2_charge = ctx.unified_ccharge(j + 2, 0).charge;
+        const real_t ow_charge = ctx.unified_ccharge(j, 0).charge;
+        const real_t hw1_charge = ctx.unified_ccharge(j + 1, 0).charge;
+        const real_t hw2_charge = ctx.unified_ccharge(j + 2, 0).charge;
         for (int qi = 0; qi < ctx.n_qatoms; qi++) {
             i = ctx.q_atoms[qi];
             if (excluded[i] || excluded[j]) continue;
@@ -36,13 +36,12 @@ void calc_nonbonded_qw_forces() {
             dH2.x = coords[j + 2].x - coords[i].x;
             dH2.y = coords[j + 2].y - coords[i].y;
             dH2.z = coords[j + 2].z - coords[i].z;
-            r2O = pow(dO.x, 2) + pow(dO.y, 2) + pow(dO.z, 2);
-            rH1 = sqrt(1.0 / (pow(dH1.x, 2) + pow(dH1.y, 2) + pow(dH1.z, 2)));
-            rH2 = sqrt(1.0 / (pow(dH2.x, 2) + pow(dH2.y, 2) + pow(dH2.z, 2)));
-            r6O = r2O * r2O * r2O;
-            r2O = 1.0 / r2O;
-            rO = sqrt(r2O);
-            double r6Oinv = r2O * r2O * r2O;  // 1/r^6 for vdW calculation
+            r2O = dO.x * dO.x + dO.y * dO.y + dO.z * dO.z;
+            rH1 = static_cast<real_t>(std::sqrt(static_cast<real_t>(1.0) / (dH1.x * dH1.x + dH1.y * dH1.y + dH1.z * dH1.z)));
+            rH2 = static_cast<real_t>(std::sqrt(static_cast<real_t>(1.0) / (dH2.x * dH2.x + dH2.y * dH2.y + dH2.z * dH2.z)));
+            r2O = static_cast<real_t>(1.0) / r2O;
+            rO = static_cast<real_t>(std::sqrt(r2O));
+            const real_t r6Oinv = r2O * r2O * r2O;  // 1/r^6 for vdW calculation
             r2H1 = rH1 * rH1;
             r2H2 = rH2 * rH2;
 
@@ -63,19 +62,21 @@ void calc_nonbonded_qw_forces() {
                     calc_vdw_arithmetic(ai_aii, ow_type.aii_normal, ai_bii, ow_type.bii_normal, r6Oinv, &V_a, &V_b);
                 }
 
-                const double q_charge = ctx.unified_ccharge(i, state).charge;
-                VelO = ctx.topo.coulomb_constant * ow_charge * q_charge * rO;
-                VelH1 = ctx.topo.coulomb_constant * hw1_charge * q_charge * rH1;
-                VelH2 = ctx.topo.coulomb_constant * hw2_charge * q_charge * rH2;
+                const real_t q_charge = ctx.unified_ccharge(i, state).charge;
+                const real_t coulomb_constant = static_cast<real_t>(ctx.topo.coulomb_constant);
+                VelO = coulomb_constant * ow_charge * q_charge * rO;
+                VelH1 = coulomb_constant * hw1_charge * q_charge * rH1;
+                VelH2 = coulomb_constant * hw2_charge * q_charge * rH2;
 
                 // if (state == 0 && qi == 1) printf("j = %d ai__aii = %f A_O = %f B_O = %f V_a = %f V_b = %f r6O = %f\n", j, ai_aii, A_O, B_O, V_a, V_b, r6O);
 
-                dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * lambdas[state];
-                dvH1 -= r2H1 * VelH1 * lambdas[state];
-                dvH2 -= r2H2 * VelH2 * lambdas[state];
+                const real_t lambda = static_cast<real_t>(lambdas[state]);
+                dvO += r2O * (-VelO - (static_cast<real_t>(12.0) * V_a - static_cast<real_t>(6.0) * V_b)) * lambda;
+                dvH1 -= r2H1 * VelH1 * lambda;
+                dvH2 -= r2H2 * VelH2 * lambda;
 
-                ctx.EQ_nonbond_qw[state].Ucoul += (VelO + VelH1 + VelH2);
-                ctx.EQ_nonbond_qw[state].Uvdw += (V_a - V_b);
+                ctx.EQ_nonbond_qw[state].Ucoul += static_cast<double>(VelO + VelH1 + VelH2);
+                ctx.EQ_nonbond_qw[state].Uvdw += static_cast<double>(V_a - V_b);
             }
 
             // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!!
diff --git a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
index 505dd45a..3be5e6f0 100644
--- a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
@@ -1,18 +1,21 @@
 #include "cpu_nonbonded_ww_force.h"
 
+#include <cmath>
+
 #include "constants.h"
 #include "context.h"
 #include "vdw_rules.h"
 
 namespace {
-void calc_oxygen_vdw_parameters(const Context& ctx, int oxygen_i, int oxygen_j, double* vdw_a, double* vdw_b) {
+void calc_oxygen_vdw_parameters(const Context& ctx, int oxygen_i, int oxygen_j, real_t* vdw_a, real_t* vdw_b) {
     const catype_t& oi_type = ctx.unified_catype(oxygen_i, 0);
     const catype_t& oj_type = ctx.unified_catype(oxygen_j, 0);
     if (ctx.topo.vdw_rule == VDW_GEOMETRIC) {
         *vdw_a = oi_type.aii_normal * oj_type.aii_normal;
         *vdw_b = oi_type.bii_normal * oj_type.bii_normal;
     } else {
-        calc_vdw_arithmetic(oi_type.aii_normal, oj_type.aii_normal, oi_type.bii_normal, oj_type.bii_normal, 1.0, vdw_a, vdw_b);
+        calc_vdw_arithmetic(
+            oi_type.aii_normal, oj_type.aii_normal, oi_type.bii_normal, oj_type.bii_normal, static_cast<real_t>(1.0), vdw_a, vdw_b);
     }
 }
 }  // namespace
@@ -20,33 +23,33 @@ void calc_oxygen_vdw_parameters(const Context& ctx, int oxygen_i, int oxygen_j,
 void accumulate_pair_force(Context& ctx,
                            int atom_i,
                            int atom_j,
-                           double qi,
-                           double qj,
+                           real_t qi,
+                           real_t qj,
                            bool include_vdw,
-                           double vdw_a,
-                           double vdw_b,
+                           real_t vdw_a,
+                           real_t vdw_b,
                            E_nonbonded_t& energy) {
     auto &coords = ctx.coords->cpu_data_p;
     auto &dvelocities = ctx.dvelocities->cpu_data_p;
-    const double dx = coords[atom_j].x - coords[atom_i].x;
-    const double dy = coords[atom_j].y - coords[atom_i].y;
-    const double dz = coords[atom_j].z - coords[atom_i].z;
+    const real_t dx = coords[atom_j].x - coords[atom_i].x;
+    const real_t dy = coords[atom_j].y - coords[atom_i].y;
+    const real_t dz = coords[atom_j].z - coords[atom_i].z;
 
-    const double r2inv = 1.0 / (dx * dx + dy * dy + dz * dz);
-    const double rinv = std::sqrt(r2inv);
-    const double ecoul = ctx.topo.coulomb_constant * qi * qj * rinv;
+    const real_t r2inv = static_cast<real_t>(1.0) / (dx * dx + dy * dy + dz * dz);
+    const real_t rinv = static_cast<real_t>(std::sqrt(r2inv));
+    const real_t ecoul = static_cast<real_t>(ctx.topo.coulomb_constant) * qi * qj * rinv;
 
-    double evdw = 0.0;
-    double dva = -ecoul;
+    real_t evdw = 0.0;
+    real_t dva = -ecoul;
     if (include_vdw) {
-        const double r6inv = r2inv * r2inv * r2inv;
-        const double v_a = vdw_a * r6inv * r6inv;
-        const double v_b = vdw_b * r6inv;
+        const real_t r6inv = r2inv * r2inv * r2inv;
+        const real_t v_a = vdw_a * r6inv * r6inv;
+        const real_t v_b = vdw_b * r6inv;
         evdw = v_a - v_b;
-        dva -= 12.0 * v_a - 6.0 * v_b;
+        dva -= static_cast<real_t>(12.0) * v_a - static_cast<real_t>(6.0) * v_b;
     }
 
-    const double scale = r2inv * dva;
+    const real_t scale = r2inv * dva;
 
     dvelocities[atom_i].x -= scale * dx;
     dvelocities[atom_i].y -= scale * dy;
@@ -56,8 +59,8 @@ void accumulate_pair_force(Context& ctx,
     dvelocities[atom_j].y += scale * dy;
     dvelocities[atom_j].z += scale * dz;
 
-    energy.Ucoul += ecoul;
-    energy.Uvdw += evdw;
+    energy.Ucoul += static_cast<double>(ecoul);
+    energy.Uvdw += static_cast<double>(evdw);
 }
 
 void calc_nonbonded_ww_forces() {
@@ -70,8 +73,8 @@ void calc_nonbonded_ww_forces() {
         const int base_i = ctx.n_atoms_solute + 3 * water_i;
         for (int water_j = water_i + 1; water_j < ctx.n_waters; ++water_j) {
             const int base_j = ctx.n_atoms_solute + 3 * water_j;
-            double oxygen_vdw_a = 0.0;
-            double oxygen_vdw_b = 0.0;
+            real_t oxygen_vdw_a = 0.0;
+            real_t oxygen_vdw_b = 0.0;
             calc_oxygen_vdw_parameters(ctx, base_i, base_j, &oxygen_vdw_a, &oxygen_vdw_b);
             for (int atom_i = 0; atom_i < 3; ++atom_i) {
                 for (int atom_j = 0; atom_j < 3; ++atom_j) {
diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu
index 7c49cffb..dcd044ce 100644
--- a/src/core/cuda/src/cuda_angle_force.cu
+++ b/src/core/cuda/src/cuda_angle_force.cu
@@ -48,14 +48,14 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co
     atomicAdd(energy_sum, energy);
 
     coord_t di = {
-        f1 * (rjk.x / (rji_length * rjk_length) - cos_theta * rji.x / (rji_length * rji_length)),
-        f1 * (rjk.y / (rji_length * rjk_length) - cos_theta * rji.y / (rji_length * rji_length)),
-        f1 * (rjk.z / (rji_length * rjk_length) - cos_theta * rji.z / (rji_length * rji_length))};
+        static_cast<real_t>(f1 * (rjk.x / (rji_length * rjk_length) - cos_theta * rji.x / (rji_length * rji_length))),
+        static_cast<real_t>(f1 * (rjk.y / (rji_length * rjk_length) - cos_theta * rji.y / (rji_length * rji_length))),
+        static_cast<real_t>(f1 * (rjk.z / (rji_length * rjk_length) - cos_theta * rji.z / (rji_length * rji_length)))};
 
     coord_t dk = {
-        f1 * (rji.x / (rji_length * rjk_length) - cos_theta * rjk.x / (rjk_length * rjk_length)),
-        f1 * (rji.y / (rji_length * rjk_length) - cos_theta * rjk.y / (rjk_length * rjk_length)),
-        f1 * (rji.z / (rji_length * rjk_length) - cos_theta * rjk.z / (rjk_length * rjk_length))};
+        static_cast<real_t>(f1 * (rji.x / (rji_length * rjk_length) - cos_theta * rjk.x / (rjk_length * rjk_length))),
+        static_cast<real_t>(f1 * (rji.y / (rji_length * rjk_length) - cos_theta * rjk.y / (rjk_length * rjk_length))),
+        static_cast<real_t>(f1 * (rji.z / (rji_length * rjk_length) - cos_theta * rjk.z / (rjk_length * rjk_length)))};
 
     atomicAdd(&dvelocities[i].x, dv * di.x);
     atomicAdd(&dvelocities[i].y, dv * di.y);
diff --git a/src/core/cuda/src/cuda_nonbonded_14_force.cu b/src/core/cuda/src/cuda_nonbonded_14_force.cu
index fa404ee7..a33bb695 100644
--- a/src/core/cuda/src/cuda_nonbonded_14_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_14_force.cu
@@ -29,12 +29,12 @@ __device__ __forceinline__ int unified_parameter_index(
 __device__ void calculate_nonbonded_14_pair(
     const coord_t& x,
     const coord_t& y,
-    double x_charge,
-    double y_charge,
-    double x_aii,
-    double y_aii,
-    double x_bii,
-    double y_bii,
+    real_t x_charge,
+    real_t y_charge,
+    real_t x_aii,
+    real_t y_aii,
+    real_t x_bii,
+    real_t y_bii,
     double coulomb_constant,
     double scaling,
     int vdw_rule,
@@ -42,15 +42,17 @@ __device__ void calculate_nonbonded_14_pair(
     double& evdw,
     double& ecoul,
     double& dv) {
-    const double3 d = {x.x - y.x, x.y - y.y, x.z - y.z};
-    const double r = rsqrt(d.x * d.x + d.y * d.y + d.z * d.z);
-    const double r2 = r * r;
-    const double r6 = r2 * r2 * r2;
+    const real_t dx = x.x - y.x;
+    const real_t dy = x.y - y.y;
+    const real_t dz = x.z - y.z;
+    const real_t r = rsqrt(dx * dx + dy * dy + dz * dz);
+    const real_t r2 = r * r;
+    const real_t r6 = r2 * r2 * r2;
 
     ecoul = scaling * coulomb_constant * x_charge * y_charge * r * lambda;
 
-    double v_a = 0.0;
-    double v_b = 0.0;
+    real_t v_a = 0.0;
+    real_t v_b = 0.0;
     if (vdw_rule == VDW_GEOMETRIC) {
         calc_vdw_geometric(x_aii, y_aii, x_bii, y_bii, r6, &v_a, &v_b);
     } else {
@@ -124,13 +126,15 @@ __global__ void calc_nonbonded_14_force_kernel(
         ecoul,
         dv);
 
-    const double3 d = {rj.x - ri.x, rj.y - ri.y, rj.z - ri.z};
-    atomicAdd(&d_dvelocities[ai].x, -dv * d.x);
-    atomicAdd(&d_dvelocities[ai].y, -dv * d.y);
-    atomicAdd(&d_dvelocities[ai].z, -dv * d.z);
-    atomicAdd(&d_dvelocities[aj].x, dv * d.x);
-    atomicAdd(&d_dvelocities[aj].y, dv * d.y);
-    atomicAdd(&d_dvelocities[aj].z, dv * d.z);
+    const real_t dx = rj.x - ri.x;
+    const real_t dy = rj.y - ri.y;
+    const real_t dz = rj.z - ri.z;
+    atomicAdd(&d_dvelocities[ai].x, -dv * dx);
+    atomicAdd(&d_dvelocities[ai].y, -dv * dy);
+    atomicAdd(&d_dvelocities[ai].z, -dv * dz);
+    atomicAdd(&d_dvelocities[aj].x, dv * dx);
+    atomicAdd(&d_dvelocities[aj].y, dv * dy);
+    atomicAdd(&d_dvelocities[aj].z, dv * dz);
 
     atomicAdd(&evdw_totals[mode], evdw);
     atomicAdd(&ecoul_totals[mode], ecoul);
diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu
index 432a7137..097a3550 100644
--- a/src/core/cuda/src/cuda_nonbonded_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_force.cu
@@ -19,7 +19,13 @@ __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
     y += x;
 }
 
-__device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0xffffffffu) {
+template <typename T>
+__device__ __forceinline__ T shfl_value(T v, int srcLane, unsigned mask = 0xffffffffu) {
+    return __shfl_sync(mask, v, srcLane);
+}
+
+template <>
+__device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mask) {
     int2 a = *reinterpret_cast<int2*>(&v);
     a.x = __shfl_sync(mask, a.x, srcLane);
     a.y = __shfl_sync(mask, a.y, srcLane);
@@ -27,9 +33,9 @@ __device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0x
 }
 
 __device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) {
-    v.x = shfl(v.x, srcLane, mask);
-    v.y = shfl(v.y, srcLane, mask);
-    v.z = shfl(v.z, srcLane, mask);
+    v.x = shfl_value(v.x, srcLane, mask);
+    v.y = shfl_value(v.y, srcLane, mask);
+    v.z = shfl_value(v.z, srcLane, mask);
     return v;
 }
 
@@ -37,7 +43,7 @@ __device__ void calculate_unforce_bound(
     const coord_t& x,
     const coord_t& y,
 
-    const double charge_product,
+    const real_t charge_product,
     const vdw_pair_param_t& pair_param,
 
     const double coulomb_constant,
@@ -48,10 +54,12 @@ __device__ void calculate_unforce_bound(
     double& evdw,
     double& ecoul,
     double& dv) {
-    double3 d = {x.x - y.x, x.y - y.y, x.z - y.z};
-    double r = rsqrt(d.x * d.x + d.y * d.y + d.z * d.z);
-    double r2 = r * r;
-    double r6 = r2 * r2 * r2;
+    const real_t dx = x.x - y.x;
+    const real_t dy = x.y - y.y;
+    const real_t dz = x.z - y.z;
+    const real_t r = rsqrt(dx * dx + dy * dy + dz * dz);
+    const real_t r2 = r * r;
+    const real_t r6 = r2 * r2 * r2;
     // double v_a = r6 * r6;
     // double v_b = r6;
     // ecoul = r;
@@ -60,8 +68,8 @@ __device__ void calculate_unforce_bound(
 
     ecoul = scaling * coulomb_constant * charge_product * r * lambda;
 
-    double v_a = pair_param.a * r6 * r6 * lambda;
-    double v_b = pair_param.b * r6 * lambda;
+    const real_t v_a = pair_param.a * r6 * r6 * static_cast<real_t>(lambda);
+    const real_t v_b = pair_param.b * r6 * static_cast<real_t>(lambda);
     evdw = v_a - v_b;
     dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b);
 }
@@ -72,7 +80,7 @@ __global__ void calc_nonbonded_force_kernel(
 
     const int* x_charges_types,
     const int* y_charges_types,
-    const double* charge_pair_products,
+    const real_t* charge_pair_products,
 
     const int* x_atypes_types,
     const int* y_atypes_types,
@@ -139,7 +147,7 @@ __global__ void calc_nonbonded_force_kernel(
     int x_atom_idx = (x_idx < nx) ? x_idx_list[x_idx] : -1;
     int y_atom_idx = (y_idx < ny) ? y_idx_list[y_idx] : -1;
 
-    coord_t invalid = {-1e9, -1e9, -1e9};
+    coord_t invalid = {static_cast<real_t>(-1e9), static_cast<real_t>(-1e9), static_cast<real_t>(-1e9)};
     coord_t x_coord = (x_atom_idx >= 0) ? d_coords[x_atom_idx] : invalid;
     coord_t y_coord = (y_atom_idx >= 0) ? d_coords[y_atom_idx] : invalid;
 
@@ -194,9 +202,9 @@ __global__ void calc_nonbonded_force_kernel(
         y_charge_type_idx = __shfl_sync(mask, y_charge_type_idx, src);
         y_catype_type_idx = __shfl_sync(mask, y_catype_type_idx, src);
 
-        y_force.x = shfl(y_force.x, src, mask);
-        y_force.y = shfl(y_force.y, src, mask);
-        y_force.z = shfl(y_force.z, src, mask);
+        y_force.x = shfl_value(y_force.x, src, mask);
+        y_force.y = shfl_value(y_force.y, src, mask);
+        y_force.z = shfl_value(y_force.z, src, mask);
     };
 
     if (disable_water_h_lj) {
@@ -214,7 +222,7 @@ __global__ void calc_nonbonded_force_kernel(
     for (int i = 0; i < 32; i++) {
         if (is_valid()) {
             double scaling = 1.0;
-            double charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx];
+            real_t charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx];
             vdw_pair_param_t pair_param = catype_pair_params[pair_row + y_catype_type_idx];
 
             // todo: Now the idx is wrong, should optimize it later
@@ -242,14 +250,16 @@ __global__ void calc_nonbonded_force_kernel(
             evdw_sum += evdw;
             ecoul_sum += ecoul;
 
-            double3 d = {x_coord.x - y_coord.x, x_coord.y - y_coord.y, x_coord.z - y_coord.z};
-            y_force.x -= dv * d.x;
-            y_force.y -= dv * d.y;
-            y_force.z -= dv * d.z;
+            const real_t dx = x_coord.x - y_coord.x;
+            const real_t dy = x_coord.y - y_coord.y;
+            const real_t dz = x_coord.z - y_coord.z;
+            y_force.x -= dv * dx;
+            y_force.y -= dv * dy;
+            y_force.z -= dv * dz;
 
-            x_force.x += dv * d.x;
-            x_force.y += dv * d.y;
-            x_force.z += dv * d.z;
+            x_force.x += dv * dx;
+            x_force.y += dv * dy;
+            x_force.z += dv * dz;
         }
         do_shuffle();
     }

From e9406befed0cfa6b6779cd9a5e7dde24a8fb0a0c Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Tue, 28 Apr 2026 09:28:42 +0200
Subject: [PATCH 02/20] add benchmark script

---
 benchmark-qgpu/benchmark_correctness.py    | 417 ++++++++++++++++
 benchmark-qgpu/benchmark_nsday.py          | 427 +++++++++++++++++
 benchmark-qgpu/benchmark_system_scaling.py | 413 ++++++++++++++++
 benchmark-qgpu/benchmark_test.py           | 527 +++++++++++++++++++++
 test/runTEST.py                            | 261 ++++++----
 5 files changed, 1950 insertions(+), 95 deletions(-)
 create mode 100644 benchmark-qgpu/benchmark_correctness.py
 create mode 100644 benchmark-qgpu/benchmark_nsday.py
 create mode 100644 benchmark-qgpu/benchmark_system_scaling.py
 create mode 100644 benchmark-qgpu/benchmark_test.py

diff --git a/benchmark-qgpu/benchmark_correctness.py b/benchmark-qgpu/benchmark_correctness.py
new file mode 100644
index 00000000..07f25046
--- /dev/null
+++ b/benchmark-qgpu/benchmark_correctness.py
@@ -0,0 +1,417 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import io
+import json
+import math
+import os
+import shutil
+import sys
+from contextlib import redirect_stdout
+from datetime import datetime
+from pathlib import Path
+from statistics import mean
+
+os.environ.setdefault("MPLCONFIGDIR", "/tmp/qgpu-benchmark-matplotlib")
+
+import matplotlib
+
+matplotlib.use("Agg")
+from matplotlib import pyplot as plt
+
+from benchmark_test import (
+    ROOT,
+    command_text,
+    prepare_qgpu_input,
+    prepare_restart_with_qdyn_test,
+    resolve_fortran_bin,
+    resolve_qgpu_bin,
+    resolve_test_data,
+    run_timed,
+    write_md_input,
+)
+
+sys.path.insert(0, str(ROOT / "src" / "Qgpu"))
+
+import compare  # noqa: E402
+import energy as ENERGY  # noqa: E402
+
+
+def default_collect_out(test_name):
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_{test_name}_correctness"
+
+
+def run_qgpu_once(qgpu_bin, prepared_data_dir, run_dir):
+    if run_dir.exists():
+        shutil.rmtree(run_dir)
+    run_dir.mkdir(parents=True)
+    data_dir = run_dir / prepared_data_dir.name
+    shutil.copytree(prepared_data_dir, data_dir)
+
+    stdout_path = run_dir / "qgpu.log"
+    stderr_path = run_dir / "qgpu.err"
+    args = [str(qgpu_bin), "--gpu", str(data_dir)]
+    return_code, wall_seconds = run_timed(args, ROOT, stdout_path, stderr_path)
+    if return_code != 0:
+        raise RuntimeError(
+            "QGPU correctness run failed. "
+            f"Command: {command_text(args)} Logs: stdout={stdout_path} stderr={stderr_path}"
+        )
+    return data_dir, {
+        "command": command_text(args),
+        "return_code": return_code,
+        "wall_seconds": wall_seconds,
+        "stdout": str(stdout_path),
+        "stderr": str(stderr_path),
+    }
+
+
+def load_qgpu_energy(qgpu_data_dir):
+    energy_path = Path(qgpu_data_dir) / "output" / "energies.csv"
+    if not energy_path.exists():
+        raise FileNotFoundError(f"QGPU energy file not found: {energy_path}")
+    return ENERGY.Read_Energy(str(energy_path), 0).QDYN(), energy_path
+
+
+def load_fortran_energy(fortran_dir):
+    q_data_path = Path(fortran_dir) / "Q_data.json"
+    if not q_data_path.exists():
+        raise FileNotFoundError(f"Fortran energy JSON not found: {q_data_path}")
+    with open(q_data_path, encoding="utf-8") as json_f:
+        return json.load(json_f), q_data_path
+
+
+def build_correctness_rows(fortran_data, qgpu_data, tolerance):
+    compare.ENERGY_TOLERANCE = tolerance
+    rows = []
+    frames = sorted(int(key) for key in fortran_data.keys() if key.isdigit())
+    for frame in frames:
+        if frame >= len(qgpu_data):
+            continue
+        with redirect_stdout(io.StringIO()):
+            passed, fortran_values, qgpu_values = compare.compare_energies(
+                fortran_data[str(frame)],
+                qgpu_data[frame],
+            )
+        for term, fortran_value, qgpu_value in zip(compare.header, fortran_values, qgpu_values):
+            if math.isnan(fortran_value) or math.isnan(qgpu_value):
+                continue
+            abs_error = abs(fortran_value - qgpu_value)
+            rel_error = abs_error / abs(fortran_value) if fortran_value != 0 else ""
+            rows.append(
+                {
+                    "frame": frame,
+                    "term": term,
+                    "fortran": fortran_value,
+                    "qgpu": qgpu_value,
+                    "abs_error": abs_error,
+                    "rel_error": rel_error,
+                    "passed_tolerance": abs_error <= tolerance,
+                    "frame_passed": passed,
+                }
+            )
+    if not rows:
+        raise RuntimeError("No comparable energy rows were produced.")
+    return rows
+
+
+def summarize_rows(rows, tolerance):
+    abs_errors = [float(row["abs_error"]) for row in rows]
+    by_term = {}
+    for row in rows:
+        by_term.setdefault(row["term"], []).append(float(row["abs_error"]))
+
+    term_summary = []
+    for term, values in sorted(by_term.items()):
+        term_summary.append(
+            {
+                "term": term,
+                "max_abs_error": max(values),
+                "mean_abs_error": mean(values),
+                "rmse": math.sqrt(mean([value * value for value in values])),
+            }
+        )
+
+    return {
+        "tolerance": tolerance,
+        "frames": sorted({int(row["frame"]) for row in rows}),
+        "terms": len(by_term),
+        "rows": len(rows),
+        "max_abs_error": max(abs_errors),
+        "mean_abs_error": mean(abs_errors),
+        "rmse": math.sqrt(mean([value * value for value in abs_errors])),
+        "passed": all(float(row["abs_error"]) <= tolerance for row in rows),
+        "term_summary": term_summary,
+    }
+
+
+def write_outputs(rows, summary, out_dir, metadata):
+    terms_csv = out_dir / "correctness_terms.csv"
+    summary_json = out_dir / "correctness_summary.json"
+
+    with open(terms_csv, "w", newline="", encoding="utf-8") as csv_f:
+        fieldnames = [
+            "frame",
+            "term",
+            "fortran",
+            "qgpu",
+            "abs_error",
+            "rel_error",
+            "passed_tolerance",
+            "frame_passed",
+        ]
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+
+    payload = {
+        "created_at": datetime.now().isoformat(timespec="seconds"),
+        "metadata": metadata,
+        "summary": summary,
+    }
+    with open(summary_json, "w", encoding="utf-8") as json_f:
+        json.dump(payload, json_f, indent=2)
+
+    return terms_csv, summary_json
+
+
+def collect(args):
+    out_dir = Path(args.out).expanduser().resolve() if args.out else default_collect_out(args.test)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    qgpu_bin = resolve_qgpu_bin(args.qgpu_bin)
+    prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin)
+    data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake)
+
+    fortran_dir = out_dir / "fortran_reference"
+    prep_dir = out_dir / "qgpu_prepare"
+    qgpu_run_dir = out_dir / "qgpu_run"
+    fortran_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Preparing Fortran reference for {args.test}")
+    write_md_input(data, fortran_dir)
+    prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir)
+
+    print("Preparing QGPU input")
+    prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir)
+
+    print("Running QGPU correctness simulation")
+    qgpu_data_dir, qgpu_run = run_qgpu_once(qgpu_bin, prepared_data_dir, qgpu_run_dir)
+
+    fortran_data, fortran_energy_path = load_fortran_energy(fortran_dir)
+    qgpu_data, qgpu_energy_path = load_qgpu_energy(qgpu_data_dir)
+    rows = build_correctness_rows(fortran_data, qgpu_data, args.tolerance)
+    summary = summarize_rows(rows, args.tolerance)
+
+    terms_csv, summary_json = write_outputs(
+        rows,
+        summary,
+        out_dir,
+        {
+            "test": args.test,
+            "steps": args.steps,
+            "lambda": args.lambda_name,
+            "shake": args.shake,
+            "qgpu_bin": str(qgpu_bin),
+            "prep_fortran_bin": str(prep_fortran_bin),
+            "fortran_energy": str(fortran_energy_path),
+            "qgpu_energy": str(qgpu_energy_path),
+            "qgpu_run": qgpu_run,
+        },
+    )
+
+    print(f"Terms CSV: {terms_csv}")
+    print(f"Summary JSON: {summary_json}")
+    print(
+        f"max |delta E| = {summary['max_abs_error']:.6g} kcal/mol; "
+        f"RMSE = {summary['rmse']:.6g}; passed = {summary['passed']}"
+    )
+    return 0
+
+
+def load_rows(csv_path):
+    rows = []
+    with open(csv_path, newline="", encoding="utf-8") as csv_f:
+        reader = csv.DictReader(csv_f)
+        for row in reader:
+            row["frame"] = int(row["frame"])
+            row["fortran"] = float(row["fortran"])
+            row["qgpu"] = float(row["qgpu"])
+            row["abs_error"] = float(row["abs_error"])
+            rows.append(row)
+    if not rows:
+        raise RuntimeError(f"No rows found in {csv_path}")
+    return rows
+
+
+def select_term_rows(rows, term):
+    selected = [row for row in rows if row["term"] == term]
+    if not selected:
+        terms = ", ".join(sorted({row["term"] for row in rows}))
+        raise ValueError(f"Term '{term}' not found. Available terms: {terms}")
+    return sorted(selected, key=lambda row: row["frame"])
+
+
+def plot(args):
+    rows = load_rows(Path(args.csv).expanduser().resolve())
+    selected = select_term_rows(rows, args.term)
+
+    frames = [row["frame"] for row in selected]
+    fortran_values = [row["fortran"] for row in selected]
+    qgpu_values = [row["qgpu"] for row in selected]
+    abs_errors = [row["abs_error"] for row in selected]
+    rel_errors_pct = [
+        (row["abs_error"] / abs(row["fortran"]) * 100.0) if row["fortran"] != 0 else 0.0
+        for row in selected
+    ]
+    max_abs_error = max(abs_errors)
+    mean_abs_error = mean(abs_errors)
+    rmse = math.sqrt(mean([value * value for value in abs_errors]))
+    max_rel_error = max(rel_errors_pct)
+    mean_rel_error = mean(rel_errors_pct)
+
+    if args.error_mode == "relative":
+        plotted_errors = rel_errors_pct
+        error_ylabel = "Relative error (%)"
+        tolerance = args.tolerance
+        tolerance_label = "rel. tolerance"
+    else:
+        plotted_errors = abs_errors
+        error_ylabel = "|delta E|"
+        tolerance = args.tolerance
+        tolerance_label = "tolerance"
+
+    out_path = Path(args.out).expanduser().resolve()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    fig = plt.figure(figsize=(9.8, 4.2))
+    grid = fig.add_gridspec(2, 2, width_ratios=[4.2, 1.45], height_ratios=[2.3, 1.3])
+    ax_energy = fig.add_subplot(grid[0, 0])
+    ax_error = fig.add_subplot(grid[1, 0], sharex=ax_energy)
+    ax_panel = fig.add_subplot(grid[:, 1])
+
+    ax_energy.plot(frames, fortran_values, color="#4a4a4a", linewidth=1.8, label="Fortran")
+    ax_energy.plot(frames, qgpu_values, color="#0b71c8", linestyle="--", linewidth=1.6, label="QGPU")
+    ax_energy.set_title(args.title, loc="left", fontsize=13, weight="bold", color="#113b5f")
+    ax_energy.set_ylabel(f"{args.term} (kcal/mol)")
+    ax_energy.grid(axis="y", color="#e5e8ee", linewidth=0.8)
+    ax_energy.legend(frameon=False, loc="best", fontsize=8)
+    ax_energy.spines["top"].set_visible(False)
+    ax_energy.spines["right"].set_visible(False)
+
+    ax_error.plot(frames, plotted_errors, color="#d62728", linewidth=1.6)
+    ax_error.fill_between(frames, plotted_errors, color="#d62728", alpha=0.13)
+    if tolerance is not None:
+        ax_error.axhline(tolerance, color="#777777", linestyle=":", linewidth=1.0, label=tolerance_label)
+        ax_error.legend(frameon=False, loc="best", fontsize=8)
+    ax_error.set_xlabel("MD step")
+    ax_error.set_ylabel(error_ylabel)
+    ax_error.grid(axis="y", color="#e5e8ee", linewidth=0.8)
+    ax_error.spines["top"].set_visible(False)
+    ax_error.spines["right"].set_visible(False)
+
+    ax_panel.set_facecolor("#eef5fd")
+    for spine in ax_panel.spines.values():
+        spine.set_color("#8ab9ef")
+    ax_panel.set_xticks([])
+    ax_panel.set_yticks([])
+    if args.error_mode == "relative":
+        ax_panel.text(0.5, 0.84, "Consistency", ha="center", va="center", fontsize=13, weight="bold", color="#0b3970")
+        ax_panel.text(0.5, 0.64, f"{max_rel_error:.3f}%", ha="center", va="center", fontsize=24, weight="bold", color="#003c7f")
+        ax_panel.text(0.5, 0.50, "max rel. error", ha="center", va="center", fontsize=10, color="#0b3970")
+        ax_panel.axhline(0.36, xmin=0.15, xmax=0.85, color="#8ab9ef", linewidth=0.8)
+        ax_panel.text(0.5, 0.25, f"mean {mean_rel_error:.3f}%", ha="center", va="center", fontsize=11, weight="bold", color="#0b3970")
+        ax_panel.text(0.5, 0.13, f"abs RMSE {rmse:.2e}", ha="center", va="center", fontsize=9, color="#0b3970")
+    else:
+        ax_panel.text(0.5, 0.82, "Agreement", ha="center", va="center", fontsize=13, weight="bold", color="#0b3970")
+        ax_panel.text(0.5, 0.62, f"{max_abs_error:.2e}", ha="center", va="center", fontsize=22, weight="bold", color="#003c7f")
+        ax_panel.text(0.5, 0.48, "max |delta E|", ha="center", va="center", fontsize=10, color="#0b3970")
+        ax_panel.axhline(0.34, xmin=0.15, xmax=0.85, color="#8ab9ef", linewidth=0.8)
+        ax_panel.text(0.5, 0.23, f"RMSE {rmse:.2e}", ha="center", va="center", fontsize=11, weight="bold", color="#0b3970")
+        ax_panel.text(0.5, 0.12, f"mean {mean_abs_error:.2e}", ha="center", va="center", fontsize=10, color="#0b3970")
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=220)
+    plt.close(fig)
+    print(f"Plot written to: {out_path}")
+    return 0
+
+
+def positive_int(value):
+    parsed = int(value)
+    if parsed < 1:
+        raise argparse.ArgumentTypeError("must be >= 1")
+    return parsed
+
+
+def nonnegative_float(value):
+    parsed = float(value)
+    if parsed < 0:
+        raise argparse.ArgumentTypeError("must be >= 0")
+    return parsed
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Collect and plot Fortran vs QGPU energy correctness.")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    collect_parser = subparsers.add_parser("collect", help="Run a correctness benchmark and write CSV data.")
+    collect_parser.add_argument("--test", required=True, help="runTEST.py test name.")
+    collect_parser.add_argument("--steps", type=positive_int, required=True, help="MD steps.")
+    collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.")
+    collect_parser.add_argument("--shake", action="store_true", help="Enable shake.")
+    collect_parser.add_argument("--out", help="Output directory.")
+    collect_parser.add_argument("--qgpu-bin", help="Path to QGPU qdyn binary.")
+    collect_parser.add_argument(
+        "--prep-fortran-bin",
+        default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"),
+        help="Path to qdyn_test used to generate Fortran reference data.",
+    )
+    collect_parser.add_argument(
+        "--tolerance",
+        type=nonnegative_float,
+        default=1e-3,
+        help="Absolute energy tolerance in kcal/mol for pass/fail summary.",
+    )
+
+    plot_parser = subparsers.add_parser("plot", help="Plot correctness from correctness_terms.csv.")
+    plot_parser.add_argument("csv", help="correctness_terms.csv from collect.")
+    plot_parser.add_argument("--out", required=True, help="Output PNG path.")
+    plot_parser.add_argument("--term", default="total-Utot", help="Energy term to plot.")
+    plot_parser.add_argument(
+        "--title",
+        default="Long-Run Energy Consistency",
+        help="Plot title.",
+    )
+    plot_parser.add_argument(
+        "--error-mode",
+        choices=["absolute", "relative"],
+        default="absolute",
+        help="Plot absolute kcal/mol error or relative percent error.",
+    )
+    plot_parser.add_argument(
+        "--tolerance",
+        type=nonnegative_float,
+        default=None,
+        help="Optional horizontal tolerance line on the error panel. Units follow --error-mode.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    if args.command == "collect":
+        return collect(args)
+    if args.command == "plot":
+        return plot(args)
+    raise SystemExit(f"Unknown command: {args.command}")
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except (FileNotFoundError, RuntimeError, ValueError) as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        raise SystemExit(1)
diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py
new file mode 100644
index 00000000..d62ee459
--- /dev/null
+++ b/benchmark-qgpu/benchmark_nsday.py
@@ -0,0 +1,427 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import json
+import os
+import shutil
+import subprocess
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from statistics import median
+
+os.environ.setdefault("MPLCONFIGDIR", "/tmp/qgpu-benchmark-matplotlib")
+
+import matplotlib
+
+matplotlib.use("Agg")
+from matplotlib import pyplot as plt
+
+from benchmark_test import (
+    ROOT,
+    TIME_STEP_NS,
+    command_text,
+    prepare_qgpu_input,
+    prepare_restart_with_qdyn_test,
+    resolve_fortran_bin,
+    resolve_qgpu_bin,
+    resolve_test_data,
+    write_md_input,
+)
+
+
+def read_steps_from_md_csv(data_dir):
+    md_path = Path(data_dir) / "md.csv"
+    if not md_path.exists():
+        raise FileNotFoundError(f"md.csv not found: {md_path}")
+    with open(md_path, encoding="utf-8") as md_f:
+        for line in md_f:
+            if line.startswith("steps;"):
+                return int(line.strip().split(";", 1)[1])
+    raise RuntimeError(f"Could not find steps in {md_path}")
+
+
+def default_collect_out(label):
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    safe_label = "".join(ch if ch.isalnum() or ch in "._-" else "_" for ch in label)
+    return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_{safe_label}_nsday"
+
+
+def prepare_from_test(args, out_dir):
+    data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake)
+    fortran_dir = out_dir / "prepare" / args.test / "fortran"
+    prep_dir = out_dir / "prepare" / args.test / "qgpu_prepare"
+    fortran_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Preparing QGPU input for {args.test} in {out_dir}")
+    write_md_input(data, fortran_dir)
+    prepare_restart_with_qdyn_test(data, resolve_fortran_bin(args.prep_fortran_bin), fortran_dir)
+    return prepare_qgpu_input(data, fortran_dir, prep_dir)
+
+
+def resolve_collect_data_dir(args, out_dir):
+    if args.data_dir:
+        data_dir = Path(args.data_dir).expanduser().resolve()
+        if not data_dir.is_dir():
+            raise FileNotFoundError(f"data dir not found: {data_dir}")
+        steps = args.steps if args.steps is not None else read_steps_from_md_csv(data_dir)
+        return data_dir, steps
+
+    if not args.test:
+        raise SystemExit("collect requires --test or --data-dir.")
+    if args.steps is None:
+        raise SystemExit("collect with --test requires --steps.")
+    data_dir = prepare_from_test(args, out_dir)
+    return data_dir, args.steps
+
+
+def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, steps, label, repeat):
+    if run_dir.exists():
+        shutil.rmtree(run_dir)
+    run_dir.mkdir(parents=True)
+
+    processes = []
+    process_rows = []
+    command_template = None
+    batch_start = time.perf_counter()
+    for index in range(1, concurrency + 1):
+        proc_dir = run_dir / f"proc_{index:03d}"
+        data_dir = proc_dir / prepared_data_dir.name
+        proc_dir.mkdir(parents=True)
+        shutil.copytree(prepared_data_dir, data_dir)
+
+        stdout_path = proc_dir / "qgpu.log"
+        stderr_path = proc_dir / "qgpu.err"
+        args = [str(qgpu_bin), "--gpu", str(data_dir)]
+        command_template = command_text([str(qgpu_bin), "--gpu", "<data_dir>"])
+        stdout_f = open(stdout_path, "w", encoding="utf-8")
+        stderr_f = open(stderr_path, "w", encoding="utf-8")
+        proc_start = time.perf_counter()
+        process = subprocess.Popen(args, cwd=ROOT, stdout=stdout_f, stderr=stderr_f)
+        processes.append(
+            {
+                "index": index,
+                "process": process,
+                "stdout_file": stdout_f,
+                "stderr_file": stderr_f,
+                "stdout": stdout_path,
+                "stderr": stderr_path,
+                "start": proc_start,
+                "command": command_text(args),
+            }
+        )
+
+    remaining = set(range(len(processes)))
+    while remaining:
+        for item_index in list(remaining):
+            item = processes[item_index]
+            return_code = item["process"].poll()
+            if return_code is None:
+                continue
+            item["return_code"] = return_code
+            item["end"] = time.perf_counter()
+            item["stdout_file"].close()
+            item["stderr_file"].close()
+            remaining.remove(item_index)
+        if remaining:
+            time.sleep(0.01)
+
+    for item in processes:
+        wall_seconds = item["end"] - item["start"]
+        process_rows.append(
+            {
+                "label": label,
+                "concurrency": concurrency,
+                "repeat": repeat,
+                "process_index": item["index"],
+                "return_code": item["return_code"],
+                "process_wall_seconds": wall_seconds,
+                "process_ns_per_day": steps * TIME_STEP_NS * 86400 / wall_seconds if wall_seconds > 0 else "",
+                "stdout": str(item["stdout"]),
+                "stderr": str(item["stderr"]),
+                "command": item["command"],
+            }
+        )
+
+    batch_wall_seconds = time.perf_counter() - batch_start
+    failed = sum(1 for row in process_rows if row["return_code"] != 0)
+    total_ns_per_day = concurrency * steps * TIME_STEP_NS * 86400 / batch_wall_seconds
+    mean_process_ns_per_day = (
+        sum(float(row["process_ns_per_day"]) for row in process_rows if row["process_ns_per_day"] != "")
+        / len(process_rows)
+    )
+    return {
+        "label": label,
+        "concurrency": concurrency,
+        "repeat": repeat,
+        "steps": steps,
+        "batch_wall_seconds": batch_wall_seconds,
+        "total_ns_per_day": total_ns_per_day,
+        "mean_process_ns_per_day": mean_process_ns_per_day,
+        "failed_processes": failed,
+        "command": command_template,
+    }, process_rows
+
+
+def write_collect_outputs(batch_rows, process_rows, out_dir, meta):
+    summary_csv = out_dir / "nsday_summary.csv"
+    process_csv = out_dir / "nsday_processes.csv"
+    meta_json = out_dir / "nsday_meta.json"
+
+    with open(summary_csv, "w", newline="", encoding="utf-8") as csv_f:
+        fieldnames = [
+            "label",
+            "concurrency",
+            "repeat",
+            "steps",
+            "batch_wall_seconds",
+            "total_ns_per_day",
+            "mean_process_ns_per_day",
+            "failed_processes",
+            "command",
+        ]
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(batch_rows)
+
+    with open(process_csv, "w", newline="", encoding="utf-8") as csv_f:
+        fieldnames = [
+            "label",
+            "concurrency",
+            "repeat",
+            "process_index",
+            "return_code",
+            "process_wall_seconds",
+            "process_ns_per_day",
+            "stdout",
+            "stderr",
+            "command",
+        ]
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(process_rows)
+
+    with open(meta_json, "w", encoding="utf-8") as json_f:
+        json.dump(meta, json_f, indent=2)
+
+    return summary_csv, process_csv, meta_json
+
+
+def collect(args):
+    label = args.label or args.test or Path(args.data_dir).name
+    out_dir = Path(args.out).expanduser().resolve() if args.out else default_collect_out(label)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    qgpu_bin = resolve_qgpu_bin(args.qgpu_bin)
+    prepared_data_dir, steps = resolve_collect_data_dir(args, out_dir)
+
+    batch_rows = []
+    process_rows = []
+    for concurrency in args.concurrency:
+        for repeat in range(1, args.repeat + 1):
+            run_dir = out_dir / "runs" / f"c{concurrency:03d}" / f"repeat_{repeat:03d}"
+            print(f"Running {label}: concurrency={concurrency}, repeat={repeat}")
+            batch_row, rows = run_concurrency_batch(
+                qgpu_bin=qgpu_bin,
+                prepared_data_dir=prepared_data_dir,
+                run_dir=run_dir,
+                concurrency=concurrency,
+                steps=steps,
+                label=label,
+                repeat=repeat,
+            )
+            batch_rows.append(batch_row)
+            process_rows.extend(rows)
+            if batch_row["failed_processes"]:
+                summary_csv, process_csv, meta_json = write_collect_outputs(
+                    batch_rows,
+                    process_rows,
+                    out_dir,
+                    {
+                        "created_at": datetime.now().isoformat(timespec="seconds"),
+                        "label": label,
+                        "qgpu_bin": str(qgpu_bin),
+                        "prepared_data_dir": str(prepared_data_dir),
+                        "steps": steps,
+                    },
+                )
+                raise RuntimeError(
+                    f"{batch_row['failed_processes']} process(es) failed at concurrency "
+                    f"{concurrency}, repeat {repeat}. Summary: {summary_csv}; processes: {process_csv}; meta: {meta_json}"
+                )
+            if args.pause_seconds > 0:
+                time.sleep(args.pause_seconds)
+
+    summary_csv, process_csv, meta_json = write_collect_outputs(
+        batch_rows,
+        process_rows,
+        out_dir,
+        {
+            "created_at": datetime.now().isoformat(timespec="seconds"),
+            "label": label,
+            "test": args.test,
+            "data_dir": str(prepared_data_dir),
+            "qgpu_bin": str(qgpu_bin),
+            "steps": steps,
+            "concurrency": args.concurrency,
+            "repeat": args.repeat,
+        },
+    )
+    print(f"Summary CSV: {summary_csv}")
+    print(f"Process CSV: {process_csv}")
+    print(f"Metadata JSON: {meta_json}")
+    return 0
+
+
+def load_plot_series(csv_paths, metric):
+    series = {}
+    for csv_path in csv_paths:
+        with open(csv_path, newline="", encoding="utf-8") as csv_f:
+            reader = csv.DictReader(csv_f)
+            for row in reader:
+                if int(row.get("failed_processes") or 0) != 0:
+                    continue
+                label = row["label"]
+                concurrency = int(row["concurrency"])
+                value = float(row[metric])
+                series.setdefault(label, {}).setdefault(concurrency, []).append(value)
+
+    plotted = []
+    for label, by_concurrency in sorted(series.items()):
+        xs = sorted(by_concurrency)
+        ys = [median(by_concurrency[x]) for x in xs]
+        plotted.append({"label": label, "xs": xs, "ys": ys})
+    if not plotted:
+        raise RuntimeError("No successful rows found in input CSV file(s).")
+    return plotted
+
+
+def plot(args):
+    metric = args.metric
+    series = load_plot_series([Path(path).expanduser().resolve() for path in args.csv], metric)
+    out_path = Path(args.out).expanduser().resolve()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    fig, (ax, panel) = plt.subplots(
+        1,
+        2,
+        figsize=(9.5, 3.2),
+        gridspec_kw={"width_ratios": [4.4, 1.55]},
+    )
+    palette = ["#1f77b4", "#43a047", "#f57c00", "#7b1fa2", "#00838f"]
+    all_points = []
+    for index, item in enumerate(series):
+        color = palette[index % len(palette)]
+        ax.plot(item["xs"], item["ys"], marker="o", linewidth=1.8, markersize=4.5, color=color, label=item["label"])
+        for x, y in zip(item["xs"], item["ys"]):
+            all_points.append((y, item["label"], x))
+            ax.text(x, y, f"{y:.1f}", ha="center", va="bottom", fontsize=8, weight="bold", color="#253142")
+
+    ax.set_title(args.title, loc="left", fontsize=13, weight="bold", color="#0f5f18")
+    ax.text(0.0, 1.02, args.subtitle, transform=ax.transAxes, fontsize=9, style="italic", color="#253142")
+    ax.set_xlabel("Concurrent Simulations")
+    ax.set_ylabel("Throughput (ns/day)")
+    ax.grid(axis="y", color="#e3e7ed", linewidth=0.8)
+    ax.set_axisbelow(True)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    ax.legend(frameon=False, loc="upper left", fontsize=8)
+
+    best_points = sorted(all_points, reverse=True)
+    best = best_points[0]
+    second = None
+    seen_labels = {best[1]}
+    for point in best_points[1:]:
+        if point[1] not in seen_labels:
+            second = point
+            break
+
+    panel.set_facecolor("#edf7eb")
+    for spine in panel.spines.values():
+        spine.set_color("#a3d39b")
+    panel.set_xticks([])
+    panel.set_yticks([])
+    panel.text(0.5, 0.80, "Up to", ha="center", va="center", fontsize=11, weight="bold", color="#14751c")
+    panel.text(0.5, 0.55, f"{best[0]:.1f}", ha="center", va="center", fontsize=30, weight="bold", color="#14751c")
+    panel.text(0.5, 0.35, "ns/day", ha="center", va="center", fontsize=13, weight="bold", color="#14751c")
+    panel.text(0.5, 0.20, f"{best[1]}", ha="center", va="center", fontsize=9, color="#253142")
+    if second is not None:
+        panel.axhline(0.12, xmin=0.12, xmax=0.88, color="#7fbf79", linewidth=0.8)
+        panel.text(0.5, 0.05, f"{second[0]:.1f} ns/day", ha="center", va="bottom", fontsize=10, weight="bold", color="#14751c")
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=220)
+    plt.close(fig)
+    print(f"Plot written to: {out_path}")
+    return 0
+
+
+def positive_int(value):
+    parsed = int(value)
+    if parsed < 1:
+        raise argparse.ArgumentTypeError("must be >= 1")
+    return parsed
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Collect and plot QGPU concurrency throughput in ns/day.")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    collect_parser = subparsers.add_parser("collect", help="Run QGPU concurrency benchmark and write CSV data.")
+    collect_parser.add_argument("--test", help="runTEST.py test name to prepare and benchmark.")
+    collect_parser.add_argument("--data-dir", help="Existing prepared QGPU input directory containing md.csv.")
+    collect_parser.add_argument("--steps", type=positive_int, help="MD steps. Required with --test; optional with --data-dir.")
+    collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.")
+    collect_parser.add_argument("--shake", action="store_true", help="Enable shake when preparing from --test.")
+    collect_parser.add_argument(
+        "--concurrency",
+        type=positive_int,
+        nargs="+",
+        default=[1, 2, 4, 8],
+        help="Concurrent QGPU simulations to run.",
+    )
+    collect_parser.add_argument("--repeat", type=positive_int, default=1, help="Repeats per concurrency level.")
+    collect_parser.add_argument("--label", help="Series label written into the CSV, e.g. 'A100 (thrombin)'.")
+    collect_parser.add_argument("--out", help="Output directory.")
+    collect_parser.add_argument("--qgpu-bin", help="Path to QGPU qdyn binary.")
+    collect_parser.add_argument(
+        "--prep-fortran-bin",
+        default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"),
+        help="Path to qdyn_test used only when preparing from --test.",
+    )
+    collect_parser.add_argument("--pause-seconds", type=float, default=0.0, help="Pause between batches.")
+
+    plot_parser = subparsers.add_parser("plot", help="Plot ns/day vs concurrency from one or more CSV files.")
+    plot_parser.add_argument("csv", nargs="+", help="One or more nsday_summary.csv files from collect.")
+    plot_parser.add_argument("--out", required=True, help="Output PNG path.")
+    plot_parser.add_argument(
+        "--metric",
+        choices=["total_ns_per_day", "mean_process_ns_per_day"],
+        default="total_ns_per_day",
+        help="Y-axis metric.",
+    )
+    plot_parser.add_argument("--title", default="Multi-System Concurrency (MPS)", help="Plot title.")
+    plot_parser.add_argument(
+        "--subtitle",
+        default="Total simulation throughput at different concurrency levels",
+        help="Plot subtitle.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    if args.command == "collect":
+        return collect(args)
+    if args.command == "plot":
+        return plot(args)
+    raise SystemExit(f"Unknown command: {args.command}")
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except (FileNotFoundError, RuntimeError, ValueError) as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        raise SystemExit(1)
diff --git a/benchmark-qgpu/benchmark_system_scaling.py b/benchmark-qgpu/benchmark_system_scaling.py
new file mode 100644
index 00000000..d481244b
--- /dev/null
+++ b/benchmark-qgpu/benchmark_system_scaling.py
@@ -0,0 +1,413 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import json
+import math
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+from statistics import median
+
+os.environ.setdefault("MPLCONFIGDIR", "/tmp/qgpu-benchmark-matplotlib")
+
+import matplotlib
+
+matplotlib.use("Agg")
+from matplotlib import pyplot as plt
+
+from benchmark_test import (
+    ROOT,
+    ns_per_day,
+    prepare_qgpu_input,
+    prepare_restart_with_qdyn_test,
+    resolve_fortran_bin,
+    resolve_qgpu_bin,
+    resolve_test_data,
+    run_fortran_repeats,
+    run_qgpu_repeats,
+    write_md_input,
+)
+
+
+def default_collect_out():
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_system_scaling"
+
+
+def count_atoms(prepared_data_dir):
+    coords_path = Path(prepared_data_dir) / "coords.csv"
+    if not coords_path.exists():
+        raise FileNotFoundError(f"coords.csv not found: {coords_path}")
+    with open(coords_path, encoding="utf-8") as coords_f:
+        return int(coords_f.readline().strip())
+
+
+def successful_times(records):
+    return [float(record["wall_seconds"]) for record in records if int(record["return_code"]) == 0]
+
+
+def write_raw_records(records, out_dir):
+    path = out_dir / "system_scaling_raw.csv"
+    fieldnames = [
+        "test",
+        "runner",
+        "repeat",
+        "command",
+        "return_code",
+        "wall_seconds",
+        "steps",
+        "ns_per_day",
+        "stdout",
+        "stderr",
+    ]
+    with open(path, "w", newline="", encoding="utf-8") as csv_f:
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(records)
+    return path
+
+
+def write_summary(rows, out_dir, metadata):
+    summary_csv = out_dir / "system_scaling.csv"
+    meta_json = out_dir / "system_scaling_meta.json"
+
+    fieldnames = [
+        "test",
+        "atoms",
+        "steps",
+        "fortran_wall_median_s",
+        "qgpu_wall_median_s",
+        "fortran_ns_per_day",
+        "qgpu_ns_per_day",
+        "speedup_x",
+        "fortran_repeats",
+        "qgpu_repeats",
+    ]
+    with open(summary_csv, "w", newline="", encoding="utf-8") as csv_f:
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+
+    with open(meta_json, "w", encoding="utf-8") as json_f:
+        json.dump(metadata, json_f, indent=2)
+
+    return summary_csv, meta_json
+
+
+def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qgpu_bin):
+    test_dir = out_dir / test_name
+    fortran_dir = test_dir / "fortran"
+    prep_dir = test_dir / "qgpu_prepare"
+    qgpu_runs_dir = test_dir / "qgpu_runs"
+    fortran_dir.mkdir(parents=True, exist_ok=True)
+
+    data = resolve_test_data(test_name, args.steps, args.lambda_name, args.shake)
+    print(f"Preparing {test_name}")
+    write_md_input(data, fortran_dir)
+
+    print(f"Running Fortran qdyn for {test_name} ({args.repeat} repeat(s))")
+    fortran_records, fortran_ok = run_fortran_repeats(data, fortran_bin, fortran_dir, args.repeat, args.steps)
+    if not fortran_ok:
+        return None, fortran_records
+
+    print(f"Preparing QGPU input for {test_name}")
+    prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir)
+    prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir)
+    atoms = count_atoms(prepared_data_dir)
+
+    print(f"Running QGPU for {test_name} ({args.repeat} repeat(s))")
+    qgpu_records = run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, args.repeat, args.steps)
+
+    fortran_times = successful_times(fortran_records)
+    qgpu_times = successful_times(qgpu_records)
+    if not fortran_times or not qgpu_times:
+        return None, [*fortran_records, *qgpu_records]
+
+    fortran_wall = median(fortran_times)
+    qgpu_wall = median(qgpu_times)
+    row = {
+        "test": test_name,
+        "atoms": atoms,
+        "steps": args.steps,
+        "fortran_wall_median_s": fortran_wall,
+        "qgpu_wall_median_s": qgpu_wall,
+        "fortran_ns_per_day": ns_per_day(args.steps, fortran_wall),
+        "qgpu_ns_per_day": ns_per_day(args.steps, qgpu_wall),
+        "speedup_x": fortran_wall / qgpu_wall if qgpu_wall > 0 else "",
+        "fortran_repeats": len(fortran_records),
+        "qgpu_repeats": len(qgpu_records),
+    }
+    return row, [*fortran_records, *qgpu_records]
+
+
+def collect(args):
+    out_dir = Path(args.out).expanduser().resolve() if args.out else default_collect_out()
+    out_dir.mkdir(parents=True, exist_ok=True)
+    fortran_bin = resolve_fortran_bin(args.fortran_bin)
+    prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin)
+    qgpu_bin = resolve_qgpu_bin(args.qgpu_bin)
+
+    rows = []
+    raw_records = []
+    try:
+        for test_name in args.test:
+            row, records = collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qgpu_bin)
+            raw_records.extend(records)
+            write_raw_records(raw_records, out_dir)
+            if row is not None:
+                rows.append(row)
+                write_summary(
+                    rows,
+                    out_dir,
+                    {
+                        "created_at": datetime.now().isoformat(timespec="seconds"),
+                        "tests": args.test,
+                        "steps": args.steps,
+                        "repeat": args.repeat,
+                        "fortran_bin": str(fortran_bin),
+                        "prep_fortran_bin": str(prep_fortran_bin),
+                        "qgpu_bin": str(qgpu_bin),
+                    },
+                )
+    finally:
+        raw_path = write_raw_records(raw_records, out_dir)
+
+    failures = [record for record in raw_records if int(record["return_code"]) != 0]
+    if failures:
+        first = failures[0]
+        raise RuntimeError(
+            f"{first['runner']} failed for {first['test']} repeat {first['repeat']}. "
+            f"Logs: stdout={first['stdout']} stderr={first['stderr']}; raw CSV: {raw_path}"
+        )
+
+    summary_csv, meta_json = write_summary(
+        rows,
+        out_dir,
+        {
+            "created_at": datetime.now().isoformat(timespec="seconds"),
+            "tests": args.test,
+            "steps": args.steps,
+            "repeat": args.repeat,
+            "fortran_bin": str(fortran_bin),
+            "prep_fortran_bin": str(prep_fortran_bin),
+            "qgpu_bin": str(qgpu_bin),
+        },
+    )
+    print(f"Summary CSV: {summary_csv}")
+    print(f"Raw CSV: {raw_path}")
+    print(f"Metadata JSON: {meta_json}")
+    return 0
+
+
+def load_rows(csv_path):
+    rows = []
+    with open(csv_path, newline="", encoding="utf-8") as csv_f:
+        reader = csv.DictReader(csv_f)
+        for row in reader:
+            parsed = dict(row)
+            for key in [
+                "atoms",
+                "steps",
+                "fortran_wall_median_s",
+                "qgpu_wall_median_s",
+                "fortran_ns_per_day",
+                "qgpu_ns_per_day",
+                "speedup_x",
+            ]:
+                parsed[key] = float(parsed[key])
+            rows.append(parsed)
+    if not rows:
+        raise RuntimeError(f"No rows found in {csv_path}")
+    return rows
+
+
+def fmt_atoms(atoms):
+    atoms = int(atoms)
+    if atoms >= 1000:
+        return f"{atoms / 1000:.1f}k atoms"
+    return f"{atoms} atoms"
+
+
+def annotate_bars(ax, bars, formatter):
+    for bar in bars:
+        height = bar.get_height()
+        ax.text(
+            bar.get_x() + bar.get_width() / 2,
+            height,
+            formatter(height),
+            ha="center",
+            va="bottom",
+            fontsize=8,
+            weight="bold",
+        )
+
+
+def plot_speedup(rows, out_path, title):
+    labels = [row["test"] for row in rows]
+    speedups = [row["speedup_x"] for row in rows]
+    atoms = [row["atoms"] for row in rows]
+
+    fig, (ax, panel) = plt.subplots(
+        1,
+        2,
+        figsize=(9.2, 3.3),
+        gridspec_kw={"width_ratios": [4.3, 1.55]},
+    )
+    x = range(len(rows))
+    bars = ax.bar(x, speedups, color="#0b71c8", width=0.62)
+    annotate_bars(ax, bars, lambda value: f"{value:.1f}x")
+    ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f")
+    ax.set_ylabel("Speedup vs Fortran (x)")
+    ax.set_xticks(list(x))
+    ax.set_xticklabels(labels)
+    ax.grid(axis="y", color="#e5e8ee", linewidth=0.8)
+    ax.set_axisbelow(True)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    for xpos, atom_count in zip(x, atoms):
+        ax.text(xpos, -0.08, fmt_atoms(atom_count), transform=ax.get_xaxis_transform(), ha="center", va="top", fontsize=8)
+
+    best = max(rows, key=lambda row: row["speedup_x"])
+    panel.set_facecolor("#eef5fd")
+    for spine in panel.spines.values():
+        spine.set_color("#8ab9ef")
+    panel.set_xticks([])
+    panel.set_yticks([])
+    panel.text(0.5, 0.80, "Best", ha="center", va="center", fontsize=12, weight="bold", color="#0b3970")
+    panel.text(0.5, 0.55, f"{best['speedup_x']:.1f}x", ha="center", va="center", fontsize=30, weight="bold", color="#003c7f")
+    panel.text(0.5, 0.35, "speedup", ha="center", va="center", fontsize=13, weight="bold", color="#0b3970")
+    panel.text(0.5, 0.18, best["test"], ha="center", va="center", fontsize=10, color="#0b3970")
+    panel.text(0.5, 0.08, fmt_atoms(best["atoms"]), ha="center", va="center", fontsize=9, color="#0b3970")
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=220)
+    plt.close(fig)
+
+
+def plot_nsday(rows, out_path, title):
+    labels = [row["test"] for row in rows]
+    x = list(range(len(rows)))
+    width = 0.34
+
+    fig, ax = plt.subplots(figsize=(8.6, 3.5))
+    fortran = [row["fortran_ns_per_day"] for row in rows]
+    qgpu = [row["qgpu_ns_per_day"] for row in rows]
+    bars_cpu = ax.bar([i - width / 2 for i in x], fortran, width, label="Fortran CPU", color="#9b9b9b")
+    bars_gpu = ax.bar([i + width / 2 for i in x], qgpu, width, label="QGPU", color="#0b71c8")
+    annotate_bars(ax, bars_cpu, lambda value: f"{value:.1f}")
+    annotate_bars(ax, bars_gpu, lambda value: f"{value:.1f}")
+    ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f")
+    ax.set_ylabel("ns/day")
+    ax.set_xticks(x)
+    ax.set_xticklabels(labels)
+    ax.grid(axis="y", color="#e5e8ee", linewidth=0.8)
+    ax.set_axisbelow(True)
+    ax.legend(frameon=False, loc="best")
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    for xpos, row in zip(x, rows):
+        ax.text(xpos, -0.08, fmt_atoms(row["atoms"]), transform=ax.get_xaxis_transform(), ha="center", va="top", fontsize=8)
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=220)
+    plt.close(fig)
+
+
+def plot_atoms(rows, out_path, title):
+    fig, ax = plt.subplots(figsize=(6.5, 3.8))
+    xs = [row["atoms"] for row in rows]
+    ys = [row["speedup_x"] for row in rows]
+    ax.plot(xs, ys, color="#0b71c8", marker="o", linewidth=1.8)
+    for row in rows:
+        ax.text(row["atoms"], row["speedup_x"], f" {row['test']} ({row['speedup_x']:.1f}x)", va="center", fontsize=8)
+    ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f")
+    ax.set_xlabel("Atoms")
+    ax.set_ylabel("Speedup vs Fortran (x)")
+    ax.grid(True, color="#e5e8ee", linewidth=0.8)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=220)
+    plt.close(fig)
+
+
+def plot(args):
+    rows = load_rows(Path(args.csv).expanduser().resolve())
+    rows.sort(key=lambda row: row["atoms"] if args.sort == "atoms" else row["test"])
+    out_path = Path(args.out).expanduser().resolve()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    if args.metric == "speedup":
+        plot_speedup(rows, out_path, args.title)
+    elif args.metric == "nsday":
+        plot_nsday(rows, out_path, args.title)
+    elif args.metric == "atoms":
+        plot_atoms(rows, out_path, args.title)
+    else:
+        raise SystemExit(f"Unknown metric: {args.metric}")
+
+    print(f"Plot written to: {out_path}")
+    return 0
+
+
+def positive_int(value):
+    parsed = int(value)
+    if parsed < 1:
+        raise argparse.ArgumentTypeError("must be >= 1")
+    return parsed
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Collect and plot QGPU scaling across molecular systems.")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    collect_parser = subparsers.add_parser("collect", help="Run Fortran/QGPU benchmark for multiple tests.")
+    collect_parser.add_argument("--test", nargs="+", required=True, help="runTEST.py test names.")
+    collect_parser.add_argument("--steps", type=positive_int, required=True, help="MD steps.")
+    collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.")
+    collect_parser.add_argument("--shake", action="store_true", help="Enable shake.")
+    collect_parser.add_argument("--repeat", type=positive_int, default=1, help="Repeats per runner per system.")
+    collect_parser.add_argument("--out", help="Output directory.")
+    collect_parser.add_argument(
+        "--fortran-bin",
+        default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn"),
+        help="Path to production Fortran qdyn binary.",
+    )
+    collect_parser.add_argument(
+        "--prep-fortran-bin",
+        default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"),
+        help="Path to qdyn_test used only to prepare QGPU restart CSVs.",
+    )
+    collect_parser.add_argument("--qgpu-bin", help="Path to QGPU qdyn binary.")
+
+    plot_parser = subparsers.add_parser("plot", help="Plot system scaling from system_scaling.csv.")
+    plot_parser.add_argument("csv", help="system_scaling.csv from collect.")
+    plot_parser.add_argument("--out", required=True, help="Output PNG path.")
+    plot_parser.add_argument(
+        "--metric",
+        choices=["speedup", "nsday", "atoms"],
+        default="speedup",
+        help="Plot style.",
+    )
+    plot_parser.add_argument("--sort", choices=["atoms", "test"], default="atoms", help="System order.")
+    plot_parser.add_argument("--title", default="Performance Across Molecular Systems", help="Plot title.")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    if args.command == "collect":
+        return collect(args)
+    if args.command == "plot":
+        return plot(args)
+    raise SystemExit(f"Unknown command: {args.command}")
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except (FileNotFoundError, RuntimeError, ValueError) as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        raise SystemExit(1)
diff --git a/benchmark-qgpu/benchmark_test.py b/benchmark-qgpu/benchmark_test.py
new file mode 100644
index 00000000..a30695f0
--- /dev/null
+++ b/benchmark-qgpu/benchmark_test.py
@@ -0,0 +1,527 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import json
+import os
+import shlex
+import shutil
+import subprocess
+import sys
+import time
+from contextlib import contextmanager
+from datetime import datetime
+from pathlib import Path
+from statistics import median
+
+os.environ.setdefault("MPLCONFIGDIR", "/tmp/qgpu-benchmark-matplotlib")
+
+import matplotlib
+
+matplotlib.use("Agg")
+from matplotlib import pyplot as plt
+
+
+ROOT = Path(__file__).resolve().parents[1]
+TIME_STEP_NS = 2e-6
+
+sys.path.insert(0, str(ROOT / "test"))
+sys.path.insert(0, str(ROOT / "src" / "qligfep-newbin-unfinished"))
+
+import runTEST  # noqa: E402
+import qdyn as qdyn_prepare  # noqa: E402
+
+
+@contextmanager
+def pushd(path):
+    previous = Path.cwd()
+    os.chdir(path)
+    try:
+        yield
+    finally:
+        os.chdir(previous)
+
+
+def abs_path(path):
+    if path is None:
+        return None
+    return str(Path(path).expanduser().resolve())
+
+
+def command_text(args):
+    return " ".join(shlex.quote(str(arg)) for arg in args)
+
+
+def resolve_qgpu_bin(path):
+    if path:
+        candidate = Path(path).expanduser()
+        if not candidate.is_absolute():
+            candidate = (Path.cwd() / candidate).resolve()
+        if not candidate.exists():
+            raise FileNotFoundError(f"QGPU binary not found: {candidate}")
+        return candidate
+
+    for candidate in (ROOT / "bin" / "qdyn", ROOT / "src" / "core" / "qdyn"):
+        if candidate.exists():
+            return candidate
+    raise FileNotFoundError(
+        "QGPU binary not found. Expected bin/qdyn or src/core/qdyn, "
+        "or pass --qgpu-bin."
+    )
+
+
+def resolve_fortran_bin(path):
+    candidate = Path(path).expanduser()
+    if not candidate.is_absolute():
+        candidate = (Path.cwd() / candidate).resolve()
+    if not candidate.exists():
+        raise FileNotFoundError(f"Fortran binary not found: {candidate}")
+    return candidate
+
+
+def resolve_test_data(test_name, steps, lambda_name, shake):
+    testinfo = runTEST.get_default_testinfo()
+    if test_name not in testinfo:
+        available = ", ".join(sorted(testinfo))
+        raise ValueError(f"Unknown test '{test_name}'. Available tests: {available}")
+
+    topdir = ROOT / "test" / "data" / "topology"
+    inputdir = ROOT / "test" / "data" / "inputs"
+    info = testinfo[test_name]
+    topfile = info[0]
+    if len(info) >= 3 and lambda_name is not None:
+        stem, suffix = topfile.rsplit(".", 1)
+        topfile = f"{stem}_{lambda_name}.{suffix}"
+
+    data = {
+        "avg": False,
+        "curtest": None,
+        "fep_path": None,
+        "inputdir": str(inputdir),
+        "lambda": lambda_name,
+        "plot": False,
+        "restraints_path": None,
+        "shake": shake,
+        "test": test_name,
+        "testinfo": testinfo,
+        "timestep": str(steps),
+        "topdir": str(topdir),
+        "topfile": topfile,
+        "topology_path": str(topdir / topfile),
+        "verbose": False,
+    }
+    if len(info) >= 3:
+        data["fep_path"] = str(inputdir / info[2])
+    if len(info) >= 4:
+        data["restraints_path"] = str(inputdir / info[3])
+
+    required = [Path(data["topology_path"])]
+    if data["fep_path"] is not None:
+        required.append(Path(data["fep_path"]))
+    if data["restraints_path"] is not None:
+        required.append(Path(data["restraints_path"]))
+    missing = [str(path) for path in required if not path.exists()]
+    if missing:
+        raise FileNotFoundError("Required input file(s) not found: " + ", ".join(missing))
+
+    return data
+
+
+def run_timed(args, cwd, stdout_path, stderr_path):
+    start = time.perf_counter()
+    with open(stdout_path, "w", encoding="utf-8") as stdout_f, open(
+        stderr_path, "w", encoding="utf-8"
+    ) as stderr_f:
+        completed = subprocess.run(args, cwd=cwd, stdout=stdout_f, stderr=stderr_f)
+    wall_seconds = time.perf_counter() - start
+    return completed.returncode, wall_seconds
+
+
+def ns_per_day(steps, wall_seconds):
+    if wall_seconds <= 0:
+        return None
+    return steps * TIME_STEP_NS * 86400 / wall_seconds
+
+
+def write_md_input(data, fortran_dir):
+    data["curtest"] = str(fortran_dir)
+    with pushd(fortran_dir):
+        runTEST.create_MD_input(data)
+
+
+def run_fortran_repeats(data, fortran_bin, fortran_dir, repeat, steps):
+    records = []
+    saw_success = False
+
+    for index in range(1, repeat + 1):
+        stdout_name = "fortran.log" if repeat == 1 else f"fortran_{index}.log"
+        stderr_name = "fortran.err" if repeat == 1 else f"fortran_{index}.err"
+        stdout_path = fortran_dir / stdout_name
+        stderr_path = fortran_dir / stderr_name
+        args = [str(fortran_bin), "eq1.inp"]
+        return_code, wall_seconds = run_timed(args, fortran_dir, stdout_path, stderr_path)
+        if return_code == 0:
+            saw_success = True
+        records.append(
+            {
+                "test": data["test"],
+                "runner": "fortran",
+                "repeat": index,
+                "command": command_text(args),
+                "return_code": return_code,
+                "wall_seconds": wall_seconds,
+                "steps": steps,
+                "ns_per_day": ns_per_day(steps, wall_seconds),
+                "stdout": str(stdout_path),
+                "stderr": str(stderr_path),
+            }
+        )
+
+    return records, saw_success
+
+
+def prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir):
+    stdout_path = fortran_dir / "restart_prep_qdyn_test.log"
+    stderr_path = fortran_dir / "restart_prep_qdyn_test.err"
+    args = [str(prep_fortran_bin), "eq1.inp"]
+    return_code, _ = run_timed(args, fortran_dir, stdout_path, stderr_path)
+    if return_code != 0:
+        raise RuntimeError(
+            "QGPU restart preparation failed. "
+            f"Command: {command_text(args)} Logs: stdout={stdout_path} stderr={stderr_path}"
+        )
+
+    shutil.copyfile(stdout_path, fortran_dir / "eq1.log")
+    with pushd(fortran_dir):
+        runTEST.Parse_Q6_data(data)
+
+
+def prepare_qgpu_input(data, fortran_dir, prep_dir):
+    prep_dir.mkdir(parents=True, exist_ok=True)
+    restart_dir = prep_dir / "restart"
+    restart_dir.mkdir(exist_ok=True)
+    shutil.copyfile(fortran_dir / "coords.csv", restart_dir / "coords.csv")
+    shutil.copyfile(fortran_dir / "velocities.csv", restart_dir / "velocities.csv")
+
+    top_stem = Path(data["topfile"]).stem
+    wd_rel = f"TEST/{top_stem}"
+    with pushd(prep_dir):
+        qdyn_prepare.Create_Environment(top=data["topology_path"], wd=wd_rel)
+        qdyn_prepare.Prepare_Topology(top=data["topology_path"], wd=wd_rel)
+        qdyn_prepare.Prepare_MD(top=data["topology_path"], md=str(fortran_dir / "eq1.inp"), wd=wd_rel)
+        qdyn_prepare.Prepare_FEP(
+            fepfile=data["fep_path"],
+            wd=wd_rel,
+            top=data["topology_path"],
+        )
+        qdyn_prepare.Read_Restart(restart=str(restart_dir), wd=wd_rel, top=data["topology_path"])
+
+    prepared_data_dir = prep_dir / wd_rel
+    if not (prepared_data_dir / "md.csv").exists():
+        raise RuntimeError(f"Prepared QGPU data is missing md.csv: {prepared_data_dir}")
+    return prepared_data_dir
+
+
+def run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, repeat, steps):
+    qgpu_runs_dir.mkdir(parents=True, exist_ok=True)
+    records = []
+
+    for index in range(1, repeat + 1):
+        run_dir = qgpu_runs_dir / f"repeat_{index:03d}"
+        data_dir = run_dir / prepared_data_dir.name
+        if run_dir.exists():
+            shutil.rmtree(run_dir)
+        run_dir.mkdir(parents=True)
+        shutil.copytree(prepared_data_dir, data_dir)
+
+        stdout_path = run_dir / "qgpu.log"
+        stderr_path = run_dir / "qgpu.err"
+        args = [str(qgpu_bin), "--gpu", str(data_dir)]
+        return_code, wall_seconds = run_timed(args, ROOT, stdout_path, stderr_path)
+        records.append(
+            {
+                "test": data["test"],
+                "runner": "qgpu",
+                "repeat": index,
+                "command": command_text(args),
+                "return_code": return_code,
+                "wall_seconds": wall_seconds,
+                "steps": steps,
+                "ns_per_day": ns_per_day(steps, wall_seconds),
+                "stdout": str(stdout_path),
+                "stderr": str(stderr_path),
+            }
+        )
+
+    return records
+
+
+def write_summary_csv(records, out_dir):
+    csv_path = out_dir / "summary.csv"
+    fieldnames = [
+        "test",
+        "runner",
+        "repeat",
+        "command",
+        "return_code",
+        "wall_seconds",
+        "steps",
+        "ns_per_day",
+        "stdout",
+        "stderr",
+    ]
+    with open(csv_path, "w", newline="", encoding="utf-8") as csv_f:
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(records)
+    return csv_path
+
+
+def summarize(records, args, qgpu_bin, fortran_bin, prep_fortran_bin):
+    by_test = {}
+    for record in records:
+        by_test.setdefault(record["test"], {}).setdefault(record["runner"], []).append(record)
+
+    tests = []
+    for test_name in sorted(by_test):
+        fortran_records = by_test[test_name].get("fortran", [])
+        qgpu_records = by_test[test_name].get("qgpu", [])
+        fortran_ok = [r["wall_seconds"] for r in fortran_records if r["return_code"] == 0]
+        qgpu_ok = [r["wall_seconds"] for r in qgpu_records if r["return_code"] == 0]
+        if not fortran_ok or not qgpu_ok:
+            continue
+        fortran_median = median(fortran_ok)
+        qgpu_median = median(qgpu_ok)
+        speedup = fortran_median / qgpu_median if qgpu_median > 0 else None
+        tests.append(
+            {
+                "test": test_name,
+                "fortran_median_seconds": fortran_median,
+                "qgpu_median_seconds": qgpu_median,
+                "speedup_x": speedup,
+                "improvement_pct": (speedup - 1) * 100 if speedup is not None else None,
+                "fortran_repeats": len(fortran_records),
+                "qgpu_repeats": len(qgpu_records),
+            }
+        )
+
+    return {
+        "created_at": datetime.now().isoformat(timespec="seconds"),
+        "args": {
+            "test": args.test,
+            "steps": args.steps,
+            "lambda": args.lambda_name,
+            "shake": args.shake,
+            "repeat": args.repeat,
+        },
+        "binaries": {
+            "fortran": str(fortran_bin),
+            "restart_prep_fortran": str(prep_fortran_bin),
+            "qgpu": str(qgpu_bin),
+        },
+        "tests": tests,
+    }
+
+
+def write_summary_json(summary, out_dir):
+    json_path = out_dir / "summary.json"
+    with open(json_path, "w", encoding="utf-8") as json_f:
+        json.dump(summary, json_f, indent=2)
+    return json_path
+
+
+def plot_speedup(summary, out_dir):
+    tests = summary["tests"]
+    if not tests:
+        return None
+
+    fig_width = max(8.0, 2.0 + len(tests) * 1.2)
+    fig, (ax, panel) = plt.subplots(
+        1,
+        2,
+        figsize=(fig_width, 3.0),
+        gridspec_kw={"width_ratios": [3.6, 1.8]},
+    )
+
+    x_positions = list(range(len(tests)))
+    width = 0.34
+    fortran_times = [item["fortran_median_seconds"] for item in tests]
+    qgpu_times = [item["qgpu_median_seconds"] for item in tests]
+    labels = [item["test"] for item in tests]
+
+    ax.bar([x - width / 2 for x in x_positions], fortran_times, width, label="Fortran", color="#9b9b9b")
+    ax.bar([x + width / 2 for x in x_positions], qgpu_times, width, label="QGPU", color="#0b71c8")
+    ax.set_title("Execution Time (s)", fontsize=11, weight="bold")
+    ax.set_ylabel("Time (s)")
+    ax.set_xticks(x_positions)
+    ax.set_xticklabels(labels, rotation=0 if len(labels) <= 3 else 30, ha="center")
+    ax.legend(frameon=False, loc="upper right")
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    ax.grid(axis="y", color="#e7e7e7", linewidth=0.8)
+    ax.set_axisbelow(True)
+
+    for x, value in zip([x - width / 2 for x in x_positions], fortran_times):
+        ax.text(x, value, f"{value:.1f}", ha="center", va="bottom", fontsize=8, weight="bold")
+    for x, value in zip([x + width / 2 for x in x_positions], qgpu_times):
+        ax.text(x, value, f"{value:.1f}", ha="center", va="bottom", fontsize=8, weight="bold")
+
+    if len(tests) == 1:
+        ymax = max(fortran_times[0], qgpu_times[0])
+        ax.annotate(
+            "",
+            xy=(x_positions[0] + width / 2, qgpu_times[0] + ymax * 0.15),
+            xytext=(x_positions[0] - width / 2, fortran_times[0] * 0.85),
+            arrowprops={"arrowstyle": "->", "linestyle": "--", "color": "#0b71c8", "lw": 1.2},
+        )
+
+    best = max(tests, key=lambda item: item["speedup_x"] or 0)
+    panel.set_facecolor("#eef5fd")
+    for spine in panel.spines.values():
+        spine.set_color("#8ab9ef")
+    panel.set_xticks([])
+    panel.set_yticks([])
+    panel.text(0.5, 0.82, "Up to", ha="center", va="center", fontsize=13, weight="bold", color="#0b3970")
+    panel.text(
+        0.5,
+        0.52,
+        f"{best['speedup_x']:.1f}x",
+        ha="center",
+        va="center",
+        fontsize=32,
+        weight="bold",
+        color="#003c7f",
+    )
+    panel.text(0.5, 0.28, "speedup", ha="center", va="center", fontsize=14, weight="bold", color="#0b3970")
+    panel.text(0.5, 0.12, "(vs. Fortran)", ha="center", va="center", fontsize=10, color="#0b3970")
+
+    fig.tight_layout()
+    png_path = out_dir / "speedup.png"
+    fig.savefig(png_path, dpi=200)
+    plt.close(fig)
+    return png_path
+
+
+def default_out_dir(test_names):
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    label = test_names[0] if len(test_names) == 1 else "multi"
+    return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_{label}"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Benchmark Fortran vs QGPU for runTEST.py test cases.")
+    parser.add_argument("--test", nargs="+", help="Test name(s) from test/runTEST.py.")
+    parser.add_argument("--list-tests", action="store_true", help="List available tests and exit.")
+    parser.add_argument("--steps", type=int, help="MD steps to write into eq1.inp.")
+    parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.")
+    parser.add_argument("--shake", action="store_true", help="Enable shake in generated MD input.")
+    parser.add_argument("--repeat", type=int, default=1, help="Number of repeats for each runner.")
+    parser.add_argument("--out", default=None, help="Output directory.")
+    parser.add_argument(
+        "--fortran-bin",
+        default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn"),
+        help="Path to production Fortran qdyn binary used for timed Fortran runs.",
+    )
+    parser.add_argument(
+        "--prep-fortran-bin",
+        default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"),
+        help="Path to qdyn_test binary used only to prepare QGPU restart CSVs.",
+    )
+    parser.add_argument("--qgpu-bin", default=None, help="Path to QGPU qdyn binary.")
+    return parser.parse_args()
+
+
+def validate_args(args):
+    if args.list_tests:
+        return
+    if not args.test:
+        raise SystemExit("--test is required unless --list-tests is used.")
+    if args.steps is None:
+        raise SystemExit("--steps is required unless --list-tests is used.")
+    if args.steps < 1:
+        raise SystemExit("--steps must be >= 1.")
+    if args.repeat < 1:
+        raise SystemExit("--repeat must be >= 1.")
+
+
+def main():
+    args = parse_args()
+    validate_args(args)
+
+    testinfo = runTEST.get_default_testinfo()
+    if args.list_tests:
+        for test_name in sorted(testinfo):
+            print(test_name)
+        return 0
+
+    qgpu_bin = resolve_qgpu_bin(args.qgpu_bin)
+    fortran_bin = resolve_fortran_bin(args.fortran_bin)
+    prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin)
+    out_dir = Path(args.out).expanduser().resolve() if args.out else default_out_dir(args.test)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    all_records = []
+    try:
+        for test_name in args.test:
+            test_dir = out_dir / test_name
+            fortran_dir = test_dir / "fortran"
+            prep_dir = test_dir / "qgpu_prepare"
+            qgpu_runs_dir = test_dir / "qgpu_runs"
+            fortran_dir.mkdir(parents=True, exist_ok=True)
+
+            data = resolve_test_data(test_name, args.steps, args.lambda_name, args.shake)
+            print(f"Preparing Fortran input for {test_name} in {fortran_dir}")
+            write_md_input(data, fortran_dir)
+
+            print(f"Running Fortran for {test_name} ({args.repeat} repeat(s))")
+            fortran_records, fortran_ok = run_fortran_repeats(
+                data, fortran_bin, fortran_dir, args.repeat, args.steps
+            )
+            all_records.extend(fortran_records)
+            if not fortran_ok:
+                continue
+
+            print(f"Preparing QGPU restart with qdyn_test for {test_name}")
+            prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir)
+
+            print(f"Preparing QGPU CSV input for {test_name}")
+            prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir)
+
+            print(f"Running QGPU for {test_name} ({args.repeat} repeat(s))")
+            all_records.extend(
+                run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, args.repeat, args.steps)
+            )
+    finally:
+        write_summary_csv(all_records, out_dir)
+
+    failures = [record for record in all_records if record["return_code"] != 0]
+    if failures:
+        first = failures[0]
+        raise RuntimeError(
+            f"{first['runner']} failed for {first['test']} repeat {first['repeat']}. "
+            f"Logs: stdout={first['stdout']} stderr={first['stderr']}"
+        )
+
+    summary = summarize(all_records, args, qgpu_bin, fortran_bin, prep_fortran_bin)
+    csv_path = write_summary_csv(all_records, out_dir)
+    json_path = write_summary_json(summary, out_dir)
+    png_path = plot_speedup(summary, out_dir)
+
+    print(f"Summary CSV: {csv_path}")
+    print(f"Summary JSON: {json_path}")
+    if png_path is not None:
+        print(f"Speedup plot: {png_path}")
+    for item in summary["tests"]:
+        print(
+            f"{item['test']}: Fortran {item['fortran_median_seconds']:.3f}s, "
+            f"QGPU {item['qgpu_median_seconds']:.3f}s, speedup {item['speedup_x']:.2f}x"
+        )
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except (FileNotFoundError, RuntimeError, ValueError) as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        raise SystemExit(1)
diff --git a/test/runTEST.py b/test/runTEST.py
index ea8d5256..0b694155 100644
--- a/test/runTEST.py
+++ b/test/runTEST.py
@@ -17,6 +17,92 @@
 
 lambdas = ['eq5', '0744_0256', '0998_0002']
 
+
+def get_default_testinfo():
+    return {
+                'p-p'               : [
+                                        'benzene-vacuum.top',
+                                        '20'
+                                      ],
+                'q-p_benzene'       : [
+                                       'Na-benzene-vacuum.top',
+                                       '20',
+                                       'FEP_benzene.fep'
+                                      ],
+                'q-p_Na'            : [
+                                       'Na-benzene-vacuum.top',
+                                       '20',
+                                       'FEP_Na.fep'
+                                      ],
+                'q-p-w_benzene'     : [
+                                       'Na-benzene-water.top',
+                                       '20',
+                                       'FEP_benzene.fep'
+                                      ],
+                'q-p-w_Na'          : [
+                                       'Na-benzene-water.top',
+                                       '20',
+                                       'FEP_Na.fep'
+                                      ],
+                'q-q'               : [
+                                       'benzene-vacuum.top',
+                                       '20',
+                                       'FEP_benzene.fep'
+                                      ],
+                'w-p'               : [
+                                       'benzene-water.top',
+                                       '20'
+                                      ],
+                'w-q'               : [
+                                       'benzene-water.top',
+                                       '20',
+                                       'FEP_benzene.fep'
+                                      ],
+                'w-w'               : [
+                                       'water.top',
+                                       '20'
+                                      ],
+                'boundary'          : [
+                                       'ala_wat.top',
+                                       '14'
+                                      ],
+                'polypeptide'       : [
+                                       'ala_wat.top',
+                                       '15'
+                                      ],
+                'polypeptide25'     : [
+                                       'ala_wat25.top',
+                                       '25'
+                                      ],
+                'q-q-large_vac'     : [
+                                       'dualtop_vacuum.top',
+                                       '22',
+                                       'dualtop.fep'
+                                      ],
+                'cdk2'              : [
+                                       'cdk2.top',
+                                       '22',
+                                       'FEPm_cdk2.fep',
+                                       'restraints_cdk2.inp'
+                                      ],
+                'thrombin'          : [
+                                       'thrombin.top',
+                                       '25',
+                                       'FEPm_thrombin.fep',
+                                       'restraints_thrombin.inp'
+                                      ],
+            }
+
+
+def resolve_path(path, base_dir=None):
+    if path is None:
+        return None
+    if os.path.isabs(path):
+        return path
+    if base_dir is not None:
+        return os.path.abspath(os.path.join(base_dir, path))
+    return os.path.abspath(path)
+
 class Create_Environment(object):
     """
         Creates the workdirectory environment.
@@ -44,7 +130,7 @@ def __init__(self,data):
         _inv_lambda = None
         # Check if a lambda has been specified
 
-        if len(data['testinfo'][test]) >= 3 and data['lambda'] is not None:
+        if data.get('fep_path') is not None and data['lambda'] is not None:
             if not data['lambda'].startswith('eq'):
                 str_lambda = data['lambda'].split("_")[0]
                 str_inv_lambda = data['lambda'].split("_")[1]
@@ -86,31 +172,32 @@ def __init__(self,data):
 non_bond                  1
 
 [files]
-topology                  {}{}
+topology                  {}
 final                     eq1.re
 """.format(data['timestep'],
            shake, shake, shake,
            data['testinfo'][data['test']][1],
-           data['topdir'],
-           data['topfile'])
-        if len(data['testinfo'][test]) >= 3:
-            filename = data['testinfo'][data['test']][2]
+           data['topology_path'])
+        if data.get('fep_path') is not None:
+            filename = data['fep_path']
+            fep_name = os.path.basename(filename)
 
             fep_part = """fep                       {}{}
 
 [lambdas]
-""".format(data['inputdir'], filename)
+""".format("" if os.path.isabs(filename) else "",
+           filename)
             if _lambda is not None:
                 fep_part += _lambda + " " + _inv_lambda + "\n"
             else:
-                if filename.startswith("FEPm"):
+                if fep_name.startswith("FEPm"):
                     fep_part += "0.500 0.500\n"
                 else:
                     fep_part += "1.000 0.000\n"
             md_content = md_content + fep_part
         # Check if there are boundary conditions
-        if len(data['testinfo'][test]) >= 4:
-            filename = data['inputdir'] + '/' + data['testinfo'][data['test']][3]
+        if data.get('restraints_path') is not None:
+            filename = data['restraints_path']
             with open(filename, 'r') as f:
                 restraint_part = f.read()
                 md_content = md_content + restraint_part
@@ -192,9 +279,7 @@ def __init__(self,data):
                     outfile.write('{}\n'.format(v))
 
         # Parse the topology
-        Qtopology = '{}{}'.format(data['topdir'],
-                                  data['topfile'])
-        read_top = TOPOLOGY.Read_Topology(Qtopology)
+        read_top = TOPOLOGY.Read_Topology(data['topology_path'])
         top_data = read_top.Q()
         with open('coords.csv','w') as outfile:
             outfile.write('{}\n'.format(len(top_data['coords'])))
@@ -210,17 +295,16 @@ def __init__(self,data):
         shutil.copy('coords.csv', 'tmp/coords.csv')
         args = [
                 ' {}src/bin/qdyn.py'.format(settings.ROOT),
-                '-t', '{}{}'.format(data['topdir'],
-                                   data['topfile']),
+                '-t', data['topology_path'],
                 '-m', 'eq1.inp',
                 '-d', 'TEST',
                 '-r', 'tmp'
                ]
 
         # FEP file?
-        if len(data['testinfo'][data['test']]) >= 3:
+        if data.get('fep_path') is not None:
             args.append('-f')
-            args.append('{}{}'.format(data['inputdir'],data['testinfo'][data['test']][2]))
+            args.append(data['fep_path'])
 
         if data['verbose']:
             args.append('--verbose')
@@ -330,8 +414,8 @@ def __init__(self, data):
         self.data = data
         self.data['curdir'] = os.getcwd()
         self.data['executable'] = sys.executable
-        self.data['topdir'] = '{}test/data/topology/'.format(settings.ROOT)
-        self.data['inputdir']   = '{}test/data/inputs/'.format(settings.ROOT)
+        self.data['topdir'] = os.path.join(settings.ROOT, 'test/data/topology')
+        self.data['inputdir'] = os.path.join(settings.ROOT, 'test/data/inputs')
         # Step = step + 1
         self.data['timestep'] = '{}'.format(int(self.data['timestep'])+1)
 
@@ -340,79 +424,20 @@ def __init__(self, data):
         if self.data['wd'][-1] != '/':
             self.data['wd'] = self.data['wd'] + '/'
 
-        self.data['testinfo'] = {
-                    'p-p'               : [
-                                            'benzene-vacuum.top',
-                                            '20'
-                                          ],
-                    'q-p_benzene'       : [
-                                           'Na-benzene-vacuum.top',
-                                           '20',
-                                           'FEP_benzene.fep'
-                                          ],
-                    'q-p_Na'            : [
-                                           'Na-benzene-vacuum.top',
-                                           '20',
-                                           'FEP_Na.fep'
-                                          ],
-                    'q-p-w_benzene'     : [
-                                           'Na-benzene-water.top',
-                                           '20',
-                                           'FEP_benzene.fep'
-                                          ],
-                    'q-p-w_Na'          : [
-                                           'Na-benzene-water.top',
-                                           '20',
-                                           'FEP_Na.fep'
-                                          ],
-                    'q-q'               : [
-                                           'benzene-vacuum.top',
-                                           '20',
-                                           'FEP_benzene.fep'
-                                          ],
-                    'w-p'               : [
-                                           'benzene-water.top',
-                                           '20'
-                                          ],
-                    'w-q'               : [
-                                           'benzene-water.top',
-                                           '20',
-                                           'FEP_benzene.fep'                                            
-                                          ],
-                    'w-w'               : [
-                                           'water.top',
-                                           '20'
-                                          ],
-                    'boundary'          : [
-                                           'ala_wat.top',
-                                           '14'
-                                          ],
-                    'polypeptide'       : [
-                                           'ala_wat.top',
-                                           '15'                       
-                                          ],
-                    'polypeptide25'     : [
-                                           'ala_wat25.top',
-                                           '25'
-                                          ],
-                    'q-q-large_vac'     : [
-                                           'dualtop_vacuum.top',
-                                           '22',
-                                           'dualtop.fep'
-                                          ],
-                    'cdk2'              : [
-                                           'cdk2.top',
-                                           '22',
-                                           'FEPm_cdk2.fep',
-                                           'restraints_cdk2.inp'
-                                          ],
-                    'thrombin'          : [
-                                           'thrombin.top',
-                                           '25',
-                                           'FEPm_thrombin.fep',
-                                           'restraints_thrombin.inp'
-                                          ],
-                }
+        self.data['testinfo'] = get_default_testinfo()
+
+        if self.data['custom_top'] is not None:
+            custom_info = [
+                os.path.basename(self.data['custom_top']),
+                self.data['custom_shell_radius']
+            ]
+            if self.data['custom_fep'] is not None:
+                custom_info.append(os.path.basename(self.data['custom_fep']))
+            if self.data['custom_restraints'] is not None:
+                while len(custom_info) < 3:
+                    custom_info.append(None)
+                custom_info.append(os.path.basename(self.data['custom_restraints']))
+            self.data['testinfo'][self.data['custom_name']] = custom_info
 
         tests = data['testinfo'].keys()
         if self.data['run'] is not None:
@@ -422,9 +447,21 @@ def __init__(self, data):
             self.data['test'] = test
             self.data['curtest'] = self.data['wd'] + test
             _topfile = data['testinfo'][data['test']][0]
-            if len(data['testinfo'][test]) >= 3 and data['lambda'] is not None:
+            if len(data['testinfo'][test]) >= 3 and data['lambda'] is not None and test != self.data['custom_name']:
                 _topfile = _topfile.split(".")[0] + "_" + data['lambda'] + "." + _topfile.split(".")[1]
             self.data['topfile'] = _topfile
+            if test == self.data['custom_name'] and self.data['custom_top'] is not None:
+                self.data['topology_path'] = self.data['custom_top']
+                self.data['fep_path'] = self.data['custom_fep']
+                self.data['restraints_path'] = self.data['custom_restraints']
+            else:
+                self.data['topology_path'] = os.path.join(self.data['topdir'], self.data['topfile'])
+                self.data['fep_path'] = None
+                self.data['restraints_path'] = None
+                if len(data['testinfo'][test]) >= 3:
+                    self.data['fep_path'] = os.path.join(self.data['inputdir'], data['testinfo'][test][2])
+                if len(data['testinfo'][test]) >= 4:
+                    self.data['restraints_path'] = os.path.join(self.data['inputdir'], data['testinfo'][test][3])
             # INIT
             Create_Environment(self.data)
             
@@ -515,6 +552,36 @@ def __init__(self, data):
                         required = False,
                         help = "Specify a particular phase of the perturbation")
 
+    parser.add_argument('--custom-top',
+                        dest = "custom_top",
+                        default = None,
+                        required = False,
+                        help = "Path to a custom topology file to add as a test")
+
+    parser.add_argument('--custom-shell-radius',
+                        dest = "custom_shell_radius",
+                        default = '25',
+                        required = False,
+                        help = "Shell radius to use with --custom-top")
+
+    parser.add_argument('--custom-fep',
+                        dest = "custom_fep",
+                        default = None,
+                        required = False,
+                        help = "Optional FEP file for --custom-top")
+
+    parser.add_argument('--custom-restraints',
+                        dest = "custom_restraints",
+                        default = None,
+                        required = False,
+                        help = "Optional restraints file for --custom-top")
+
+    parser.add_argument('--custom-name',
+                        dest = "custom_name",
+                        default = 'custom',
+                        required = False,
+                        help = "Test name to use with --custom-top")
+
     parser.add_argument('--tolerance',
                         dest = "tolerance",
                         type = float,
@@ -523,5 +590,9 @@ def __init__(self, data):
                         help = "Energy comparison tolerance (default: 0.0 = exact match)")
 
     args = parser.parse_args()
-    
-    START = Init(vars(args))
+    data = vars(args)
+    data['custom_top'] = resolve_path(data['custom_top'])
+    data['custom_fep'] = resolve_path(data['custom_fep'])
+    data['custom_restraints'] = resolve_path(data['custom_restraints'])
+
+    START = Init(data)

From c4d416c4c5dc66c72ef9d9c13c58e260ac7d887a Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Tue, 28 Apr 2026 10:11:49 +0200
Subject: [PATCH 03/20] fix prepare data

---
 benchmark-qgpu/benchmark_nsday.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py
index d62ee459..253b6b5e 100644
--- a/benchmark-qgpu/benchmark_nsday.py
+++ b/benchmark-qgpu/benchmark_nsday.py
@@ -32,6 +32,9 @@
 )
 
 
+RESTART_INIT_STEPS = 1
+
+
 def read_steps_from_md_csv(data_dir):
     md_path = Path(data_dir) / "md.csv"
     if not md_path.exists():
@@ -50,15 +53,25 @@ def default_collect_out(label):
 
 
 def prepare_from_test(args, out_dir):
-    data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake)
+    init_data = resolve_test_data(args.test, RESTART_INIT_STEPS, args.lambda_name, args.shake)
+    benchmark_data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake)
     fortran_dir = out_dir / "prepare" / args.test / "fortran"
     prep_dir = out_dir / "prepare" / args.test / "qgpu_prepare"
     fortran_dir.mkdir(parents=True, exist_ok=True)
 
-    print(f"Preparing QGPU input for {args.test} in {out_dir}")
-    write_md_input(data, fortran_dir)
-    prepare_restart_with_qdyn_test(data, resolve_fortran_bin(args.prep_fortran_bin), fortran_dir)
-    return prepare_qgpu_input(data, fortran_dir, prep_dir)
+    print(f"Preparing QGPU restart for {args.test} with {RESTART_INIT_STEPS} MD step(s) in {out_dir}")
+    write_md_input(init_data, fortran_dir)
+    prepare_restart_with_qdyn_test(init_data, resolve_fortran_bin(args.prep_fortran_bin), fortran_dir)
+
+    print(f"Writing QGPU benchmark input for {args.test} with {args.steps} MD step(s)")
+    write_md_input(benchmark_data, fortran_dir)
+    prepared_data_dir = prepare_qgpu_input(benchmark_data, fortran_dir, prep_dir)
+    prepared_steps = read_steps_from_md_csv(prepared_data_dir)
+    if prepared_steps != args.steps:
+        raise RuntimeError(
+            f"Prepared QGPU input has {prepared_steps} steps, expected {args.steps}: {prepared_data_dir}"
+        )
+    return prepared_data_dir
 
 
 def resolve_collect_data_dir(args, out_dir):

From 6eaf637ccd3dc51bbaf74b4d44a9e6a2ccf28871 Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Tue, 28 Apr 2026 10:32:57 +0200
Subject: [PATCH 04/20] fix plot png

---
 benchmark-qgpu/benchmark_nsday.py | 65 ++++++++++++++++++++++++-------
 1 file changed, 50 insertions(+), 15 deletions(-)

diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py
index 253b6b5e..f86708cb 100644
--- a/benchmark-qgpu/benchmark_nsday.py
+++ b/benchmark-qgpu/benchmark_nsday.py
@@ -95,10 +95,8 @@ def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, ste
         shutil.rmtree(run_dir)
     run_dir.mkdir(parents=True)
 
-    processes = []
-    process_rows = []
     command_template = None
-    batch_start = time.perf_counter()
+    launch_specs = []
     for index in range(1, concurrency + 1):
         proc_dir = run_dir / f"proc_{index:03d}"
         data_dir = proc_dir / prepared_data_dir.name
@@ -109,20 +107,34 @@ def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, ste
         stderr_path = proc_dir / "qgpu.err"
         args = [str(qgpu_bin), "--gpu", str(data_dir)]
         command_template = command_text([str(qgpu_bin), "--gpu", "<data_dir>"])
-        stdout_f = open(stdout_path, "w", encoding="utf-8")
-        stderr_f = open(stderr_path, "w", encoding="utf-8")
+        launch_specs.append(
+            {
+                "index": index,
+                "args": args,
+                "stdout": stdout_path,
+                "stderr": stderr_path,
+                "command": command_text(args),
+            }
+        )
+
+    processes = []
+    process_rows = []
+    batch_start = time.perf_counter()
+    for spec in launch_specs:
+        stdout_f = open(spec["stdout"], "w", encoding="utf-8")
+        stderr_f = open(spec["stderr"], "w", encoding="utf-8")
         proc_start = time.perf_counter()
-        process = subprocess.Popen(args, cwd=ROOT, stdout=stdout_f, stderr=stderr_f)
+        process = subprocess.Popen(spec["args"], cwd=ROOT, stdout=stdout_f, stderr=stderr_f)
         processes.append(
             {
-                "index": index,
+                "index": spec["index"],
                 "process": process,
                 "stdout_file": stdout_f,
                 "stderr_file": stderr_f,
-                "stdout": stdout_path,
-                "stderr": stderr_path,
+                "stdout": spec["stdout"],
+                "stderr": spec["stderr"],
                 "start": proc_start,
-                "command": command_text(args),
+                "command": spec["command"],
             }
         )
 
@@ -324,22 +336,45 @@ def plot(args):
     )
     palette = ["#1f77b4", "#43a047", "#f57c00", "#7b1fa2", "#00838f"]
     all_points = []
+    all_xs = sorted({x for item in series for x in item["xs"]})
     for index, item in enumerate(series):
         color = palette[index % len(palette)]
         ax.plot(item["xs"], item["ys"], marker="o", linewidth=1.8, markersize=4.5, color=color, label=item["label"])
         for x, y in zip(item["xs"], item["ys"]):
             all_points.append((y, item["label"], x))
-            ax.text(x, y, f"{y:.1f}", ha="center", va="bottom", fontsize=8, weight="bold", color="#253142")
+            ax.annotate(
+                f"{y:.1f}",
+                xy=(x, y),
+                xytext=(0, 6),
+                textcoords="offset points",
+                ha="center",
+                va="bottom",
+                fontsize=8,
+                weight="bold",
+                color="#253142",
+            )
 
-    ax.set_title(args.title, loc="left", fontsize=13, weight="bold", color="#0f5f18")
-    ax.text(0.0, 1.02, args.subtitle, transform=ax.transAxes, fontsize=9, style="italic", color="#253142")
+    y_values = [point[0] for point in all_points]
+    y_min = min(y_values)
+    y_max = max(y_values)
+    y_span = y_max - y_min
+    y_pad = max(y_span * 0.22, y_max * 0.035, 0.5)
+    ax.set_ylim(max(0, y_min - y_pad * 0.35), y_max + y_pad)
+    ax.set_xticks(all_xs)
+    if len(all_xs) == 1:
+        ax.set_xlim(all_xs[0] - 0.5, all_xs[0] + 0.5)
+    else:
+        ax.set_xlim(all_xs[0] - 0.1, all_xs[-1] + 0.1)
+
+    ax.text(0.0, 1.14, args.title, transform=ax.transAxes, fontsize=13, weight="bold", color="#0f5f18")
+    ax.text(0.0, 1.07, args.subtitle, transform=ax.transAxes, fontsize=9, style="italic", color="#253142")
     ax.set_xlabel("Concurrent Simulations")
     ax.set_ylabel("Throughput (ns/day)")
     ax.grid(axis="y", color="#e3e7ed", linewidth=0.8)
     ax.set_axisbelow(True)
     ax.spines["top"].set_visible(False)
     ax.spines["right"].set_visible(False)
-    ax.legend(frameon=False, loc="upper left", fontsize=8)
+    ax.legend(frameon=False, loc="upper right", fontsize=8)
 
     best_points = sorted(all_points, reverse=True)
     best = best_points[0]
@@ -363,7 +398,7 @@ def plot(args):
         panel.axhline(0.12, xmin=0.12, xmax=0.88, color="#7fbf79", linewidth=0.8)
         panel.text(0.5, 0.05, f"{second[0]:.1f} ns/day", ha="center", va="bottom", fontsize=10, weight="bold", color="#14751c")
 
-    fig.tight_layout()
+    fig.tight_layout(rect=(0, 0, 1, 0.9))
     fig.savefig(out_path, dpi=220)
     plt.close(fig)
     print(f"Plot written to: {out_path}")

From aac3e713ac98b74089150ce4799cef1abf633a56 Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Tue, 28 Apr 2026 10:41:40 +0200
Subject: [PATCH 05/20] fix calculation

---
 benchmark-qgpu/benchmark_nsday.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py
index f86708cb..6a7c37b1 100644
--- a/benchmark-qgpu/benchmark_nsday.py
+++ b/benchmark-qgpu/benchmark_nsday.py
@@ -324,6 +324,10 @@ def load_plot_series(csv_paths, metric):
 
 def plot(args):
     metric = args.metric
+    metric_labels = {
+        "total_ns_per_day": "Total Throughput (ns/day)",
+        "mean_process_ns_per_day": "Mean Per-Process Throughput (ns/day)",
+    }
     series = load_plot_series([Path(path).expanduser().resolve() for path in args.csv], metric)
     out_path = Path(args.out).expanduser().resolve()
     out_path.parent.mkdir(parents=True, exist_ok=True)
@@ -369,7 +373,7 @@ def plot(args):
     ax.text(0.0, 1.14, args.title, transform=ax.transAxes, fontsize=13, weight="bold", color="#0f5f18")
     ax.text(0.0, 1.07, args.subtitle, transform=ax.transAxes, fontsize=9, style="italic", color="#253142")
     ax.set_xlabel("Concurrent Simulations")
-    ax.set_ylabel("Throughput (ns/day)")
+    ax.set_ylabel(metric_labels[metric])
     ax.grid(axis="y", color="#e3e7ed", linewidth=0.8)
     ax.set_axisbelow(True)
     ax.spines["top"].set_visible(False)

From cc5700e9a9ebce76f9a45d90e758a5e3e92d8217 Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Tue, 28 Apr 2026 11:24:14 +0200
Subject: [PATCH 06/20] support only gpu

---
 benchmark-qgpu/benchmark_system_scaling.py | 86 ++++++++++++++++------
 1 file changed, 62 insertions(+), 24 deletions(-)

diff --git a/benchmark-qgpu/benchmark_system_scaling.py b/benchmark-qgpu/benchmark_system_scaling.py
index d481244b..7b0ad6bf 100644
--- a/benchmark-qgpu/benchmark_system_scaling.py
+++ b/benchmark-qgpu/benchmark_system_scaling.py
@@ -31,6 +31,9 @@
 )
 
 
+RESTART_INIT_STEPS = 1
+
+
 def default_collect_out():
     stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_system_scaling"
@@ -48,6 +51,12 @@ def successful_times(records):
     return [float(record["wall_seconds"]) for record in records if int(record["return_code"]) == 0]
 
 
+def parse_optional_float(value):
+    if value in (None, ""):
+        return float("nan")
+    return float(value)
+
+
 def write_raw_records(records, out_dir):
     path = out_dir / "system_scaling_raw.csv"
     fieldnames = [
@@ -104,38 +113,51 @@ def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qg
     fortran_dir.mkdir(parents=True, exist_ok=True)
 
     data = resolve_test_data(test_name, args.steps, args.lambda_name, args.shake)
-    print(f"Preparing {test_name}")
-    write_md_input(data, fortran_dir)
+    fortran_records = []
+    fortran_times = []
+
+    if args.gpu_only:
+        init_data = resolve_test_data(test_name, RESTART_INIT_STEPS, args.lambda_name, args.shake)
+        print(f"Preparing QGPU restart for {test_name} with {RESTART_INIT_STEPS} MD step(s)")
+        write_md_input(init_data, fortran_dir)
+        prepare_restart_with_qdyn_test(init_data, prep_fortran_bin, fortran_dir)
+
+        print(f"Writing QGPU benchmark input for {test_name} with {args.steps} MD step(s)")
+        write_md_input(data, fortran_dir)
+    else:
+        print(f"Preparing {test_name}")
+        write_md_input(data, fortran_dir)
+
+        print(f"Running Fortran qdyn for {test_name} ({args.repeat} repeat(s))")
+        fortran_records, fortran_ok = run_fortran_repeats(data, fortran_bin, fortran_dir, args.repeat, args.steps)
+        if not fortran_ok:
+            return None, fortran_records
+        fortran_times = successful_times(fortran_records)
 
-    print(f"Running Fortran qdyn for {test_name} ({args.repeat} repeat(s))")
-    fortran_records, fortran_ok = run_fortran_repeats(data, fortran_bin, fortran_dir, args.repeat, args.steps)
-    if not fortran_ok:
-        return None, fortran_records
+        print(f"Preparing QGPU input for {test_name}")
+        prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir)
 
-    print(f"Preparing QGPU input for {test_name}")
-    prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir)
     prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir)
     atoms = count_atoms(prepared_data_dir)
 
     print(f"Running QGPU for {test_name} ({args.repeat} repeat(s))")
     qgpu_records = run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, args.repeat, args.steps)
 
-    fortran_times = successful_times(fortran_records)
     qgpu_times = successful_times(qgpu_records)
-    if not fortran_times or not qgpu_times:
+    if not qgpu_times or (not args.gpu_only and not fortran_times):
         return None, [*fortran_records, *qgpu_records]
 
-    fortran_wall = median(fortran_times)
+    fortran_wall = median(fortran_times) if fortran_times else None
     qgpu_wall = median(qgpu_times)
     row = {
         "test": test_name,
         "atoms": atoms,
         "steps": args.steps,
-        "fortran_wall_median_s": fortran_wall,
+        "fortran_wall_median_s": fortran_wall if fortran_wall is not None else "",
         "qgpu_wall_median_s": qgpu_wall,
-        "fortran_ns_per_day": ns_per_day(args.steps, fortran_wall),
+        "fortran_ns_per_day": ns_per_day(args.steps, fortran_wall) if fortran_wall is not None else "",
         "qgpu_ns_per_day": ns_per_day(args.steps, qgpu_wall),
-        "speedup_x": fortran_wall / qgpu_wall if qgpu_wall > 0 else "",
+        "speedup_x": fortran_wall / qgpu_wall if fortran_wall is not None and qgpu_wall > 0 else "",
         "fortran_repeats": len(fortran_records),
         "qgpu_repeats": len(qgpu_records),
     }
@@ -145,7 +167,7 @@ def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qg
 def collect(args):
     out_dir = Path(args.out).expanduser().resolve() if args.out else default_collect_out()
     out_dir.mkdir(parents=True, exist_ok=True)
-    fortran_bin = resolve_fortran_bin(args.fortran_bin)
+    fortran_bin = None if args.gpu_only else resolve_fortran_bin(args.fortran_bin)
     prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin)
     qgpu_bin = resolve_qgpu_bin(args.qgpu_bin)
 
@@ -166,7 +188,8 @@ def collect(args):
                         "tests": args.test,
                         "steps": args.steps,
                         "repeat": args.repeat,
-                        "fortran_bin": str(fortran_bin),
+                        "gpu_only": args.gpu_only,
+                        "fortran_bin": str(fortran_bin) if fortran_bin is not None else None,
                         "prep_fortran_bin": str(prep_fortran_bin),
                         "qgpu_bin": str(qgpu_bin),
                     },
@@ -190,7 +213,8 @@ def collect(args):
             "tests": args.test,
             "steps": args.steps,
             "repeat": args.repeat,
-            "fortran_bin": str(fortran_bin),
+            "gpu_only": args.gpu_only,
+            "fortran_bin": str(fortran_bin) if fortran_bin is not None else None,
             "prep_fortran_bin": str(prep_fortran_bin),
             "qgpu_bin": str(qgpu_bin),
         },
@@ -216,7 +240,7 @@ def load_rows(csv_path):
                 "qgpu_ns_per_day",
                 "speedup_x",
             ]:
-                parsed[key] = float(parsed[key])
+                parsed[key] = parse_optional_float(parsed[key])
             rows.append(parsed)
     if not rows:
         raise RuntimeError(f"No rows found in {csv_path}")
@@ -245,6 +269,8 @@ def annotate_bars(ax, bars, formatter):
 
 
 def plot_speedup(rows, out_path, title):
+    if not any(math.isfinite(row["speedup_x"]) for row in rows):
+        raise RuntimeError("speedup plot requires Fortran data. Use --metric nsday for --gpu-only results.")
     labels = [row["test"] for row in rows]
     speedups = [row["speedup_x"] for row in rows]
     atoms = [row["atoms"] for row in rows]
@@ -294,9 +320,13 @@ def plot_nsday(rows, out_path, title):
     fig, ax = plt.subplots(figsize=(8.6, 3.5))
     fortran = [row["fortran_ns_per_day"] for row in rows]
     qgpu = [row["qgpu_ns_per_day"] for row in rows]
-    bars_cpu = ax.bar([i - width / 2 for i in x], fortran, width, label="Fortran CPU", color="#9b9b9b")
-    bars_gpu = ax.bar([i + width / 2 for i in x], qgpu, width, label="QGPU", color="#0b71c8")
-    annotate_bars(ax, bars_cpu, lambda value: f"{value:.1f}")
+    has_fortran = any(math.isfinite(value) for value in fortran)
+    if has_fortran:
+        bars_cpu = ax.bar([i - width / 2 for i in x], fortran, width, label="Fortran CPU", color="#9b9b9b")
+        bars_gpu = ax.bar([i + width / 2 for i in x], qgpu, width, label="QGPU", color="#0b71c8")
+        annotate_bars(ax, bars_cpu, lambda value: f"{value:.1f}")
+    else:
+        bars_gpu = ax.bar(x, qgpu, width * 1.55, label="QGPU", color="#0b71c8")
     annotate_bars(ax, bars_gpu, lambda value: f"{value:.1f}")
     ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f")
     ax.set_ylabel("ns/day")
@@ -318,13 +348,16 @@ def plot_nsday(rows, out_path, title):
 def plot_atoms(rows, out_path, title):
     fig, ax = plt.subplots(figsize=(6.5, 3.8))
     xs = [row["atoms"] for row in rows]
-    ys = [row["speedup_x"] for row in rows]
+    has_speedup = any(math.isfinite(row["speedup_x"]) for row in rows)
+    value_key = "speedup_x" if has_speedup else "qgpu_ns_per_day"
+    ys = [row[value_key] for row in rows]
     ax.plot(xs, ys, color="#0b71c8", marker="o", linewidth=1.8)
     for row in rows:
-        ax.text(row["atoms"], row["speedup_x"], f" {row['test']} ({row['speedup_x']:.1f}x)", va="center", fontsize=8)
+        suffix = f"{row['speedup_x']:.1f}x" if has_speedup else f"{row['qgpu_ns_per_day']:.1f} ns/day"
+        ax.text(row["atoms"], row[value_key], f" {row['test']} ({suffix})", va="center", fontsize=8)
     ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f")
     ax.set_xlabel("Atoms")
-    ax.set_ylabel("Speedup vs Fortran (x)")
+    ax.set_ylabel("Speedup vs Fortran (x)" if has_speedup else "QGPU ns/day")
     ax.grid(True, color="#e5e8ee", linewidth=0.8)
     ax.spines["top"].set_visible(False)
     ax.spines["right"].set_visible(False)
@@ -369,6 +402,11 @@ def parse_args():
     collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.")
     collect_parser.add_argument("--shake", action="store_true", help="Enable shake.")
     collect_parser.add_argument("--repeat", type=positive_int, default=1, help="Repeats per runner per system.")
+    collect_parser.add_argument(
+        "--gpu-only",
+        action="store_true",
+        help="Skip timed Fortran qdyn runs and collect only QGPU performance.",
+    )
     collect_parser.add_argument("--out", help="Output directory.")
     collect_parser.add_argument(
         "--fortran-bin",

From a09f086a82af94effc7b948a2fba13c9d3b79d1e Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Tue, 28 Apr 2026 11:36:29 +0200
Subject: [PATCH 07/20] support multi instance

---
 benchmark-qgpu/benchmark_system_scaling.py | 129 ++++++++++++++++++---
 1 file changed, 113 insertions(+), 16 deletions(-)

diff --git a/benchmark-qgpu/benchmark_system_scaling.py b/benchmark-qgpu/benchmark_system_scaling.py
index 7b0ad6bf..b76aa130 100644
--- a/benchmark-qgpu/benchmark_system_scaling.py
+++ b/benchmark-qgpu/benchmark_system_scaling.py
@@ -29,6 +29,7 @@
     run_qgpu_repeats,
     write_md_input,
 )
+from benchmark_nsday import run_concurrency_batch
 
 
 RESTART_INIT_STEPS = 1
@@ -78,6 +79,27 @@ def write_raw_records(records, out_dir):
     return path
 
 
+def write_qgpu_concurrency_records(records, out_dir):
+    path = out_dir / "system_scaling_qgpu_concurrency.csv"
+    fieldnames = [
+        "test",
+        "label",
+        "concurrency",
+        "repeat",
+        "steps",
+        "batch_wall_seconds",
+        "total_ns_per_day",
+        "mean_process_ns_per_day",
+        "failed_processes",
+        "command",
+    ]
+    with open(path, "w", newline="", encoding="utf-8") as csv_f:
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(records)
+    return path
+
+
 def write_summary(rows, out_dir, metadata):
     summary_csv = out_dir / "system_scaling.csv"
     meta_json = out_dir / "system_scaling_meta.json"
@@ -90,6 +112,7 @@ def write_summary(rows, out_dir, metadata):
         "qgpu_wall_median_s",
         "fortran_ns_per_day",
         "qgpu_ns_per_day",
+        "qgpu_best_concurrency",
         "speedup_x",
         "fortran_repeats",
         "qgpu_repeats",
@@ -105,6 +128,33 @@ def write_summary(rows, out_dir, metadata):
     return summary_csv, meta_json
 
 
+def run_qgpu_concurrency_sweep(args, test_name, qgpu_bin, prepared_data_dir, qgpu_runs_dir):
+    batch_rows = []
+    process_rows = []
+    for concurrency in args.concurrency:
+        for repeat in range(1, args.repeat + 1):
+            run_dir = qgpu_runs_dir / f"c{concurrency:03d}" / f"repeat_{repeat:03d}"
+            print(f"Running QGPU for {test_name}: concurrency={concurrency}, repeat={repeat}")
+            batch_row, rows = run_concurrency_batch(
+                qgpu_bin=qgpu_bin,
+                prepared_data_dir=prepared_data_dir,
+                run_dir=run_dir,
+                concurrency=concurrency,
+                steps=args.steps,
+                label=test_name,
+                repeat=repeat,
+            )
+            batch_row["test"] = test_name
+            batch_rows.append(batch_row)
+            process_rows.extend(rows)
+            if batch_row["failed_processes"]:
+                raise RuntimeError(
+                    f"{batch_row['failed_processes']} QGPU process(es) failed for {test_name} "
+                    f"at concurrency {concurrency}, repeat {repeat}. Logs are under {run_dir}"
+                )
+    return batch_rows, process_rows
+
+
 def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qgpu_bin):
     test_dir = out_dir / test_name
     fortran_dir = test_dir / "fortran"
@@ -131,7 +181,7 @@ def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qg
         print(f"Running Fortran qdyn for {test_name} ({args.repeat} repeat(s))")
         fortran_records, fortran_ok = run_fortran_repeats(data, fortran_bin, fortran_dir, args.repeat, args.steps)
         if not fortran_ok:
-            return None, fortran_records
+            return None, fortran_records, []
         fortran_times = successful_times(fortran_records)
 
         print(f"Preparing QGPU input for {test_name}")
@@ -140,28 +190,53 @@ def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qg
     prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir)
     atoms = count_atoms(prepared_data_dir)
 
-    print(f"Running QGPU for {test_name} ({args.repeat} repeat(s))")
-    qgpu_records = run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, args.repeat, args.steps)
+    qgpu_concurrency_rows = []
+    if args.concurrency:
+        qgpu_records = []
+        qgpu_concurrency_rows, _ = run_qgpu_concurrency_sweep(
+            args, test_name, qgpu_bin, prepared_data_dir, qgpu_runs_dir
+        )
+    else:
+        print(f"Running QGPU for {test_name} ({args.repeat} repeat(s))")
+        qgpu_records = run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, args.repeat, args.steps)
 
     qgpu_times = successful_times(qgpu_records)
-    if not qgpu_times or (not args.gpu_only and not fortran_times):
-        return None, [*fortran_records, *qgpu_records]
+    if args.concurrency:
+        successful_batches = [row for row in qgpu_concurrency_rows if int(row["failed_processes"]) == 0]
+        if not successful_batches:
+            return None, [*fortran_records, *qgpu_records], qgpu_concurrency_rows
+        best_qgpu = max(successful_batches, key=lambda row: float(row["total_ns_per_day"]))
+        qgpu_wall = float(best_qgpu["batch_wall_seconds"])
+        qgpu_ns_day = float(best_qgpu["total_ns_per_day"])
+        qgpu_best_concurrency = int(best_qgpu["concurrency"])
+        qgpu_repeat_count = len(qgpu_concurrency_rows)
+    else:
+        if not qgpu_times:
+            return None, [*fortran_records, *qgpu_records], qgpu_concurrency_rows
+        qgpu_wall = median(qgpu_times)
+        qgpu_ns_day = ns_per_day(args.steps, qgpu_wall)
+        qgpu_best_concurrency = 1
+        qgpu_repeat_count = len(qgpu_records)
+
+    if not args.gpu_only and not fortran_times:
+        return None, [*fortran_records, *qgpu_records], qgpu_concurrency_rows
 
     fortran_wall = median(fortran_times) if fortran_times else None
-    qgpu_wall = median(qgpu_times)
+    fortran_ns_day = ns_per_day(args.steps, fortran_wall) if fortran_wall is not None else None
     row = {
         "test": test_name,
         "atoms": atoms,
         "steps": args.steps,
         "fortran_wall_median_s": fortran_wall if fortran_wall is not None else "",
         "qgpu_wall_median_s": qgpu_wall,
-        "fortran_ns_per_day": ns_per_day(args.steps, fortran_wall) if fortran_wall is not None else "",
-        "qgpu_ns_per_day": ns_per_day(args.steps, qgpu_wall),
-        "speedup_x": fortran_wall / qgpu_wall if fortran_wall is not None and qgpu_wall > 0 else "",
+        "fortran_ns_per_day": fortran_ns_day if fortran_ns_day is not None else "",
+        "qgpu_ns_per_day": qgpu_ns_day,
+        "qgpu_best_concurrency": qgpu_best_concurrency,
+        "speedup_x": qgpu_ns_day / fortran_ns_day if fortran_ns_day is not None and fortran_ns_day > 0 else "",
         "fortran_repeats": len(fortran_records),
-        "qgpu_repeats": len(qgpu_records),
+        "qgpu_repeats": qgpu_repeat_count,
     }
-    return row, [*fortran_records, *qgpu_records]
+    return row, [*fortran_records, *qgpu_records], qgpu_concurrency_rows
 
 
 def collect(args):
@@ -173,11 +248,17 @@ def collect(args):
 
     rows = []
     raw_records = []
+    qgpu_concurrency_records = []
     try:
         for test_name in args.test:
-            row, records = collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qgpu_bin)
+            row, records, concurrency_records = collect_one_test(
+                args, test_name, out_dir, fortran_bin, prep_fortran_bin, qgpu_bin
+            )
             raw_records.extend(records)
+            qgpu_concurrency_records.extend(concurrency_records)
             write_raw_records(raw_records, out_dir)
+            if args.concurrency:
+                write_qgpu_concurrency_records(qgpu_concurrency_records, out_dir)
             if row is not None:
                 rows.append(row)
                 write_summary(
@@ -189,6 +270,7 @@ def collect(args):
                         "steps": args.steps,
                         "repeat": args.repeat,
                         "gpu_only": args.gpu_only,
+                        "concurrency": args.concurrency,
                         "fortran_bin": str(fortran_bin) if fortran_bin is not None else None,
                         "prep_fortran_bin": str(prep_fortran_bin),
                         "qgpu_bin": str(qgpu_bin),
@@ -196,6 +278,9 @@ def collect(args):
                 )
     finally:
         raw_path = write_raw_records(raw_records, out_dir)
+        concurrency_path = (
+            write_qgpu_concurrency_records(qgpu_concurrency_records, out_dir) if args.concurrency else None
+        )
 
     failures = [record for record in raw_records if int(record["return_code"]) != 0]
     if failures:
@@ -214,6 +299,7 @@ def collect(args):
             "steps": args.steps,
             "repeat": args.repeat,
             "gpu_only": args.gpu_only,
+            "concurrency": args.concurrency,
             "fortran_bin": str(fortran_bin) if fortran_bin is not None else None,
             "prep_fortran_bin": str(prep_fortran_bin),
             "qgpu_bin": str(qgpu_bin),
@@ -221,6 +307,8 @@ def collect(args):
     )
     print(f"Summary CSV: {summary_csv}")
     print(f"Raw CSV: {raw_path}")
+    if concurrency_path is not None:
+        print(f"QGPU concurrency CSV: {concurrency_path}")
     print(f"Metadata JSON: {meta_json}")
     return 0
 
@@ -238,9 +326,10 @@ def load_rows(csv_path):
                 "qgpu_wall_median_s",
                 "fortran_ns_per_day",
                 "qgpu_ns_per_day",
+                "qgpu_best_concurrency",
                 "speedup_x",
             ]:
-                parsed[key] = parse_optional_float(parsed[key])
+                parsed[key] = parse_optional_float(parsed.get(key))
             rows.append(parsed)
     if not rows:
         raise RuntimeError(f"No rows found in {csv_path}")
@@ -321,15 +410,17 @@ def plot_nsday(rows, out_path, title):
     fortran = [row["fortran_ns_per_day"] for row in rows]
     qgpu = [row["qgpu_ns_per_day"] for row in rows]
     has_fortran = any(math.isfinite(value) for value in fortran)
+    has_concurrency = any(math.isfinite(row["qgpu_best_concurrency"]) and row["qgpu_best_concurrency"] > 1 for row in rows)
+    qgpu_label = "QGPU best total" if has_concurrency else "QGPU"
     if has_fortran:
         bars_cpu = ax.bar([i - width / 2 for i in x], fortran, width, label="Fortran CPU", color="#9b9b9b")
-        bars_gpu = ax.bar([i + width / 2 for i in x], qgpu, width, label="QGPU", color="#0b71c8")
+        bars_gpu = ax.bar([i + width / 2 for i in x], qgpu, width, label=qgpu_label, color="#0b71c8")
         annotate_bars(ax, bars_cpu, lambda value: f"{value:.1f}")
     else:
-        bars_gpu = ax.bar(x, qgpu, width * 1.55, label="QGPU", color="#0b71c8")
+        bars_gpu = ax.bar(x, qgpu, width * 1.55, label=qgpu_label, color="#0b71c8")
     annotate_bars(ax, bars_gpu, lambda value: f"{value:.1f}")
     ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f")
-    ax.set_ylabel("ns/day")
+    ax.set_ylabel("Best total ns/day" if has_concurrency else "ns/day")
     ax.set_xticks(x)
     ax.set_xticklabels(labels)
     ax.grid(axis="y", color="#e5e8ee", linewidth=0.8)
@@ -402,6 +493,12 @@ def parse_args():
     collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.")
     collect_parser.add_argument("--shake", action="store_true", help="Enable shake.")
     collect_parser.add_argument("--repeat", type=positive_int, default=1, help="Repeats per runner per system.")
+    collect_parser.add_argument(
+        "--concurrency",
+        type=positive_int,
+        nargs="+",
+        help="Concurrent QGPU instance counts to sweep; summary uses the maximum total ns/day.",
+    )
     collect_parser.add_argument(
         "--gpu-only",
         action="store_true",

From 575db10d762d73d6d1eb77795b996f10bb6c4a7c Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Tue, 28 Apr 2026 12:52:34 +0200
Subject: [PATCH 08/20] remove files

---
 benchmark-qgpu/benchmark_system_scaling.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/benchmark-qgpu/benchmark_system_scaling.py b/benchmark-qgpu/benchmark_system_scaling.py
index b76aa130..650c3d4c 100644
--- a/benchmark-qgpu/benchmark_system_scaling.py
+++ b/benchmark-qgpu/benchmark_system_scaling.py
@@ -5,6 +5,7 @@
 import json
 import math
 import os
+import shutil
 import sys
 from datetime import datetime
 from pathlib import Path
@@ -128,6 +129,13 @@ def write_summary(rows, out_dir, metadata):
     return summary_csv, meta_json
 
 
+def cleanup_test_artifacts(out_dir, test_name):
+    test_dir = Path(out_dir) / test_name
+    if test_dir.exists():
+        shutil.rmtree(test_dir)
+        print(f"Removed intermediate run data: {test_dir}")
+
+
 def run_qgpu_concurrency_sweep(args, test_name, qgpu_bin, prepared_data_dir, qgpu_runs_dir):
     batch_rows = []
     process_rows = []
@@ -152,6 +160,8 @@ def run_qgpu_concurrency_sweep(args, test_name, qgpu_bin, prepared_data_dir, qgp
                     f"{batch_row['failed_processes']} QGPU process(es) failed for {test_name} "
                     f"at concurrency {concurrency}, repeat {repeat}. Logs are under {run_dir}"
                 )
+            if not args.keep_run_data:
+                shutil.rmtree(run_dir)
     return batch_rows, process_rows
 
 
@@ -271,11 +281,14 @@ def collect(args):
                         "repeat": args.repeat,
                         "gpu_only": args.gpu_only,
                         "concurrency": args.concurrency,
+                        "keep_run_data": args.keep_run_data,
                         "fortran_bin": str(fortran_bin) if fortran_bin is not None else None,
                         "prep_fortran_bin": str(prep_fortran_bin),
                         "qgpu_bin": str(qgpu_bin),
                     },
                 )
+                if not args.keep_run_data:
+                    cleanup_test_artifacts(out_dir, test_name)
     finally:
         raw_path = write_raw_records(raw_records, out_dir)
         concurrency_path = (
@@ -300,6 +313,7 @@ def collect(args):
             "repeat": args.repeat,
             "gpu_only": args.gpu_only,
             "concurrency": args.concurrency,
+            "keep_run_data": args.keep_run_data,
             "fortran_bin": str(fortran_bin) if fortran_bin is not None else None,
             "prep_fortran_bin": str(prep_fortran_bin),
             "qgpu_bin": str(qgpu_bin),
@@ -504,6 +518,11 @@ def parse_args():
         action="store_true",
         help="Skip timed Fortran qdyn runs and collect only QGPU performance.",
     )
+    collect_parser.add_argument(
+        "--keep-run-data",
+        action="store_true",
+        help="Keep per-test run directories and logs. By default successful intermediate data is deleted.",
+    )
     collect_parser.add_argument("--out", help="Output directory.")
     collect_parser.add_argument(
         "--fortran-bin",

From 4cf9e0e47bc35f4d81ba7cdeea642bcbce3f5f2e Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Tue, 28 Apr 2026 15:25:54 +0200
Subject: [PATCH 09/20] support specify concurrency

---
 benchmark-qgpu/benchmark_nsday.py       | 22 ++++++++++++++++++----
 benchmark-qgpu/benchmark_report.html.j2 | 10 +++++-----
 benchmark-qgpu/benchmark_report.py      |  6 +++---
 benchmark-qgpu/benchmark_run.py         | 10 +++++++---
 benchmark-qgpu/main.py                  | 20 +++++++++++++++++---
 5 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py
index 6a7c37b1..71ddbe9b 100644
--- a/benchmark-qgpu/benchmark_nsday.py
+++ b/benchmark-qgpu/benchmark_nsday.py
@@ -346,12 +346,24 @@ def plot(args):
         ax.plot(item["xs"], item["ys"], marker="o", linewidth=1.8, markersize=4.5, color=color, label=item["label"])
         for x, y in zip(item["xs"], item["ys"]):
             all_points.append((y, item["label"], x))
+            if len(item["xs"]) == 1:
+                x_offset = 0
+                ha = "center"
+            elif x == item["xs"][0]:
+                x_offset = 6
+                ha = "left"
+            elif x == item["xs"][-1]:
+                x_offset = -6
+                ha = "right"
+            else:
+                x_offset = 0
+                ha = "center"
             ax.annotate(
                 f"{y:.1f}",
                 xy=(x, y),
-                xytext=(0, 6),
+                xytext=(x_offset, 6),
                 textcoords="offset points",
-                ha="center",
+                ha=ha,
                 va="bottom",
                 fontsize=8,
                 weight="bold",
@@ -368,7 +380,8 @@ def plot(args):
     if len(all_xs) == 1:
         ax.set_xlim(all_xs[0] - 0.5, all_xs[0] + 0.5)
     else:
-        ax.set_xlim(all_xs[0] - 0.1, all_xs[-1] + 0.1)
+        x_pad = max((all_xs[-1] - all_xs[0]) * 0.06, 0.25)
+        ax.set_xlim(all_xs[0] - x_pad, all_xs[-1] + x_pad)
 
     ax.text(0.0, 1.14, args.title, transform=ax.transAxes, fontsize=13, weight="bold", color="#0f5f18")
     ax.text(0.0, 1.07, args.subtitle, transform=ax.transAxes, fontsize=9, style="italic", color="#253142")
@@ -378,7 +391,8 @@ def plot(args):
     ax.set_axisbelow(True)
     ax.spines["top"].set_visible(False)
     ax.spines["right"].set_visible(False)
-    ax.legend(frameon=False, loc="upper right", fontsize=8)
+    if len(series) > 1:
+        ax.legend(frameon=False, loc="upper left", bbox_to_anchor=(0.0, 1.01), ncols=2, fontsize=8)
 
     best_points = sorted(all_points, reverse=True)
     best = best_points[0]
diff --git a/benchmark-qgpu/benchmark_report.html.j2 b/benchmark-qgpu/benchmark_report.html.j2
index 61c3a634..6cae93c7 100644
--- a/benchmark-qgpu/benchmark_report.html.j2
+++ b/benchmark-qgpu/benchmark_report.html.j2
@@ -23,9 +23,9 @@
   <small>Logs root: {{ logs_root }}</small>
 
   <div class="chart">
-  <h2>Simulation performance (ns/day)</h2>
+  <h2>Simulation throughput (total ns/day)</h2>
   <div id="nsday-chart"></div>
-  <div class="note">Simulated nanoseconds per wall-clock day.</div>
+  <div class="note">Total simulated nanoseconds per wall-clock day across all concurrent processes.</div>
   </div>
 
   <div class="chart">
@@ -209,8 +209,8 @@ function lineChart(containerId, cfg) {
   });
 
   lineChart("nsday-chart", {
-    xLabel: "Processes", yLabel: "ns/day", xs: payload.procs,
-    series: [{ label: "Performance (ns/day)", ys: payload.ns_per_day }]
+    xLabel: "Processes", yLabel: "total ns/day", xs: payload.procs,
+    series: [{ label: "Total throughput (ns/day)", ys: payload.ns_per_day }]
   });
 
 
@@ -233,7 +233,7 @@ function lineChart(containerId, cfg) {
       <th>GPU util mean (%)</th><th>GPU util peak (%)</th>
       <th>VRAM util mean (%)</th><th>VRAM util peak (%)</th>
       <th>Speedup (×)</th>
-      <th>ns/day</th>
+      <th>Total ns/day</th>
     </tr>`;
   table.appendChild(thead);
 
diff --git a/benchmark-qgpu/benchmark_report.py b/benchmark-qgpu/benchmark_report.py
index 2d393147..c5a85fb4 100644
--- a/benchmark-qgpu/benchmark_report.py
+++ b/benchmark-qgpu/benchmark_report.py
@@ -85,6 +85,7 @@ def fget(k):
         mem_means = [v["mem_util_mean"] for v in vals if math.isfinite(v["mem_util_mean"])]
         mem_peaks = [v["mem_util_peak"] for v in vals if math.isfinite(v["mem_util_peak"])]
         tmp_ns_per_day = [v["ns_per_day"] for v in vals if math.isfinite(v["ns_per_day"])]
+        total_ns_per_day = statistics.mean(tmp_ns_per_day) * p if tmp_ns_per_day else float("nan")
 
         
         rc_bad = sum(1 for v in vals if v["rc"] != 0)
@@ -99,7 +100,7 @@ def fget(k):
         gpu_util_peak.append(statistics.mean(util_peak) if util_peak else float("nan"))
         util_mem_mean.append(statistics.mean(mem_means) if mem_means else float("nan"))
         util_mem_peak.append(statistics.mean(mem_peaks) if mem_peaks else float("nan"))
-        ns_per_day.append(statistics.mean(tmp_ns_per_day) if tmp_ns_per_day else float("nan"))
+        ns_per_day.append(total_ns_per_day)
         
         
         Tn = max(walls) if walls else float("nan")
@@ -126,7 +127,7 @@ def fget(k):
             "vram_util_peak": statistics.mean(mem_peaks) if mem_peaks else float("nan"),
             "Tn": Tn,
             "speedup": speedup,
-            "ns_per_day": statistics.mean(tmp_ns_per_day) if tmp_ns_per_day else float("nan"),
+            "ns_per_day": total_ns_per_day,
         })
 
 
@@ -156,4 +157,3 @@ def fget(k):
     with open(out_html, "w", encoding="utf-8") as f:
         f.write(html_out)
     print(f"Report written to: {out_html}")
-
diff --git a/benchmark-qgpu/benchmark_run.py b/benchmark-qgpu/benchmark_run.py
index b119423b..7e0c396d 100644
--- a/benchmark-qgpu/benchmark_run.py
+++ b/benchmark-qgpu/benchmark_run.py
@@ -350,7 +350,12 @@ def _get(d, dotted, default=None):
 def run(args):
     data_dir = os.path.expanduser(args.data_dir)   # e.g., TEST/water
     bin_path = os.path.expanduser(args.bin)        # e.g., /path/to/qdyn
-    max_procs = int(args.max_processes)
+    if getattr(args, "concurrency", None):
+        concurrency = sorted(dict.fromkeys(int(value) for value in args.concurrency))
+    elif args.max_processes is not None:
+        concurrency = list(range(1, int(args.max_processes) + 1))
+    else:
+        raise ValueError("Pass --concurrency or --max_processes.")
 
     if not os.path.isdir(data_dir):
         raise FileNotFoundError(f"data_dir not found: {data_dir}")
@@ -386,7 +391,7 @@ def run(args):
     os.makedirs(logs_dir, exist_ok=True)
     work(1, logs_dir, f'"{bin_path}" "{data_dir}"', steps)
     
-    for process_num in range(1, max_procs + 1):
+    for process_num in concurrency:
         print(f"Will run {process_num} processes in parallel:")
         logs_dir = os.path.join(current_dir, f"benchmark_logs/{process_num:02d}_procs")
         os.makedirs(logs_dir, exist_ok=True)
@@ -405,4 +410,3 @@ def run(args):
     # generate report
     out_html = os.path.join(current_dir, "benchmark_report.html")
     make_html_report(logs_root, out_html)
-
diff --git a/benchmark-qgpu/main.py b/benchmark-qgpu/main.py
index f379f021..4d4b6f96 100644
--- a/benchmark-qgpu/main.py
+++ b/benchmark-qgpu/main.py
@@ -1,5 +1,11 @@
 import argparse
-from benchmark_run import run 
+
+
+def positive_int(value):
+    parsed = int(value)
+    if parsed < 1:
+        raise argparse.ArgumentTypeError("must be >= 1")
+    return parsed
     
 
 if __name__ == "__main__":
@@ -7,8 +13,16 @@
 
     parser.add_argument('--data_dir', type=str, help='Directory containing a single test case.')
     parser.add_argument('--bin', type=str, help='Path to the Qdyn GPU executable.')
-    parser.add_argument('--max_processes', type=int, help='Max number of parallel processes to run.')
+    parser.add_argument('--max_processes', type=positive_int, help='Max number of parallel processes to run.')
+    parser.add_argument(
+        '--concurrency',
+        type=positive_int,
+        nargs='+',
+        help='Specific parallel process counts to run, e.g. --concurrency 1 2 3 4 5 10 15 20.',
+    )
 
     args = parser.parse_args()
 
-    run(args)
\ No newline at end of file
+    from benchmark_run import run
+
+    run(args)

From 1db80d76eab89e6e0229911ef4a302b66e47faaa Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Tue, 28 Apr 2026 15:53:48 +0200
Subject: [PATCH 10/20] support specify fortran files

---
 benchmark-qgpu/benchmark_correctness.py | 69 ++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 12 deletions(-)

diff --git a/benchmark-qgpu/benchmark_correctness.py b/benchmark-qgpu/benchmark_correctness.py
index 07f25046..d30f321d 100644
--- a/benchmark-qgpu/benchmark_correctness.py
+++ b/benchmark-qgpu/benchmark_correctness.py
@@ -83,6 +83,38 @@ def load_fortran_energy(fortran_dir):
         return json.load(json_f), q_data_path
 
 
+def find_prepared_qgpu_dir(reference_dir):
+    prepare_root = Path(reference_dir) / "qgpu_prepare" / "TEST"
+    if not prepare_root.is_dir():
+        raise FileNotFoundError(f"Prepared QGPU TEST directory not found: {prepare_root}")
+    candidates = sorted(path for path in prepare_root.iterdir() if path.is_dir() and (path / "md.csv").exists())
+    if len(candidates) != 1:
+        shown = ", ".join(str(path) for path in candidates)
+        raise RuntimeError(f"Expected exactly one prepared QGPU directory under {prepare_root}; found: {shown}")
+    return candidates[0]
+
+
+def copy_reference_inputs(reference_dir, out_dir):
+    reference_dir = Path(reference_dir).expanduser().resolve()
+    source_fortran_dir = reference_dir / "fortran_reference"
+    if not (source_fortran_dir / "Q_data.json").exists():
+        raise FileNotFoundError(f"Fortran reference Q_data.json not found: {source_fortran_dir / 'Q_data.json'}")
+
+    source_prepared_dir = find_prepared_qgpu_dir(reference_dir)
+    fortran_dir = out_dir / "fortran_reference"
+    prep_dir = out_dir / "qgpu_prepare"
+    prepared_data_dir = prep_dir / "TEST" / source_prepared_dir.name
+
+    if fortran_dir.exists():
+        shutil.rmtree(fortran_dir)
+    if prep_dir.exists():
+        shutil.rmtree(prep_dir)
+
+    shutil.copytree(source_fortran_dir, fortran_dir)
+    shutil.copytree(source_prepared_dir, prepared_data_dir)
+    return fortran_dir, prep_dir, prepared_data_dir, reference_dir
+
+
 def build_correctness_rows(fortran_data, qgpu_data, tolerance):
     compare.ENERGY_TOLERANCE = tolerance
     rows = []
@@ -182,20 +214,27 @@ def collect(args):
     out_dir.mkdir(parents=True, exist_ok=True)
 
     qgpu_bin = resolve_qgpu_bin(args.qgpu_bin)
-    prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin)
-    data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake)
-
-    fortran_dir = out_dir / "fortran_reference"
-    prep_dir = out_dir / "qgpu_prepare"
     qgpu_run_dir = out_dir / "qgpu_run"
-    fortran_dir.mkdir(parents=True, exist_ok=True)
 
-    print(f"Preparing Fortran reference for {args.test}")
-    write_md_input(data, fortran_dir)
-    prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir)
+    if args.reference_dir:
+        print(f"Reusing Fortran/QGPU prepared reference from {args.reference_dir}")
+        fortran_dir, prep_dir, prepared_data_dir, reference_dir = copy_reference_inputs(args.reference_dir, out_dir)
+        prep_fortran_bin = None
+    else:
+        prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin)
+        data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake)
+
+        fortran_dir = out_dir / "fortran_reference"
+        prep_dir = out_dir / "qgpu_prepare"
+        fortran_dir.mkdir(parents=True, exist_ok=True)
 
-    print("Preparing QGPU input")
-    prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir)
+        print(f"Preparing Fortran reference for {args.test}")
+        write_md_input(data, fortran_dir)
+        prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir)
+
+        print("Preparing QGPU input")
+        prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir)
+        reference_dir = None
 
     print("Running QGPU correctness simulation")
     qgpu_data_dir, qgpu_run = run_qgpu_once(qgpu_bin, prepared_data_dir, qgpu_run_dir)
@@ -215,7 +254,9 @@ def collect(args):
             "lambda": args.lambda_name,
             "shake": args.shake,
             "qgpu_bin": str(qgpu_bin),
-            "prep_fortran_bin": str(prep_fortran_bin),
+            "prep_fortran_bin": str(prep_fortran_bin) if prep_fortran_bin is not None else None,
+            "reference_dir": str(reference_dir) if reference_dir is not None else None,
+            "prepared_qgpu_input": str(prepared_data_dir),
             "fortran_energy": str(fortran_energy_path),
             "qgpu_energy": str(qgpu_energy_path),
             "qgpu_run": qgpu_run,
@@ -364,6 +405,10 @@ def parse_args():
     collect_parser.add_argument("--shake", action="store_true", help="Enable shake.")
     collect_parser.add_argument("--out", help="Output directory.")
     collect_parser.add_argument("--qgpu-bin", help="Path to QGPU qdyn binary.")
+    collect_parser.add_argument(
+        "--reference-dir",
+        help="Existing correctness result directory containing fortran_reference/ and qgpu_prepare/ to reuse.",
+    )
     collect_parser.add_argument(
         "--prep-fortran-bin",
         default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"),

From a73e56a112ff821e3dd29a9bad126caa4b0efd3e Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Tue, 28 Apr 2026 15:56:07 +0200
Subject: [PATCH 11/20] support spfp

---
 src/core/Makefile | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/core/Makefile b/src/core/Makefile
index 367be528..0b607d06 100644
--- a/src/core/Makefile
+++ b/src/core/Makefile
@@ -1,12 +1,22 @@
 CC = nvcc
-CFLAGS = -O3 -std=c++17 -arch=sm_89 -I./cuda/include -I./common/include -I./cpu/include -I.
+CFLAGS = -O3 -std=c++17 -arch=sm_86 -I./cuda/include -I./common/include -I./cpu/include -I.
 DEPFLAGS = -MMD -MF $(@:.o=.d)
+QDYN_SPFP ?= 0
+
+ifneq ($(filter 1 true TRUE yes YES on ON,$(QDYN_SPFP)),)
+CFLAGS += -DQDYN_SPFP
+PRECISION = spfp
+else
+PRECISION = dpfp
+endif
+
+BUILD_DIR = build/$(PRECISION)
 
 # collect all .cu files except main.cu
 SRCS = $(filter-out main.cu, $(wildcard *.cu cuda/src/*.cu))
 CPPSRCS = $(wildcard common/*.cpp common/src/*.cpp cpu/*.cpp cpu/src/*.cpp)
-MAIN_OBJ = main.o
-OBJS = $(SRCS:.cu=.o) $(CPPSRCS:.cpp=.o)
+MAIN_OBJ = $(BUILD_DIR)/main.cu.o
+OBJS = $(addprefix $(BUILD_DIR)/,$(SRCS:.cu=.cu.o)) $(addprefix $(BUILD_DIR)/,$(CPPSRCS:.cpp=.cpp.o))
 DEPS = $(MAIN_OBJ:.o=.d) $(OBJS:.o=.d)
 
 all: qdyn move
@@ -14,13 +24,16 @@ all: qdyn move
 qdyn: $(MAIN_OBJ) $(OBJS)
 	$(CC) $(CFLAGS) -o $@ $^
 
-%.o: %.cu
+$(BUILD_DIR)/%.cu.o: %.cu
+	mkdir -p $(dir $@)
 	$(CC) $(CFLAGS) $(DEPFLAGS) -c $< -o $@
 
-%.o: %.cpp
+$(BUILD_DIR)/%.cpp.o: %.cpp
+	mkdir -p $(dir $@)
 	$(CC) $(CFLAGS) $(DEPFLAGS) -c $< -o $@
 
 clean:
+	rm -rf build
 	rm -f *.o *.d cuda/src/*.o cuda/src/*.d common/*.o common/*.d common/src/*.o common/src/*.d cpu/*.o cpu/*.d cpu/src/*.o cpu/src/*.d ../../bin/qdyn
 
 move:

From bc0cb51b066cd38ec2a4ae0796634d68a502be38 Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Wed, 29 Apr 2026 16:35:23 +0200
Subject: [PATCH 12/20] support mpi

---
 benchmark-qgpu/benchmark_test.py | 191 ++++++++++++++++++++++++++++---
 1 file changed, 172 insertions(+), 19 deletions(-)

diff --git a/benchmark-qgpu/benchmark_test.py b/benchmark-qgpu/benchmark_test.py
index a30695f0..afefcfbc 100644
--- a/benchmark-qgpu/benchmark_test.py
+++ b/benchmark-qgpu/benchmark_test.py
@@ -52,6 +52,21 @@ def command_text(args):
     return " ".join(shlex.quote(str(arg)) for arg in args)
 
 
+def split_mpirun_args(args):
+    if args is None:
+        return []
+    if isinstance(args, str):
+        return shlex.split(args)
+    return [str(arg) for arg in args]
+
+
+def build_fortran_command(fortran_bin, input_file, mpi_procs=None, mpirun_bin="mpirun", mpirun_args=None):
+    command = [str(fortran_bin), input_file]
+    if mpi_procs is None:
+        return command
+    return [str(mpirun_bin), "-np", str(mpi_procs), *split_mpirun_args(mpirun_args), *command]
+
+
 def resolve_qgpu_bin(path):
     if path:
         candidate = Path(path).expanduser()
@@ -149,7 +164,16 @@ def write_md_input(data, fortran_dir):
         runTEST.create_MD_input(data)
 
 
-def run_fortran_repeats(data, fortran_bin, fortran_dir, repeat, steps):
+def run_fortran_repeats(
+    data,
+    fortran_bin,
+    fortran_dir,
+    repeat,
+    steps,
+    mpi_procs=None,
+    mpirun_bin="mpirun",
+    mpirun_args=None,
+):
     records = []
     saw_success = False
 
@@ -158,7 +182,13 @@ def run_fortran_repeats(data, fortran_bin, fortran_dir, repeat, steps):
         stderr_name = "fortran.err" if repeat == 1 else f"fortran_{index}.err"
         stdout_path = fortran_dir / stdout_name
         stderr_path = fortran_dir / stderr_name
-        args = [str(fortran_bin), "eq1.inp"]
+        args = build_fortran_command(
+            fortran_bin,
+            "eq1.inp",
+            mpi_procs=mpi_procs,
+            mpirun_bin=mpirun_bin,
+            mpirun_args=mpirun_args,
+        )
         return_code, wall_seconds = run_timed(args, fortran_dir, stdout_path, stderr_path)
         if return_code == 0:
             saw_success = True
@@ -180,20 +210,33 @@ def run_fortran_repeats(data, fortran_bin, fortran_dir, repeat, steps):
     return records, saw_success
 
 
-def prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir):
+def prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir, prep_steps=None):
+    input_path = fortran_dir / "eq1.inp"
+    original_input = input_path.read_text(encoding="utf-8")
+    parse_data = data
+    if prep_steps is not None:
+        prep_data = dict(data)
+        prep_data["timestep"] = str(prep_steps)
+        write_md_input(prep_data, fortran_dir)
+        parse_data = prep_data
+
     stdout_path = fortran_dir / "restart_prep_qdyn_test.log"
     stderr_path = fortran_dir / "restart_prep_qdyn_test.err"
     args = [str(prep_fortran_bin), "eq1.inp"]
-    return_code, _ = run_timed(args, fortran_dir, stdout_path, stderr_path)
-    if return_code != 0:
-        raise RuntimeError(
-            "QGPU restart preparation failed. "
-            f"Command: {command_text(args)} Logs: stdout={stdout_path} stderr={stderr_path}"
-        )
+    try:
+        return_code, _ = run_timed(args, fortran_dir, stdout_path, stderr_path)
+        if return_code != 0:
+            raise RuntimeError(
+                "QGPU restart preparation failed. "
+                f"Command: {command_text(args)} Logs: stdout={stdout_path} stderr={stderr_path}"
+            )
 
-    shutil.copyfile(stdout_path, fortran_dir / "eq1.log")
-    with pushd(fortran_dir):
-        runTEST.Parse_Q6_data(data)
+        shutil.copyfile(stdout_path, fortran_dir / "eq1.log")
+        with pushd(fortran_dir):
+            runTEST.Parse_Q6_data(parse_data)
+    finally:
+        if prep_steps is not None:
+            input_path.write_text(original_input, encoding="utf-8")
 
 
 def prepare_qgpu_input(data, fortran_dir, prep_dir):
@@ -277,6 +320,23 @@ def write_summary_csv(records, out_dir):
     return csv_path
 
 
+def read_summary_csv(csv_path):
+    records = []
+    with open(csv_path, newline="", encoding="utf-8") as csv_f:
+        reader = csv.DictReader(csv_f)
+        for row in reader:
+            parsed = dict(row)
+            parsed["repeat"] = int(parsed["repeat"])
+            parsed["return_code"] = int(parsed["return_code"])
+            parsed["wall_seconds"] = float(parsed["wall_seconds"])
+            parsed["steps"] = int(parsed["steps"])
+            parsed["ns_per_day"] = float(parsed["ns_per_day"]) if parsed.get("ns_per_day") else None
+            records.append(parsed)
+    if not records:
+        raise RuntimeError(f"No records found in {csv_path}")
+    return records
+
+
 def summarize(records, args, qgpu_bin, fortran_bin, prep_fortran_bin):
     by_test = {}
     for record in records:
@@ -286,8 +346,8 @@ def summarize(records, args, qgpu_bin, fortran_bin, prep_fortran_bin):
     for test_name in sorted(by_test):
         fortran_records = by_test[test_name].get("fortran", [])
         qgpu_records = by_test[test_name].get("qgpu", [])
-        fortran_ok = [r["wall_seconds"] for r in fortran_records if r["return_code"] == 0]
-        qgpu_ok = [r["wall_seconds"] for r in qgpu_records if r["return_code"] == 0]
+        fortran_ok = [float(r["wall_seconds"]) for r in fortran_records if int(r["return_code"]) == 0]
+        qgpu_ok = [float(r["wall_seconds"]) for r in qgpu_records if int(r["return_code"]) == 0]
         if not fortran_ok or not qgpu_ok:
             continue
         fortran_median = median(fortran_ok)
@@ -313,6 +373,10 @@ def summarize(records, args, qgpu_bin, fortran_bin, prep_fortran_bin):
             "lambda": args.lambda_name,
             "shake": args.shake,
             "repeat": args.repeat,
+            "restart_prep_steps": getattr(args, "restart_prep_steps", None),
+            "fortran_mpi_procs": getattr(args, "fortran_mpi_procs", None),
+            "mpirun_bin": getattr(args, "mpirun_bin", None),
+            "mpirun_args": getattr(args, "mpirun_args", None),
         },
         "binaries": {
             "fortran": str(fortran_bin),
@@ -323,6 +387,17 @@ def summarize(records, args, qgpu_bin, fortran_bin, prep_fortran_bin):
     }
 
 
+def summarize_for_plot(records):
+    args = argparse.Namespace(
+        test=sorted({record["test"] for record in records}),
+        steps=sorted({int(record["steps"]) for record in records}),
+        lambda_name=None,
+        shake=None,
+        repeat=None,
+    )
+    return summarize(records, args, qgpu_bin="<from summary.csv>", fortran_bin="<from summary.csv>", prep_fortran_bin="")
+
+
 def write_summary_json(summary, out_dir):
     json_path = out_dir / "summary.json"
     with open(json_path, "w", encoding="utf-8") as json_f:
@@ -402,6 +477,28 @@ def plot_speedup(summary, out_dir):
     return png_path
 
 
+def plot_summary_csv(args):
+    csv_path = Path(args.csv).expanduser().resolve()
+    records = read_summary_csv(csv_path)
+    summary = summarize_for_plot(records)
+    if args.out:
+        out_path = Path(args.out).expanduser().resolve()
+        out_dir = out_path.parent
+        out_dir.mkdir(parents=True, exist_ok=True)
+        png_path = plot_speedup(summary, out_dir)
+        if png_path != out_path:
+            if out_path.exists():
+                out_path.unlink()
+            png_path.rename(out_path)
+            png_path = out_path
+    else:
+        png_path = plot_speedup(summary, csv_path.parent)
+    if png_path is None:
+        raise RuntimeError("No successful Fortran/QGPU pairs found to plot.")
+    print(f"Speedup plot: {png_path}")
+    return 0
+
+
 def default_out_dir(test_names):
     stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     label = test_names[0] if len(test_names) == 1 else "multi"
@@ -409,6 +506,13 @@ def default_out_dir(test_names):
 
 
 def parse_args():
+    if len(sys.argv) > 1 and sys.argv[1] == "plot":
+        parser = argparse.ArgumentParser(description="Plot benchmark_test.py speedup from an existing summary.csv.")
+        parser.add_argument("command", choices=["plot"])
+        parser.add_argument("csv", help="summary.csv written by benchmark_test.py.")
+        parser.add_argument("--out", help="Output PNG path. Defaults to speedup.png next to the CSV.")
+        return parser.parse_args()
+
     parser = argparse.ArgumentParser(description="Benchmark Fortran vs QGPU for runTEST.py test cases.")
     parser.add_argument("--test", nargs="+", help="Test name(s) from test/runTEST.py.")
     parser.add_argument("--list-tests", action="store_true", help="List available tests and exit.")
@@ -417,10 +521,32 @@ def parse_args():
     parser.add_argument("--shake", action="store_true", help="Enable shake in generated MD input.")
     parser.add_argument("--repeat", type=int, default=1, help="Number of repeats for each runner.")
     parser.add_argument("--out", default=None, help="Output directory.")
+    parser.add_argument(
+        "--restart-prep-steps",
+        type=int,
+        default=1,
+        help="MD steps used only for qdyn_test restart preparation. Defaults to 1.",
+    )
     parser.add_argument(
         "--fortran-bin",
         default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn"),
-        help="Path to production Fortran qdyn binary used for timed Fortran runs.",
+        help="Path to production Fortran qdyn/qdynp binary used for timed Fortran runs.",
+    )
+    parser.add_argument(
+        "--fortran-mpi-procs",
+        type=int,
+        default=None,
+        help="Run the timed Fortran binary through mpirun with this many MPI ranks.",
+    )
+    parser.add_argument(
+        "--mpirun-bin",
+        default="mpirun",
+        help="MPI launcher to use with --fortran-mpi-procs. Defaults to mpirun.",
+    )
+    parser.add_argument(
+        "--mpirun-args",
+        default=None,
+        help='Extra MPI launcher arguments, quoted as one string, e.g. "--bind-to core".',
     )
     parser.add_argument(
         "--prep-fortran-bin",
@@ -432,6 +558,8 @@ def parse_args():
 
 
 def validate_args(args):
+    if getattr(args, "command", None) == "plot":
+        return
     if args.list_tests:
         return
     if not args.test:
@@ -442,12 +570,19 @@ def validate_args(args):
         raise SystemExit("--steps must be >= 1.")
     if args.repeat < 1:
         raise SystemExit("--repeat must be >= 1.")
+    if args.restart_prep_steps < 1:
+        raise SystemExit("--restart-prep-steps must be >= 1.")
+    if args.fortran_mpi_procs is not None and args.fortran_mpi_procs < 1:
+        raise SystemExit("--fortran-mpi-procs must be >= 1.")
 
 
 def main():
     args = parse_args()
     validate_args(args)
 
+    if getattr(args, "command", None) == "plot":
+        return plot_summary_csv(args)
+
     testinfo = runTEST.get_default_testinfo()
     if args.list_tests:
         for test_name in sorted(testinfo):
@@ -473,16 +608,34 @@ def main():
             print(f"Preparing Fortran input for {test_name} in {fortran_dir}")
             write_md_input(data, fortran_dir)
 
-            print(f"Running Fortran for {test_name} ({args.repeat} repeat(s))")
+            if args.fortran_mpi_procs is None:
+                print(f"Running Fortran for {test_name} ({args.repeat} repeat(s))")
+            else:
+                print(
+                    f"Running Fortran for {test_name} with {args.fortran_mpi_procs} MPI rank(s) "
+                    f"({args.repeat} repeat(s))"
+                )
             fortran_records, fortran_ok = run_fortran_repeats(
-                data, fortran_bin, fortran_dir, args.repeat, args.steps
+                data,
+                fortran_bin,
+                fortran_dir,
+                args.repeat,
+                args.steps,
+                mpi_procs=args.fortran_mpi_procs,
+                mpirun_bin=args.mpirun_bin,
+                mpirun_args=args.mpirun_args,
             )
             all_records.extend(fortran_records)
             if not fortran_ok:
                 continue
 
-            print(f"Preparing QGPU restart with qdyn_test for {test_name}")
-            prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir)
+            print(f"Preparing QGPU restart with qdyn_test for {test_name} ({args.restart_prep_steps} step(s))")
+            prepare_restart_with_qdyn_test(
+                data,
+                prep_fortran_bin,
+                fortran_dir,
+                prep_steps=args.restart_prep_steps,
+            )
 
             print(f"Preparing QGPU CSV input for {test_name}")
             prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir)

From e95b427fbb87b9323aa5ee31efd6c73af23d64fb Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Wed, 29 Apr 2026 17:12:23 +0200
Subject: [PATCH 13/20] support mixed precision

---
 src/core/common/include/precision.h          |  6 ++
 src/core/common/src/handler.cpp              |  5 ++
 src/core/cuda/src/cuda_improper2_force.cu    |  4 +-
 src/core/cuda/src/cuda_leapfrog.cu           | 35 +++++----
 src/core/cuda/src/cuda_nonbonded_14_force.cu | 74 +++++++++++++-------
 src/core/cuda/src/cuda_nonbonded_force.cu    | 64 ++++++++++-------
 src/core/cuda/src/cuda_polx_water_force.cu   | 13 ++--
 src/core/cuda/src/cuda_pshell_force.cu       |  2 +-
 src/core/cuda/src/cuda_radix_water_force.cu  |  9 ++-
 src/core/cuda/src/cuda_restrang_force.cu     |  6 +-
 src/core/cuda/src/cuda_restrdis_force.cu     |  4 +-
 src/core/cuda/src/cuda_restrpos_force.cu     |  6 +-
 src/core/cuda/src/cuda_restrseq_force.cu     |  6 +-
 src/core/cuda/src/cuda_restrwall_force.cu    |  4 +-
 src/core/cuda/src/cuda_shake_constraints.cu  |  5 +-
 src/core/cuda/src/cuda_temperature.cu        |  5 +-
 src/core/cuda/src/cuda_torsion_force.cu      |  4 +-
 17 files changed, 155 insertions(+), 97 deletions(-)

diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h
index f15fc6ca..fc633f45 100644
--- a/src/core/common/include/precision.h
+++ b/src/core/common/include/precision.h
@@ -2,6 +2,12 @@
 
 #ifdef QDYN_SPFP
 using real_t = float;
+using nonbond_work_t = float;
 #else
 using real_t = double;
+using nonbond_work_t = double;
 #endif
+
+using energy_accum_t = double;
+using force_accum_t = double;
+using constraint_work_t = double;
diff --git a/src/core/common/src/handler.cpp b/src/core/common/src/handler.cpp
index 3fdd1341..b462b2c7 100644
--- a/src/core/common/src/handler.cpp
+++ b/src/core/common/src/handler.cpp
@@ -88,6 +88,11 @@ void Handler::update_energy_totals() {
 }
 
 void Handler::print_outputs(int iteration) {
+    auto& host = Context::instance();
+    if (host.run_gpu && host.md.trajectory != 0 && iteration % host.md.trajectory == 0) {
+        host.coords->download();
+        host.velocities->download();
+    }
     print_energies();
     write_coords(iteration);
     write_velocities(iteration);
diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu
index e44678e0..78707b12 100644
--- a/src/core/cuda/src/cuda_improper2_force.cu
+++ b/src/core/cuda/src/cuda_improper2_force.cu
@@ -51,8 +51,8 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp
     rnk.y = -rjk.z * rkl.x + rjk.x * rkl.z;
     rnk.z = -rjk.x * rkl.y + rjk.y * rkl.x;
 
-    bj2inv = 1 / (pow(rnj.x, 2) + pow(rnj.y, 2) + pow(rnj.z, 2));
-    bk2inv = 1 / (pow(rnk.x, 2) + pow(rnk.y, 2) + pow(rnk.z, 2));
+    bj2inv = 1 / (rnj.x * rnj.x + rnj.y * rnj.y + rnj.z * rnj.z);
+    bk2inv = 1 / (rnk.x * rnk.x + rnk.y * rnk.y + rnk.z * rnk.z);
     bjinv = sqrt(bj2inv);
     bkinv = sqrt(bk2inv);
 
diff --git a/src/core/cuda/src/cuda_leapfrog.cu b/src/core/cuda/src/cuda_leapfrog.cu
index 49312337..1e010f7e 100644
--- a/src/core/cuda/src/cuda_leapfrog.cu
+++ b/src/core/cuda/src/cuda_leapfrog.cu
@@ -45,6 +45,20 @@ __global__ void calc_leapfrog_kernel(
     coords[i].z += velocities[i].z * dt;
 }
 
+__global__ void update_velocities_from_positions_kernel(
+    vel_t* velocities,
+    const coord_t* coords,
+    const coord_t* xcoords,
+    int n_atoms,
+    double dt) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n_atoms) return;
+
+    velocities[idx].x = (coords[idx].x - xcoords[idx].x) / dt;
+    velocities[idx].y = (coords[idx].y - xcoords[idx].y) / dt;
+    velocities[idx].z = (coords[idx].z - xcoords[idx].z) / dt;
+}
+
 void calc_leapfrog_host() {
     auto& host = Context::instance();
     auto d_atypes = host.atypes->gpu_data_p;
@@ -70,24 +84,17 @@ void calc_leapfrog_host() {
         host.dt);
     check_cuda(cudaDeviceSynchronize());
 
-    host.velocities->download();
-    host.dvelocities->download();
-    host.coords->download();
-    host.xcoords->download();
-
     // shake
-    // todo: Here is some problem, it writes into cpu memory, but we use gpu..
     printf("n_shake_constraints: %d\n", host.n_shake_constraints);
     if (host.n_shake_constraints > 0) {
         calc_shake_constraints_host();
-        auto &velocities = host.velocities->cpu_data_p;
-        auto &coords = host.coords->cpu_data_p;
-        auto *xcoords = host.xcoords->cpu_data_p;
-        for (int i = 0; i < host.n_atoms; i++) {
-            velocities[i].x = (coords[i].x - xcoords[i].x) / host.dt;
-            velocities[i].y = (coords[i].y - xcoords[i].y) / host.dt;
-            velocities[i].z = (coords[i].z - xcoords[i].z) / host.dt;
-        }
+        update_velocities_from_positions_kernel<<<numBlocks, blockSize>>>(
+            d_velocities,
+            d_coords,
+            d_xcoords,
+            host.n_atoms,
+            host.dt);
+        check_cuda(cudaDeviceSynchronize());
     }
 }
 
diff --git a/src/core/cuda/src/cuda_nonbonded_14_force.cu b/src/core/cuda/src/cuda_nonbonded_14_force.cu
index a33bb695..78c4bc91 100644
--- a/src/core/cuda/src/cuda_nonbonded_14_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_14_force.cu
@@ -12,6 +12,14 @@ int* d_atom_to_qi = nullptr;
 double* d_evdw_totals = nullptr;
 double* d_ecoul_totals = nullptr;
 
+__device__ __forceinline__ nonbond_work_t nonbond14_rsqrt(nonbond_work_t value) {
+#ifdef QDYN_SPFP
+    return rsqrtf(value);
+#else
+    return rsqrt(value);
+#endif
+}
+
 __device__ __forceinline__ int unified_parameter_index(
     int atom_idx,
     int state,
@@ -35,33 +43,47 @@ __device__ void calculate_nonbonded_14_pair(
     real_t y_aii,
     real_t x_bii,
     real_t y_bii,
-    double coulomb_constant,
-    double scaling,
+    nonbond_work_t coulomb_constant,
+    nonbond_work_t scaling,
     int vdw_rule,
-    double lambda,
-    double& evdw,
-    double& ecoul,
-    double& dv) {
-    const real_t dx = x.x - y.x;
-    const real_t dy = x.y - y.y;
-    const real_t dz = x.z - y.z;
-    const real_t r = rsqrt(dx * dx + dy * dy + dz * dz);
-    const real_t r2 = r * r;
-    const real_t r6 = r2 * r2 * r2;
+    nonbond_work_t lambda,
+    nonbond_work_t& evdw,
+    nonbond_work_t& ecoul,
+    nonbond_work_t& dv) {
+    const nonbond_work_t dx = static_cast<nonbond_work_t>(x.x - y.x);
+    const nonbond_work_t dy = static_cast<nonbond_work_t>(x.y - y.y);
+    const nonbond_work_t dz = static_cast<nonbond_work_t>(x.z - y.z);
+    const nonbond_work_t r = nonbond14_rsqrt(dx * dx + dy * dy + dz * dz);
+    const nonbond_work_t r2 = r * r;
+    const nonbond_work_t r6 = r2 * r2 * r2;
 
     ecoul = scaling * coulomb_constant * x_charge * y_charge * r * lambda;
 
-    real_t v_a = 0.0;
-    real_t v_b = 0.0;
+    nonbond_work_t v_a = 0.0;
+    nonbond_work_t v_b = 0.0;
     if (vdw_rule == VDW_GEOMETRIC) {
-        calc_vdw_geometric(x_aii, y_aii, x_bii, y_bii, r6, &v_a, &v_b);
+        calc_vdw_geometric(
+            static_cast<nonbond_work_t>(x_aii),
+            static_cast<nonbond_work_t>(y_aii),
+            static_cast<nonbond_work_t>(x_bii),
+            static_cast<nonbond_work_t>(y_bii),
+            r6,
+            &v_a,
+            &v_b);
     } else {
-        calc_vdw_arithmetic(x_aii, y_aii, x_bii, y_bii, r6, &v_a, &v_b);
+        calc_vdw_arithmetic(
+            static_cast<nonbond_work_t>(x_aii),
+            static_cast<nonbond_work_t>(y_aii),
+            static_cast<nonbond_work_t>(x_bii),
+            static_cast<nonbond_work_t>(y_bii),
+            r6,
+            &v_a,
+            &v_b);
     }
     v_a *= lambda;
     v_b *= lambda;
     evdw = v_a - v_b;
-    dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b);
+    dv = r2 * (-ecoul - static_cast<nonbond_work_t>(12.0) * v_a + static_cast<nonbond_work_t>(6.0) * v_b);
 }
 
 __global__ void calc_nonbonded_14_force_kernel(
@@ -104,10 +126,10 @@ __global__ void calc_nonbonded_14_force_kernel(
     const coord_t ri = d_coords[ai];
     const coord_t rj = d_coords[aj];
 
-    double evdw = 0.0;
-    double ecoul = 0.0;
-    double dv = 0.0;
-    const double pair_lambda = (mode == NONBONDED_14_PP) ? 1.0 : lambda;
+    nonbond_work_t evdw = 0.0;
+    nonbond_work_t ecoul = 0.0;
+    nonbond_work_t dv = 0.0;
+    const nonbond_work_t pair_lambda = static_cast<nonbond_work_t>((mode == NONBONDED_14_PP) ? 1.0 : lambda);
 
     calculate_nonbonded_14_pair(
         ri,
@@ -118,17 +140,17 @@ __global__ void calc_nonbonded_14_force_kernel(
         aj_type.aii_1_4,
         ai_type.bii_1_4,
         aj_type.bii_1_4,
-        d_topo.coulomb_constant,
-        d_topo.el14_scale,
+        static_cast<nonbond_work_t>(d_topo.coulomb_constant),
+        static_cast<nonbond_work_t>(d_topo.el14_scale),
         d_topo.vdw_rule,
         pair_lambda,
         evdw,
         ecoul,
         dv);
 
-    const real_t dx = rj.x - ri.x;
-    const real_t dy = rj.y - ri.y;
-    const real_t dz = rj.z - ri.z;
+    const nonbond_work_t dx = static_cast<nonbond_work_t>(rj.x - ri.x);
+    const nonbond_work_t dy = static_cast<nonbond_work_t>(rj.y - ri.y);
+    const nonbond_work_t dz = static_cast<nonbond_work_t>(rj.z - ri.z);
     atomicAdd(&d_dvelocities[ai].x, -dv * dx);
     atomicAdd(&d_dvelocities[ai].y, -dv * dy);
     atomicAdd(&d_dvelocities[ai].z, -dv * dz);
diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu
index 097a3550..ce3f73ae 100644
--- a/src/core/cuda/src/cuda_nonbonded_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_force.cu
@@ -9,6 +9,20 @@ namespace CudaNonbondedForce {
 bool is_initialized = false;
 double *d_evdw_total, *d_ecoul_total;
 
+struct nonbond_vec_t {
+    nonbond_work_t x;
+    nonbond_work_t y;
+    nonbond_work_t z;
+};
+
+__device__ __forceinline__ nonbond_work_t nonbond_rsqrt(nonbond_work_t value) {
+#ifdef QDYN_SPFP
+    return rsqrtf(value);
+#else
+    return rsqrt(value);
+#endif
+}
+
 __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
     x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f);
     y = t - (x * n - (x * (x - 1) >> 1));
@@ -46,20 +60,20 @@ __device__ void calculate_unforce_bound(
     const real_t charge_product,
     const vdw_pair_param_t& pair_param,
 
-    const double coulomb_constant,
+    const nonbond_work_t coulomb_constant,
 
-    const double scaling,
-    const double lambda,
+    const nonbond_work_t scaling,
+    const nonbond_work_t lambda,
 
-    double& evdw,
-    double& ecoul,
-    double& dv) {
-    const real_t dx = x.x - y.x;
-    const real_t dy = x.y - y.y;
-    const real_t dz = x.z - y.z;
-    const real_t r = rsqrt(dx * dx + dy * dy + dz * dz);
-    const real_t r2 = r * r;
-    const real_t r6 = r2 * r2 * r2;
+    nonbond_work_t& evdw,
+    nonbond_work_t& ecoul,
+    nonbond_work_t& dv) {
+    const nonbond_work_t dx = static_cast<nonbond_work_t>(x.x - y.x);
+    const nonbond_work_t dy = static_cast<nonbond_work_t>(x.y - y.y);
+    const nonbond_work_t dz = static_cast<nonbond_work_t>(x.z - y.z);
+    const nonbond_work_t r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz);
+    const nonbond_work_t r2 = r * r;
+    const nonbond_work_t r6 = r2 * r2 * r2;
     // double v_a = r6 * r6;
     // double v_b = r6;
     // ecoul = r;
@@ -68,10 +82,10 @@ __device__ void calculate_unforce_bound(
 
     ecoul = scaling * coulomb_constant * charge_product * r * lambda;
 
-    const real_t v_a = pair_param.a * r6 * r6 * static_cast<real_t>(lambda);
-    const real_t v_b = pair_param.b * r6 * static_cast<real_t>(lambda);
+    const nonbond_work_t v_a = static_cast<nonbond_work_t>(pair_param.a) * r6 * r6 * lambda;
+    const nonbond_work_t v_b = static_cast<nonbond_work_t>(pair_param.b) * r6 * lambda;
     evdw = v_a - v_b;
-    dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b);
+    dv = r2 * (-ecoul - static_cast<nonbond_work_t>(12.0) * v_a + static_cast<nonbond_work_t>(6.0) * v_b);
 }
 
 __global__ void calc_nonbonded_force_kernel(
@@ -160,8 +174,8 @@ __global__ void calc_nonbonded_force_kernel(
     int x_catype_type_idx = (x_idx < nx) ? x_atypes_types[x_idx] : -1;
     int y_catype_type_idx = (y_idx < ny) ? y_atypes_types[y_idx] : -1;
 
-    double3 x_force = {0.0, 0.0, 0.0};
-    double3 y_force = {0.0, 0.0, 0.0};
+    nonbond_vec_t x_force = {0.0, 0.0, 0.0};
+    nonbond_vec_t y_force = {0.0, 0.0, 0.0};
 
     double evdw_sum = 0.0;
     double ecoul_sum = 0.0;
@@ -216,12 +230,14 @@ __global__ void calc_nonbonded_force_kernel(
         }
     }
 
+    const nonbond_work_t kernel_lambda = static_cast<nonbond_work_t>(lambda);
+    const nonbond_work_t coulomb_constant = static_cast<nonbond_work_t>(d_topo.coulomb_constant);
     const int charge_pair_row = x_charge_type_idx * n_charge_types;
     const int pair_row = (x_catype_type_idx >= 0) ? x_catype_type_idx * n_catype_types : 0;
 
     for (int i = 0; i < 32; i++) {
         if (is_valid()) {
-            double scaling = 1.0;
+            nonbond_work_t scaling = static_cast<nonbond_work_t>(1.0);
             real_t charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx];
             vdw_pair_param_t pair_param = catype_pair_params[pair_row + y_catype_type_idx];
 
@@ -233,16 +249,16 @@ __global__ void calc_nonbonded_force_kernel(
             //     }
             // }
 
-            double evdw = 0, ecoul = 0, dv = 0;
+            nonbond_work_t evdw = 0, ecoul = 0, dv = 0;
 
             calculate_unforce_bound(
                 x_coord,
                 y_coord,
                 charge_product,
                 pair_param,
-                d_topo.coulomb_constant,
+                coulomb_constant,
                 scaling,
-                lambda,
+                kernel_lambda,
                 evdw,
                 ecoul,
                 dv);
@@ -250,9 +266,9 @@ __global__ void calc_nonbonded_force_kernel(
             evdw_sum += evdw;
             ecoul_sum += ecoul;
 
-            const real_t dx = x_coord.x - y_coord.x;
-            const real_t dy = x_coord.y - y_coord.y;
-            const real_t dz = x_coord.z - y_coord.z;
+            const nonbond_work_t dx = static_cast<nonbond_work_t>(x_coord.x - y_coord.x);
+            const nonbond_work_t dy = static_cast<nonbond_work_t>(x_coord.y - y_coord.y);
+            const nonbond_work_t dz = static_cast<nonbond_work_t>(x_coord.z - y_coord.z);
             y_force.x -= dv * dx;
             y_force.y -= dv * dy;
             y_force.z -= dv * dz;
diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu
index 9b0eb667..13c37fbc 100644
--- a/src/core/cuda/src/cuda_polx_water_force.cu
+++ b/src/core/cuda/src/cuda_polx_water_force.cu
@@ -46,7 +46,7 @@ __global__ void calc_polx_theta_and_shells(
     rmu.y = coords[wi + 1].y + coords[wi + 2].y - 2 * coords[wi].y;
     rmu.z = coords[wi + 1].z + coords[wi + 2].z - 2 * coords[wi].z;
 
-    rm = sqrt(pow(rmu.x, 2) + pow(rmu.y, 2) + pow(rmu.z, 2));
+    rm = sqrt(rmu.x * rmu.x + rmu.y * rmu.y + rmu.z * rmu.z);
 
     rmu.x /= rm;
     rmu.y /= rm;
@@ -55,7 +55,7 @@ __global__ void calc_polx_theta_and_shells(
     rcu.x = coords[wi].x - topo.solvent_center.x;
     rcu.y = coords[wi].y - topo.solvent_center.y;
     rcu.z = coords[wi].z - topo.solvent_center.z;
-    rc = sqrt(pow(rcu.x, 2) + pow(rcu.y, 2) + pow(rcu.z, 2));
+    rc = sqrt(rcu.x * rcu.x + rcu.y * rcu.y + rcu.z * rcu.z);
     rcu.x /= rc;
     rcu.y /= rc;
     rcu.z /= rc;
@@ -106,18 +106,19 @@ __global__ void calc_polx_water_forces_kernel(
     if (theta_val > M_PI) theta_val = M_PI;
 
     avtdum += theta[ii];
-    ener = .5 * md.polarisation_force * pow(theta[ii] - theta_val + wshells[is].theta_corr, 2);
+    const double dtheta = theta[ii] - theta_val + wshells[is].theta_corr;
+    ener = .5 * md.polarisation_force * dtheta * dtheta;
     // E_restraint.Upolx += ener;
     atomicAdd(energy, ener);
 
-    dv = md.polarisation_force * (theta[ii] - theta_val + wshells[is].theta_corr);
+    dv = md.polarisation_force * dtheta;
     wi = n_atoms_solute + 3 * ii;
 
     rmu.x = coords[wi + 1].x + coords[wi + 2].x - 2 * coords[wi].x;
     rmu.y = coords[wi + 1].y + coords[wi + 2].y - 2 * coords[wi].y;
     rmu.z = coords[wi + 1].z + coords[wi + 2].z - 2 * coords[wi].z;
 
-    rm = sqrt(pow(rmu.x, 2) + pow(rmu.y, 2) + pow(rmu.z, 2));
+    rm = sqrt(rmu.x * rmu.x + rmu.y * rmu.y + rmu.z * rmu.z);
 
     rmu.x /= rm;
     rmu.y /= rm;
@@ -126,7 +127,7 @@ __global__ void calc_polx_water_forces_kernel(
     rcu.x = coords[wi].x - topo.solvent_center.x;
     rcu.y = coords[wi].y - topo.solvent_center.y;
     rcu.z = coords[wi].z - topo.solvent_center.z;
-    rc = sqrt(pow(rcu.x, 2) + pow(rcu.y, 2) + pow(rcu.z, 2));
+    rc = sqrt(rcu.x * rcu.x + rcu.y * rcu.y + rcu.z * rcu.z);
     rcu.x /= rc;
     rcu.y /= rc;
     rcu.z /= rc;
diff --git a/src/core/cuda/src/cuda_pshell_force.cu b/src/core/cuda/src/cuda_pshell_force.cu
index a01fb536..5221cb9e 100644
--- a/src/core/cuda/src/cuda_pshell_force.cu
+++ b/src/core/cuda/src/cuda_pshell_force.cu
@@ -34,7 +34,7 @@ __global__ void calc_pshell_force_kernel(
         dr.x = coords[i].x - coords_init[i].x;
         dr.y = coords[i].y - coords_init[i].y;
         dr.z = coords[i].z - coords_init[i].z;
-        r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
+        r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
         ener = 0.5 * k * r2;
         // printf("dr = %f %f %f\n", dr.x, dr.y, dr.z);
 
diff --git a/src/core/cuda/src/cuda_radix_water_force.cu b/src/core/cuda/src/cuda_radix_water_force.cu
index 06f5f5a3..f037e9db 100644
--- a/src/core/cuda/src/cuda_radix_water_force.cu
+++ b/src/core/cuda/src/cuda_radix_water_force.cu
@@ -29,18 +29,18 @@ __global__ void calc_radix_water_forces_kernel(
     dr.x = coords[i].x - topo.solvent_center.x;
     dr.y = coords[i].y - topo.solvent_center.y;
     dr.z = coords[i].z - topo.solvent_center.z;
-    double b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2));
+    double b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z);
     double db = b - (topo.solvent_radius - shift);
 
     double ener, dv;
     if (db > 0) {
-        ener = 0.5 * md.radial_force * pow(db, 2) - Dwmz;
+        ener = 0.5 * md.radial_force * db * db - Dwmz;
         dv = md.radial_force * db / b;
     } else {
         if (b > 0.0) {
             double fexp = exp(awmz * db);
-            ener = Dwmz * (pow(fexp, 2) - 2 * fexp);
-            dv = -2 * Dwmz * awmz * (fexp - pow(fexp, 2)) / b;
+            ener = Dwmz * (fexp * fexp - 2 * fexp);
+            dv = -2 * Dwmz * awmz * (fexp - fexp * fexp) / b;
         } else {
             dv = 0;
             ener = 0;
@@ -91,7 +91,6 @@ void calc_radix_water_forces_host() {
                                                              d_dvelocities,
                                                              d_energy);
     check_cuda(cudaDeviceSynchronize());
-    host.dvelocities->download();
     check_cuda(cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost));
     host.E_restraint.Uradx += energy;
 }
diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu
index eb0813f5..b214aee9 100644
--- a/src/core/cuda/src/cuda_restrang_force.cu
+++ b/src/core/cuda/src/cuda_restrang_force.cu
@@ -45,8 +45,8 @@ __global__ void calc_restrang_force_kernel(
         lambda = 1;
     }
 
-    r2ij = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
-    r2jk = pow(dr2.x, 2) + pow(dr2.y, 2) + pow(dr2.z, 2);
+    r2ij = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
+    r2jk = dr2.x * dr2.x + dr2.y * dr2.y + dr2.z * dr2.z;
 
     rij = sqrt(r2ij);
     rjk = sqrt(r2jk);
@@ -60,7 +60,7 @@ __global__ void calc_restrang_force_kernel(
     th = acos(cos_th);
     dth = th - to_radians_device(restrangs[ir].ang);
 
-    ener = .5 * restrangs[ir].k * pow(dth, 2);
+    ener = .5 * restrangs[ir].k * dth * dth;
     dv = lambda * restrangs[ir].k * dth;
 
     f1 = sin(th);
diff --git a/src/core/cuda/src/cuda_restrdis_force.cu b/src/core/cuda/src/cuda_restrdis_force.cu
index 9aacf977..14f9b466 100644
--- a/src/core/cuda/src/cuda_restrdis_force.cu
+++ b/src/core/cuda/src/cuda_restrdis_force.cu
@@ -40,7 +40,7 @@ __global__ void calc_restrdis_forces_kernel(
         lambda = 1;
     }
 
-    b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2));
+    b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z);
     if (b < restrdists[ir].d1) {
         db = b - restrdists[ir].d1;
     } else if (b > restrdists[ir].d2) {
@@ -50,7 +50,7 @@ __global__ void calc_restrdis_forces_kernel(
         return;
     }
 
-    ener = .5 * restrdists[ir].k * pow(db, 2);
+    ener = .5 * restrdists[ir].k * db * db;
     dv = lambda * restrdists[ir].k * db / b;
 
     atomicAdd(&dvelocities[j].x, dr.x * dv);
diff --git a/src/core/cuda/src/cuda_restrpos_force.cu b/src/core/cuda/src/cuda_restrpos_force.cu
index 5f479364..695e2b33 100644
--- a/src/core/cuda/src/cuda_restrpos_force.cu
+++ b/src/core/cuda/src/cuda_restrpos_force.cu
@@ -39,9 +39,9 @@ __global__ void calc_restrpos_forces_kernel(
         lambda = 1;
     }
 
-    x2 = pow(dr.x, 2);
-    y2 = pow(dr.y, 2);
-    z2 = pow(dr.z, 2);
+    x2 = dr.x * dr.x;
+    y2 = dr.y * dr.y;
+    z2 = dr.z * dr.z;
 
     ener = .5 * restrspos[ir].k.x * x2 + .5 * restrspos[ir].k.y * y2 + .5 * restrspos[ir].k.z * z2;
 
diff --git a/src/core/cuda/src/cuda_restrseq_force.cu b/src/core/cuda/src/cuda_restrseq_force.cu
index b5db3552..71835e4e 100644
--- a/src/core/cuda/src/cuda_restrseq_force.cu
+++ b/src/core/cuda/src/cuda_restrseq_force.cu
@@ -46,7 +46,7 @@ __global__ void calc_restrseq_forces_kernel(
             dr.x /= n_ctr;
             dr.y /= n_ctr;
             dr.z /= n_ctr;
-            r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
+            r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
             ener = .5 * k * r2;
             atomicAdd(upres_energy, ener);
 
@@ -77,7 +77,7 @@ __global__ void calc_restrseq_forces_kernel(
             dr.x /= totmass;
             dr.y /= totmass;
             dr.z /= totmass;
-            r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
+            r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
             ener = .5 * k * r2;
             atomicAdd(upres_energy, ener);
 
@@ -100,7 +100,7 @@ __global__ void calc_restrseq_forces_kernel(
                 dr.y = coords[i].y - coords_init[i].y;
                 dr.z = coords[i].z - coords_init[i].z;
 
-                r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
+                r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
                 ener = .5 * k * r2;
                 atomicAdd(upres_energy, ener);
 
diff --git a/src/core/cuda/src/cuda_restrwall_force.cu b/src/core/cuda/src/cuda_restrwall_force.cu
index 12d890ad..c928bb71 100644
--- a/src/core/cuda/src/cuda_restrwall_force.cu
+++ b/src/core/cuda/src/cuda_restrwall_force.cu
@@ -29,11 +29,11 @@ __global__ void calc_restrwall_forces_kernel(
             dr.y = coords[i].y - topo.solvent_center.y;
             dr.z = coords[i].z - topo.solvent_center.z;
 
-            b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2));
+            b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z);
             db = b - restrwalls[ir].d;
 
             if (db > 0) {
-                ener = .5 * k * pow(db, 2) - restrwalls[ir].dMorse;
+                ener = .5 * k * db * db - restrwalls[ir].dMorse;
                 dv = k * db / b;
             } else {
                 fexp = exp(restrwalls[ir].aMorse * db);
diff --git a/src/core/cuda/src/cuda_shake_constraints.cu b/src/core/cuda/src/cuda_shake_constraints.cu
index e9dfd051..bda47e50 100644
--- a/src/core/cuda/src/cuda_shake_constraints.cu
+++ b/src/core/cuda/src/cuda_shake_constraints.cu
@@ -48,7 +48,7 @@ __global__ void calc_shake_constraints_kernel(
                     xij.x = coords[ai].x - coords[aj].x;
                     xij.y = coords[ai].y - coords[aj].y;
                     xij.z = coords[ai].z - coords[aj].z;
-                    xij2 = pow(xij.x, 2) + pow(xij.y, 2) + pow(xij.z, 2);
+                    xij2 = xij.x * xij.x + xij.y * xij.y + xij.z * xij.z;
                     diff = shake_bonds[shake + i].dist2 - xij2;
                     if (fabs(diff) < shake_tol * shake_bonds[shake + i].dist2) {
                         shake_bonds[shake + i].ready = true;
@@ -86,7 +86,7 @@ __global__ void calc_shake_constraints_kernel(
                 xxij.x = xcoords[ai].x - xcoords[aj].x;
                 xxij.y = xcoords[ai].y - xcoords[aj].y;
                 xxij.z = xcoords[ai].z - xcoords[aj].z;
-                xxij2 = pow(xxij.x, 2) + pow(xxij.y, 2) + pow(xxij.z, 2);
+                xxij2 = xxij.x * xxij.x + xxij.y * xxij.y + xxij.z * xxij.z;
                 printf(">>> Shake failed, i = %d,j = %d, d = %f, d0 = %f", ai, aj, sqrt(xxij2), shake_bonds[shake + i].dist2);
             }
             return;
@@ -154,6 +154,5 @@ int calc_shake_constraints_host() {
         d_mol_shake_offset);
     cudaDeviceSynchronize();
     cudaMemcpy(&total_iterations_host, d_total_iterations, sizeof(int), cudaMemcpyDeviceToHost);
-    host.coords->download();
     return host.n_molecules == 0 ? 0 : total_iterations_host / host.n_molecules;
 }
diff --git a/src/core/cuda/src/cuda_temperature.cu b/src/core/cuda/src/cuda_temperature.cu
index a02c6cf7..baba687e 100644
--- a/src/core/cuda/src/cuda_temperature.cu
+++ b/src/core/cuda/src/cuda_temperature.cu
@@ -19,7 +19,10 @@ __global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_atoms) return;
     double mass_i = catypes[atypes[idx].code - 1].m;
-    double ener = .5 * mass_i * (pow(velocities[idx].x, 2) + pow(velocities[idx].y, 2) + pow(velocities[idx].z, 2));
+    const double vx = velocities[idx].x;
+    const double vy = velocities[idx].y;
+    const double vz = velocities[idx].z;
+    double ener = .5 * mass_i * (vx * vx + vy * vy + vz * vz);
     bool is_solute = (idx < n_atoms_solute);
     bool is_excluded = excluded[idx];
 
diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu
index 6ef7cd45..97b687a6 100644
--- a/src/core/cuda/src/cuda_torsion_force.cu
+++ b/src/core/cuda/src/cuda_torsion_force.cu
@@ -57,8 +57,8 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio
     rnk.y = -rjk.z * rkl.x + rjk.x * rkl.z;
     rnk.z = -rjk.x * rkl.y + rjk.y * rkl.x;
 
-    bj2inv = 1 / (pow(rnj.x, 2) + pow(rnj.y, 2) + pow(rnj.z, 2));
-    bk2inv = 1 / (pow(rnk.x, 2) + pow(rnk.y, 2) + pow(rnk.z, 2));
+    bj2inv = 1 / (rnj.x * rnj.x + rnj.y * rnj.y + rnj.z * rnj.z);
+    bk2inv = 1 / (rnk.x * rnk.x + rnk.y * rnk.y + rnk.z * rnk.z);
     bjinv = sqrt(bj2inv);
     bkinv = sqrt(bk2inv);
 

From d32c490b3ea8576ef3036ce8d8fcfc1332375bee Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Wed, 29 Apr 2026 17:14:15 +0200
Subject: [PATCH 14/20] save binary in build

---
 src/core/Makefile | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/src/core/Makefile b/src/core/Makefile
index 0b607d06..6acc5da1 100644
--- a/src/core/Makefile
+++ b/src/core/Makefile
@@ -1,43 +1,44 @@
 CC = nvcc
-CFLAGS = -O3 -std=c++17 -arch=sm_86 -I./cuda/include -I./common/include -I./cpu/include -I.
-DEPFLAGS = -MMD -MF $(@:.o=.d)
-QDYN_SPFP ?= 0
-
-ifneq ($(filter 1 true TRUE yes YES on ON,$(QDYN_SPFP)),)
-CFLAGS += -DQDYN_SPFP
-PRECISION = spfp
-else
-PRECISION = dpfp
+SPFPFLAGS =
+ifeq ($(QDYN_SPFP),1)
+SPFPFLAGS += -DQDYN_SPFP
 endif
-
-BUILD_DIR = build/$(PRECISION)
+CFLAGS = -O3 -std=c++17 -arch=sm_86 $(SPFPFLAGS) -I./cuda/include -I./common/include -I./cpu/include -I.
+DEPFLAGS = -MMD -MF $(@:.o=.d)
+BUILD_MODE = $(if $(filter 1,$(QDYN_SPFP)),spfp,dpfp)
+OBJDIR = .build/$(BUILD_MODE)
+TARGET = $(OBJDIR)/qdyn
 
 # collect all .cu files except main.cu
 SRCS = $(filter-out main.cu, $(wildcard *.cu cuda/src/*.cu))
 CPPSRCS = $(wildcard common/*.cpp common/src/*.cpp cpu/*.cpp cpu/src/*.cpp)
-MAIN_OBJ = $(BUILD_DIR)/main.cu.o
-OBJS = $(addprefix $(BUILD_DIR)/,$(SRCS:.cu=.cu.o)) $(addprefix $(BUILD_DIR)/,$(CPPSRCS:.cpp=.cpp.o))
+MAIN_OBJ = $(OBJDIR)/main.o
+OBJS = $(addprefix $(OBJDIR)/,$(SRCS:.cu=.o)) $(addprefix $(OBJDIR)/,$(CPPSRCS:.cpp=.o))
 DEPS = $(MAIN_OBJ:.o=.d) $(OBJS:.o=.d)
 
 all: qdyn move
 
-qdyn: $(MAIN_OBJ) $(OBJS)
+qdyn: $(TARGET)
+	cp $< $@
+
+$(TARGET): $(MAIN_OBJ) $(OBJS)
 	$(CC) $(CFLAGS) -o $@ $^
 
-$(BUILD_DIR)/%.cu.o: %.cu
-	mkdir -p $(dir $@)
+$(OBJDIR)/%.o: %.cu
+	mkdir -p $(@D)
 	$(CC) $(CFLAGS) $(DEPFLAGS) -c $< -o $@
 
-$(BUILD_DIR)/%.cpp.o: %.cpp
-	mkdir -p $(dir $@)
+$(OBJDIR)/%.o: %.cpp
+	mkdir -p $(@D)
 	$(CC) $(CFLAGS) $(DEPFLAGS) -c $< -o $@
 
 clean:
-	rm -rf build
-	rm -f *.o *.d cuda/src/*.o cuda/src/*.d common/*.o common/*.d common/src/*.o common/src/*.d cpu/*.o cpu/*.d cpu/src/*.o cpu/src/*.d ../../bin/qdyn
+	rm -rf .build qdyn ../../bin/qdyn
 
-move:
+move: $(TARGET)
 	mkdir -p ../../bin
-	mv qdyn ../../bin/
+	cp $< ../../bin/qdyn
+
+.PHONY: all qdyn clean move
 
 -include $(DEPS)

From 100d1c0987afede48eea874592523b9ef761d733 Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Wed, 29 Apr 2026 20:04:15 +0200
Subject: [PATCH 15/20] update dvel to float

---
 src/core/common/include/md_types.h         |   6 +-
 src/core/common/include/precision.h        |   9 +-
 src/core/cpu/src/cpu_angle_force.cpp       |   4 +-
 src/core/cpu/src/cpu_improper2_force.cpp   |   4 +-
 src/core/cpu/src/cpu_polx_water_force.cpp  |   4 +-
 src/core/cpu/src/cpu_q_angle_force.cpp     |   4 +-
 src/core/cpu/src/cpu_q_torsion_force.cpp   |   4 +-
 src/core/cpu/src/cpu_restrang_force.cpp    |   4 +-
 src/core/cpu/src/cpu_torsion_force.cpp     |   4 +-
 src/core/cuda/src/cuda_angle_force.cu      |   4 +-
 src/core/cuda/src/cuda_improper2_force.cu  |   2 +-
 src/core/cuda/src/cuda_nonbonded_force.cu  | 126 +++++++++++----------
 src/core/cuda/src/cuda_polx_water_force.cu |   2 +-
 src/core/cuda/src/cuda_restrang_force.cu   |   4 +-
 src/core/cuda/src/cuda_torsion_force.cu    |   2 +-
 15 files changed, 99 insertions(+), 84 deletions(-)

diff --git a/src/core/common/include/md_types.h b/src/core/common/include/md_types.h
index 6a4d2865..27c20cef 100644
--- a/src/core/common/include/md_types.h
+++ b/src/core/common/include/md_types.h
@@ -310,9 +310,9 @@ struct vel_t {
 };
 
 struct dvel_t {
-    double x;
-    double y;
-    double z;
+    force_accum_t x;
+    force_accum_t y;
+    force_accum_t z;
 };
 
 struct E_bonded_t {
diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h
index fc633f45..80b790f7 100644
--- a/src/core/common/include/precision.h
+++ b/src/core/common/include/precision.h
@@ -3,11 +3,18 @@
 #ifdef QDYN_SPFP
 using real_t = float;
 using nonbond_work_t = float;
+using force_accum_t = float;
 #else
 using real_t = double;
 using nonbond_work_t = double;
+using force_accum_t = double;
 #endif
 
 using energy_accum_t = double;
-using force_accum_t = double;
 using constraint_work_t = double;
+
+#ifdef QDYN_SPFP
+constexpr double k_singular_sin_epsilon = 1.0e-6;
+#else
+constexpr double k_singular_sin_epsilon = 1.0e-12;
+#endif
diff --git a/src/core/cpu/src/cpu_angle_force.cpp b/src/core/cpu/src/cpu_angle_force.cpp
index a9c29c1e..ae600561 100644
--- a/src/core/cpu/src/cpu_angle_force.cpp
+++ b/src/core/cpu/src/cpu_angle_force.cpp
@@ -64,9 +64,9 @@ double calc_angle_forces(int start, int end) {
         dv = cangle.kth * dth;
 
         f1 = sin(th);
-        if (std::fabs(f1) < 1.0E-12) {
+        if (std::fabs(f1) < k_singular_sin_epsilon) {
             // Avoid division by zero
-            f1 = -1.0E12;
+            f1 = -1.0 / k_singular_sin_epsilon;
         } else {
             f1 = -1.0 / f1;
         }
diff --git a/src/core/cpu/src/cpu_improper2_force.cpp b/src/core/cpu/src/cpu_improper2_force.cpp
index af73a9cc..6e4faa60 100644
--- a/src/core/cpu/src/cpu_improper2_force.cpp
+++ b/src/core/cpu/src/cpu_improper2_force.cpp
@@ -79,8 +79,8 @@ double calc_improper2_forces(int start, int end) {
 
         // Forces
         f1 = sin(phi);
-        if (std::fabs(f1) < 1E-12) {
-            f1 = 1E-12;
+        if (std::fabs(f1) < k_singular_sin_epsilon) {
+            f1 = std::copysign(k_singular_sin_epsilon, f1);
         }
         f1 = -1 / f1;
 
diff --git a/src/core/cpu/src/cpu_polx_water_force.cpp b/src/core/cpu/src/cpu_polx_water_force.cpp
index 9d0e4711..5116dbbb 100644
--- a/src/core/cpu/src/cpu_polx_water_force.cpp
+++ b/src/core/cpu/src/cpu_polx_water_force.cpp
@@ -158,8 +158,8 @@ void calc_polx_w_forces(int iteration) {
                 cos_th = -1;
             }
             f0 = sin(acos(cos_th));
-            if (fabs(f0) < 1.0E-12) {
-                f0 = 1.0E-12;
+            if (fabs(f0) < k_singular_sin_epsilon) {
+                f0 = k_singular_sin_epsilon;
             }
             f0 = -1.0 / f0;
             f0 *= dv;
diff --git a/src/core/cpu/src/cpu_q_angle_force.cpp b/src/core/cpu/src/cpu_q_angle_force.cpp
index c9c2ea65..14aa802c 100644
--- a/src/core/cpu/src/cpu_q_angle_force.cpp
+++ b/src/core/cpu/src/cpu_q_angle_force.cpp
@@ -56,8 +56,8 @@ void calc_qangle_forces(int state) {
 
         dv = ctx.q_cangles[ic].kth * dth * lambdas[state];
         f1 = sin(th);
-        if (abs(f1) < 1E-12) {
-            f1 = 1E-12;
+        if (fabs(f1) < k_singular_sin_epsilon) {
+            f1 = k_singular_sin_epsilon;
         }
         f1 = -1.0 / f1;
 
diff --git a/src/core/cpu/src/cpu_q_torsion_force.cpp b/src/core/cpu/src/cpu_q_torsion_force.cpp
index be309347..7b7fb271 100644
--- a/src/core/cpu/src/cpu_q_torsion_force.cpp
+++ b/src/core/cpu/src/cpu_q_torsion_force.cpp
@@ -76,8 +76,8 @@ void calc_qtorsion_forces(int state) {
 
         // Forces
         f1 = sin(phi);
-        if (abs(f1) < 1E-12) {
-            f1 = 1E-12;
+        if (fabs(f1) < k_singular_sin_epsilon) {
+            f1 = copysign(k_singular_sin_epsilon, f1);
         }
         f1 = -1 / f1;
 
diff --git a/src/core/cpu/src/cpu_restrang_force.cpp b/src/core/cpu/src/cpu_restrang_force.cpp
index d809a9c1..84f593b0 100644
--- a/src/core/cpu/src/cpu_restrang_force.cpp
+++ b/src/core/cpu/src/cpu_restrang_force.cpp
@@ -61,8 +61,8 @@ void calc_restrang_forces() {
         dv = lambda * restrangs[ir].k * dth;
 
         f1 = sin(th);
-        if (fabs(f1) < 1E-12) {
-            f1 = -1E-12;
+        if (fabs(f1) < k_singular_sin_epsilon) {
+            f1 = -1.0 / k_singular_sin_epsilon;
         } else {
             f1 = -1 / f1;
         }
diff --git a/src/core/cpu/src/cpu_torsion_force.cpp b/src/core/cpu/src/cpu_torsion_force.cpp
index e8aaa2a3..4ebb44b2 100644
--- a/src/core/cpu/src/cpu_torsion_force.cpp
+++ b/src/core/cpu/src/cpu_torsion_force.cpp
@@ -88,8 +88,8 @@ double calc_torsion_forces(int start, int end) {
 
         // Forces
         f1 = sin(phi);
-        if (std::fabs(f1) < 1E-12) {
-            f1 = 1E-12;
+        if (std::fabs(f1) < k_singular_sin_epsilon) {
+            f1 = std::copysign(k_singular_sin_epsilon, f1);
         }
         f1 = -1 / f1;
 
diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu
index dcd044ce..f20b039a 100644
--- a/src/core/cuda/src/cuda_angle_force.cu
+++ b/src/core/cuda/src/cuda_angle_force.cu
@@ -39,8 +39,8 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co
     double dv = cang.kth * dtheta;
 
     double f1 = sin(theta);
-    if (fabs(f1) < 1e-12) {
-        f1 = -1.0e12;
+    if (fabs(f1) < k_singular_sin_epsilon) {
+        f1 = -1.0 / k_singular_sin_epsilon;
     } else {
         f1 = -1.0 / f1;
     }
diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu
index 78707b12..dd7d91aa 100644
--- a/src/core/cuda/src/cuda_improper2_force.cu
+++ b/src/core/cuda/src/cuda_improper2_force.cu
@@ -76,7 +76,7 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp
 
     // Forces
     f1 = sin(phi);
-    if (fabs(f1) < 1E-12) f1 = 1E-12;
+    if (fabs(f1) < k_singular_sin_epsilon) f1 = copysign(k_singular_sin_epsilon, f1);
     f1 = -1 / f1;
     // printf("f1 = %f phi = %f cos_phi = %f\n", f1, phi, cos_phi);
 
diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu
index ce3f73ae..32b4077a 100644
--- a/src/core/cuda/src/cuda_nonbonded_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_force.cu
@@ -9,18 +9,19 @@ namespace CudaNonbondedForce {
 bool is_initialized = false;
 double *d_evdw_total, *d_ecoul_total;
 
+template <typename WorkT>
 struct nonbond_vec_t {
-    nonbond_work_t x;
-    nonbond_work_t y;
-    nonbond_work_t z;
+    WorkT x;
+    WorkT y;
+    WorkT z;
 };
 
-__device__ __forceinline__ nonbond_work_t nonbond_rsqrt(nonbond_work_t value) {
-#ifdef QDYN_SPFP
+__device__ __forceinline__ float nonbond_rsqrt(float value) {
     return rsqrtf(value);
-#else
+}
+
+__device__ __forceinline__ double nonbond_rsqrt(double value) {
     return rsqrt(value);
-#endif
 }
 
 __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
@@ -53,6 +54,7 @@ __device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned m
     return v;
 }
 
+template <typename WorkT>
 __device__ void calculate_unforce_bound(
     const coord_t& x,
     const coord_t& y,
@@ -60,20 +62,20 @@ __device__ void calculate_unforce_bound(
     const real_t charge_product,
     const vdw_pair_param_t& pair_param,
 
-    const nonbond_work_t coulomb_constant,
+    const WorkT coulomb_constant,
 
-    const nonbond_work_t scaling,
-    const nonbond_work_t lambda,
+    const WorkT scaling,
+    const WorkT lambda,
 
-    nonbond_work_t& evdw,
-    nonbond_work_t& ecoul,
-    nonbond_work_t& dv) {
-    const nonbond_work_t dx = static_cast<nonbond_work_t>(x.x - y.x);
-    const nonbond_work_t dy = static_cast<nonbond_work_t>(x.y - y.y);
-    const nonbond_work_t dz = static_cast<nonbond_work_t>(x.z - y.z);
-    const nonbond_work_t r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz);
-    const nonbond_work_t r2 = r * r;
-    const nonbond_work_t r6 = r2 * r2 * r2;
+    WorkT& evdw,
+    WorkT& ecoul,
+    WorkT& dv) {
+    const WorkT dx = static_cast<WorkT>(x.x - y.x);
+    const WorkT dy = static_cast<WorkT>(x.y - y.y);
+    const WorkT dz = static_cast<WorkT>(x.z - y.z);
+    const WorkT r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz);
+    const WorkT r2 = r * r;
+    const WorkT r6 = r2 * r2 * r2;
     // double v_a = r6 * r6;
     // double v_b = r6;
     // ecoul = r;
@@ -82,12 +84,13 @@ __device__ void calculate_unforce_bound(
 
     ecoul = scaling * coulomb_constant * charge_product * r * lambda;
 
-    const nonbond_work_t v_a = static_cast<nonbond_work_t>(pair_param.a) * r6 * r6 * lambda;
-    const nonbond_work_t v_b = static_cast<nonbond_work_t>(pair_param.b) * r6 * lambda;
+    const WorkT v_a = static_cast<WorkT>(pair_param.a) * r6 * r6 * lambda;
+    const WorkT v_b = static_cast<WorkT>(pair_param.b) * r6 * lambda;
     evdw = v_a - v_b;
-    dv = r2 * (-ecoul - static_cast<nonbond_work_t>(12.0) * v_a + static_cast<nonbond_work_t>(6.0) * v_b);
+    dv = r2 * (-ecoul - static_cast<WorkT>(12.0) * v_a + static_cast<WorkT>(6.0) * v_b);
 }
 
+template <typename WorkT>
 __global__ void calc_nonbonded_force_kernel(
     const int nx,
     const int ny,
@@ -174,8 +177,8 @@ __global__ void calc_nonbonded_force_kernel(
     int x_catype_type_idx = (x_idx < nx) ? x_atypes_types[x_idx] : -1;
     int y_catype_type_idx = (y_idx < ny) ? y_atypes_types[y_idx] : -1;
 
-    nonbond_vec_t x_force = {0.0, 0.0, 0.0};
-    nonbond_vec_t y_force = {0.0, 0.0, 0.0};
+    nonbond_vec_t<WorkT> x_force = {0.0, 0.0, 0.0};
+    nonbond_vec_t<WorkT> y_force = {0.0, 0.0, 0.0};
 
     double evdw_sum = 0.0;
     double ecoul_sum = 0.0;
@@ -230,14 +233,14 @@ __global__ void calc_nonbonded_force_kernel(
         }
     }
 
-    const nonbond_work_t kernel_lambda = static_cast<nonbond_work_t>(lambda);
-    const nonbond_work_t coulomb_constant = static_cast<nonbond_work_t>(d_topo.coulomb_constant);
+    const WorkT kernel_lambda = static_cast<WorkT>(lambda);
+    const WorkT coulomb_constant = static_cast<WorkT>(d_topo.coulomb_constant);
     const int charge_pair_row = x_charge_type_idx * n_charge_types;
     const int pair_row = (x_catype_type_idx >= 0) ? x_catype_type_idx * n_catype_types : 0;
 
     for (int i = 0; i < 32; i++) {
         if (is_valid()) {
-            nonbond_work_t scaling = static_cast<nonbond_work_t>(1.0);
+            WorkT scaling = static_cast<WorkT>(1.0);
             real_t charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx];
             vdw_pair_param_t pair_param = catype_pair_params[pair_row + y_catype_type_idx];
 
@@ -249,7 +252,7 @@ __global__ void calc_nonbonded_force_kernel(
             //     }
             // }
 
-            nonbond_work_t evdw = 0, ecoul = 0, dv = 0;
+            WorkT evdw = 0, ecoul = 0, dv = 0;
 
             calculate_unforce_bound(
                 x_coord,
@@ -266,9 +269,9 @@ __global__ void calc_nonbonded_force_kernel(
             evdw_sum += evdw;
             ecoul_sum += ecoul;
 
-            const nonbond_work_t dx = static_cast<nonbond_work_t>(x_coord.x - y_coord.x);
-            const nonbond_work_t dy = static_cast<nonbond_work_t>(x_coord.y - y_coord.y);
-            const nonbond_work_t dz = static_cast<nonbond_work_t>(x_coord.z - y_coord.z);
+            const WorkT dx = static_cast<WorkT>(x_coord.x - y_coord.x);
+            const WorkT dy = static_cast<WorkT>(x_coord.y - y_coord.y);
+            const WorkT dz = static_cast<WorkT>(x_coord.z - y_coord.z);
             y_force.x -= dv * dx;
             y_force.y -= dv * dy;
             y_force.z -= dv * dz;
@@ -334,34 +337,39 @@ std::pair<double, double> calc_nonbonded_force_host(
     cudaMemset(d_ecoul_total, 0, sizeof(double));
     cudaMemset(d_evdw_total, 0, sizeof(double));
 
-    calc_nonbonded_force_kernel<<<grid, block_sz>>>(
-        nx,
-        ny,
-        x_charges_types,
-        y_charges_types,
-        host.charge_pair_products->gpu_data_p,
-        x_atypes_types,
-        y_atypes_types,
-        host.catype_pair_params->gpu_data_p,
-        host.topo,
-        host.excluded->gpu_data_p,
-        host.LJ_matrix->gpu_data_p,
-        x_idx_list,
-        y_idx_list,
-        host.coords->gpu_data_p,
-        host.dvelocities->gpu_data_p,
-        d_evdw_total,
-        d_ecoul_total,
-        symmetric,
-        disable_water_h_lj,
-        host.n_atoms_solute,
-        host.n_charge_types,
-        host.zero_charge_type,
-        host.n_catype_types,
-        host.zero_catype_type,
-        host.n_qelscales,
-        lambda,
-        host.q_elscales->gpu_data_p);
+    auto launch_kernel = [&](auto work_tag) {
+        using WorkT = decltype(work_tag);
+        calc_nonbonded_force_kernel<WorkT><<<grid, block_sz>>>(
+            nx,
+            ny,
+            x_charges_types,
+            y_charges_types,
+            host.charge_pair_products->gpu_data_p,
+            x_atypes_types,
+            y_atypes_types,
+            host.catype_pair_params->gpu_data_p,
+            host.topo,
+            host.excluded->gpu_data_p,
+            host.LJ_matrix->gpu_data_p,
+            x_idx_list,
+            y_idx_list,
+            host.coords->gpu_data_p,
+            host.dvelocities->gpu_data_p,
+            d_evdw_total,
+            d_ecoul_total,
+            symmetric,
+            disable_water_h_lj,
+            host.n_atoms_solute,
+            host.n_charge_types,
+            host.zero_charge_type,
+            host.n_catype_types,
+            host.zero_catype_type,
+            host.n_qelscales,
+            lambda,
+            host.q_elscales->gpu_data_p);
+    };
+
+    launch_kernel(nonbond_work_t{});
 
     cudaDeviceSynchronize();
 
diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu
index 13c37fbc..7be0656f 100644
--- a/src/core/cuda/src/cuda_polx_water_force.cu
+++ b/src/core/cuda/src/cuda_polx_water_force.cu
@@ -136,7 +136,7 @@ __global__ void calc_polx_water_forces_kernel(
     if (cos_th > 1) cos_th = 1;
     if (cos_th < -1) cos_th = -1;
     f0 = sin(acos(cos_th));
-    if (abs(f0) < 1.0E-12) f0 = 1.0E-12;
+    if (abs(f0) < k_singular_sin_epsilon) f0 = k_singular_sin_epsilon;
     f0 = -1.0 / f0;
     f0 *= dv;
 
diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu
index b214aee9..567a78df 100644
--- a/src/core/cuda/src/cuda_restrang_force.cu
+++ b/src/core/cuda/src/cuda_restrang_force.cu
@@ -64,8 +64,8 @@ __global__ void calc_restrang_force_kernel(
     dv = lambda * restrangs[ir].k * dth;
 
     f1 = sin(th);
-    if (fabs(f1) < 1E-12) {
-        f1 = -1E-12;
+    if (fabs(f1) < k_singular_sin_epsilon) {
+        f1 = -1.0 / k_singular_sin_epsilon;
     } else {
         f1 = -1 / f1;
     }
diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu
index 97b687a6..5baffbde 100644
--- a/src/core/cuda/src/cuda_torsion_force.cu
+++ b/src/core/cuda/src/cuda_torsion_force.cu
@@ -76,7 +76,7 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio
 
     // Forces
     f1 = sin(phi);
-    if (fabs(f1) < 1E-12) f1 = 1E-12;
+    if (fabs(f1) < k_singular_sin_epsilon) f1 = copysign(k_singular_sin_epsilon, f1);
     f1 = -1 / f1;
 
     di.x = f1 * (rnk.x * (bjinv * bkinv) - cos_phi * rnj.x * bj2inv);

From 840e1c9f16adee5b330a6a34dd3ef605f17e9a30 Mon Sep 17 00:00:00 2001
From: shen <goodstudyqaq@gmail.com>
Date: Wed, 29 Apr 2026 20:48:27 +0200
Subject: [PATCH 16/20] update all to float

---
 src/core/common/include/context.h             |  32 ++---
 src/core/common/include/md_types.h            | 128 +++++++++---------
 src/core/common/include/precision.h           |   5 +-
 src/core/common/src/init.cpp                  |  56 ++++----
 src/core/common/src/parse.cpp                 |   2 +-
 src/core/cpu/include/cpu_angle_force.h        |   5 +-
 src/core/cpu/include/cpu_bond_force.h         |   4 +-
 src/core/cpu/include/cpu_improper2_force.h    |   4 +-
 src/core/cpu/include/cpu_torsion_force.h      |   4 +-
 src/core/cpu/include/cpu_utils.h              |   8 +-
 src/core/cpu/src/cpu_angle_force.cpp          |  10 +-
 src/core/cpu/src/cpu_bond_force.cpp           |   6 +-
 src/core/cpu/src/cpu_improper2_force.cpp      |   8 +-
 src/core/cpu/src/cpu_leapfrog.cpp             |   4 +-
 src/core/cpu/src/cpu_nonbonded_pp_force.cpp   |   6 +-
 src/core/cpu/src/cpu_nonbonded_pw_force.cpp   |   4 +-
 src/core/cpu/src/cpu_nonbonded_qp_force.cpp   |   6 +-
 src/core/cpu/src/cpu_nonbonded_qq_force.cpp   |   6 +-
 src/core/cpu/src/cpu_nonbonded_qw_force.cpp   |   4 +-
 src/core/cpu/src/cpu_nonbonded_ww_force.cpp   |   4 +-
 src/core/cpu/src/cpu_polx_water_force.cpp     |  18 +--
 src/core/cpu/src/cpu_pshell_force.cpp         |   2 +-
 src/core/cpu/src/cpu_q_angle_force.cpp        |   4 +-
 src/core/cpu/src/cpu_q_bond_force.cpp         |   2 +-
 src/core/cpu/src/cpu_q_torsion_force.cpp      |   8 +-
 src/core/cpu/src/cpu_radix_water_force.cpp    |   4 +-
 src/core/cpu/src/cpu_restrang_force.cpp       |   4 +-
 src/core/cpu/src/cpu_restrdis_force.cpp       |   2 +-
 src/core/cpu/src/cpu_restrpos_force.cpp       |   2 +-
 src/core/cpu/src/cpu_restrseq_force.cpp       |   4 +-
 src/core/cpu/src/cpu_restrwall_force.cpp      |   2 +-
 src/core/cpu/src/cpu_shake.cpp                |   8 +-
 src/core/cpu/src/cpu_temperature.cpp          |  10 +-
 src/core/cpu/src/cpu_torsion_force.cpp        |  12 +-
 src/core/cpu/src/utils.cpp                    |  15 +-
 src/core/cuda/include/cuda_angle_force.cuh    |   4 +-
 src/core/cuda/include/cuda_bond_force.cuh     |   4 +-
 .../cuda/include/cuda_improper2_force.cuh     |   4 +-
 .../cuda/include/cuda_nonbonded_force.cuh     |   8 +-
 src/core/cuda/include/cuda_torsion_force.cuh  |   4 +-
 src/core/cuda/include/cuda_utility.cuh        |   3 +-
 src/core/cuda/src/cuda_angle_force.cu         |  33 ++---
 src/core/cuda/src/cuda_bond_force.cu          |  28 ++--
 src/core/cuda/src/cuda_improper2_force.cu     |  18 +--
 src/core/cuda/src/cuda_leapfrog.cu            |  12 +-
 src/core/cuda/src/cuda_nonbonded_14_force.cu  |  30 ++--
 src/core/cuda/src/cuda_nonbonded_force.cu     |  38 +++---
 src/core/cuda/src/cuda_polx_water_force.cu    |  54 ++++----
 src/core/cuda/src/cuda_pshell_force.cu        |  26 ++--
 src/core/cuda/src/cuda_radix_water_force.cu   |  28 ++--
 src/core/cuda/src/cuda_restrang_force.cu      |  18 +--
 src/core/cuda/src/cuda_restrdis_force.cu      |  16 +--
 src/core/cuda/src/cuda_restrpos_force.cu      |  16 +--
 src/core/cuda/src/cuda_restrseq_force.cu      |  16 +--
 src/core/cuda/src/cuda_restrwall_force.cu     |  14 +-
 src/core/cuda/src/cuda_shake_constraints.cu   |   4 +-
 src/core/cuda/src/cuda_temperature.cu         |  66 ++++-----
 src/core/cuda/src/cuda_torsion_force.cu       |  25 ++--
 58 files changed, 452 insertions(+), 420 deletions(-)

diff --git a/src/core/common/include/context.h b/src/core/common/include/context.h
index 83817bb8..ee516d7a 100644
--- a/src/core/common/include/context.h
+++ b/src/core/common/include/context.h
@@ -32,8 +32,8 @@ class Context {
     int n_qatoms = 0;
     int n_waters = 0;
     int n_molecules = 0;
-    double dt = 0.0;
-    double tau_T = 0.0;
+    real_t dt = 0.0;
+    real_t tau_T = 0.0;
     md_t md;
     topo_t topo;
     int n_excluded = 0;
@@ -108,7 +108,7 @@ class Context {
 
     std::unique_ptr<HostDeviceBuffer<bool>> excluded;
 
-    std::unique_ptr<HostDeviceBuffer<double>> winv;
+    std::unique_ptr<HostDeviceBuffer<real_t>> winv;
 
     std::unique_ptr<HostDeviceBuffer<bool>> shell;
 
@@ -137,12 +137,12 @@ class Context {
     Water
     */
     std::unique_ptr<HostDeviceBuffer<shell_t>> wshells;
-    double crgQtot = 0.0;
-    double Dwmz = 0.0;
-    double awmz = 0.0;
-    std::vector<double> theta;
-    std::vector<double> theta0;
-    std::vector<double> tdum;
+    real_t crgQtot = 0.0;
+    real_t Dwmz = 0.0;
+    real_t awmz = 0.0;
+    std::vector<real_t> theta;
+    std::vector<real_t> theta0;
+    std::vector<real_t> tdum;
     int n_max_inshell = 0;
     int n_shells = 0;
     std::vector<std::vector<int>> list_sh;
@@ -152,7 +152,7 @@ class Context {
     /*
     FEP
     */
-    std::unique_ptr<HostDeviceBuffer<double>> lambdas; // Actually length is only 2..
+    std::unique_ptr<HostDeviceBuffer<real_t>> lambdas; // Actually length is only 2..
 
     /*
     Energy
@@ -206,13 +206,13 @@ class Context {
     Temperature
     */
 
-    double Temp = 0.0;
-    double Tfree = 0.0;
-    double Ndegf = 0.0;
-    double Ndegfree = 0.0;
+    real_t Temp = 0.0;
+    real_t Tfree = 0.0;
+    real_t Ndegf = 0.0;
+    real_t Ndegfree = 0.0;
 
-    double Tscale_solute = 0.0;
-    double Tscale_solvent = 0.0;
+    real_t Tscale_solute = 0.0;
+    real_t Tscale_solvent = 0.0;
     /*
     Info for FEP
     */
diff --git a/src/core/common/include/md_types.h b/src/core/common/include/md_types.h
index 27c20cef..dd5ef21d 100644
--- a/src/core/common/include/md_types.h
+++ b/src/core/common/include/md_types.h
@@ -12,29 +12,29 @@
 struct md_t {
     // [MD]
     int steps;
-    double stepsize;
-    double temperature;
+    real_t stepsize;
+    real_t temperature;
     char thermostat[40];
-    double bath_coupling;
+    real_t bath_coupling;
     int random_seed;
-    double initial_temperature;
+    real_t initial_temperature;
     bool shake_solvent;
     bool shake_solute;
     bool shake_hydrogens;
     bool lrf;
     bool charge_groups;
     // [cut-offs]
-    double solute_solute;
-    double solvent_solvent;
-    double solute_solvent;
-    double q_atom;
+    real_t solute_solute;
+    real_t solvent_solvent;
+    real_t solute_solvent;
+    real_t q_atom;
     // [sphere]
-    double shell_radius;  // Note: this is for the pshell
-    double shell_force;   // Note: this is for the pshell
+    real_t shell_radius;  // Note: this is for the pshell
+    real_t shell_force;   // Note: this is for the pshell
     // [solvent]
-    double radial_force;
+    real_t radial_force;
     bool polarisation;
-    double polarisation_force;
+    real_t polarisation_force;
     // [intervals]
     int non_bond;
     int output;
@@ -62,8 +62,8 @@ struct bond_t {
 
 struct cbond_t {
     int code;
-    double kb;
-    double b0;
+    real_t kb;
+    real_t b0;
 };
 
 struct angle_t {
@@ -75,8 +75,8 @@ struct angle_t {
 
 struct cangle_t {
     int code;
-    double kth;
-    double th0;
+    real_t kth;
+    real_t th0;
 };
 
 struct torsion_t {
@@ -89,10 +89,10 @@ struct torsion_t {
 
 struct ctorsion_t {
     int code;
-    double k;
-    double n;
-    double d;
-    double paths;
+    real_t k;
+    real_t n;
+    real_t d;
+    real_t paths;
 };
 
 struct improper_t {
@@ -105,8 +105,8 @@ struct improper_t {
 
 struct cimproper_t {
     int code;
-    double k;
-    double phi0;
+    real_t k;
+    real_t phi0;
 };
 
 struct charge_t {
@@ -126,11 +126,11 @@ struct atype_t {
 
 struct catype_t {
     int code;
-    double m;
+    real_t m;
     real_t aii_normal;
     real_t bii_normal;
-    // double aii_polar;
-    // double bii_polar;
+    // real_t aii_polar;
+    // real_t bii_polar;
     real_t aii_1_4;
     real_t bii_1_4;
 };
@@ -142,12 +142,12 @@ struct vdw_pair_param_t {
 
 struct topo_t {
     int solvent_type;
-    double exclusion_radius;
-    double solvent_radius;
+    real_t exclusion_radius;
+    real_t solvent_radius;
     coord_t solute_center;
     coord_t solvent_center;
-    double el14_scale;
-    double coulomb_constant;
+    real_t el14_scale;
+    real_t coulomb_constant;
     int vdw_rule;  // 1=geometric, 2=arithmetic
 };
 
@@ -177,14 +177,14 @@ struct q_angcouple_t {
 }; // no use
 
 struct q_cimproper_t {
-    double k;
-    double phi0;
+    real_t k;
+    real_t phi0;
 }; // no use
 
 struct q_elscale_t {
     int qi;
     int qj;
-    double mu;
+    real_t mu;
 };
 
 struct q_exclpair_t {
@@ -211,18 +211,18 @@ struct q_offdiag_t {
     int j;
     int qk;
     int ql;
-    double Aij;
-    double muij;
+    real_t Aij;
+    real_t muij;
 }; // no use
 
 struct q_shake_t {
     int ai;
     int aj;
-    double dist;
+    real_t dist;
 }; // no use
 
 struct q_softcore_t {
-    double s;
+    real_t s;
 }; // no use
 
 struct q_softpair_t {
@@ -243,7 +243,7 @@ struct q_torcouple_t {
 struct restrseq_t {
     int ai;
     int aj;
-    double k;
+    real_t k;
     bool ih;
     int to_center;  // Flag for restraining to geom. or mass center
 };
@@ -258,32 +258,32 @@ struct restrpos_t {
 struct restrdis_t {
     int ai, aj;
     int ipsi;
-    double d1, d2;
-    double k;
+    real_t d1, d2;
+    real_t k;
     char itext[20], jtext[20];
 };
 
 struct restrang_t {
     int ai, aj, ak;
     int ipsi;
-    double ang;
-    double k;
+    real_t ang;
+    real_t k;
 };
 
 struct restrwall_t {
     int ai, aj;
-    double d, k, aMorse, dMorse;
+    real_t d, k, aMorse, dMorse;
     bool ih;
 };
 
 struct shell_t {
     int n_inshell;
-    double theta_corr;
-    double avtheta;
-    double avn_inshell;
-    double router;
-    double dr;
-    double cstb;
+    real_t theta_corr;
+    real_t avtheta;
+    real_t avn_inshell;
+    real_t router;
+    real_t dr;
+    real_t cstb;
 };
 
 /* =============================================
@@ -294,7 +294,7 @@ struct shell_t {
 struct shake_bond_t {
     int ai;
     int aj;
-    double dist2;
+    real_t dist2;
     bool ready;
 };
 
@@ -316,28 +316,28 @@ struct dvel_t {
 };
 
 struct E_bonded_t {
-    double Ubond;
-    double Uangle;
-    double Utor;
-    double Uimp;
+    real_t Ubond;
+    real_t Uangle;
+    real_t Utor;
+    real_t Uimp;
 };
 
 struct E_nonbonded_t {
-    double Ucoul;
-    double Uvdw;
+    real_t Ucoul;
+    real_t Uvdw;
 };
 
 struct E_restraint_t {
-    double Uradx;
-    double Upolx;
-    double Ufix;
-    double Ushell;
-    double Upres;
-    double Urestr;
+    real_t Uradx;
+    real_t Upolx;
+    real_t Ufix;
+    real_t Ushell;
+    real_t Upres;
+    real_t Urestr;
 };
 
 struct energy_t {
-    double Ukin;
-    double Upot;
-    double Utot;
+    real_t Ukin;
+    real_t Upot;
+    real_t Utot;
 };
diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h
index 80b790f7..b0978010 100644
--- a/src/core/common/include/precision.h
+++ b/src/core/common/include/precision.h
@@ -4,14 +4,15 @@
 using real_t = float;
 using nonbond_work_t = float;
 using force_accum_t = float;
+using energy_accum_t = float;
+using constraint_work_t = float;
 #else
 using real_t = double;
 using nonbond_work_t = double;
 using force_accum_t = double;
-#endif
-
 using energy_accum_t = double;
 using constraint_work_t = double;
+#endif
 
 #ifdef QDYN_SPFP
 constexpr double k_singular_sin_epsilon = 1.0e-6;
diff --git a/src/core/common/src/init.cpp b/src/core/common/src/init.cpp
index 499c01cb..e7c2b8c0 100644
--- a/src/core/common/src/init.cpp
+++ b/src/core/common/src/init.cpp
@@ -38,10 +38,10 @@ void initialize_catype_tables() {
     auto *q_atoms_cpu = ctx.q_atoms_list->cpu_data_p;
 
     std::vector<catype_t> h_catype_table_all;
-    std::map<std::array<double, 4>, int> catype_to_type_host;
+    std::map<std::array<real_t, 4>, int> catype_to_type_host;
 
     auto add_catype = [&](catype_t catype) -> int {
-        const std::array<double, 4> key = {
+        const std::array<real_t, 4> key = {
             catype.aii_normal,
             catype.bii_normal,
             catype.aii_1_4,
@@ -91,7 +91,7 @@ void initialize_catype_tables() {
     for (int i = 0; i < static_cast<int>(ctx.p_atoms_list->length); i++) {
         const int id = p_atoms_cpu[i];
         const catype_t catype = catypes[atypes[id].code - 1];
-        const std::array<double, 4> key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4};
+        const std::array<real_t, 4> key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4};
         p_catype_types_cpu[i] = catype_to_type_host[key];
     }
 
@@ -109,7 +109,7 @@ void initialize_catype_tables() {
             const int id = q_atoms_cpu[i];
             const atype_t& qat = ctx.q_atypes[q_idx[id] + ctx.n_qatoms * state];
             const catype_t& qcatype = ctx.q_catypes[qat.code - 1];
-            const std::array<double, 4> key = {qcatype.aii_normal, qcatype.bii_normal, qcatype.aii_1_4, qcatype.bii_1_4};
+            const std::array<real_t, 4> key = {qcatype.aii_normal, qcatype.bii_normal, qcatype.aii_1_4, qcatype.bii_1_4};
             q_catype_types_cpu[state * ctx.q_atoms_list->length + i] = catype_to_type_host[key];
         }
     }
@@ -118,7 +118,7 @@ void initialize_catype_tables() {
     for (int i = 0; i < static_cast<int>(ctx.w_atoms_list->length); i++) {
         const int id = w_atoms_cpu[i];
         const catype_t catype = catypes[atypes[id].code - 1];
-        const std::array<double, 4> key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4};
+        const std::array<real_t, 4> key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4};
         w_catype_types_cpu[i] = catype_to_type_host[key];
     }
     printf("Total water atom number: %lu, w_catype_types size: %lu\n", ctx.w_atoms_list->length, w_catype_types_cpu.size());
@@ -141,10 +141,10 @@ void initialize_charge_tables() {
     auto *w_atoms_cpu = ctx.w_atoms_list->cpu_data_p;
     auto *q_atoms_cpu = ctx.q_atoms_list->cpu_data_p;
 
-    std::map<double, int> charge_to_type_host;
+    std::map<real_t, int> charge_to_type_host;
     std::vector<ccharge_t> h_charge_table_all;
 
-    auto add_charge = [&](double charge) -> int {
+    auto add_charge = [&](real_t charge) -> int {
         if (charge_to_type_host.count(charge) == 0) {
             int sz = static_cast<int>(h_charge_table_all.size());
             ccharge_t new_ccharge = {};
@@ -161,7 +161,7 @@ void initialize_charge_tables() {
     }
     for (int state = 0; state < ctx.n_lambdas; state++) {
         for (int i = 0; i < ctx.n_qatoms; i++) {
-            double charge = ctx.q_charges[i + ctx.n_qatoms * state].charge;
+            real_t charge = ctx.q_charges[i + ctx.n_qatoms * state].charge;
             add_charge(charge);
             add_charge(charge * lambda_values[state]);
         }
@@ -181,7 +181,7 @@ void initialize_charge_tables() {
     std::vector<int> p_charge_types_cpu(ctx.p_atoms_list->length);
     for (int i = 0; i < static_cast<int>(ctx.p_atoms_list->length); i++) {
         const int id = p_atoms_cpu[i];
-        const double charge = ccharges[charges[id].code - 1].charge;
+        const real_t charge = ccharges[charges[id].code - 1].charge;
         p_charge_types_cpu[i] = charge_to_type_host[charge];
     }
 
@@ -197,7 +197,7 @@ void initialize_charge_tables() {
     for (int state = 0; state < ctx.n_lambdas; state++) {
         for (int i = 0; i < static_cast<int>(ctx.q_atoms_list->length); i++) {
             const int id = q_atoms_cpu[i];
-            const double charge = ctx.q_charges[q_idx[id] + ctx.n_qatoms * state].charge;
+            const real_t charge = ctx.q_charges[q_idx[id] + ctx.n_qatoms * state].charge;
             q_charge_types_cpu[state * ctx.q_atoms_list->length + i] = charge_to_type_host[charge];
         }
     }
@@ -205,7 +205,7 @@ void initialize_charge_tables() {
     std::vector<int> w_charge_types_cpu(ctx.w_atoms_list->length);
     for (int i = 0; i < static_cast<int>(ctx.w_atoms_list->length); i++) {
         const int id = w_atoms_cpu[i];
-        const double charge = ccharges[charges[id].code - 1].charge;
+        const real_t charge = ccharges[charges[id].code - 1].charge;
         w_charge_types_cpu[i] = charge_to_type_host[charge];
     }
 
@@ -493,8 +493,8 @@ void init_velocities() {
     auto& velocities = ctx.velocities->cpu_data_p;
 
     // If not previous value set, use a Maxwell distribution to fill velocities
-    double kT = Boltz * ctx.md.initial_temperature;
-    double sd, mass;
+    real_t kT = Boltz * ctx.md.initial_temperature;
+    real_t sd, mass;
     for (int i = 0; i < ctx.n_atoms; i++) {
         mass = catypes[atypes[i].code - 1].m;
         sd = sqrt(kT / mass);
@@ -514,7 +514,7 @@ void init_inv_mass() {
     auto& ctx = Context::instance();
     auto& atypes = ctx.atypes->cpu_data_p;
     auto& catypes = ctx.catypes->cpu_data_p;
-    ctx.winv = std::make_unique<HostDeviceBuffer<double>>(ctx.n_atoms, true, ctx.run_gpu);
+    ctx.winv = std::make_unique<HostDeviceBuffer<real_t>>(ctx.n_atoms, true, ctx.run_gpu);
     auto* winv = ctx.winv->cpu_data_p;
     for (int ai = 0; ai < ctx.n_atoms; ai++) {
         winv[ai] = 1 / catypes[atypes[ai].code - 1].m;
@@ -539,7 +539,7 @@ void init_water_sphere() {
 void init_wshells() {
     auto& ctx = Context::instance();
     int n_inshell;
-    double drs, router, ri, dr, Vshell, rshell;
+    real_t drs, router, ri, dr, Vshell, rshell;
     auto& bonds = ctx.bonds->cpu_data_p;
     auto& cbonds = ctx.cbonds->cpu_data_p;
     auto& angles = ctx.angles->cpu_data_p;
@@ -547,8 +547,8 @@ void init_wshells() {
     // Get water properties from the first water molecule.
     cbond_t cbondw = cbonds[bonds[ctx.n_atoms_solute].code - 1];
     cangle_t canglew = cangles[angles[ctx.n_atoms_solute].code - 1];
-    const double crg_ow = ctx.unified_ccharge(ctx.n_atoms_solute, 0).charge;
-    const double mu_w = -crg_ow * cbondw.b0 * cos(canglew.th0 / 2);
+    const real_t crg_ow = ctx.unified_ccharge(ctx.n_atoms_solute, 0).charge;
+    const real_t mu_w = -crg_ow * cbondw.b0 * cos(canglew.th0 / 2);
 
     drs = wpolr_layer / drouter;
 
@@ -605,7 +605,7 @@ void init_pshells() {
     auto& catypes = ctx.catypes->cpu_data_p;
     auto& coords_init = ctx.coords_init->cpu_data_p;
     auto* excluded = ctx.excluded->cpu_data_p;
-    double mass, r2, rin2;
+    real_t mass, r2, rin2;
 
     ctx.heavy = std::make_unique<HostDeviceBuffer<bool>>(ctx.n_atoms, true, ctx.run_gpu);
     auto* heavy = ctx.heavy->cpu_data_p;
@@ -655,7 +655,7 @@ static int mark_heavy_atoms(Context& ctx) {
     auto* heavy = ctx.heavy->cpu_data_p;
     int n_heavy = 0;
     for (int i = 0; i < ctx.n_atoms; i++) {
-        double mass = catypes[atypes[i].code - 1].m;
+        real_t mass = catypes[atypes[i].code - 1].m;
         if (mass < 4.0) {
             heavy[i] = false;
         } else {
@@ -681,7 +681,7 @@ void init_pshells_from_charge_groups() {
     auto& ctx = Context::instance();
     auto& coords_init = ctx.coords_init->cpu_data_p;
     auto* excluded = ctx.excluded->cpu_data_p;
-    double r2, rin2;
+    real_t r2, rin2;
     auto& charge_groups = ctx.charge_group_config;
     const bool use_switch_atom = charge_groups.iuse_switch_atom == 1;
 
@@ -697,9 +697,9 @@ void init_pshells_from_charge_groups() {
         const auto& charge_group = charge_groups.charge_groups[grp];
         int i = charge_group.iswitch - 1;
         if (heavy[i] && !excluded[i] && i < ctx.n_atoms_solute) {
-            double cx = coords_init[i].x;
-            double cy = coords_init[i].y;
-            double cz = coords_init[i].z;
+            real_t cx = coords_init[i].x;
+            real_t cy = coords_init[i].y;
+            real_t cz = coords_init[i].z;
             if (!use_switch_atom) {
                 cx = 0.0;
                 cy = 0.0;
@@ -710,7 +710,7 @@ void init_pshells_from_charge_groups() {
                     cy += coords_init[ai].y;
                     cz += coords_init[ai].z;
                 }
-                double inv_atoms = 1.0 / static_cast<double>(charge_group.atoms.size());
+                real_t inv_atoms = 1.0 / static_cast<real_t>(charge_group.atoms.size());
                 cx *= inv_atoms;
                 cy *= inv_atoms;
                 cz *= inv_atoms;
@@ -748,7 +748,7 @@ void init_shake() {
     int mol = 0;
     int shake;
     int n_solute_shake_constraints = 0;
-    double excl_shake = 0;
+    real_t excl_shake = 0;
     auto& bonds = ctx.bonds->cpu_data_p;
     auto& cbonds = ctx.cbonds->cpu_data_p;
 
@@ -808,10 +808,10 @@ void init_shake() {
     ctx.Ndegf = 3 * ctx.n_atoms - ctx.n_shake_constraints;
     ctx.Ndegfree = ctx.Ndegf - 3 * ctx.n_excluded + excl_shake;
 
-    const double Ndegf_solvent = ctx.Ndegf - 3 * ctx.n_atoms_solute + n_solute_shake_constraints;
+    const real_t Ndegf_solvent = ctx.Ndegf - 3 * ctx.n_atoms_solute + n_solute_shake_constraints;
 
-    const double Ndegfree_solvent = ctx.Ndegfree - (ctx.n_shake_constraints - n_solute_shake_constraints);
-    const double Ndegfree_solute = ctx.Ndegfree - Ndegfree_solvent;
+    const real_t Ndegfree_solvent = ctx.Ndegfree - (ctx.n_shake_constraints - n_solute_shake_constraints);
+    const real_t Ndegfree_solute = ctx.Ndegfree - Ndegfree_solvent;
 
     printf("n_shake_constrains = %d, n_solute_shake_constraints = %d, excl_shake = %f\n", ctx.n_shake_constraints, n_solute_shake_constraints, excl_shake);
 
diff --git a/src/core/common/src/parse.cpp b/src/core/common/src/parse.cpp
index 98e859ae..1b45a7a6 100644
--- a/src/core/common/src/parse.cpp
+++ b/src/core/common/src/parse.cpp
@@ -132,7 +132,7 @@ void parse_md(const char* filename) {
 #ifdef VERBOSE
     printf("reading in %d lambdas (%s in file)\n", ctx.n_lambdas, file.buffer[k][1]);
 #endif
-    ctx.lambdas = std::make_unique<HostDeviceBuffer<double>>(ctx.n_lambdas, true, ctx.run_gpu);
+    ctx.lambdas = std::make_unique<HostDeviceBuffer<real_t>>(ctx.n_lambdas, true, ctx.run_gpu);
     auto *lambdas = ctx.lambdas->cpu_data_p;
     k++;
     for (int i = 0; i < ctx.n_lambdas; i++) {
diff --git a/src/core/cpu/include/cpu_angle_force.h b/src/core/cpu/include/cpu_angle_force.h
index df2a3a64..ea4f5ef6 100644
--- a/src/core/cpu/include/cpu_angle_force.h
+++ b/src/core/cpu/include/cpu_angle_force.h
@@ -1,2 +1,5 @@
 #pragma once
-double calc_angle_forces(int start, int end);
\ No newline at end of file
+
+#include "common/include/precision.h"
+
+real_t calc_angle_forces(int start, int end);
diff --git a/src/core/cpu/include/cpu_bond_force.h b/src/core/cpu/include/cpu_bond_force.h
index 6a2f7f73..32775c6e 100644
--- a/src/core/cpu/include/cpu_bond_force.h
+++ b/src/core/cpu/include/cpu_bond_force.h
@@ -1,3 +1,5 @@
 #pragma once
 
-double calc_bond_forces(int start, int end);
+#include "common/include/precision.h"
+
+real_t calc_bond_forces(int start, int end);
diff --git a/src/core/cpu/include/cpu_improper2_force.h b/src/core/cpu/include/cpu_improper2_force.h
index 26d694aa..b6606e57 100644
--- a/src/core/cpu/include/cpu_improper2_force.h
+++ b/src/core/cpu/include/cpu_improper2_force.h
@@ -1,3 +1,5 @@
 #pragma once
 
-double calc_improper2_forces(int start, int end);
+#include "common/include/precision.h"
+
+real_t calc_improper2_forces(int start, int end);
diff --git a/src/core/cpu/include/cpu_torsion_force.h b/src/core/cpu/include/cpu_torsion_force.h
index 19089318..309bd505 100644
--- a/src/core/cpu/include/cpu_torsion_force.h
+++ b/src/core/cpu/include/cpu_torsion_force.h
@@ -1,3 +1,5 @@
 #pragma once
 
-double calc_torsion_forces(int start, int end);
+#include "common/include/precision.h"
+
+real_t calc_torsion_forces(int start, int end);
diff --git a/src/core/cpu/include/cpu_utils.h b/src/core/cpu/include/cpu_utils.h
index e7be4557..352d6b3c 100644
--- a/src/core/cpu/include/cpu_utils.h
+++ b/src/core/cpu/include/cpu_utils.h
@@ -1,5 +1,7 @@
 #pragma once
 
-double gauss(double mean, double sd);
-double to_degrees(double radians);
-double to_radians(double degrees);
+#include "common/include/precision.h"
+
+real_t gauss(real_t mean, real_t sd);
+real_t to_degrees(real_t radians);
+real_t to_radians(real_t degrees);
diff --git a/src/core/cpu/src/cpu_angle_force.cpp b/src/core/cpu/src/cpu_angle_force.cpp
index ae600561..1f19390f 100644
--- a/src/core/cpu/src/cpu_angle_force.cpp
+++ b/src/core/cpu/src/cpu_angle_force.cpp
@@ -5,7 +5,7 @@
 #include "context.h"
 #include "cpu_utils.h"
 
-double calc_angle_forces(int start, int end) {
+real_t calc_angle_forces(int start, int end) {
     auto& ctx = Context::instance();
     auto &coords = ctx.coords->cpu_data_p;
     auto &dvelocities = ctx.dvelocities->cpu_data_p;
@@ -15,11 +15,11 @@ double calc_angle_forces(int start, int end) {
     coord_t rji, rjk;
     coord_t di, dk;
 
-    double bji2inv, bjk2inv, bjiinv, bjkinv;
+    real_t bji2inv, bjk2inv, bjiinv, bjkinv;
     cangle_t cangle;
-    double cos_th, th, dth, dv, f1;
-    double ener;
-    double angle = 0;
+    real_t cos_th, th, dth, dv, f1;
+    real_t ener;
+    real_t angle = 0;
 
     auto &angles = ctx.angles->cpu_data_p;
     auto &cangles = ctx.cangles->cpu_data_p;
diff --git a/src/core/cpu/src/cpu_bond_force.cpp b/src/core/cpu/src/cpu_bond_force.cpp
index 2a539f90..0ab4baff 100644
--- a/src/core/cpu/src/cpu_bond_force.cpp
+++ b/src/core/cpu/src/cpu_bond_force.cpp
@@ -4,7 +4,7 @@
 
 #include "context.h"
 
-double calc_bond_forces(int start, int end) {
+real_t calc_bond_forces(int start, int end) {
     auto& ctx = Context::instance();
     auto &bonds = ctx.bonds->cpu_data_p;
     auto &cbonds = ctx.cbonds->cpu_data_p;
@@ -13,8 +13,8 @@ double calc_bond_forces(int start, int end) {
     int aii, aji;
     coord_t ai, aj, dx;
     cbond_t cbond;
-    double dx2, dx1, ddx, ener, ampl;
-    double bond = 0;
+    real_t dx2, dx1, ddx, ener, ampl;
+    real_t bond = 0;
 
     for (int i = start; i < end; i++) {
         aii = bonds[i].ai - 1;
diff --git a/src/core/cpu/src/cpu_improper2_force.cpp b/src/core/cpu/src/cpu_improper2_force.cpp
index 6e4faa60..fea7c724 100644
--- a/src/core/cpu/src/cpu_improper2_force.cpp
+++ b/src/core/cpu/src/cpu_improper2_force.cpp
@@ -5,7 +5,7 @@
 #include "context.h"
 #include "cpu_utils.h"
 
-double calc_improper2_forces(int start, int end) {
+real_t calc_improper2_forces(int start, int end) {
     auto& ctx = Context::instance();
     auto &impropers = ctx.impropers->cpu_data_p;
     auto &cimpropers = ctx.cimpropers->cpu_data_p;
@@ -15,13 +15,13 @@ double calc_improper2_forces(int start, int end) {
 
     coord_t ai, aj, ak, al;
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
-    double bj2inv, bk2inv, bjinv, bkinv;
-    double cos_phi, phi, arg, ener, dv, f1;
+    real_t bj2inv, bk2inv, bjinv, bkinv;
+    real_t cos_phi, phi, arg, ener, dv, f1;
     coord_t di, dl, dpi, dpj, dpk, dpl;
 
     improper_t imp;
     cimproper_t cimp;
-    double improper = 0;
+    real_t improper = 0;
 
     for (int i = start; i < end; i++) {
         imp = impropers[i];
diff --git a/src/core/cpu/src/cpu_leapfrog.cpp b/src/core/cpu/src/cpu_leapfrog.cpp
index 9d1ff43a..0927e414 100644
--- a/src/core/cpu/src/cpu_leapfrog.cpp
+++ b/src/core/cpu/src/cpu_leapfrog.cpp
@@ -11,8 +11,8 @@ void calc_leapfrog() {
     auto &velocities = ctx.velocities->cpu_data_p;
     auto &dvelocities = ctx.dvelocities->cpu_data_p;
     auto *xcoords = ctx.xcoords->cpu_data_p;
-    double mass_i;
-    double winv_i;
+    real_t mass_i;
+    real_t winv_i;
 
     for (int i = 0; i < ctx.n_atoms_solute; i++) {
         mass_i = catypes[atypes[i].code - 1].m;
diff --git a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
index 390c67eb..cbeb11f5 100644
--- a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
@@ -13,7 +13,7 @@ void calc_nonbonded_pp_forces() {
     auto &LJ_matrix = ctx.LJ_matrix->cpu_data_p;
     auto *excluded = ctx.excluded->cpu_data_p;
     bool bond14, bond23;
-    double scaling;
+    real_t scaling;
     coord_t da;
     real_t r2a, ra, r6a;
     real_t V_a, V_b;
@@ -67,8 +67,8 @@ void calc_nonbonded_pp_forces() {
             dvelocities[j].y += dva * da.y;
             dvelocities[j].z += dva * da.z;
 
-            ctx.E_nonbond_pp.Ucoul += static_cast<double>(Vela);
-            ctx.E_nonbond_pp.Uvdw += static_cast<double>(V_a - V_b);
+            ctx.E_nonbond_pp.Ucoul += static_cast<real_t>(Vela);
+            ctx.E_nonbond_pp.Uvdw += static_cast<real_t>(V_a - V_b);
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
index 030c1290..52c9242b 100644
--- a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
@@ -66,8 +66,8 @@ void calc_nonbonded_pw_forces() {
             dvelocities[atom_j].y += scale * dy;
             dvelocities[atom_j].z += scale * dz;
 
-            ctx.E_nonbond_pw.Ucoul += static_cast<double>(ecoul);
-            ctx.E_nonbond_pw.Uvdw += static_cast<double>(v_a - v_b);
+            ctx.E_nonbond_pw.Ucoul += static_cast<real_t>(ecoul);
+            ctx.E_nonbond_pw.Uvdw += static_cast<real_t>(v_a - v_b);
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
index 7a81a516..b0df677d 100644
--- a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
@@ -18,7 +18,7 @@ void calc_nonbonded_qp_forces() {
     real_t r2, r;
     real_t ai_aii, aj_aii, ai_bii, aj_bii;
     bool bond23, bond14;
-    double scaling;
+    real_t scaling;
     real_t Vel, V_a, V_b, dv;
 
     for (int qi = 0; qi < ctx.n_qatoms; qi++) {
@@ -70,8 +70,8 @@ void calc_nonbonded_qp_forces() {
                 dvelocities[j].z += dv * da.z;
 
                 // Update Q totals
-                ctx.EQ_nonbond_qp[state].Ucoul += static_cast<double>(Vel);
-                ctx.EQ_nonbond_qp[state].Uvdw += static_cast<double>(V_a - V_b);
+                ctx.EQ_nonbond_qp[state].Ucoul += static_cast<real_t>(Vel);
+                ctx.EQ_nonbond_qp[state].Uvdw += static_cast<real_t>(V_a - V_b);
             }
         }
     }
diff --git a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
index 006a3c0e..96462795 100644
--- a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
@@ -16,7 +16,7 @@ void calc_nonbonded_qq_forces() {
     auto *q_elscales = ctx.q_elscales->cpu_data_p;
     int ai, aj;
     real_t crg_i, crg_j;
-    double elscale, scaling;
+    real_t elscale, scaling;
     bool bond23, bond14;
     coord_t da;
     real_t r2a, ra, r6a;
@@ -81,8 +81,8 @@ void calc_nonbonded_qq_forces() {
                 dvelocities[aj].y += dva * da.y;
                 dvelocities[aj].z += dva * da.z;
 
-                ctx.EQ_nonbond_qq[state].Ucoul += static_cast<double>(Vela);
-                ctx.EQ_nonbond_qq[state].Uvdw += static_cast<double>(V_a - V_b);
+                ctx.EQ_nonbond_qq[state].Ucoul += static_cast<real_t>(Vela);
+                ctx.EQ_nonbond_qq[state].Uvdw += static_cast<real_t>(V_a - V_b);
             }
         }
     }
diff --git a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
index 8d18bc55..1ab0b469 100644
--- a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
@@ -75,8 +75,8 @@ void calc_nonbonded_qw_forces() {
                 dvH1 -= r2H1 * VelH1 * lambda;
                 dvH2 -= r2H2 * VelH2 * lambda;
 
-                ctx.EQ_nonbond_qw[state].Ucoul += static_cast<double>(VelO + VelH1 + VelH2);
-                ctx.EQ_nonbond_qw[state].Uvdw += static_cast<double>(V_a - V_b);
+                ctx.EQ_nonbond_qw[state].Ucoul += static_cast<real_t>(VelO + VelH1 + VelH2);
+                ctx.EQ_nonbond_qw[state].Uvdw += static_cast<real_t>(V_a - V_b);
             }
 
             // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!!
diff --git a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
index 3be5e6f0..f6d2ac98 100644
--- a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
@@ -59,8 +59,8 @@ void accumulate_pair_force(Context& ctx,
     dvelocities[atom_j].y += scale * dy;
     dvelocities[atom_j].z += scale * dz;
 
-    energy.Ucoul += static_cast<double>(ecoul);
-    energy.Uvdw += static_cast<double>(evdw);
+    energy.Ucoul += static_cast<real_t>(ecoul);
+    energy.Uvdw += static_cast<real_t>(evdw);
 }
 
 void calc_nonbonded_ww_forces() {
diff --git a/src/core/cpu/src/cpu_polx_water_force.cpp b/src/core/cpu/src/cpu_polx_water_force.cpp
index 5116dbbb..440d03d2 100644
--- a/src/core/cpu/src/cpu_polx_water_force.cpp
+++ b/src/core/cpu/src/cpu_polx_water_force.cpp
@@ -13,12 +13,12 @@ void calc_polx_w_forces(int iteration) {
     auto *wshells = ctx.wshells->cpu_data_p;
 
     int wi, imin, jw, ii, iis, jmin;
-    double tmin;
+    real_t tmin;
     coord_t rmu, rcu, f1O, f1H1, f1H2, f2;
-    double rm, rc;
-    double cos_th;
-    double avtdum, arg, f0, dv;
-    double ener;
+    real_t rm, rc;
+    real_t cos_th;
+    real_t avtdum, arg, f0, dv;
+    real_t ener;
 
     for (int is = 0; is < ctx.n_shells; is++) {
         wshells[is].n_inshell = 0;
@@ -93,8 +93,8 @@ void calc_polx_w_forces(int iteration) {
     if (iteration != 0 && iteration % itdis_update == 0) {
         for (int is = 0; is < ctx.n_shells; is++) {
             printf("SHELL %d\n", is);
-            wshells[is].avtheta /= (double)itdis_update;
-            wshells[is].avn_inshell /= (double)itdis_update;
+            wshells[is].avtheta /= (real_t)itdis_update;
+            wshells[is].avn_inshell /= (real_t)itdis_update;
             wshells[is].theta_corr =
                 wshells[is].theta_corr + wshells[is].avtheta - acos(wshells[is].cstb);
             printf("average theta = %f, average in shell = %f, theta_corr = %f\n",
@@ -113,7 +113,7 @@ void calc_polx_w_forces(int iteration) {
         avtdum = 0;
         for (int il = 0; il < wshells[is].n_inshell; il++) {
             ii = ctx.nsort[il][is];
-            arg = 1 + ((1 - 2 * (double)(il + 1)) / (double)wshells[is].n_inshell);
+            arg = 1 + ((1 - 2 * (real_t)(il + 1)) / (real_t)wshells[is].n_inshell);
             ctx.theta0[il] = acos(arg);
             ctx.theta0[il] = ctx.theta0[il] - 3 * sin(ctx.theta0[il]) * wshells[is].cstb / 2;
             if (ctx.theta0[il] < 0) {
@@ -189,7 +189,7 @@ void calc_polx_w_forces(int iteration) {
             dvelocities[wi + 2].z += f0 * f1H2.z;
         }
 
-        wshells[is].avtheta += avtdum / (double)wshells[is].n_inshell;
+        wshells[is].avtheta += avtdum / (real_t)wshells[is].n_inshell;
         wshells[is].avn_inshell += wshells[is].n_inshell;
     }
 }
diff --git a/src/core/cpu/src/cpu_pshell_force.cpp b/src/core/cpu/src/cpu_pshell_force.cpp
index 9ff083cc..a547f16d 100644
--- a/src/core/cpu/src/cpu_pshell_force.cpp
+++ b/src/core/cpu/src/cpu_pshell_force.cpp
@@ -13,7 +13,7 @@ void calc_pshell_forces() {
     auto *shell = ctx.shell->cpu_data_p;
 
     coord_t dr;
-    double k, r2, ener;
+    real_t k, r2, ener;
 
     for (int i = 0; i < ctx.n_atoms_solute; i++) {
         if (shell[i] || excluded[i]) {
diff --git a/src/core/cpu/src/cpu_q_angle_force.cpp b/src/core/cpu/src/cpu_q_angle_force.cpp
index 14aa802c..c92c904d 100644
--- a/src/core/cpu/src/cpu_q_angle_force.cpp
+++ b/src/core/cpu/src/cpu_q_angle_force.cpp
@@ -14,8 +14,8 @@ void calc_qangle_forces(int state) {
     int ic;
     int ai, aj, ak;
     coord_t rji, rjk;
-    double bji, bjk;
-    double cos_th, th, dth, ener, dv, f1;
+    real_t bji, bjk;
+    real_t cos_th, th, dth, ener, dv, f1;
     coord_t di, dk;
 
     for (int i = 0; i < ctx.n_qangles; i++) {
diff --git a/src/core/cpu/src/cpu_q_bond_force.cpp b/src/core/cpu/src/cpu_q_bond_force.cpp
index 5f2f7203..6b924c69 100644
--- a/src/core/cpu/src/cpu_q_bond_force.cpp
+++ b/src/core/cpu/src/cpu_q_bond_force.cpp
@@ -11,7 +11,7 @@ void calc_qbond_forces(int state) {
     auto *lambdas = ctx.lambdas->cpu_data_p;
     int ic;
     int ai, aj;
-    double b, db, ener, dv;
+    real_t b, db, ener, dv;
     coord_t rij;
 
     for (int i = 0; i < ctx.n_qbonds; i++) {
diff --git a/src/core/cpu/src/cpu_q_torsion_force.cpp b/src/core/cpu/src/cpu_q_torsion_force.cpp
index 7b7fb271..2be495b0 100644
--- a/src/core/cpu/src/cpu_q_torsion_force.cpp
+++ b/src/core/cpu/src/cpu_q_torsion_force.cpp
@@ -15,10 +15,10 @@ void calc_qtorsion_forces(int state) {
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
     coord_t di, dl, dpi, dpj, dpk, dpl;
 
-    double bj2inv, bk2inv, bjinv, bkinv;
-    double bj, bk, cos_phi, phi;
-    double arg, dv, f1;
-    double ener;
+    real_t bj2inv, bk2inv, bjinv, bkinv;
+    real_t bj, bk, cos_phi, phi;
+    real_t arg, dv, f1;
+    real_t ener;
 
     for (int i = 0; i < ctx.n_qtorsions; i++) {
         ic = ctx.q_torsions[i + ctx.n_qtorsions * state].code;
diff --git a/src/core/cpu/src/cpu_radix_water_force.cpp b/src/core/cpu/src/cpu_radix_water_force.cpp
index a887ad31..a85af35c 100644
--- a/src/core/cpu/src/cpu_radix_water_force.cpp
+++ b/src/core/cpu/src/cpu_radix_water_force.cpp
@@ -10,9 +10,9 @@ void calc_radix_w_forces() {
     auto &coords = ctx.coords->cpu_data_p;
     auto &dvelocities = ctx.dvelocities->cpu_data_p;
 
-    double b, db, ener, dv, fexp;
+    real_t b, db, ener, dv, fexp;
     coord_t dr;
-    double shift;
+    real_t shift;
 
     if (ctx.md.radial_force != 0) {
         shift = sqrt(Boltz * ctx.Tfree / ctx.md.radial_force);
diff --git a/src/core/cpu/src/cpu_restrang_force.cpp b/src/core/cpu/src/cpu_restrang_force.cpp
index 84f593b0..c2b9ed50 100644
--- a/src/core/cpu/src/cpu_restrang_force.cpp
+++ b/src/core/cpu/src/cpu_restrang_force.cpp
@@ -15,8 +15,8 @@ void calc_restrang_forces() {
 
     int state, i, j, k;
     coord_t dr, dr2, di, dk;
-    double lambda, r2ij, r2jk, rij, rjk, cos_th, th;
-    double dth, dv, ener, f1;
+    real_t lambda, r2ij, r2jk, rij, rjk, cos_th, th;
+    real_t dth, dv, ener, f1;
 
     for (int ir = 0; ir < ctx.n_restrangs; ir++) {
         state = restrangs[ir].ipsi - 1;
diff --git a/src/core/cpu/src/cpu_restrdis_force.cpp b/src/core/cpu/src/cpu_restrdis_force.cpp
index c15cbef7..859481f3 100644
--- a/src/core/cpu/src/cpu_restrdis_force.cpp
+++ b/src/core/cpu/src/cpu_restrdis_force.cpp
@@ -14,7 +14,7 @@ void calc_restrdis_forces() {
 
     int state, i, j;
     coord_t dr;
-    double lambda, b, db, dv, ener;
+    real_t lambda, b, db, dv, ener;
 
     for (int ir = 0; ir < ctx.n_restrdists; ir++) {
         state = restrdists[ir].ipsi - 1;
diff --git a/src/core/cpu/src/cpu_restrpos_force.cpp b/src/core/cpu/src/cpu_restrpos_force.cpp
index 6db044b4..a3e8710d 100644
--- a/src/core/cpu/src/cpu_restrpos_force.cpp
+++ b/src/core/cpu/src/cpu_restrpos_force.cpp
@@ -14,7 +14,7 @@ void calc_restrpos_forces() {
 
     int state, i;
     coord_t dr;
-    double lambda, ener, x2, y2, z2;
+    real_t lambda, ener, x2, y2, z2;
 
     for (int ir = 0; ir < ctx.n_restrspos; ir++) {
         state = restrspos[ir].ipsi - 1;
diff --git a/src/core/cpu/src/cpu_restrseq_force.cpp b/src/core/cpu/src/cpu_restrseq_force.cpp
index 296762e8..f9ff9fd0 100644
--- a/src/core/cpu/src/cpu_restrseq_force.cpp
+++ b/src/core/cpu/src/cpu_restrseq_force.cpp
@@ -13,9 +13,9 @@ void calc_restrseq_forces() {
     auto &restrseqs = ctx.restrseqs->cpu_data_p;
     auto *heavy = ctx.heavy->cpu_data_p;
 
-    double k, mass, totmass;
+    real_t k, mass, totmass;
     coord_t dr;
-    double r2, ener;
+    real_t r2, ener;
 
     for (int s = 0; s < ctx.n_restrseqs; s++) {
         k = restrseqs[s].k;
diff --git a/src/core/cpu/src/cpu_restrwall_force.cpp b/src/core/cpu/src/cpu_restrwall_force.cpp
index fd49749a..7da6faa6 100644
--- a/src/core/cpu/src/cpu_restrwall_force.cpp
+++ b/src/core/cpu/src/cpu_restrwall_force.cpp
@@ -11,7 +11,7 @@ void calc_restrwall_forces() {
     auto &restrwalls = ctx.restrwalls->cpu_data_p;
     auto *heavy = ctx.heavy->cpu_data_p;
 
-    double k, b, db, ener, dv, fexp;
+    real_t k, b, db, ener, dv, fexp;
     coord_t dr;
 
     for (int ir = 0; ir < ctx.n_restrwalls; ir++) {
diff --git a/src/core/cpu/src/cpu_shake.cpp b/src/core/cpu/src/cpu_shake.cpp
index cb29a0f0..91162c98 100644
--- a/src/core/cpu/src/cpu_shake.cpp
+++ b/src/core/cpu/src/cpu_shake.cpp
@@ -34,7 +34,7 @@ int calc_shake_constraints(coord_t* coords, coord_t* xcoords) {
                     const int aj = shake_bond.aj - 1;
                     coord_t xij;
                     coord_t xxij;
-                    double xij2, diff, corr, scp;
+                    real_t xij2, diff, corr, scp;
 
                     xij.x = coords[ai].x - coords[aj].x;
                     xij.y = coords[ai].y - coords[aj].y;
@@ -75,7 +75,7 @@ int calc_shake_constraints(coord_t* coords, coord_t* xcoords) {
                 const int ai = shake_bonds[shake + i].ai - 1;
                 const int aj = shake_bonds[shake + i].aj - 1;
                 coord_t xxij;
-                double xxij2;
+                real_t xxij2;
 
                 xxij.x = xcoords[ai].x - xcoords[aj].x;
                 xxij.y = xcoords[ai].y - xcoords[aj].y;
@@ -125,11 +125,11 @@ void stop_cm_translation() {
     auto &atypes = ctx.atypes->cpu_data_p;
     auto &catypes = ctx.catypes->cpu_data_p;
     auto &velocities = ctx.velocities->cpu_data_p;
-    double total_mass = 0;
+    real_t total_mass = 0;
     coord_t vcm = {};
 
     for (int ai = 0; ai < ctx.n_atoms; ai++) {
-        const double rmass = catypes[atypes[ai].code - 1].m;
+        const real_t rmass = catypes[atypes[ai].code - 1].m;
         total_mass += rmass;
         vcm.x += velocities[ai].x * rmass;
         vcm.y += velocities[ai].y;
diff --git a/src/core/cpu/src/cpu_temperature.cpp b/src/core/cpu/src/cpu_temperature.cpp
index 6b76139f..537dec77 100644
--- a/src/core/cpu/src/cpu_temperature.cpp
+++ b/src/core/cpu/src/cpu_temperature.cpp
@@ -17,11 +17,11 @@ void calc_temperature() {
     auto *excluded = ctx.excluded->cpu_data_p;
     ctx.Temp = 0;
     ctx.Tfree = 0;
-    double Temp_solute = 0, Tfree_solute = 0, Texcl_solute = 0;
-    double Tfree_solvent = 0, Temp_solvent = 0, Texcl_solvent = 0;
-    double Ekinmax = 1000.0 * ctx.Ndegf * Boltz * ctx.md.temperature / 2.0 / ctx.n_atoms;
-    double ener;
-    double mass_i;
+    real_t Temp_solute = 0, Tfree_solute = 0, Texcl_solute = 0;
+    real_t Tfree_solvent = 0, Temp_solvent = 0, Texcl_solvent = 0;
+    real_t Ekinmax = 1000.0 * ctx.Ndegf * Boltz * ctx.md.temperature / 2.0 / ctx.n_atoms;
+    real_t ener;
+    real_t mass_i;
 
     ctx.Temp = 0;
     for (int i = 0; i < ctx.n_atoms_solute; i++) {
diff --git a/src/core/cpu/src/cpu_torsion_force.cpp b/src/core/cpu/src/cpu_torsion_force.cpp
index 4ebb44b2..37a68298 100644
--- a/src/core/cpu/src/cpu_torsion_force.cpp
+++ b/src/core/cpu/src/cpu_torsion_force.cpp
@@ -5,7 +5,7 @@
 #include "context.h"
 #include "cpu_utils.h"
 
-double calc_torsion_forces(int start, int end) {
+real_t calc_torsion_forces(int start, int end) {
     auto& ctx = Context::instance();
     auto &torsions = ctx.torsions->cpu_data_p;
     auto &ctorsions = ctx.ctorsions->cpu_data_p;
@@ -17,11 +17,11 @@ double calc_torsion_forces(int start, int end) {
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
     coord_t di, dl, dpi, dpj, dpk, dpl;
 
-    double bj2inv, bk2inv, bjinv, bkinv;
-    double cos_phi, phi;
-    double arg, dv, f1;
-    double ener;
-    double torsion = 0;
+    real_t bj2inv, bk2inv, bjinv, bkinv;
+    real_t cos_phi, phi;
+    real_t arg, dv, f1;
+    real_t ener;
+    real_t torsion = 0;
 
     torsion_t t;
     ctorsion_t ctors;
diff --git a/src/core/cpu/src/utils.cpp b/src/core/cpu/src/utils.cpp
index ed680aa3..00c37e41 100644
--- a/src/core/cpu/src/utils.cpp
+++ b/src/core/cpu/src/utils.cpp
@@ -1,24 +1,25 @@
 #include <math.h>
 #include <stdio.h>
 
+#include "common/include/precision.h"
+
 // Get a value from a gaussian distributed random variable with
 // mean mean and standard deviation sd
-double gauss(double mean, double sd) {
-    double v1, v2, nd10;
+real_t gauss(real_t mean, real_t sd) {
+    real_t v1, v2, nd10;
 
-    v1 = ( (double)(rand()) + 1. )/( (double)(RAND_MAX) + 1. );
-    v2 = ( (double)(rand()) + 1. )/( (double)(RAND_MAX) + 1. );
+    v1 = ( (real_t)(rand()) + 1. )/( (real_t)(RAND_MAX) + 1. );
+    v2 = ( (real_t)(rand()) + 1. )/( (real_t)(RAND_MAX) + 1. );
     nd10 = cos(2 * M_PI * v2) * sqrt(-2. * log(v1));
 
     return sd * nd10 + mean;
 }
 
 
-double to_degrees(double radians) {
+real_t to_degrees(real_t radians) {
     return radians * (180.0 / M_PI);
 }
 
-double to_radians(double degrees) {
+real_t to_radians(real_t degrees) {
     return degrees * (M_PI / 180.0);
 }
-
diff --git a/src/core/cuda/include/cuda_angle_force.cuh b/src/core/cuda/include/cuda_angle_force.cuh
index c2e00e15..63ebb011 100644
--- a/src/core/cuda/include/cuda_angle_force.cuh
+++ b/src/core/cuda/include/cuda_angle_force.cuh
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "common/include/precision.h"
+
 void init_angle_force_kernel_data();
-double calc_angle_forces_host(int start, int end);
+real_t calc_angle_forces_host(int start, int end);
 void cleanup_angle_force();
diff --git a/src/core/cuda/include/cuda_bond_force.cuh b/src/core/cuda/include/cuda_bond_force.cuh
index 83961ed5..bddc873c 100644
--- a/src/core/cuda/include/cuda_bond_force.cuh
+++ b/src/core/cuda/include/cuda_bond_force.cuh
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "common/include/precision.h"
+
 void init_bond_force_kernel_data();
-double calc_bond_forces_host(int start, int end);
+real_t calc_bond_forces_host(int start, int end);
 void cleanup_bond_force();
diff --git a/src/core/cuda/include/cuda_improper2_force.cuh b/src/core/cuda/include/cuda_improper2_force.cuh
index cb0a9635..9e0a2cfd 100644
--- a/src/core/cuda/include/cuda_improper2_force.cuh
+++ b/src/core/cuda/include/cuda_improper2_force.cuh
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "common/include/precision.h"
+
 void init_improper2_force_kernel_data();
-double calc_improper2_forces_host(int start, int end);
+real_t calc_improper2_forces_host(int start, int end);
 void cleanup_improper2_force();
diff --git a/src/core/cuda/include/cuda_nonbonded_force.cuh b/src/core/cuda/include/cuda_nonbonded_force.cuh
index f1a9b252..ee227088 100644
--- a/src/core/cuda/include/cuda_nonbonded_force.cuh
+++ b/src/core/cuda/include/cuda_nonbonded_force.cuh
@@ -1,8 +1,12 @@
 #pragma once
 
+#include <utility>
+
+#include "common/include/precision.h"
+
 void init_nonbonded_force_kernel_data();
 
-std::pair<double, double> calc_nonbonded_force_host(
+std::pair<real_t, real_t> calc_nonbonded_force_host(
     int nx, 
     int ny, 
     int* x_idx_list, 
@@ -14,7 +18,7 @@ std::pair<double, double> calc_nonbonded_force_host(
     const int* x_atypes_types, 
     const int* y_atypes_types,
     const bool disable_water_h_lj = false,
-    const double lambda = 1.0
+    const real_t lambda = 1.0
 );
 
 void cleanup_nonbonded_force();
diff --git a/src/core/cuda/include/cuda_torsion_force.cuh b/src/core/cuda/include/cuda_torsion_force.cuh
index 50315181..cac7e191 100644
--- a/src/core/cuda/include/cuda_torsion_force.cuh
+++ b/src/core/cuda/include/cuda_torsion_force.cuh
@@ -1,6 +1,8 @@
 #pragma once
 
+#include "common/include/precision.h"
+
 void init_torsion_force_kernel_data();
-double calc_torsion_forces_host(int start, int end);
+real_t calc_torsion_forces_host(int start, int end);
 
 void cleanup_torsion_force();
diff --git a/src/core/cuda/include/cuda_utility.cuh b/src/core/cuda/include/cuda_utility.cuh
index 36767be0..9cbcefd5 100644
--- a/src/core/cuda/include/cuda_utility.cuh
+++ b/src/core/cuda/include/cuda_utility.cuh
@@ -3,7 +3,8 @@
 #include <math.h>
 
 #include "common/include/cuda_runtime_utility.h"
+#include "common/include/precision.h"
 
-__device__ inline double to_radians_device(double degrees) {
+__device__ inline real_t to_radians_device(real_t degrees) {
     return degrees * (M_PI / 180.0);
 }
diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu
index f20b039a..445bed51 100644
--- a/src/core/cuda/src/cuda_angle_force.cu
+++ b/src/core/cuda/src/cuda_angle_force.cu
@@ -4,10 +4,10 @@
 
 namespace CudaAngleForce {
 bool is_initialized = false;
-double* d_energy_sum;
+real_t* d_energy_sum;
 }  // namespace CudaAngleForce
 
-__global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, coord_t* coords, cangle_t* cangles, dvel_t* dvelocities, double* energy_sum) {
+__global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, coord_t* coords, cangle_t* cangles, dvel_t* dvelocities, real_t* energy_sum) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x + start;
     if (idx >= end) return;
 
@@ -24,21 +24,22 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co
     coord_t rji = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
     coord_t rjk = {rk.x - rj.x, rk.y - rj.y, rk.z - rj.z};
 
-    double rji_length = sqrt(rji.x * rji.x + rji.y * rji.y + rji.z * rji.z);
-    double rjk_length = sqrt(rjk.x * rjk.x + rjk.y * rjk.y + rjk.z * rjk.z);
+    real_t rji_length = sqrt(rji.x * rji.x + rji.y * rji.y + rji.z * rji.z);
+    real_t rjk_length = sqrt(rjk.x * rjk.x + rjk.y * rjk.y + rjk.z * rjk.z);
 
-    double cos_theta = (rji.x * rjk.x + rji.y * rjk.y + rji.z * rjk.z) / (rji_length * rjk_length);
+    real_t cos_theta = (rji.x * rjk.x + rji.y * rjk.y + rji.z * rjk.z) / (rji_length * rjk_length);
 
-    cos_theta = fmax(fmin(cos_theta, 1.0), -1.0);  // Clamp value to avoid NaNs
-    double theta = acos(cos_theta);
+    cos_theta = cos_theta > static_cast<real_t>(1.0) ? static_cast<real_t>(1.0) : cos_theta;
+    cos_theta = cos_theta < static_cast<real_t>(-1.0) ? static_cast<real_t>(-1.0) : cos_theta;
+    real_t theta = acos(cos_theta);
 
-    double dtheta = theta - to_radians_device(cang.th0);
-    double energy = 0.5 * cang.kth * dtheta * dtheta;
+    real_t dtheta = theta - to_radians_device(cang.th0);
+    real_t energy = 0.5 * cang.kth * dtheta * dtheta;
 
     // calculate force magnitude
-    double dv = cang.kth * dtheta;
+    real_t dv = cang.kth * dtheta;
 
-    double f1 = sin(theta);
+    real_t f1 = sin(theta);
     if (fabs(f1) < k_singular_sin_epsilon) {
         f1 = -1.0 / k_singular_sin_epsilon;
     } else {
@@ -70,7 +71,7 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co
     atomicAdd(&dvelocities[j].z, -dv * (di.z + dk.z));
 }
 
-double calc_angle_forces_host(int start, int end) {
+real_t calc_angle_forces_host(int start, int end) {
     int N = end - start;
     if (N <= 0) return 0.0;
     using namespace CudaAngleForce;
@@ -85,8 +86,8 @@ double calc_angle_forces_host(int start, int end) {
     // todo: now have to do that, after moving all to CudaContext, can remove it
     // ctx.sync_all_to_device();
 
-    double h_energy_sum = 0.0;
-    cudaMemcpy(d_energy_sum, &h_energy_sum, sizeof(double), cudaMemcpyHostToDevice);
+    real_t h_energy_sum = 0.0;
+    cudaMemcpy(d_energy_sum, &h_energy_sum, sizeof(real_t), cudaMemcpyHostToDevice);
 
     // launch kernel
     calc_angle_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_angles, d_coords, d_cangles, d_dvelocities, d_energy_sum);
@@ -94,14 +95,14 @@ double calc_angle_forces_host(int start, int end) {
 
     // todo: Now have to do that, after moving all to CudaContext, can remove it
     // copy results back to host
-    cudaMemcpy(&h_energy_sum, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_energy_sum, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost);
     return h_energy_sum;
 }
 
 void init_angle_force_kernel_data() {
     using namespace CudaAngleForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
+        check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_bond_force.cu b/src/core/cuda/src/cuda_bond_force.cu
index 9b31a660..476d7209 100644
--- a/src/core/cuda/src/cuda_bond_force.cu
+++ b/src/core/cuda/src/cuda_bond_force.cu
@@ -3,9 +3,9 @@
 #include "cuda_utility.cuh"
 namespace CudaBondForce {
 bool is_initialized = false;
-double* d_energy_sum;
+real_t* d_energy_sum;
 }  // namespace CudaBondForce
-__global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord_t* coords, cbond_t* cbonds, dvel_t* dvelocities, double* energy_sum) {
+__global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord_t* coords, cbond_t* cbonds, dvel_t* dvelocities, real_t* energy_sum) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x + start;
     if (idx >= end) return;
     bond_t bond = bonds[idx];
@@ -13,18 +13,18 @@ __global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord
     coord_t rj = coords[bond.aj - 1];
     cbond_t cbond = cbonds[bond.code - 1];
 
-    double dx = rj.x - ri.x;
-    double dy = rj.y - ri.y;
-    double dz = rj.z - ri.z;
-    double r = sqrt(dx * dx + dy * dy + dz * dz);
+    real_t dx = rj.x - ri.x;
+    real_t dy = rj.y - ri.y;
+    real_t dz = rj.z - ri.z;
+    real_t r = sqrt(dx * dx + dy * dy + dz * dz);
 
-    double dr = r - cbond.b0;
-    double energy = 0.5 * cbond.kb * dr * dr;
+    real_t dr = r - cbond.b0;
+    real_t energy = 0.5 * cbond.kb * dr * dr;
 
     atomicAdd(energy_sum, energy);
 
     // update forces
-    double f = cbond.kb * dr / r;
+    real_t f = cbond.kb * dr / r;
     atomicAdd(&dvelocities[bond.aj - 1].x, f * dx);
     atomicAdd(&dvelocities[bond.aj - 1].y, f * dy);
     atomicAdd(&dvelocities[bond.aj - 1].z, f * dz);
@@ -33,15 +33,15 @@ __global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord
     atomicAdd(&dvelocities[bond.ai - 1].z, -f * dz);
 }
 
-double calc_bond_forces_host(int start, int end) {
+real_t calc_bond_forces_host(int start, int end) {
     int N = end - start;
     if (N <= 0) return 0.0;
     using namespace CudaBondForce;
     int blockSize = 256;
     int numBlocks = (N + blockSize - 1) / blockSize;
 
-    double energy = 0.0;
-    cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice);
+    real_t energy = 0.0;
+    cudaMemcpy(d_energy_sum, &energy, sizeof(real_t), cudaMemcpyHostToDevice);
 
     auto& host_ctx = Context::instance();
     bond_t* d_bonds = host_ctx.bonds->gpu_data_p;
@@ -51,7 +51,7 @@ double calc_bond_forces_host(int start, int end) {
 
     calc_bond_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_bonds, d_coords, d_cbonds, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
-    cudaMemcpy(&energy, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&energy, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost);
 
     return energy;
 }
@@ -59,7 +59,7 @@ double calc_bond_forces_host(int start, int end) {
 void init_bond_force_kernel_data() {
     using namespace CudaBondForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
+        check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu
index dd7d91aa..f0f790db 100644
--- a/src/core/cuda/src/cuda_improper2_force.cu
+++ b/src/core/cuda/src/cuda_improper2_force.cu
@@ -4,10 +4,10 @@
 
 namespace CudaImproper2Force {
 bool is_initialized = false;
-double* d_energy_sum;
+real_t* d_energy_sum;
 }  // namespace CudaImproper2Force
 
-__global__ void calc_improper2_forces_kernel(int start, int end, improper_t* impropers, cimproper_t* cimpropers, coord_t* coords, dvel_t* dvelocities, double* energy_sum) {
+__global__ void calc_improper2_forces_kernel(int start, int end, improper_t* impropers, cimproper_t* cimpropers, coord_t* coords, dvel_t* dvelocities, real_t* energy_sum) {
     int i = blockIdx.x * blockDim.x + threadIdx.x + start;
     if (i >= end) return;
 
@@ -15,8 +15,8 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp
 
     coord_t ai, aj, ak, al;
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
-    double bj2inv, bk2inv, bjinv, bkinv;
-    double cos_phi, phi, arg, ener, dv, f1;
+    real_t bj2inv, bk2inv, bjinv, bkinv;
+    real_t cos_phi, phi, arg, ener, dv, f1;
     coord_t di, dl, dpi, dpj, dpk, dpl;
 
     improper_t imp;
@@ -124,15 +124,15 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp
     atomicAdd(&dvelocities[ali].z, dv * dpl.z);
 }
 
-double calc_improper2_forces_host(int start, int end) {
+real_t calc_improper2_forces_host(int start, int end) {
     int N = end - start;
     if (N <= 0) return 0.0;
     using namespace CudaImproper2Force;
     int blockSize = 256;
     int numBlocks = (N + blockSize - 1) / blockSize;
 
-    double energy = 0.0;
-    cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice);
+    real_t energy = 0.0;
+    cudaMemcpy(d_energy_sum, &energy, sizeof(real_t), cudaMemcpyHostToDevice);
 
     auto& host_ctx = Context::instance();
     coord_t* d_coords = host_ctx.coords->gpu_data_p;
@@ -142,14 +142,14 @@ double calc_improper2_forces_host(int start, int end) {
 
     calc_improper2_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_impropers, d_cimpropers, d_coords, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
-    cudaMemcpy(&energy, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&energy, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost);
     return energy;
 }
 
 void init_improper2_force_kernel_data() {
     using namespace CudaImproper2Force;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
+        check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_leapfrog.cu b/src/core/cuda/src/cuda_leapfrog.cu
index 1e010f7e..2ac8245a 100644
--- a/src/core/cuda/src/cuda_leapfrog.cu
+++ b/src/core/cuda/src/cuda_leapfrog.cu
@@ -18,20 +18,20 @@ __global__ void calc_leapfrog_kernel(
     coord_t* xcoords,
     int n_atoms,
     int n_atoms_solute,
-    double Tscale_solute,
-    double Tscale_solvent,
-    double dt) {
+    real_t Tscale_solute,
+    real_t Tscale_solvent,
+    real_t dt) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_atoms) return;
     int i = idx;
 
     // Kernel implementation goes here
-    double mass_i, winv_i;
+    real_t mass_i, winv_i;
 
     mass_i = catypes[atypes[i].code - 1].m;
 
     winv_i = 1 / mass_i;
-    double scale = (i < n_atoms_solute) ? Tscale_solute : Tscale_solvent;
+    real_t scale = (i < n_atoms_solute) ? Tscale_solute : Tscale_solvent;
     velocities[i].x = (velocities[i].x - dvelocities[i].x * dt * winv_i) * scale;
     velocities[i].y = (velocities[i].y - dvelocities[i].y * dt * winv_i) * scale;
     velocities[i].z = (velocities[i].z - dvelocities[i].z * dt * winv_i) * scale;
@@ -50,7 +50,7 @@ __global__ void update_velocities_from_positions_kernel(
     const coord_t* coords,
     const coord_t* xcoords,
     int n_atoms,
-    double dt) {
+    real_t dt) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_atoms) return;
 
diff --git a/src/core/cuda/src/cuda_nonbonded_14_force.cu b/src/core/cuda/src/cuda_nonbonded_14_force.cu
index 78c4bc91..f925fe6f 100644
--- a/src/core/cuda/src/cuda_nonbonded_14_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_14_force.cu
@@ -9,8 +9,8 @@ bool is_initialized = false;
 constexpr int kNonbonded14ModeCount = 3;
 
 int* d_atom_to_qi = nullptr;
-double* d_evdw_totals = nullptr;
-double* d_ecoul_totals = nullptr;
+real_t* d_evdw_totals = nullptr;
+real_t* d_ecoul_totals = nullptr;
 
 __device__ __forceinline__ nonbond_work_t nonbond14_rsqrt(nonbond_work_t value) {
 #ifdef QDYN_SPFP
@@ -96,13 +96,13 @@ __global__ void calc_nonbonded_14_force_kernel(
     const catype_t* unified_catypes,
     const coord_t* d_coords,
     dvel_t* d_dvelocities,
-    double* evdw_totals,
-    double* ecoul_totals,
+    real_t* evdw_totals,
+    real_t* ecoul_totals,
     bool include_pp,
     int state,
     int n_atoms,
     int n_qatoms,
-    double lambda) {
+    real_t lambda) {
     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_pairs) return;
 
@@ -166,14 +166,14 @@ __global__ void calc_nonbonded_14_force_kernel(
 
 namespace {
 struct Nonbonded14EnergyBuckets {
-    double evdw[CudaNonbonded14Force::kNonbonded14ModeCount] = {};
-    double ecoul[CudaNonbonded14Force::kNonbonded14ModeCount] = {};
+    real_t evdw[CudaNonbonded14Force::kNonbonded14ModeCount] = {};
+    real_t ecoul[CudaNonbonded14Force::kNonbonded14ModeCount] = {};
 };
 }
 
 static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host(
     int state,
-    double lambda,
+    real_t lambda,
     bool include_pp) {
     using namespace CudaNonbonded14Force;
 
@@ -182,8 +182,8 @@ static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host(
     Nonbonded14EnergyBuckets energies = {};
     if (n_ngbrs_14 == 0) return energies;
 
-    cudaMemset(d_ecoul_totals, 0, sizeof(double) * kNonbonded14ModeCount);
-    cudaMemset(d_evdw_totals, 0, sizeof(double) * kNonbonded14ModeCount);
+    cudaMemset(d_ecoul_totals, 0, sizeof(real_t) * kNonbonded14ModeCount);
+    cudaMemset(d_evdw_totals, 0, sizeof(real_t) * kNonbonded14ModeCount);
 
     const int block_size = 256;
     const int num_blocks = (n_ngbrs_14 + block_size - 1) / block_size;
@@ -208,8 +208,8 @@ static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host(
 
     cudaDeviceSynchronize();
 
-    cudaMemcpy(energies.evdw, d_evdw_totals, sizeof(double) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost);
-    cudaMemcpy(energies.ecoul, d_ecoul_totals, sizeof(double) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost);
+    cudaMemcpy(energies.evdw, d_evdw_totals, sizeof(real_t) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost);
+    cudaMemcpy(energies.ecoul, d_ecoul_totals, sizeof(real_t) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost);
 
     return energies;
 }
@@ -221,7 +221,7 @@ void calc_nonbonded_14_forces_host() {
     if (host.n_ngbrs14 == 0) return;
 
     for (int state = 0; state < host.n_lambdas; state++) {
-        const double lambda = lambdas[state];
+        const real_t lambda = lambdas[state];
         const bool include_pp = (state == 0);
         Nonbonded14EnergyBuckets energies = calc_nonbonded_14_force_state_host(state, lambda, include_pp);
 
@@ -248,8 +248,8 @@ void init_nonbonded_14_force_kernel_data() {
     check_cudaMalloc((void**)&d_atom_to_qi, sizeof(int) * host.atom_to_qi.size());
     check_cuda(cudaMemcpy(d_atom_to_qi, host.atom_to_qi.data(), sizeof(int) * host.atom_to_qi.size(), cudaMemcpyHostToDevice));
 
-    check_cudaMalloc((void**)&d_evdw_totals, sizeof(double) * kNonbonded14ModeCount);
-    check_cudaMalloc((void**)&d_ecoul_totals, sizeof(double) * kNonbonded14ModeCount);
+    check_cudaMalloc((void**)&d_evdw_totals, sizeof(real_t) * kNonbonded14ModeCount);
+    check_cudaMalloc((void**)&d_ecoul_totals, sizeof(real_t) * kNonbonded14ModeCount);
 
     is_initialized = true;
 }
diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu
index 32b4077a..d7f0719c 100644
--- a/src/core/cuda/src/cuda_nonbonded_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_force.cu
@@ -7,7 +7,7 @@
 
 namespace CudaNonbondedForce {
 bool is_initialized = false;
-double *d_evdw_total, *d_ecoul_total;
+real_t *d_evdw_total, *d_ecoul_total;
 
 template <typename WorkT>
 struct nonbond_vec_t {
@@ -20,9 +20,11 @@ __device__ __forceinline__ float nonbond_rsqrt(float value) {
     return rsqrtf(value);
 }
 
+#ifndef QDYN_SPFP
 __device__ __forceinline__ double nonbond_rsqrt(double value) {
     return rsqrt(value);
 }
+#endif
 
 __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
     x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f);
@@ -39,6 +41,7 @@ __device__ __forceinline__ T shfl_value(T v, int srcLane, unsigned mask = 0xffff
     return __shfl_sync(mask, v, srcLane);
 }
 
+#ifndef QDYN_SPFP
 template <>
 __device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mask) {
     int2 a = *reinterpret_cast<int2*>(&v);
@@ -46,6 +49,7 @@ __device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mas
     a.y = __shfl_sync(mask, a.y, srcLane);
     return *reinterpret_cast<double*>(&a);
 }
+#endif
 
 __device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) {
     v.x = shfl_value(v.x, srcLane, mask);
@@ -76,8 +80,8 @@ __device__ void calculate_unforce_bound(
     const WorkT r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz);
     const WorkT r2 = r * r;
     const WorkT r6 = r2 * r2 * r2;
-    // double v_a = r6 * r6;
-    // double v_b = r6;
+    // real_t v_a = r6 * r6;
+    // real_t v_b = r6;
     // ecoul = r;
     // evdw = v_a - v_b;
     // dv = r2 * (-ecoul - v_a + v_b);
@@ -116,8 +120,8 @@ __global__ void calc_nonbonded_force_kernel(
 
     dvel_t* d_dvelocities,
 
-    double* evdw_tot,
-    double* ecoul_tot,
+    real_t* evdw_tot,
+    real_t* ecoul_tot,
 
     bool symmetric,
 
@@ -130,7 +134,7 @@ __global__ void calc_nonbonded_force_kernel(
     const int n_catype_types,
     const int zero_catype_type,
     const int n_qelscales,
-    const double lambda,
+    const real_t lambda,
     const q_elscale_t* d_qelscales  // todo: Now doesn't use it. Should optimize it later
 
 ) {
@@ -180,8 +184,8 @@ __global__ void calc_nonbonded_force_kernel(
     nonbond_vec_t<WorkT> x_force = {0.0, 0.0, 0.0};
     nonbond_vec_t<WorkT> y_force = {0.0, 0.0, 0.0};
 
-    double evdw_sum = 0.0;
-    double ecoul_sum = 0.0;
+    real_t evdw_sum = 0.0;
+    real_t ecoul_sum = 0.0;
 
     const unsigned mask = 0xffffffffu;
 
@@ -307,7 +311,7 @@ __global__ void calc_nonbonded_force_kernel(
 
 }  // namespace CudaNonbondedForce
 
-std::pair<double, double> calc_nonbonded_force_host(
+std::pair<real_t, real_t> calc_nonbonded_force_host(
     int nx,
     int ny,
     int* x_idx_list,
@@ -318,7 +322,7 @@ std::pair<double, double> calc_nonbonded_force_host(
     const int* y_charges_types,
     const int* x_atypes_types,
     const int* y_atypes_types,
-    const bool disable_water_h_lj, const double lambda) {
+    const bool disable_water_h_lj, const real_t lambda) {
     using namespace CudaNonbondedForce;
     Context& host = Context::instance();
     const int thread_num = 256;
@@ -334,8 +338,8 @@ std::pair<double, double> calc_nonbonded_force_host(
 
     dim3 grid = dim3(grid_sz);
 
-    cudaMemset(d_ecoul_total, 0, sizeof(double));
-    cudaMemset(d_evdw_total, 0, sizeof(double));
+    cudaMemset(d_ecoul_total, 0, sizeof(real_t));
+    cudaMemset(d_evdw_total, 0, sizeof(real_t));
 
     auto launch_kernel = [&](auto work_tag) {
         using WorkT = decltype(work_tag);
@@ -373,9 +377,9 @@ std::pair<double, double> calc_nonbonded_force_host(
 
     cudaDeviceSynchronize();
 
-    double evdw_tot = 0, ecoul_tot = 0;
-    cudaMemcpy(&evdw_tot, d_evdw_total, sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&ecoul_tot, d_ecoul_total, sizeof(double), cudaMemcpyDeviceToHost);
+    real_t evdw_tot = 0, ecoul_tot = 0;
+    cudaMemcpy(&evdw_tot, d_evdw_total, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&ecoul_tot, d_ecoul_total, sizeof(real_t), cudaMemcpyDeviceToHost);
 
     return {evdw_tot, ecoul_tot};
 }
@@ -383,8 +387,8 @@ std::pair<double, double> calc_nonbonded_force_host(
 void init_nonbonded_force_kernel_data() {
     using namespace CudaNonbondedForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_evdw_total, sizeof(double));
-        check_cudaMalloc((void**)&d_ecoul_total, sizeof(double));
+        check_cudaMalloc((void**)&d_evdw_total, sizeof(real_t));
+        check_cudaMalloc((void**)&d_ecoul_total, sizeof(real_t));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu
index 7be0656f..bdb35608 100644
--- a/src/core/cuda/src/cuda_polx_water_force.cu
+++ b/src/core/cuda/src/cuda_polx_water_force.cu
@@ -14,11 +14,11 @@ int* water_shell = nullptr;
 int* water_rank = nullptr;
 int* polx_list_sh = nullptr;  // use 1d array to simulate 2d array
 
-double* d_energy;
+real_t* d_energy;
 int* d_list_sh = nullptr;
-double* d_theta = nullptr;
-double* d_theta0 = nullptr;
-double* d_tdum = nullptr;
+real_t* d_theta = nullptr;
+real_t* d_theta0 = nullptr;
+real_t* d_tdum = nullptr;
 int* d_water_shell = nullptr;
 int* d_water_rank = nullptr;
 
@@ -27,15 +27,15 @@ int* d_water_rank = nullptr;
 __global__ void calc_polx_theta_and_shells(
     int n_waters, int n_shells, int n_atoms_solute,
     coord_t* coords, topo_t topo, shell_t* wshells, int* list_sh,
-    double* theta, double* theta0, double* tdum) {
+    real_t* theta, real_t* theta0, real_t* tdum) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_waters) return;
     int i = idx;
 
     int wi, iis;
     coord_t rmu, rcu;
-    double rm, rc;
-    double cos_th;
+    real_t rm, rc;
+    real_t cos_th;
 
     theta[i] = 0;
     theta0[i] = 0;
@@ -81,7 +81,7 @@ __global__ void calc_polx_theta_and_shells(
 __global__ void calc_polx_water_forces_kernel(
     int n_waters, int n_atoms_solute, shell_t* wshells,
     coord_t* coords, dvel_t* dvelocities, topo_t topo,
-    double* theta, md_t md, double* energy,
+    real_t* theta, md_t md, real_t* energy,
     int* water_rank, int* water_shell) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_waters) return;
@@ -92,21 +92,21 @@ __global__ void calc_polx_water_forces_kernel(
 
     int wi, ii;
     coord_t rmu, rcu, f1O, f1H1, f1H2, f2;
-    double rm, rc;
-    double cos_th;
-    double avtdum, arg, f0, dv;
-    double ener;
+    real_t rm, rc;
+    real_t cos_th;
+    real_t avtdum, arg, f0, dv;
+    real_t ener;
 
     avtdum = 0;
     ii = idx;
-    arg = 1 + ((1 - 2 * (double)(il + 1)) / (double)wshells[is].n_inshell);
-    double theta_val = acos(arg);
+    arg = 1 + ((1 - 2 * (real_t)(il + 1)) / (real_t)wshells[is].n_inshell);
+    real_t theta_val = acos(arg);
     theta_val = theta_val - 3 * sin(theta_val) * wshells[is].cstb / 2;
     if (theta_val < 0) theta_val = 0;
     if (theta_val > M_PI) theta_val = M_PI;
 
     avtdum += theta[ii];
-    const double dtheta = theta[ii] - theta_val + wshells[is].theta_corr;
+    const real_t dtheta = theta[ii] - theta_val + wshells[is].theta_corr;
     ener = .5 * md.polarisation_force * dtheta * dtheta;
     // E_restraint.Upolx += ener;
     atomicAdd(energy, ener);
@@ -164,7 +164,7 @@ __global__ void calc_polx_water_forces_kernel(
     atomicAdd(&dvelocities[wi + 2].y, f0 * (f1H2.y));
     atomicAdd(&dvelocities[wi + 2].z, f0 * (f1H2.z));
 
-    atomicAdd(&wshells[is].avtheta, avtdum / (double)wshells[is].n_inshell);
+    atomicAdd(&wshells[is].avtheta, avtdum / (real_t)wshells[is].n_inshell);
     atomicAdd(&wshells[is].avn_inshell, wshells[is].n_inshell);
 }
 
@@ -174,7 +174,7 @@ void sort_waters() {
     auto *wshells = ctx.wshells->cpu_data_p;
 
     int imin, jmin, jw;
-    double tmin;
+    real_t tmin;
     // Sort the waters according to theta
     for (int is = 0; is < ctx.n_shells; is++) {
         imin = 0;
@@ -224,7 +224,7 @@ void calc_polx_water_forces_host(int iteration) {
 
     // todo: sort in cpu now..
     ctx.wshells->download();
-    cudaMemcpy(ctx.tdum.data(), d_tdum, ctx.n_waters * sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(ctx.tdum.data(), d_tdum, ctx.n_waters * sizeof(real_t), cudaMemcpyDeviceToHost);
     cudaMemcpy(polx_list_sh, d_list_sh, ctx.n_max_inshell * ctx.n_shells * sizeof(int), cudaMemcpyDeviceToHost);
 
     // Reset per-water metadata; only waters placed in shells will be overwritten in sort_waters().
@@ -244,8 +244,8 @@ void calc_polx_water_forces_host(int iteration) {
     if (iteration != 0 && iteration % itdis_update == 0) {
         for (int is = 0; is < ctx.n_shells; is++) {
             printf("SHELL %d\n", is);
-            wshells[is].avtheta /= (double)itdis_update;
-            wshells[is].avn_inshell /= (double)itdis_update;
+            wshells[is].avtheta /= (real_t)itdis_update;
+            wshells[is].avn_inshell /= (real_t)itdis_update;
             wshells[is].theta_corr = wshells[is].theta_corr + wshells[is].avtheta - acos(wshells[is].cstb);
             printf("average theta = %f, average in shell = %f, theta_corr = %f\n",
                    wshells[is].avtheta * 180 / M_PI, wshells[is].avn_inshell, wshells[is].theta_corr * 180 / M_PI);
@@ -256,12 +256,12 @@ void calc_polx_water_forces_host(int iteration) {
     }
 
     // Calculate energy and force
-    cudaMemset(d_energy, 0, sizeof(double));
+    cudaMemset(d_energy, 0, sizeof(real_t));
     calc_polx_water_forces_kernel<<<numBlocks, blockSize>>>(
         ctx.n_waters, ctx.n_atoms_solute, d_wshells, d_coords, d_dvelocities, ctx.topo,
         d_theta, ctx.md, d_energy, d_water_rank, d_water_shell);
-    double energy;
-    cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost);
+    real_t energy;
+    cudaMemcpy(&energy, d_energy, sizeof(real_t), cudaMemcpyDeviceToHost);
     ctx.E_restraint.Upolx += energy;
     ctx.wshells->download();
     // Copy back forces for all atoms (solute + solvent); water forces were being dropped.
@@ -275,11 +275,11 @@ void init_polx_water_force_kernel_data() {
         water_shell = new int[ctx.n_waters];
         polx_list_sh = new int[ctx.n_max_inshell * ctx.n_shells];
 
-        check_cudaMalloc((void**)&d_energy, sizeof(double));
+        check_cudaMalloc((void**)&d_energy, sizeof(real_t));
         check_cudaMalloc((void**)&d_list_sh, ctx.n_max_inshell * ctx.n_shells * sizeof(int));
-        check_cudaMalloc((void**)&d_theta, ctx.n_waters * sizeof(double));
-        check_cudaMalloc((void**)&d_theta0, ctx.n_waters * sizeof(double));
-        check_cudaMalloc((void**)&d_tdum, ctx.n_waters * sizeof(double));
+        check_cudaMalloc((void**)&d_theta, ctx.n_waters * sizeof(real_t));
+        check_cudaMalloc((void**)&d_theta0, ctx.n_waters * sizeof(real_t));
+        check_cudaMalloc((void**)&d_tdum, ctx.n_waters * sizeof(real_t));
         check_cudaMalloc((void**)&d_water_rank, ctx.n_waters * sizeof(int));
         check_cudaMalloc((void**)&d_water_shell, ctx.n_waters * sizeof(int));
 
diff --git a/src/core/cuda/src/cuda_pshell_force.cu b/src/core/cuda/src/cuda_pshell_force.cu
index 5221cb9e..b6ef257e 100644
--- a/src/core/cuda/src/cuda_pshell_force.cu
+++ b/src/core/cuda/src/cuda_pshell_force.cu
@@ -5,8 +5,8 @@
 #include <iostream>
 namespace CudaPshellForce {
 bool is_initialized = false;
-double* d_ufix_energy;
-double* d_ushell_energy;
+real_t* d_ufix_energy;
+real_t* d_ushell_energy;
 
 }  // namespace CudaPshellForce
 __global__ void calc_pshell_force_kernel(
@@ -15,14 +15,14 @@ __global__ void calc_pshell_force_kernel(
     bool* excluded,
     coord_t* coords,
     coord_t* coords_init,
-    double* ufix_energy,
-    double* ushell_energy,
+    real_t* ufix_energy,
+    real_t* ushell_energy,
     dvel_t* dvelocities) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i >= n_atoms_solute) return;
 
     coord_t dr;
-    double k, r2, ener;
+    real_t k, r2, ener;
 
     if (shell[i] || excluded[i]) {
         // printf("i = %d excluded = %s shell = %s\n", i, excluded[i] ? "True" : "False", shell[i] ? "True" : "False");
@@ -57,8 +57,8 @@ void calc_pshell_forces_host() {
     auto d_coords_init = host.coords_init->gpu_data_p;
     auto d_dvelocities = host.dvelocities->gpu_data_p;
 
-    cudaMemset(d_ufix_energy, 0, sizeof(double));
-    cudaMemset(d_ushell_energy, 0, sizeof(double));
+    cudaMemset(d_ufix_energy, 0, sizeof(real_t));
+    cudaMemset(d_ushell_energy, 0, sizeof(real_t));
 
     int blockSize = 256;
     int numBlocks = (host.n_atoms_solute + blockSize - 1) / blockSize;
@@ -72,10 +72,10 @@ void calc_pshell_forces_host() {
         d_ushell_energy,
         d_dvelocities);
     cudaDeviceSynchronize();
-    double ufix_energy;
-    double ushell_energy;
-    cudaMemcpy(&ufix_energy, d_ufix_energy, sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&ushell_energy, d_ushell_energy, sizeof(double), cudaMemcpyDeviceToHost);
+    real_t ufix_energy;
+    real_t ushell_energy;
+    cudaMemcpy(&ufix_energy, d_ufix_energy, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&ushell_energy, d_ushell_energy, sizeof(real_t), cudaMemcpyDeviceToHost);
 
     host.E_restraint.Ufix += ufix_energy;
     host.E_restraint.Ushell += ushell_energy;
@@ -85,8 +85,8 @@ void calc_pshell_forces_host() {
 void init_pshell_force_kernel_data() {
     using namespace CudaPshellForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_ufix_energy, sizeof(double));
-        check_cudaMalloc((void**)&d_ushell_energy, sizeof(double));
+        check_cudaMalloc((void**)&d_ufix_energy, sizeof(real_t));
+        check_cudaMalloc((void**)&d_ushell_energy, sizeof(real_t));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_radix_water_force.cu b/src/core/cuda/src/cuda_radix_water_force.cu
index f037e9db..26c8c94f 100644
--- a/src/core/cuda/src/cuda_radix_water_force.cu
+++ b/src/core/cuda/src/cuda_radix_water_force.cu
@@ -6,20 +6,20 @@
 #include "cuda/include/cuda_utility.cuh"
 namespace CudaRadixWaterForce {
 bool is_initialized = false;
-double* d_energy;
+real_t* d_energy;
 }  // namespace CudaRadixWaterForce
 
 __global__ void calc_radix_water_forces_kernel(
     coord_t* coords,
-    double shift,
+    real_t shift,
     int n_atoms_solute,
     int n_atoms,
     topo_t topo,
     md_t md,
-    double Dwmz,
-    double awmz,
+    real_t Dwmz,
+    real_t awmz,
     dvel_t* dvelocities,
-    double* energy) {
+    real_t* energy) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     i = n_atoms_solute + i * 3;  // Process only oxygen atoms of water molecules
     if (i >= n_atoms) return;
@@ -29,16 +29,16 @@ __global__ void calc_radix_water_forces_kernel(
     dr.x = coords[i].x - topo.solvent_center.x;
     dr.y = coords[i].y - topo.solvent_center.y;
     dr.z = coords[i].z - topo.solvent_center.z;
-    double b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z);
-    double db = b - (topo.solvent_radius - shift);
+    real_t b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z);
+    real_t db = b - (topo.solvent_radius - shift);
 
-    double ener, dv;
+    real_t ener, dv;
     if (db > 0) {
         ener = 0.5 * md.radial_force * db * db - Dwmz;
         dv = md.radial_force * db / b;
     } else {
         if (b > 0.0) {
-            double fexp = exp(awmz * db);
+            real_t fexp = exp(awmz * db);
             ener = Dwmz * (fexp * fexp - 2 * fexp);
             dv = -2 * Dwmz * awmz * (fexp - fexp * fexp) / b;
         } else {
@@ -70,16 +70,16 @@ void calc_radix_water_forces_host() {
 
     auto d_coords = host.coords->gpu_data_p;
     auto d_dvelocities = host.dvelocities->gpu_data_p;
-    check_cuda(cudaMemset(d_energy, 0, sizeof(double)));
+    check_cuda(cudaMemset(d_energy, 0, sizeof(real_t)));
 
-    double shift;
+    real_t shift;
     if (host.md.radial_force != 0) {
         shift = sqrt(Boltz * host.Tfree / host.md.radial_force);
     } else {
         shift = 0;
     }
 
-    double energy = 0.0;
+    real_t energy = 0.0;
     calc_radix_water_forces_kernel<<<numBlocks, blockSize>>>(d_coords,
                                                              shift,
                                                              host.n_atoms_solute,
@@ -91,14 +91,14 @@ void calc_radix_water_forces_host() {
                                                              d_dvelocities,
                                                              d_energy);
     check_cuda(cudaDeviceSynchronize());
-    check_cuda(cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost));
+    check_cuda(cudaMemcpy(&energy, d_energy, sizeof(real_t), cudaMemcpyDeviceToHost));
     host.E_restraint.Uradx += energy;
 }
 
 void init_radix_water_force_kernel_data() {
     using namespace CudaRadixWaterForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_energy, sizeof(double));
+        check_cudaMalloc((void**)&d_energy, sizeof(real_t));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu
index 567a78df..e32872b7 100644
--- a/src/core/cuda/src/cuda_restrang_force.cu
+++ b/src/core/cuda/src/cuda_restrang_force.cu
@@ -3,26 +3,26 @@
 #include "common/include/context.h"
 namespace CudaRestrangForce {
 bool is_initialized = false;
-double* d_E_restraint;
+real_t* d_E_restraint;
 }  // namespace CudaRestrangForce
 
 __global__ void calc_restrang_force_kernel(
     restrang_t* restrangs,
     int n_restrangs,
     coord_t* coords,
-    double* lambdas,
+    real_t* lambdas,
     int n_lambdas,
     dvel_t* dvelocities,
     E_restraint_t* EQ_restraint,
-    double* E_restraint) {
+    real_t* E_restraint) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_restrangs) return;
     int ir = idx;
 
     int state, i, j, k;
     coord_t dr, dr2, di, dk;
-    double lambda, r2ij, r2jk, rij, rjk, cos_th, th;
-    double dth, dv, ener, f1;
+    real_t lambda, r2ij, r2jk, rij, rjk, cos_th, th;
+    real_t dth, dv, ener, f1;
 
     state = restrangs[ir].ipsi - 1;
     i = restrangs[ir].ai - 1;
@@ -110,8 +110,8 @@ void calc_restrang_force_host() {
     auto d_dvelocities = host.dvelocities->gpu_data_p;
     auto d_EQ_restraint = host.EQ_restraint->gpu_data_p;
 
-    double val = 0;
-    cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice);
+    real_t val = 0;
+    cudaMemcpy(d_E_restraint, &val, sizeof(real_t), cudaMemcpyHostToDevice);
 
     int blockSize = 256;
     int numBlocks = (host.n_restrangs + blockSize - 1) / blockSize;
@@ -126,14 +126,14 @@ void calc_restrang_force_host() {
         d_E_restraint);
     cudaDeviceSynchronize();
     host.EQ_restraint->download();
-    cudaMemcpy(&val, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&val, d_E_restraint, sizeof(real_t), cudaMemcpyDeviceToHost);
     host.E_restraint.Upres += val;
 }
 
 void init_restrang_force_kernel_data() {
     using namespace CudaRestrangForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_E_restraint, sizeof(double));
+        check_cudaMalloc((void**)&d_E_restraint, sizeof(real_t));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_restrdis_force.cu b/src/core/cuda/src/cuda_restrdis_force.cu
index 14f9b466..cdb035b4 100644
--- a/src/core/cuda/src/cuda_restrdis_force.cu
+++ b/src/core/cuda/src/cuda_restrdis_force.cu
@@ -5,24 +5,24 @@
 #include "common/include/context.h"
 namespace CudaRestrdisForce {
 bool is_initialized = false;
-double* d_E_restraint;
+real_t* d_E_restraint;
 }  // namespace CudaRestrdisForce
 
 __global__ void calc_restrdis_forces_kernel(
     restrdis_t* restrdists,
     int n_restrdists,
     coord_t* coords,
-    double* lambdas,
+    real_t* lambdas,
     int n_lambdas,
     dvel_t* dvelocities,
     E_restraint_t* EQ_restraint,
-    double* E_restraint) {
+    real_t* E_restraint) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_restrdists) return;
 
     int state, i, j;
     coord_t dr;
-    double lambda, b, db, dv, ener;
+    real_t lambda, b, db, dv, ener;
 
     int ir = idx;
 
@@ -82,7 +82,7 @@ void calc_restrdis_forces_host() {
     auto d_dvelocities = host.dvelocities->gpu_data_p;
     auto d_EQ_restraint = host.EQ_restraint->gpu_data_p;
 
-    cudaMemset(d_E_restraint, 0, sizeof(double));
+    cudaMemset(d_E_restraint, 0, sizeof(real_t));
 
     int blockSize = 256;
     int numBlocks = (host.n_restrdists + blockSize - 1) / blockSize;
@@ -97,8 +97,8 @@ void calc_restrdis_forces_host() {
         d_E_restraint);
     cudaDeviceSynchronize();
     host.EQ_restraint->download();
-    double ener;
-    cudaMemcpy(&ener, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost);
+    real_t ener;
+    cudaMemcpy(&ener, d_E_restraint, sizeof(real_t), cudaMemcpyDeviceToHost);
     printf("Energy restraint: %f\n", ener);
     host.E_restraint.Upres += ener;
 }
@@ -106,7 +106,7 @@ void calc_restrdis_forces_host() {
 void init_restrdis_force_kernel_data() {
     using namespace CudaRestrdisForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_E_restraint, sizeof(double));
+        check_cudaMalloc((void**)&d_E_restraint, sizeof(real_t));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_restrpos_force.cu b/src/core/cuda/src/cuda_restrpos_force.cu
index 695e2b33..5307bff5 100644
--- a/src/core/cuda/src/cuda_restrpos_force.cu
+++ b/src/core/cuda/src/cuda_restrpos_force.cu
@@ -6,17 +6,17 @@
 
 namespace CudaRestrposForce {
 bool is_initialized = false;
-double* d_E_restraint;
+real_t* d_E_restraint;
 }  // namespace CudaRestrposForce
 
 __global__ void calc_restrpos_forces_kernel(
     restrpos_t* restrspos,
     int n_restrspos,
     coord_t* coords,
-    double* lambdas,
+    real_t* lambdas,
     int n_lambdas,
     E_restraint_t* EQ_restraint,
-    double* E_restraint,
+    real_t* E_restraint,
     dvel_t* dvelocities) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_restrspos) return;
@@ -24,7 +24,7 @@ __global__ void calc_restrpos_forces_kernel(
 
     int state, i;
     coord_t dr;
-    double lambda, ener, x2, y2, z2;
+    real_t lambda, ener, x2, y2, z2;
 
     state = restrspos[ir].ipsi - 1;
     i = restrspos[ir].a - 1;
@@ -64,8 +64,8 @@ void calc_restrpos_forces_host() {
     auto& host = Context::instance();
     if (host.n_restrspos == 0) return;
     using namespace CudaRestrposForce;
-    double val = 0.0;
-    cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice);
+    real_t val = 0.0;
+    cudaMemcpy(d_E_restraint, &val, sizeof(real_t), cudaMemcpyHostToDevice);
 
     auto d_restrspos = host.restrspos->gpu_data_p;
     auto d_coords = host.coords->gpu_data_p;
@@ -85,7 +85,7 @@ void calc_restrpos_forces_host() {
         d_E_restraint,
         d_dvelocities);
     cudaDeviceSynchronize();
-    cudaMemcpy(&val, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&val, d_E_restraint, sizeof(real_t), cudaMemcpyDeviceToHost);
     host.E_restraint.Upres += val;
     host.EQ_restraint->download();
 }
@@ -93,7 +93,7 @@ void calc_restrpos_forces_host() {
 void init_restrpos_force_kernel_data() {
     using namespace CudaRestrposForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_E_restraint, sizeof(double));
+        check_cudaMalloc((void**)&d_E_restraint, sizeof(real_t));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_restrseq_force.cu b/src/core/cuda/src/cuda_restrseq_force.cu
index 71835e4e..e5951303 100644
--- a/src/core/cuda/src/cuda_restrseq_force.cu
+++ b/src/core/cuda/src/cuda_restrseq_force.cu
@@ -4,7 +4,7 @@
 
 namespace CudaRestrseqForce {
 bool is_initialized = false;
-double* d_upres_energy;
+real_t* d_upres_energy;
 }  // namespace CudaRestrseqForce
 __global__ void calc_restrseq_forces_kernel(
     int n_restrseqs,
@@ -15,13 +15,13 @@ __global__ void calc_restrseq_forces_kernel(
     catype_t* catypes,
     bool* heavy,
     dvel_t* dvelocities,
-    double* upres_energy) {
+    real_t* upres_energy) {
     int s = blockIdx.x * blockDim.x + threadIdx.x;
     if (s >= n_restrseqs) return;
 
-    double k, mass, totmass;
+    real_t k, mass, totmass;
     coord_t dr;
-    double r2, ener;
+    real_t r2, ener;
 
     k = restrseqs[s].k;
 
@@ -123,7 +123,7 @@ void calc_restrseq_forces_host() {
     auto d_catypes = host.catypes->gpu_data_p;
     auto d_heavy = host.heavy->gpu_data_p;
     auto d_dvelocities = host.dvelocities->gpu_data_p;
-    cudaMemset(d_upres_energy, 0, sizeof(double));
+    cudaMemset(d_upres_energy, 0, sizeof(real_t));
     // ctx.sync_all_to_device();
 
     int blockSize = 256;
@@ -139,8 +139,8 @@ void calc_restrseq_forces_host() {
         d_dvelocities,
         d_upres_energy);
     cudaDeviceSynchronize();
-    double upres_energy;
-    cudaMemcpy(&upres_energy, d_upres_energy, sizeof(double), cudaMemcpyDeviceToHost);
+    real_t upres_energy;
+    cudaMemcpy(&upres_energy, d_upres_energy, sizeof(real_t), cudaMemcpyDeviceToHost);
     host.E_restraint.Upres = upres_energy;
     printf("Restrseq U_upres: %f\n", upres_energy);
 }
@@ -148,7 +148,7 @@ void calc_restrseq_forces_host() {
 void init_restrseq_force_kernel_data() {
     using namespace CudaRestrseqForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_upres_energy, sizeof(double));
+        check_cudaMalloc((void**)&d_upres_energy, sizeof(real_t));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_restrwall_force.cu b/src/core/cuda/src/cuda_restrwall_force.cu
index c928bb71..2ca01839 100644
--- a/src/core/cuda/src/cuda_restrwall_force.cu
+++ b/src/core/cuda/src/cuda_restrwall_force.cu
@@ -5,20 +5,20 @@
 
 namespace CudaRestrwallForce {
 bool is_initialized = false;
-double* d_energies;
+real_t* d_energies;
 }  // namespace CudaRestrwallForce
 
 __global__ void calc_restrwall_forces_kernel(
     restrwall_t* restrwalls,
     int n_restrwalls,
     coord_t* coords,
-    double* energies,
+    real_t* energies,
     dvel_t* dvelocities,
     bool* heavy, topo_t topo) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_restrwalls) return;
 
-    double k, b, db, ener, dv, fexp;
+    real_t k, b, db, ener, dv, fexp;
     coord_t dr;
 
     int ir = idx;
@@ -58,7 +58,7 @@ void calc_restrwall_forces_host() {
     auto d_coords = host.coords->gpu_data_p;
     auto d_dvelocities = host.dvelocities->gpu_data_p;
     auto d_heavy = host.heavy->gpu_data_p;
-    cudaMemset(d_energies, 0, sizeof(double));
+    cudaMemset(d_energies, 0, sizeof(real_t));
 
     int blockSize = 256;
     int numBlocks = (host.n_restrwalls + blockSize - 1) / blockSize;
@@ -69,8 +69,8 @@ void calc_restrwall_forces_host() {
         d_energies,
         d_dvelocities, d_heavy, host.topo);
     cudaDeviceSynchronize();
-    double h_energy;
-    cudaMemcpy(&h_energy, d_energies, sizeof(double), cudaMemcpyDeviceToHost);
+    real_t h_energy;
+    cudaMemcpy(&h_energy, d_energies, sizeof(real_t), cudaMemcpyDeviceToHost);
     printf("Restrwall energy: %f\n", h_energy);
     host.E_restraint.Upres += h_energy;
 }
@@ -78,7 +78,7 @@ void calc_restrwall_forces_host() {
 void init_restrwall_force_kernel_data() {
     using namespace CudaRestrwallForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_energies, sizeof(double));
+        check_cudaMalloc((void**)&d_energies, sizeof(real_t));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_shake_constraints.cu b/src/core/cuda/src/cuda_shake_constraints.cu
index bda47e50..03fa76cd 100644
--- a/src/core/cuda/src/cuda_shake_constraints.cu
+++ b/src/core/cuda/src/cuda_shake_constraints.cu
@@ -17,7 +17,7 @@ __global__ void calc_shake_constraints_kernel(
     shake_bond_t* shake_bonds,
     coord_t* coords,
     coord_t* xcoords,
-    double* winv,
+    real_t* winv,
     int* total_iterations,
     int* mol_shake_offset) {
     int idx = blockIdx.x;
@@ -26,7 +26,7 @@ __global__ void calc_shake_constraints_kernel(
     int mol = idx;
 
     int ai, aj, n_iterations, shake;
-    double xij2, diff, corr, scp, xxij2;
+    real_t xij2, diff, corr, scp, xxij2;
     coord_t xij, xxij;
 
     if (mol_n_shakes[mol] == 0) return;
diff --git a/src/core/cuda/src/cuda_temperature.cu b/src/core/cuda/src/cuda_temperature.cu
index baba687e..46c4c373 100644
--- a/src/core/cuda/src/cuda_temperature.cu
+++ b/src/core/cuda/src/cuda_temperature.cu
@@ -6,23 +6,23 @@
 
 namespace CudaTemperature {
 bool is_initialized = false;
-double* d_Temp_solute;
-double* d_Tfree_solute;
-double* d_Texcl_solute;
-double* d_Temp_solvent;
-double* d_Tfree_solvent;
-double* d_Texcl_solvent;
+real_t* d_Temp_solute;
+real_t* d_Tfree_solute;
+real_t* d_Texcl_solute;
+real_t* d_Temp_solvent;
+real_t* d_Tfree_solvent;
+real_t* d_Texcl_solvent;
 }  // namespace CudaTemperature
 
-__global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t* atypes, catype_t* catypes, vel_t* velocities, bool* excluded, double boltz, double ekinmax,
-                                        double* Temp_solute, double* Tfree_solute, double* Texcl_solute, double* Temp_solvent, double* Tfree_solvent, double* Texcl_solvent) {
+__global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t* atypes, catype_t* catypes, vel_t* velocities, bool* excluded, real_t boltz, real_t ekinmax,
+                                        real_t* Temp_solute, real_t* Tfree_solute, real_t* Texcl_solute, real_t* Temp_solvent, real_t* Tfree_solvent, real_t* Texcl_solvent) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_atoms) return;
-    double mass_i = catypes[atypes[idx].code - 1].m;
-    const double vx = velocities[idx].x;
-    const double vy = velocities[idx].y;
-    const double vz = velocities[idx].z;
-    double ener = .5 * mass_i * (vx * vx + vy * vy + vz * vz);
+    real_t mass_i = catypes[atypes[idx].code - 1].m;
+    const real_t vx = velocities[idx].x;
+    const real_t vy = velocities[idx].y;
+    const real_t vz = velocities[idx].z;
+    real_t ener = .5 * mass_i * (vx * vx + vy * vy + vz * vz);
     bool is_solute = (idx < n_atoms_solute);
     bool is_excluded = excluded[idx];
 
@@ -49,14 +49,14 @@ __global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t
 void calc_temperature_host() {
     auto& host = Context::instance();
     using namespace CudaTemperature;
-    double h_Temp_solute = 0.0, h_Tfree_solute = 0.0, h_Texcl_solute = 0.0, h_Temp_solvent = 0.0, h_Tfree_solvent = 0.0, h_Texcl_solvent = 0.0;
+    real_t h_Temp_solute = 0.0, h_Tfree_solute = 0.0, h_Texcl_solute = 0.0, h_Temp_solvent = 0.0, h_Tfree_solvent = 0.0, h_Texcl_solvent = 0.0;
 
-    cudaMemcpy(d_Temp_solute, &h_Temp_solute, sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_Tfree_solute, &h_Tfree_solute, sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_Texcl_solute, &h_Texcl_solute, sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_Temp_solvent, &h_Temp_solvent, sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_Tfree_solvent, &h_Tfree_solvent, sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_Texcl_solvent, &h_Texcl_solvent, sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Temp_solute, &h_Temp_solute, sizeof(real_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Tfree_solute, &h_Tfree_solute, sizeof(real_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Texcl_solute, &h_Texcl_solute, sizeof(real_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Temp_solvent, &h_Temp_solvent, sizeof(real_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Tfree_solvent, &h_Tfree_solvent, sizeof(real_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Texcl_solvent, &h_Texcl_solvent, sizeof(real_t), cudaMemcpyHostToDevice);
 
     atype_t* d_atypes = host.atypes->gpu_data_p;
     catype_t* d_catypes = host.catypes->gpu_data_p;
@@ -66,17 +66,17 @@ void calc_temperature_host() {
     int blockSize = 256;
     int numBlocks = (host.n_atoms + blockSize - 1) / blockSize;
 
-    double Ekinmax = 1000.0 * host.Ndegf * Boltz * host.md.temperature / 2.0 / host.n_atoms;
+    real_t Ekinmax = 1000.0 * host.Ndegf * Boltz * host.md.temperature / 2.0 / host.n_atoms;
     calc_temperature_kernel<<<numBlocks, blockSize>>>(host.n_atoms, host.n_atoms_solute, d_atypes, d_catypes, d_velocities, d_excluded, Boltz, Ekinmax,
                                                       d_Temp_solute, d_Tfree_solute, d_Texcl_solute, d_Temp_solvent, d_Tfree_solvent, d_Texcl_solvent);
 
     cudaDeviceSynchronize();
-    cudaMemcpy(&h_Temp_solute, d_Temp_solute, sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&h_Tfree_solute, d_Tfree_solute, sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&h_Texcl_solute, d_Texcl_solute, sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&h_Temp_solvent, d_Temp_solvent, sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&h_Tfree_solvent, d_Tfree_solvent, sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&h_Texcl_solvent, d_Texcl_solvent, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_Temp_solute, d_Temp_solute, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_Tfree_solute, d_Tfree_solute, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_Texcl_solute, d_Texcl_solute, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_Temp_solvent, d_Temp_solvent, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_Tfree_solvent, d_Tfree_solvent, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_Texcl_solvent, d_Texcl_solvent, sizeof(real_t), cudaMemcpyDeviceToHost);
     host.Tfree = h_Tfree_solute + h_Tfree_solvent;
     host.Temp = h_Temp_solute + h_Temp_solvent;
 
@@ -98,12 +98,12 @@ void calc_temperature_host() {
 void init_temperature_kernel_data() {
     using namespace CudaTemperature;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_Temp_solute, sizeof(double));
-        check_cudaMalloc((void**)&d_Tfree_solute, sizeof(double));
-        check_cudaMalloc((void**)&d_Texcl_solute, sizeof(double));
-        check_cudaMalloc((void**)&d_Temp_solvent, sizeof(double));
-        check_cudaMalloc((void**)&d_Tfree_solvent, sizeof(double));
-        check_cudaMalloc((void**)&d_Texcl_solvent, sizeof(double));
+        check_cudaMalloc((void**)&d_Temp_solute, sizeof(real_t));
+        check_cudaMalloc((void**)&d_Tfree_solute, sizeof(real_t));
+        check_cudaMalloc((void**)&d_Texcl_solute, sizeof(real_t));
+        check_cudaMalloc((void**)&d_Temp_solvent, sizeof(real_t));
+        check_cudaMalloc((void**)&d_Tfree_solvent, sizeof(real_t));
+        check_cudaMalloc((void**)&d_Texcl_solvent, sizeof(real_t));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu
index 5baffbde..1c0692ae 100644
--- a/src/core/cuda/src/cuda_torsion_force.cu
+++ b/src/core/cuda/src/cuda_torsion_force.cu
@@ -4,10 +4,10 @@
 
 namespace CudaTorsionForce {
 bool is_initialized = false;
-double* d_energy_sum = nullptr;
+real_t* d_energy_sum = nullptr;
 }  // namespace CudaTorsionForce
 
-__global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsions, ctorsion_t* ctorsions, coord_t* coords, dvel_t* dvelocities, double* energy_sum) {
+__global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsions, ctorsion_t* ctorsions, coord_t* coords, dvel_t* dvelocities, real_t* energy_sum) {
     int i = blockIdx.x * blockDim.x + threadIdx.x + start;
     if (i >= end) return;
     int aii, aji, aki, ali;
@@ -16,10 +16,10 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
     coord_t di, dl, dpi, dpj, dpk, dpl;
 
-    double bj2inv, bk2inv, bjinv, bkinv;
-    double cos_phi, phi;
-    double arg, dv, f1;
-    double ener;
+    real_t bj2inv, bk2inv, bjinv, bkinv;
+    real_t cos_phi, phi;
+    real_t arg, dv, f1;
+    real_t ener;
 
     torsion_t t;
     ctorsion_t ctors;
@@ -63,7 +63,8 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio
     bkinv = sqrt(bk2inv);
 
     cos_phi = (rnj.x * rnk.x + rnj.y * rnk.y + rnj.z * rnk.z) * (bjinv * bkinv);
-    cos_phi = fmin(fmax(cos_phi, -1.0), 1.0);
+    cos_phi = cos_phi > static_cast<real_t>(1.0) ? static_cast<real_t>(1.0) : cos_phi;
+    cos_phi = cos_phi < static_cast<real_t>(-1.0) ? static_cast<real_t>(-1.0) : cos_phi;
     phi = acos(cos_phi);
     if (rjk.x * (rnj.y * rnk.z - rnj.z * rnk.y) + rjk.y * (rnj.z * rnk.x - rnj.x * rnk.z) + rjk.z * (rnj.x * rnk.y - rnj.y * rnk.x) < 0) {
         phi = -phi;
@@ -123,15 +124,15 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio
     atomicAdd(&dvelocities[ali].z, dv * dpl.z);
 }
 
-double calc_torsion_forces_host(int start, int end) {
+real_t calc_torsion_forces_host(int start, int end) {
     using namespace CudaTorsionForce;
     int N = end - start;
     if (N <= 0) return 0.0;
     int blockSize = 256;
     int numBlocks = (N + blockSize - 1) / blockSize;
 
-    double zero = 0.0;
-    cudaMemcpy(d_energy_sum, &zero, sizeof(double), cudaMemcpyHostToDevice);
+    real_t zero = 0.0;
+    cudaMemcpy(d_energy_sum, &zero, sizeof(real_t), cudaMemcpyHostToDevice);
 
     auto& host_ctx = Context::instance();
     coord_t* d_coords = host_ctx.coords->gpu_data_p;
@@ -141,7 +142,7 @@ double calc_torsion_forces_host(int start, int end) {
 
     calc_torsion_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_torsions, d_ctorsions, d_coords, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
-    cudaMemcpy(&zero, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&zero, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost);
     return zero;
 }
 
@@ -150,7 +151,7 @@ double calc_torsion_forces_host(int start, int end) {
 void init_torsion_force_kernel_data() {
     using namespace CudaTorsionForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
+        check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t));
         is_initialized = true;
     }
 }

From 6f5940f20d53f1029326e2e1ea1de828ffa9392a Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Thu, 30 Apr 2026 10:24:37 +0200
Subject: [PATCH 17/20] fix benchmark fortran run

---
 benchmark-qgpu/benchmark_correctness.py | 47 ++++++++++++++++++++++---
 benchmark-qgpu/benchmark_nsday.py       | 42 +++++++++++++++++++++-
 benchmark-qgpu/benchmark_test.py        | 29 ++++++++++++---
 3 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/benchmark-qgpu/benchmark_correctness.py b/benchmark-qgpu/benchmark_correctness.py
index d30f321d..f8f4fd5c 100644
--- a/benchmark-qgpu/benchmark_correctness.py
+++ b/benchmark-qgpu/benchmark_correctness.py
@@ -221,16 +221,34 @@ def collect(args):
         fortran_dir, prep_dir, prepared_data_dir, reference_dir = copy_reference_inputs(args.reference_dir, out_dir)
         prep_fortran_bin = None
     else:
-        prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin)
+        default_prep_fortran_bin = (
+            ROOT / "src" / "q6" / "bin" / "q6" / "qdynp"
+            if args.prep_fortran_mpi_procs is not None
+            else ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"
+        )
+        prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin or default_prep_fortran_bin)
         data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake)
 
         fortran_dir = out_dir / "fortran_reference"
         prep_dir = out_dir / "qgpu_prepare"
         fortran_dir.mkdir(parents=True, exist_ok=True)
 
-        print(f"Preparing Fortran reference for {args.test}")
+        if args.prep_fortran_mpi_procs is None:
+            print(f"Preparing Fortran reference for {args.test}")
+        else:
+            print(
+                f"Preparing Fortran reference for {args.test} "
+                f"with {args.prep_fortran_mpi_procs} MPI rank(s)"
+            )
         write_md_input(data, fortran_dir)
-        prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir)
+        prepare_restart_with_qdyn_test(
+            data,
+            prep_fortran_bin,
+            fortran_dir,
+            mpi_procs=args.prep_fortran_mpi_procs,
+            mpirun_bin=args.mpirun_bin,
+            mpirun_args=args.mpirun_args,
+        )
 
         print("Preparing QGPU input")
         prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir)
@@ -255,6 +273,9 @@ def collect(args):
             "shake": args.shake,
             "qgpu_bin": str(qgpu_bin),
             "prep_fortran_bin": str(prep_fortran_bin) if prep_fortran_bin is not None else None,
+            "prep_fortran_mpi_procs": args.prep_fortran_mpi_procs,
+            "mpirun_bin": args.mpirun_bin,
+            "mpirun_args": args.mpirun_args,
             "reference_dir": str(reference_dir) if reference_dir is not None else None,
             "prepared_qgpu_input": str(prepared_data_dir),
             "fortran_energy": str(fortran_energy_path),
@@ -411,8 +432,24 @@ def parse_args():
     )
     collect_parser.add_argument(
         "--prep-fortran-bin",
-        default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"),
-        help="Path to qdyn_test used to generate Fortran reference data.",
+        default=None,
+        help="Path to Fortran binary used to generate reference data. Defaults to qdynp with MPI, otherwise qdyn_test.",
+    )
+    collect_parser.add_argument(
+        "--prep-fortran-mpi-procs",
+        type=positive_int,
+        default=None,
+        help="Run the Fortran reference preparation through mpirun with this many MPI ranks.",
+    )
+    collect_parser.add_argument(
+        "--mpirun-bin",
+        default="mpirun",
+        help="MPI launcher to use with --prep-fortran-mpi-procs. Defaults to mpirun.",
+    )
+    collect_parser.add_argument(
+        "--mpirun-args",
+        default=None,
+        help='Extra MPI launcher arguments, quoted as one string, e.g. "--bind-to core".',
     )
     collect_parser.add_argument(
         "--tolerance",
diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py
index 71ddbe9b..30ffbebc 100644
--- a/benchmark-qgpu/benchmark_nsday.py
+++ b/benchmark-qgpu/benchmark_nsday.py
@@ -90,7 +90,28 @@ def resolve_collect_data_dir(args, out_dir):
     return data_dir, args.steps
 
 
-def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, steps, label, repeat):
+def cleanup_successful_task_data(processes):
+    removed = 0
+    for item in processes:
+        if item["return_code"] != 0:
+            continue
+        data_dir = item["data_dir"]
+        if data_dir.exists():
+            shutil.rmtree(data_dir)
+            removed += 1
+    return removed
+
+
+def run_concurrency_batch(
+    qgpu_bin,
+    prepared_data_dir,
+    run_dir,
+    concurrency,
+    steps,
+    label,
+    repeat,
+    cleanup_run_data=False,
+):
     if run_dir.exists():
         shutil.rmtree(run_dir)
     run_dir.mkdir(parents=True)
@@ -113,6 +134,7 @@ def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, ste
                 "args": args,
                 "stdout": stdout_path,
                 "stderr": stderr_path,
+                "data_dir": data_dir,
                 "command": command_text(args),
             }
         )
@@ -133,6 +155,7 @@ def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, ste
                 "stderr_file": stderr_f,
                 "stdout": spec["stdout"],
                 "stderr": spec["stderr"],
+                "data_dir": spec["data_dir"],
                 "start": proc_start,
                 "command": spec["command"],
             }
@@ -172,6 +195,12 @@ def run_concurrency_batch(qgpu_bin, prepared_data_dir, run_dir, concurrency, ste
 
     batch_wall_seconds = time.perf_counter() - batch_start
     failed = sum(1 for row in process_rows if row["return_code"] != 0)
+
+    if cleanup_run_data:
+        removed_task_data = cleanup_successful_task_data(processes)
+        if removed_task_data:
+            print(f"Removed copied QGPU data for {removed_task_data} successful task(s) under {run_dir}")
+
     total_ns_per_day = concurrency * steps * TIME_STEP_NS * 86400 / batch_wall_seconds
     mean_process_ns_per_day = (
         sum(float(row["process_ns_per_day"]) for row in process_rows if row["process_ns_per_day"] != "")
@@ -255,6 +284,7 @@ def collect(args):
                 steps=steps,
                 label=label,
                 repeat=repeat,
+                cleanup_run_data=not args.keep_run_data,
             )
             batch_rows.append(batch_row)
             process_rows.extend(rows)
@@ -269,6 +299,7 @@ def collect(args):
                         "qgpu_bin": str(qgpu_bin),
                         "prepared_data_dir": str(prepared_data_dir),
                         "steps": steps,
+                        "keep_run_data": args.keep_run_data,
                     },
                 )
                 raise RuntimeError(
@@ -291,6 +322,7 @@ def collect(args):
             "steps": steps,
             "concurrency": args.concurrency,
             "repeat": args.repeat,
+            "keep_run_data": args.keep_run_data,
         },
     )
     print(f"Summary CSV: {summary_csv}")
@@ -457,6 +489,14 @@ def parse_args():
         help="Path to qdyn_test used only when preparing from --test.",
     )
     collect_parser.add_argument("--pause-seconds", type=float, default=0.0, help="Pause between batches.")
+    collect_parser.add_argument(
+        "--keep-run-data",
+        action="store_true",
+        help=(
+            "Keep per-task copied QGPU input/output directories. By default successful task data is "
+            "deleted after logs and timing are recorded."
+        ),
+    )
 
     plot_parser = subparsers.add_parser("plot", help="Plot ns/day vs concurrency from one or more CSV files.")
     plot_parser.add_argument("csv", nargs="+", help="One or more nsday_summary.csv files from collect.")
diff --git a/benchmark-qgpu/benchmark_test.py b/benchmark-qgpu/benchmark_test.py
index afefcfbc..fcad0788 100644
--- a/benchmark-qgpu/benchmark_test.py
+++ b/benchmark-qgpu/benchmark_test.py
@@ -210,7 +210,15 @@ def run_fortran_repeats(
     return records, saw_success
 
 
-def prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir, prep_steps=None):
+def prepare_restart_with_qdyn_test(
+    data,
+    prep_fortran_bin,
+    fortran_dir,
+    prep_steps=None,
+    mpi_procs=None,
+    mpirun_bin="mpirun",
+    mpirun_args=None,
+):
     input_path = fortran_dir / "eq1.inp"
     original_input = input_path.read_text(encoding="utf-8")
     parse_data = data
@@ -222,7 +230,13 @@ def prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir, prep_ste
 
     stdout_path = fortran_dir / "restart_prep_qdyn_test.log"
     stderr_path = fortran_dir / "restart_prep_qdyn_test.err"
-    args = [str(prep_fortran_bin), "eq1.inp"]
+    args = build_fortran_command(
+        prep_fortran_bin,
+        "eq1.inp",
+        mpi_procs=mpi_procs,
+        mpirun_bin=mpirun_bin,
+        mpirun_args=mpirun_args,
+    )
     try:
         return_code, _ = run_timed(args, fortran_dir, stdout_path, stderr_path)
         if return_code != 0:
@@ -529,8 +543,8 @@ def parse_args():
     )
     parser.add_argument(
         "--fortran-bin",
-        default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn"),
-        help="Path to production Fortran qdyn/qdynp binary used for timed Fortran runs.",
+        default=None,
+        help="Path to production Fortran binary used for timed Fortran runs. Defaults to qdynp with MPI, otherwise qdyn.",
     )
     parser.add_argument(
         "--fortran-mpi-procs",
@@ -590,7 +604,12 @@ def main():
         return 0
 
     qgpu_bin = resolve_qgpu_bin(args.qgpu_bin)
-    fortran_bin = resolve_fortran_bin(args.fortran_bin)
+    default_fortran_bin = (
+        ROOT / "src" / "q6" / "bin" / "q6" / "qdynp"
+        if args.fortran_mpi_procs is not None
+        else ROOT / "src" / "q6" / "bin" / "q6" / "qdyn"
+    )
+    fortran_bin = resolve_fortran_bin(args.fortran_bin or default_fortran_bin)
     prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin)
     out_dir = Path(args.out).expanduser().resolve() if args.out else default_out_dir(args.test)
     out_dir.mkdir(parents=True, exist_ok=True)

From 6759c86e0d1cc0c1dce82b5f25b53bc014b6151f Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Thu, 30 Apr 2026 16:07:51 +0200
Subject: [PATCH 18/20] Revert "Merge branch 'feature/qgpu_mixed_precision'
 into feature/qgpu_benchmark_script"

This reverts commit 97775273097769e3c687e16891784843b20795c7, reversing
changes made to e6eee26979200b22c1cbc7ee6851dd1653aebfb0.
---
 src/core/common/include/context.h             |  32 ++---
 src/core/common/include/md_types.h            | 128 +++++++++---------
 src/core/common/include/precision.h           |   5 +-
 src/core/common/src/init.cpp                  |  56 ++++----
 src/core/common/src/parse.cpp                 |   2 +-
 src/core/cpu/include/cpu_angle_force.h        |   5 +-
 src/core/cpu/include/cpu_bond_force.h         |   4 +-
 src/core/cpu/include/cpu_improper2_force.h    |   4 +-
 src/core/cpu/include/cpu_torsion_force.h      |   4 +-
 src/core/cpu/include/cpu_utils.h              |   8 +-
 src/core/cpu/src/cpu_angle_force.cpp          |  10 +-
 src/core/cpu/src/cpu_bond_force.cpp           |   6 +-
 src/core/cpu/src/cpu_improper2_force.cpp      |   8 +-
 src/core/cpu/src/cpu_leapfrog.cpp             |   4 +-
 src/core/cpu/src/cpu_nonbonded_pp_force.cpp   |   6 +-
 src/core/cpu/src/cpu_nonbonded_pw_force.cpp   |   4 +-
 src/core/cpu/src/cpu_nonbonded_qp_force.cpp   |   6 +-
 src/core/cpu/src/cpu_nonbonded_qq_force.cpp   |   6 +-
 src/core/cpu/src/cpu_nonbonded_qw_force.cpp   |   4 +-
 src/core/cpu/src/cpu_nonbonded_ww_force.cpp   |   4 +-
 src/core/cpu/src/cpu_polx_water_force.cpp     |  18 +--
 src/core/cpu/src/cpu_pshell_force.cpp         |   2 +-
 src/core/cpu/src/cpu_q_angle_force.cpp        |   4 +-
 src/core/cpu/src/cpu_q_bond_force.cpp         |   2 +-
 src/core/cpu/src/cpu_q_torsion_force.cpp      |   8 +-
 src/core/cpu/src/cpu_radix_water_force.cpp    |   4 +-
 src/core/cpu/src/cpu_restrang_force.cpp       |   4 +-
 src/core/cpu/src/cpu_restrdis_force.cpp       |   2 +-
 src/core/cpu/src/cpu_restrpos_force.cpp       |   2 +-
 src/core/cpu/src/cpu_restrseq_force.cpp       |   4 +-
 src/core/cpu/src/cpu_restrwall_force.cpp      |   2 +-
 src/core/cpu/src/cpu_shake.cpp                |   8 +-
 src/core/cpu/src/cpu_temperature.cpp          |  10 +-
 src/core/cpu/src/cpu_torsion_force.cpp        |  12 +-
 src/core/cpu/src/utils.cpp                    |  15 +-
 src/core/cuda/include/cuda_angle_force.cuh    |   4 +-
 src/core/cuda/include/cuda_bond_force.cuh     |   4 +-
 .../cuda/include/cuda_improper2_force.cuh     |   4 +-
 .../cuda/include/cuda_nonbonded_force.cuh     |   8 +-
 src/core/cuda/include/cuda_torsion_force.cuh  |   4 +-
 src/core/cuda/include/cuda_utility.cuh        |   3 +-
 src/core/cuda/src/cuda_angle_force.cu         |  33 +++--
 src/core/cuda/src/cuda_bond_force.cu          |  28 ++--
 src/core/cuda/src/cuda_improper2_force.cu     |  18 +--
 src/core/cuda/src/cuda_leapfrog.cu            |  12 +-
 src/core/cuda/src/cuda_nonbonded_14_force.cu  |  30 ++--
 src/core/cuda/src/cuda_nonbonded_force.cu     |  38 +++---
 src/core/cuda/src/cuda_polx_water_force.cu    |  54 ++++----
 src/core/cuda/src/cuda_pshell_force.cu        |  26 ++--
 src/core/cuda/src/cuda_radix_water_force.cu   |  28 ++--
 src/core/cuda/src/cuda_restrang_force.cu      |  18 +--
 src/core/cuda/src/cuda_restrdis_force.cu      |  16 +--
 src/core/cuda/src/cuda_restrpos_force.cu      |  16 +--
 src/core/cuda/src/cuda_restrseq_force.cu      |  16 +--
 src/core/cuda/src/cuda_restrwall_force.cu     |  14 +-
 src/core/cuda/src/cuda_shake_constraints.cu   |   4 +-
 src/core/cuda/src/cuda_temperature.cu         |  66 ++++-----
 src/core/cuda/src/cuda_torsion_force.cu       |  25 ++--
 58 files changed, 420 insertions(+), 452 deletions(-)

diff --git a/src/core/common/include/context.h b/src/core/common/include/context.h
index ee516d7a..83817bb8 100644
--- a/src/core/common/include/context.h
+++ b/src/core/common/include/context.h
@@ -32,8 +32,8 @@ class Context {
     int n_qatoms = 0;
     int n_waters = 0;
     int n_molecules = 0;
-    real_t dt = 0.0;
-    real_t tau_T = 0.0;
+    double dt = 0.0;
+    double tau_T = 0.0;
     md_t md;
     topo_t topo;
     int n_excluded = 0;
@@ -108,7 +108,7 @@ class Context {
 
     std::unique_ptr<HostDeviceBuffer<bool>> excluded;
 
-    std::unique_ptr<HostDeviceBuffer<real_t>> winv;
+    std::unique_ptr<HostDeviceBuffer<double>> winv;
 
     std::unique_ptr<HostDeviceBuffer<bool>> shell;
 
@@ -137,12 +137,12 @@ class Context {
     Water
     */
     std::unique_ptr<HostDeviceBuffer<shell_t>> wshells;
-    real_t crgQtot = 0.0;
-    real_t Dwmz = 0.0;
-    real_t awmz = 0.0;
-    std::vector<real_t> theta;
-    std::vector<real_t> theta0;
-    std::vector<real_t> tdum;
+    double crgQtot = 0.0;
+    double Dwmz = 0.0;
+    double awmz = 0.0;
+    std::vector<double> theta;
+    std::vector<double> theta0;
+    std::vector<double> tdum;
     int n_max_inshell = 0;
     int n_shells = 0;
     std::vector<std::vector<int>> list_sh;
@@ -152,7 +152,7 @@ class Context {
     /*
     FEP
     */
-    std::unique_ptr<HostDeviceBuffer<real_t>> lambdas; // Actually length is only 2..
+    std::unique_ptr<HostDeviceBuffer<double>> lambdas; // Actually length is only 2..
 
     /*
     Energy
@@ -206,13 +206,13 @@ class Context {
     Temperature
     */
 
-    real_t Temp = 0.0;
-    real_t Tfree = 0.0;
-    real_t Ndegf = 0.0;
-    real_t Ndegfree = 0.0;
+    double Temp = 0.0;
+    double Tfree = 0.0;
+    double Ndegf = 0.0;
+    double Ndegfree = 0.0;
 
-    real_t Tscale_solute = 0.0;
-    real_t Tscale_solvent = 0.0;
+    double Tscale_solute = 0.0;
+    double Tscale_solvent = 0.0;
     /*
     Info for FEP
     */
diff --git a/src/core/common/include/md_types.h b/src/core/common/include/md_types.h
index dd5ef21d..27c20cef 100644
--- a/src/core/common/include/md_types.h
+++ b/src/core/common/include/md_types.h
@@ -12,29 +12,29 @@
 struct md_t {
     // [MD]
     int steps;
-    real_t stepsize;
-    real_t temperature;
+    double stepsize;
+    double temperature;
     char thermostat[40];
-    real_t bath_coupling;
+    double bath_coupling;
     int random_seed;
-    real_t initial_temperature;
+    double initial_temperature;
     bool shake_solvent;
     bool shake_solute;
     bool shake_hydrogens;
     bool lrf;
     bool charge_groups;
     // [cut-offs]
-    real_t solute_solute;
-    real_t solvent_solvent;
-    real_t solute_solvent;
-    real_t q_atom;
+    double solute_solute;
+    double solvent_solvent;
+    double solute_solvent;
+    double q_atom;
     // [sphere]
-    real_t shell_radius;  // Note: this is for the pshell
-    real_t shell_force;   // Note: this is for the pshell
+    double shell_radius;  // Note: this is for the pshell
+    double shell_force;   // Note: this is for the pshell
     // [solvent]
-    real_t radial_force;
+    double radial_force;
     bool polarisation;
-    real_t polarisation_force;
+    double polarisation_force;
     // [intervals]
     int non_bond;
     int output;
@@ -62,8 +62,8 @@ struct bond_t {
 
 struct cbond_t {
     int code;
-    real_t kb;
-    real_t b0;
+    double kb;
+    double b0;
 };
 
 struct angle_t {
@@ -75,8 +75,8 @@ struct angle_t {
 
 struct cangle_t {
     int code;
-    real_t kth;
-    real_t th0;
+    double kth;
+    double th0;
 };
 
 struct torsion_t {
@@ -89,10 +89,10 @@ struct torsion_t {
 
 struct ctorsion_t {
     int code;
-    real_t k;
-    real_t n;
-    real_t d;
-    real_t paths;
+    double k;
+    double n;
+    double d;
+    double paths;
 };
 
 struct improper_t {
@@ -105,8 +105,8 @@ struct improper_t {
 
 struct cimproper_t {
     int code;
-    real_t k;
-    real_t phi0;
+    double k;
+    double phi0;
 };
 
 struct charge_t {
@@ -126,11 +126,11 @@ struct atype_t {
 
 struct catype_t {
     int code;
-    real_t m;
+    double m;
     real_t aii_normal;
     real_t bii_normal;
-    // real_t aii_polar;
-    // real_t bii_polar;
+    // double aii_polar;
+    // double bii_polar;
     real_t aii_1_4;
     real_t bii_1_4;
 };
@@ -142,12 +142,12 @@ struct vdw_pair_param_t {
 
 struct topo_t {
     int solvent_type;
-    real_t exclusion_radius;
-    real_t solvent_radius;
+    double exclusion_radius;
+    double solvent_radius;
     coord_t solute_center;
     coord_t solvent_center;
-    real_t el14_scale;
-    real_t coulomb_constant;
+    double el14_scale;
+    double coulomb_constant;
     int vdw_rule;  // 1=geometric, 2=arithmetic
 };
 
@@ -177,14 +177,14 @@ struct q_angcouple_t {
 }; // no use
 
 struct q_cimproper_t {
-    real_t k;
-    real_t phi0;
+    double k;
+    double phi0;
 }; // no use
 
 struct q_elscale_t {
     int qi;
     int qj;
-    real_t mu;
+    double mu;
 };
 
 struct q_exclpair_t {
@@ -211,18 +211,18 @@ struct q_offdiag_t {
     int j;
     int qk;
     int ql;
-    real_t Aij;
-    real_t muij;
+    double Aij;
+    double muij;
 }; // no use
 
 struct q_shake_t {
     int ai;
     int aj;
-    real_t dist;
+    double dist;
 }; // no use
 
 struct q_softcore_t {
-    real_t s;
+    double s;
 }; // no use
 
 struct q_softpair_t {
@@ -243,7 +243,7 @@ struct q_torcouple_t {
 struct restrseq_t {
     int ai;
     int aj;
-    real_t k;
+    double k;
     bool ih;
     int to_center;  // Flag for restraining to geom. or mass center
 };
@@ -258,32 +258,32 @@ struct restrpos_t {
 struct restrdis_t {
     int ai, aj;
     int ipsi;
-    real_t d1, d2;
-    real_t k;
+    double d1, d2;
+    double k;
     char itext[20], jtext[20];
 };
 
 struct restrang_t {
     int ai, aj, ak;
     int ipsi;
-    real_t ang;
-    real_t k;
+    double ang;
+    double k;
 };
 
 struct restrwall_t {
     int ai, aj;
-    real_t d, k, aMorse, dMorse;
+    double d, k, aMorse, dMorse;
     bool ih;
 };
 
 struct shell_t {
     int n_inshell;
-    real_t theta_corr;
-    real_t avtheta;
-    real_t avn_inshell;
-    real_t router;
-    real_t dr;
-    real_t cstb;
+    double theta_corr;
+    double avtheta;
+    double avn_inshell;
+    double router;
+    double dr;
+    double cstb;
 };
 
 /* =============================================
@@ -294,7 +294,7 @@ struct shell_t {
 struct shake_bond_t {
     int ai;
     int aj;
-    real_t dist2;
+    double dist2;
     bool ready;
 };
 
@@ -316,28 +316,28 @@ struct dvel_t {
 };
 
 struct E_bonded_t {
-    real_t Ubond;
-    real_t Uangle;
-    real_t Utor;
-    real_t Uimp;
+    double Ubond;
+    double Uangle;
+    double Utor;
+    double Uimp;
 };
 
 struct E_nonbonded_t {
-    real_t Ucoul;
-    real_t Uvdw;
+    double Ucoul;
+    double Uvdw;
 };
 
 struct E_restraint_t {
-    real_t Uradx;
-    real_t Upolx;
-    real_t Ufix;
-    real_t Ushell;
-    real_t Upres;
-    real_t Urestr;
+    double Uradx;
+    double Upolx;
+    double Ufix;
+    double Ushell;
+    double Upres;
+    double Urestr;
 };
 
 struct energy_t {
-    real_t Ukin;
-    real_t Upot;
-    real_t Utot;
+    double Ukin;
+    double Upot;
+    double Utot;
 };
diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h
index b0978010..80b790f7 100644
--- a/src/core/common/include/precision.h
+++ b/src/core/common/include/precision.h
@@ -4,15 +4,14 @@
 using real_t = float;
 using nonbond_work_t = float;
 using force_accum_t = float;
-using energy_accum_t = float;
-using constraint_work_t = float;
 #else
 using real_t = double;
 using nonbond_work_t = double;
 using force_accum_t = double;
+#endif
+
 using energy_accum_t = double;
 using constraint_work_t = double;
-#endif
 
 #ifdef QDYN_SPFP
 constexpr double k_singular_sin_epsilon = 1.0e-6;
diff --git a/src/core/common/src/init.cpp b/src/core/common/src/init.cpp
index e7c2b8c0..499c01cb 100644
--- a/src/core/common/src/init.cpp
+++ b/src/core/common/src/init.cpp
@@ -38,10 +38,10 @@ void initialize_catype_tables() {
     auto *q_atoms_cpu = ctx.q_atoms_list->cpu_data_p;
 
     std::vector<catype_t> h_catype_table_all;
-    std::map<std::array<real_t, 4>, int> catype_to_type_host;
+    std::map<std::array<double, 4>, int> catype_to_type_host;
 
     auto add_catype = [&](catype_t catype) -> int {
-        const std::array<real_t, 4> key = {
+        const std::array<double, 4> key = {
             catype.aii_normal,
             catype.bii_normal,
             catype.aii_1_4,
@@ -91,7 +91,7 @@ void initialize_catype_tables() {
     for (int i = 0; i < static_cast<int>(ctx.p_atoms_list->length); i++) {
         const int id = p_atoms_cpu[i];
         const catype_t catype = catypes[atypes[id].code - 1];
-        const std::array<real_t, 4> key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4};
+        const std::array<double, 4> key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4};
         p_catype_types_cpu[i] = catype_to_type_host[key];
     }
 
@@ -109,7 +109,7 @@ void initialize_catype_tables() {
             const int id = q_atoms_cpu[i];
             const atype_t& qat = ctx.q_atypes[q_idx[id] + ctx.n_qatoms * state];
             const catype_t& qcatype = ctx.q_catypes[qat.code - 1];
-            const std::array<real_t, 4> key = {qcatype.aii_normal, qcatype.bii_normal, qcatype.aii_1_4, qcatype.bii_1_4};
+            const std::array<double, 4> key = {qcatype.aii_normal, qcatype.bii_normal, qcatype.aii_1_4, qcatype.bii_1_4};
             q_catype_types_cpu[state * ctx.q_atoms_list->length + i] = catype_to_type_host[key];
         }
     }
@@ -118,7 +118,7 @@ void initialize_catype_tables() {
     for (int i = 0; i < static_cast<int>(ctx.w_atoms_list->length); i++) {
         const int id = w_atoms_cpu[i];
         const catype_t catype = catypes[atypes[id].code - 1];
-        const std::array<real_t, 4> key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4};
+        const std::array<double, 4> key = {catype.aii_normal, catype.bii_normal, catype.aii_1_4, catype.bii_1_4};
         w_catype_types_cpu[i] = catype_to_type_host[key];
     }
     printf("Total water atom number: %lu, w_catype_types size: %lu\n", ctx.w_atoms_list->length, w_catype_types_cpu.size());
@@ -141,10 +141,10 @@ void initialize_charge_tables() {
     auto *w_atoms_cpu = ctx.w_atoms_list->cpu_data_p;
     auto *q_atoms_cpu = ctx.q_atoms_list->cpu_data_p;
 
-    std::map<real_t, int> charge_to_type_host;
+    std::map<double, int> charge_to_type_host;
     std::vector<ccharge_t> h_charge_table_all;
 
-    auto add_charge = [&](real_t charge) -> int {
+    auto add_charge = [&](double charge) -> int {
         if (charge_to_type_host.count(charge) == 0) {
             int sz = static_cast<int>(h_charge_table_all.size());
             ccharge_t new_ccharge = {};
@@ -161,7 +161,7 @@ void initialize_charge_tables() {
     }
     for (int state = 0; state < ctx.n_lambdas; state++) {
         for (int i = 0; i < ctx.n_qatoms; i++) {
-            real_t charge = ctx.q_charges[i + ctx.n_qatoms * state].charge;
+            double charge = ctx.q_charges[i + ctx.n_qatoms * state].charge;
             add_charge(charge);
             add_charge(charge * lambda_values[state]);
         }
@@ -181,7 +181,7 @@ void initialize_charge_tables() {
     std::vector<int> p_charge_types_cpu(ctx.p_atoms_list->length);
     for (int i = 0; i < static_cast<int>(ctx.p_atoms_list->length); i++) {
         const int id = p_atoms_cpu[i];
-        const real_t charge = ccharges[charges[id].code - 1].charge;
+        const double charge = ccharges[charges[id].code - 1].charge;
         p_charge_types_cpu[i] = charge_to_type_host[charge];
     }
 
@@ -197,7 +197,7 @@ void initialize_charge_tables() {
     for (int state = 0; state < ctx.n_lambdas; state++) {
         for (int i = 0; i < static_cast<int>(ctx.q_atoms_list->length); i++) {
             const int id = q_atoms_cpu[i];
-            const real_t charge = ctx.q_charges[q_idx[id] + ctx.n_qatoms * state].charge;
+            const double charge = ctx.q_charges[q_idx[id] + ctx.n_qatoms * state].charge;
             q_charge_types_cpu[state * ctx.q_atoms_list->length + i] = charge_to_type_host[charge];
         }
     }
@@ -205,7 +205,7 @@ void initialize_charge_tables() {
     std::vector<int> w_charge_types_cpu(ctx.w_atoms_list->length);
     for (int i = 0; i < static_cast<int>(ctx.w_atoms_list->length); i++) {
         const int id = w_atoms_cpu[i];
-        const real_t charge = ccharges[charges[id].code - 1].charge;
+        const double charge = ccharges[charges[id].code - 1].charge;
         w_charge_types_cpu[i] = charge_to_type_host[charge];
     }
 
@@ -493,8 +493,8 @@ void init_velocities() {
     auto& velocities = ctx.velocities->cpu_data_p;
 
     // If not previous value set, use a Maxwell distribution to fill velocities
-    real_t kT = Boltz * ctx.md.initial_temperature;
-    real_t sd, mass;
+    double kT = Boltz * ctx.md.initial_temperature;
+    double sd, mass;
     for (int i = 0; i < ctx.n_atoms; i++) {
         mass = catypes[atypes[i].code - 1].m;
         sd = sqrt(kT / mass);
@@ -514,7 +514,7 @@ void init_inv_mass() {
     auto& ctx = Context::instance();
     auto& atypes = ctx.atypes->cpu_data_p;
     auto& catypes = ctx.catypes->cpu_data_p;
-    ctx.winv = std::make_unique<HostDeviceBuffer<real_t>>(ctx.n_atoms, true, ctx.run_gpu);
+    ctx.winv = std::make_unique<HostDeviceBuffer<double>>(ctx.n_atoms, true, ctx.run_gpu);
     auto* winv = ctx.winv->cpu_data_p;
     for (int ai = 0; ai < ctx.n_atoms; ai++) {
         winv[ai] = 1 / catypes[atypes[ai].code - 1].m;
@@ -539,7 +539,7 @@ void init_water_sphere() {
 void init_wshells() {
     auto& ctx = Context::instance();
     int n_inshell;
-    real_t drs, router, ri, dr, Vshell, rshell;
+    double drs, router, ri, dr, Vshell, rshell;
     auto& bonds = ctx.bonds->cpu_data_p;
     auto& cbonds = ctx.cbonds->cpu_data_p;
     auto& angles = ctx.angles->cpu_data_p;
@@ -547,8 +547,8 @@ void init_wshells() {
     // Get water properties from the first water molecule.
     cbond_t cbondw = cbonds[bonds[ctx.n_atoms_solute].code - 1];
     cangle_t canglew = cangles[angles[ctx.n_atoms_solute].code - 1];
-    const real_t crg_ow = ctx.unified_ccharge(ctx.n_atoms_solute, 0).charge;
-    const real_t mu_w = -crg_ow * cbondw.b0 * cos(canglew.th0 / 2);
+    const double crg_ow = ctx.unified_ccharge(ctx.n_atoms_solute, 0).charge;
+    const double mu_w = -crg_ow * cbondw.b0 * cos(canglew.th0 / 2);
 
     drs = wpolr_layer / drouter;
 
@@ -605,7 +605,7 @@ void init_pshells() {
     auto& catypes = ctx.catypes->cpu_data_p;
     auto& coords_init = ctx.coords_init->cpu_data_p;
     auto* excluded = ctx.excluded->cpu_data_p;
-    real_t mass, r2, rin2;
+    double mass, r2, rin2;
 
     ctx.heavy = std::make_unique<HostDeviceBuffer<bool>>(ctx.n_atoms, true, ctx.run_gpu);
     auto* heavy = ctx.heavy->cpu_data_p;
@@ -655,7 +655,7 @@ static int mark_heavy_atoms(Context& ctx) {
     auto* heavy = ctx.heavy->cpu_data_p;
     int n_heavy = 0;
     for (int i = 0; i < ctx.n_atoms; i++) {
-        real_t mass = catypes[atypes[i].code - 1].m;
+        double mass = catypes[atypes[i].code - 1].m;
         if (mass < 4.0) {
             heavy[i] = false;
         } else {
@@ -681,7 +681,7 @@ void init_pshells_from_charge_groups() {
     auto& ctx = Context::instance();
     auto& coords_init = ctx.coords_init->cpu_data_p;
     auto* excluded = ctx.excluded->cpu_data_p;
-    real_t r2, rin2;
+    double r2, rin2;
     auto& charge_groups = ctx.charge_group_config;
     const bool use_switch_atom = charge_groups.iuse_switch_atom == 1;
 
@@ -697,9 +697,9 @@ void init_pshells_from_charge_groups() {
         const auto& charge_group = charge_groups.charge_groups[grp];
         int i = charge_group.iswitch - 1;
         if (heavy[i] && !excluded[i] && i < ctx.n_atoms_solute) {
-            real_t cx = coords_init[i].x;
-            real_t cy = coords_init[i].y;
-            real_t cz = coords_init[i].z;
+            double cx = coords_init[i].x;
+            double cy = coords_init[i].y;
+            double cz = coords_init[i].z;
             if (!use_switch_atom) {
                 cx = 0.0;
                 cy = 0.0;
@@ -710,7 +710,7 @@ void init_pshells_from_charge_groups() {
                     cy += coords_init[ai].y;
                     cz += coords_init[ai].z;
                 }
-                real_t inv_atoms = 1.0 / static_cast<real_t>(charge_group.atoms.size());
+                double inv_atoms = 1.0 / static_cast<double>(charge_group.atoms.size());
                 cx *= inv_atoms;
                 cy *= inv_atoms;
                 cz *= inv_atoms;
@@ -748,7 +748,7 @@ void init_shake() {
     int mol = 0;
     int shake;
     int n_solute_shake_constraints = 0;
-    real_t excl_shake = 0;
+    double excl_shake = 0;
     auto& bonds = ctx.bonds->cpu_data_p;
     auto& cbonds = ctx.cbonds->cpu_data_p;
 
@@ -808,10 +808,10 @@ void init_shake() {
     ctx.Ndegf = 3 * ctx.n_atoms - ctx.n_shake_constraints;
     ctx.Ndegfree = ctx.Ndegf - 3 * ctx.n_excluded + excl_shake;
 
-    const real_t Ndegf_solvent = ctx.Ndegf - 3 * ctx.n_atoms_solute + n_solute_shake_constraints;
+    const double Ndegf_solvent = ctx.Ndegf - 3 * ctx.n_atoms_solute + n_solute_shake_constraints;
 
-    const real_t Ndegfree_solvent = ctx.Ndegfree - (ctx.n_shake_constraints - n_solute_shake_constraints);
-    const real_t Ndegfree_solute = ctx.Ndegfree - Ndegfree_solvent;
+    const double Ndegfree_solvent = ctx.Ndegfree - (ctx.n_shake_constraints - n_solute_shake_constraints);
+    const double Ndegfree_solute = ctx.Ndegfree - Ndegfree_solvent;
 
     printf("n_shake_constrains = %d, n_solute_shake_constraints = %d, excl_shake = %f\n", ctx.n_shake_constraints, n_solute_shake_constraints, excl_shake);
 
diff --git a/src/core/common/src/parse.cpp b/src/core/common/src/parse.cpp
index 1b45a7a6..98e859ae 100644
--- a/src/core/common/src/parse.cpp
+++ b/src/core/common/src/parse.cpp
@@ -132,7 +132,7 @@ void parse_md(const char* filename) {
 #ifdef VERBOSE
     printf("reading in %d lambdas (%s in file)\n", ctx.n_lambdas, file.buffer[k][1]);
 #endif
-    ctx.lambdas = std::make_unique<HostDeviceBuffer<real_t>>(ctx.n_lambdas, true, ctx.run_gpu);
+    ctx.lambdas = std::make_unique<HostDeviceBuffer<double>>(ctx.n_lambdas, true, ctx.run_gpu);
     auto *lambdas = ctx.lambdas->cpu_data_p;
     k++;
     for (int i = 0; i < ctx.n_lambdas; i++) {
diff --git a/src/core/cpu/include/cpu_angle_force.h b/src/core/cpu/include/cpu_angle_force.h
index ea4f5ef6..df2a3a64 100644
--- a/src/core/cpu/include/cpu_angle_force.h
+++ b/src/core/cpu/include/cpu_angle_force.h
@@ -1,5 +1,2 @@
 #pragma once
-
-#include "common/include/precision.h"
-
-real_t calc_angle_forces(int start, int end);
+double calc_angle_forces(int start, int end);
\ No newline at end of file
diff --git a/src/core/cpu/include/cpu_bond_force.h b/src/core/cpu/include/cpu_bond_force.h
index 32775c6e..6a2f7f73 100644
--- a/src/core/cpu/include/cpu_bond_force.h
+++ b/src/core/cpu/include/cpu_bond_force.h
@@ -1,5 +1,3 @@
 #pragma once
 
-#include "common/include/precision.h"
-
-real_t calc_bond_forces(int start, int end);
+double calc_bond_forces(int start, int end);
diff --git a/src/core/cpu/include/cpu_improper2_force.h b/src/core/cpu/include/cpu_improper2_force.h
index b6606e57..26d694aa 100644
--- a/src/core/cpu/include/cpu_improper2_force.h
+++ b/src/core/cpu/include/cpu_improper2_force.h
@@ -1,5 +1,3 @@
 #pragma once
 
-#include "common/include/precision.h"
-
-real_t calc_improper2_forces(int start, int end);
+double calc_improper2_forces(int start, int end);
diff --git a/src/core/cpu/include/cpu_torsion_force.h b/src/core/cpu/include/cpu_torsion_force.h
index 309bd505..19089318 100644
--- a/src/core/cpu/include/cpu_torsion_force.h
+++ b/src/core/cpu/include/cpu_torsion_force.h
@@ -1,5 +1,3 @@
 #pragma once
 
-#include "common/include/precision.h"
-
-real_t calc_torsion_forces(int start, int end);
+double calc_torsion_forces(int start, int end);
diff --git a/src/core/cpu/include/cpu_utils.h b/src/core/cpu/include/cpu_utils.h
index 352d6b3c..e7be4557 100644
--- a/src/core/cpu/include/cpu_utils.h
+++ b/src/core/cpu/include/cpu_utils.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#include "common/include/precision.h"
-
-real_t gauss(real_t mean, real_t sd);
-real_t to_degrees(real_t radians);
-real_t to_radians(real_t degrees);
+double gauss(double mean, double sd);
+double to_degrees(double radians);
+double to_radians(double degrees);
diff --git a/src/core/cpu/src/cpu_angle_force.cpp b/src/core/cpu/src/cpu_angle_force.cpp
index 1f19390f..ae600561 100644
--- a/src/core/cpu/src/cpu_angle_force.cpp
+++ b/src/core/cpu/src/cpu_angle_force.cpp
@@ -5,7 +5,7 @@
 #include "context.h"
 #include "cpu_utils.h"
 
-real_t calc_angle_forces(int start, int end) {
+double calc_angle_forces(int start, int end) {
     auto& ctx = Context::instance();
     auto &coords = ctx.coords->cpu_data_p;
     auto &dvelocities = ctx.dvelocities->cpu_data_p;
@@ -15,11 +15,11 @@ real_t calc_angle_forces(int start, int end) {
     coord_t rji, rjk;
     coord_t di, dk;
 
-    real_t bji2inv, bjk2inv, bjiinv, bjkinv;
+    double bji2inv, bjk2inv, bjiinv, bjkinv;
     cangle_t cangle;
-    real_t cos_th, th, dth, dv, f1;
-    real_t ener;
-    real_t angle = 0;
+    double cos_th, th, dth, dv, f1;
+    double ener;
+    double angle = 0;
 
     auto &angles = ctx.angles->cpu_data_p;
     auto &cangles = ctx.cangles->cpu_data_p;
diff --git a/src/core/cpu/src/cpu_bond_force.cpp b/src/core/cpu/src/cpu_bond_force.cpp
index 0ab4baff..2a539f90 100644
--- a/src/core/cpu/src/cpu_bond_force.cpp
+++ b/src/core/cpu/src/cpu_bond_force.cpp
@@ -4,7 +4,7 @@
 
 #include "context.h"
 
-real_t calc_bond_forces(int start, int end) {
+double calc_bond_forces(int start, int end) {
     auto& ctx = Context::instance();
     auto &bonds = ctx.bonds->cpu_data_p;
     auto &cbonds = ctx.cbonds->cpu_data_p;
@@ -13,8 +13,8 @@ real_t calc_bond_forces(int start, int end) {
     int aii, aji;
     coord_t ai, aj, dx;
     cbond_t cbond;
-    real_t dx2, dx1, ddx, ener, ampl;
-    real_t bond = 0;
+    double dx2, dx1, ddx, ener, ampl;
+    double bond = 0;
 
     for (int i = start; i < end; i++) {
         aii = bonds[i].ai - 1;
diff --git a/src/core/cpu/src/cpu_improper2_force.cpp b/src/core/cpu/src/cpu_improper2_force.cpp
index fea7c724..6e4faa60 100644
--- a/src/core/cpu/src/cpu_improper2_force.cpp
+++ b/src/core/cpu/src/cpu_improper2_force.cpp
@@ -5,7 +5,7 @@
 #include "context.h"
 #include "cpu_utils.h"
 
-real_t calc_improper2_forces(int start, int end) {
+double calc_improper2_forces(int start, int end) {
     auto& ctx = Context::instance();
     auto &impropers = ctx.impropers->cpu_data_p;
     auto &cimpropers = ctx.cimpropers->cpu_data_p;
@@ -15,13 +15,13 @@ real_t calc_improper2_forces(int start, int end) {
 
     coord_t ai, aj, ak, al;
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
-    real_t bj2inv, bk2inv, bjinv, bkinv;
-    real_t cos_phi, phi, arg, ener, dv, f1;
+    double bj2inv, bk2inv, bjinv, bkinv;
+    double cos_phi, phi, arg, ener, dv, f1;
     coord_t di, dl, dpi, dpj, dpk, dpl;
 
     improper_t imp;
     cimproper_t cimp;
-    real_t improper = 0;
+    double improper = 0;
 
     for (int i = start; i < end; i++) {
         imp = impropers[i];
diff --git a/src/core/cpu/src/cpu_leapfrog.cpp b/src/core/cpu/src/cpu_leapfrog.cpp
index 0927e414..9d1ff43a 100644
--- a/src/core/cpu/src/cpu_leapfrog.cpp
+++ b/src/core/cpu/src/cpu_leapfrog.cpp
@@ -11,8 +11,8 @@ void calc_leapfrog() {
     auto &velocities = ctx.velocities->cpu_data_p;
     auto &dvelocities = ctx.dvelocities->cpu_data_p;
     auto *xcoords = ctx.xcoords->cpu_data_p;
-    real_t mass_i;
-    real_t winv_i;
+    double mass_i;
+    double winv_i;
 
     for (int i = 0; i < ctx.n_atoms_solute; i++) {
         mass_i = catypes[atypes[i].code - 1].m;
diff --git a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
index cbeb11f5..390c67eb 100644
--- a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
@@ -13,7 +13,7 @@ void calc_nonbonded_pp_forces() {
     auto &LJ_matrix = ctx.LJ_matrix->cpu_data_p;
     auto *excluded = ctx.excluded->cpu_data_p;
     bool bond14, bond23;
-    real_t scaling;
+    double scaling;
     coord_t da;
     real_t r2a, ra, r6a;
     real_t V_a, V_b;
@@ -67,8 +67,8 @@ void calc_nonbonded_pp_forces() {
             dvelocities[j].y += dva * da.y;
             dvelocities[j].z += dva * da.z;
 
-            ctx.E_nonbond_pp.Ucoul += static_cast<real_t>(Vela);
-            ctx.E_nonbond_pp.Uvdw += static_cast<real_t>(V_a - V_b);
+            ctx.E_nonbond_pp.Ucoul += static_cast<double>(Vela);
+            ctx.E_nonbond_pp.Uvdw += static_cast<double>(V_a - V_b);
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
index 52c9242b..030c1290 100644
--- a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
@@ -66,8 +66,8 @@ void calc_nonbonded_pw_forces() {
             dvelocities[atom_j].y += scale * dy;
             dvelocities[atom_j].z += scale * dz;
 
-            ctx.E_nonbond_pw.Ucoul += static_cast<real_t>(ecoul);
-            ctx.E_nonbond_pw.Uvdw += static_cast<real_t>(v_a - v_b);
+            ctx.E_nonbond_pw.Ucoul += static_cast<double>(ecoul);
+            ctx.E_nonbond_pw.Uvdw += static_cast<double>(v_a - v_b);
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
index b0df677d..7a81a516 100644
--- a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
@@ -18,7 +18,7 @@ void calc_nonbonded_qp_forces() {
     real_t r2, r;
     real_t ai_aii, aj_aii, ai_bii, aj_bii;
     bool bond23, bond14;
-    real_t scaling;
+    double scaling;
     real_t Vel, V_a, V_b, dv;
 
     for (int qi = 0; qi < ctx.n_qatoms; qi++) {
@@ -70,8 +70,8 @@ void calc_nonbonded_qp_forces() {
                 dvelocities[j].z += dv * da.z;
 
                 // Update Q totals
-                ctx.EQ_nonbond_qp[state].Ucoul += static_cast<real_t>(Vel);
-                ctx.EQ_nonbond_qp[state].Uvdw += static_cast<real_t>(V_a - V_b);
+                ctx.EQ_nonbond_qp[state].Ucoul += static_cast<double>(Vel);
+                ctx.EQ_nonbond_qp[state].Uvdw += static_cast<double>(V_a - V_b);
             }
         }
     }
diff --git a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
index 96462795..006a3c0e 100644
--- a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
@@ -16,7 +16,7 @@ void calc_nonbonded_qq_forces() {
     auto *q_elscales = ctx.q_elscales->cpu_data_p;
     int ai, aj;
     real_t crg_i, crg_j;
-    real_t elscale, scaling;
+    double elscale, scaling;
     bool bond23, bond14;
     coord_t da;
     real_t r2a, ra, r6a;
@@ -81,8 +81,8 @@ void calc_nonbonded_qq_forces() {
                 dvelocities[aj].y += dva * da.y;
                 dvelocities[aj].z += dva * da.z;
 
-                ctx.EQ_nonbond_qq[state].Ucoul += static_cast<real_t>(Vela);
-                ctx.EQ_nonbond_qq[state].Uvdw += static_cast<real_t>(V_a - V_b);
+                ctx.EQ_nonbond_qq[state].Ucoul += static_cast<double>(Vela);
+                ctx.EQ_nonbond_qq[state].Uvdw += static_cast<double>(V_a - V_b);
             }
         }
     }
diff --git a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
index 1ab0b469..8d18bc55 100644
--- a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
@@ -75,8 +75,8 @@ void calc_nonbonded_qw_forces() {
                 dvH1 -= r2H1 * VelH1 * lambda;
                 dvH2 -= r2H2 * VelH2 * lambda;
 
-                ctx.EQ_nonbond_qw[state].Ucoul += static_cast<real_t>(VelO + VelH1 + VelH2);
-                ctx.EQ_nonbond_qw[state].Uvdw += static_cast<real_t>(V_a - V_b);
+                ctx.EQ_nonbond_qw[state].Ucoul += static_cast<double>(VelO + VelH1 + VelH2);
+                ctx.EQ_nonbond_qw[state].Uvdw += static_cast<double>(V_a - V_b);
             }
 
             // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!!
diff --git a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
index f6d2ac98..3be5e6f0 100644
--- a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
@@ -59,8 +59,8 @@ void accumulate_pair_force(Context& ctx,
     dvelocities[atom_j].y += scale * dy;
     dvelocities[atom_j].z += scale * dz;
 
-    energy.Ucoul += static_cast<real_t>(ecoul);
-    energy.Uvdw += static_cast<real_t>(evdw);
+    energy.Ucoul += static_cast<double>(ecoul);
+    energy.Uvdw += static_cast<double>(evdw);
 }
 
 void calc_nonbonded_ww_forces() {
diff --git a/src/core/cpu/src/cpu_polx_water_force.cpp b/src/core/cpu/src/cpu_polx_water_force.cpp
index 440d03d2..5116dbbb 100644
--- a/src/core/cpu/src/cpu_polx_water_force.cpp
+++ b/src/core/cpu/src/cpu_polx_water_force.cpp
@@ -13,12 +13,12 @@ void calc_polx_w_forces(int iteration) {
     auto *wshells = ctx.wshells->cpu_data_p;
 
     int wi, imin, jw, ii, iis, jmin;
-    real_t tmin;
+    double tmin;
     coord_t rmu, rcu, f1O, f1H1, f1H2, f2;
-    real_t rm, rc;
-    real_t cos_th;
-    real_t avtdum, arg, f0, dv;
-    real_t ener;
+    double rm, rc;
+    double cos_th;
+    double avtdum, arg, f0, dv;
+    double ener;
 
     for (int is = 0; is < ctx.n_shells; is++) {
         wshells[is].n_inshell = 0;
@@ -93,8 +93,8 @@ void calc_polx_w_forces(int iteration) {
     if (iteration != 0 && iteration % itdis_update == 0) {
         for (int is = 0; is < ctx.n_shells; is++) {
             printf("SHELL %d\n", is);
-            wshells[is].avtheta /= (real_t)itdis_update;
-            wshells[is].avn_inshell /= (real_t)itdis_update;
+            wshells[is].avtheta /= (double)itdis_update;
+            wshells[is].avn_inshell /= (double)itdis_update;
             wshells[is].theta_corr =
                 wshells[is].theta_corr + wshells[is].avtheta - acos(wshells[is].cstb);
             printf("average theta = %f, average in shell = %f, theta_corr = %f\n",
@@ -113,7 +113,7 @@ void calc_polx_w_forces(int iteration) {
         avtdum = 0;
         for (int il = 0; il < wshells[is].n_inshell; il++) {
             ii = ctx.nsort[il][is];
-            arg = 1 + ((1 - 2 * (real_t)(il + 1)) / (real_t)wshells[is].n_inshell);
+            arg = 1 + ((1 - 2 * (double)(il + 1)) / (double)wshells[is].n_inshell);
             ctx.theta0[il] = acos(arg);
             ctx.theta0[il] = ctx.theta0[il] - 3 * sin(ctx.theta0[il]) * wshells[is].cstb / 2;
             if (ctx.theta0[il] < 0) {
@@ -189,7 +189,7 @@ void calc_polx_w_forces(int iteration) {
             dvelocities[wi + 2].z += f0 * f1H2.z;
         }
 
-        wshells[is].avtheta += avtdum / (real_t)wshells[is].n_inshell;
+        wshells[is].avtheta += avtdum / (double)wshells[is].n_inshell;
         wshells[is].avn_inshell += wshells[is].n_inshell;
     }
 }
diff --git a/src/core/cpu/src/cpu_pshell_force.cpp b/src/core/cpu/src/cpu_pshell_force.cpp
index a547f16d..9ff083cc 100644
--- a/src/core/cpu/src/cpu_pshell_force.cpp
+++ b/src/core/cpu/src/cpu_pshell_force.cpp
@@ -13,7 +13,7 @@ void calc_pshell_forces() {
     auto *shell = ctx.shell->cpu_data_p;
 
     coord_t dr;
-    real_t k, r2, ener;
+    double k, r2, ener;
 
     for (int i = 0; i < ctx.n_atoms_solute; i++) {
         if (shell[i] || excluded[i]) {
diff --git a/src/core/cpu/src/cpu_q_angle_force.cpp b/src/core/cpu/src/cpu_q_angle_force.cpp
index c92c904d..14aa802c 100644
--- a/src/core/cpu/src/cpu_q_angle_force.cpp
+++ b/src/core/cpu/src/cpu_q_angle_force.cpp
@@ -14,8 +14,8 @@ void calc_qangle_forces(int state) {
     int ic;
     int ai, aj, ak;
     coord_t rji, rjk;
-    real_t bji, bjk;
-    real_t cos_th, th, dth, ener, dv, f1;
+    double bji, bjk;
+    double cos_th, th, dth, ener, dv, f1;
     coord_t di, dk;
 
     for (int i = 0; i < ctx.n_qangles; i++) {
diff --git a/src/core/cpu/src/cpu_q_bond_force.cpp b/src/core/cpu/src/cpu_q_bond_force.cpp
index 6b924c69..5f2f7203 100644
--- a/src/core/cpu/src/cpu_q_bond_force.cpp
+++ b/src/core/cpu/src/cpu_q_bond_force.cpp
@@ -11,7 +11,7 @@ void calc_qbond_forces(int state) {
     auto *lambdas = ctx.lambdas->cpu_data_p;
     int ic;
     int ai, aj;
-    real_t b, db, ener, dv;
+    double b, db, ener, dv;
     coord_t rij;
 
     for (int i = 0; i < ctx.n_qbonds; i++) {
diff --git a/src/core/cpu/src/cpu_q_torsion_force.cpp b/src/core/cpu/src/cpu_q_torsion_force.cpp
index 2be495b0..7b7fb271 100644
--- a/src/core/cpu/src/cpu_q_torsion_force.cpp
+++ b/src/core/cpu/src/cpu_q_torsion_force.cpp
@@ -15,10 +15,10 @@ void calc_qtorsion_forces(int state) {
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
     coord_t di, dl, dpi, dpj, dpk, dpl;
 
-    real_t bj2inv, bk2inv, bjinv, bkinv;
-    real_t bj, bk, cos_phi, phi;
-    real_t arg, dv, f1;
-    real_t ener;
+    double bj2inv, bk2inv, bjinv, bkinv;
+    double bj, bk, cos_phi, phi;
+    double arg, dv, f1;
+    double ener;
 
     for (int i = 0; i < ctx.n_qtorsions; i++) {
         ic = ctx.q_torsions[i + ctx.n_qtorsions * state].code;
diff --git a/src/core/cpu/src/cpu_radix_water_force.cpp b/src/core/cpu/src/cpu_radix_water_force.cpp
index a85af35c..a887ad31 100644
--- a/src/core/cpu/src/cpu_radix_water_force.cpp
+++ b/src/core/cpu/src/cpu_radix_water_force.cpp
@@ -10,9 +10,9 @@ void calc_radix_w_forces() {
     auto &coords = ctx.coords->cpu_data_p;
     auto &dvelocities = ctx.dvelocities->cpu_data_p;
 
-    real_t b, db, ener, dv, fexp;
+    double b, db, ener, dv, fexp;
     coord_t dr;
-    real_t shift;
+    double shift;
 
     if (ctx.md.radial_force != 0) {
         shift = sqrt(Boltz * ctx.Tfree / ctx.md.radial_force);
diff --git a/src/core/cpu/src/cpu_restrang_force.cpp b/src/core/cpu/src/cpu_restrang_force.cpp
index c2b9ed50..84f593b0 100644
--- a/src/core/cpu/src/cpu_restrang_force.cpp
+++ b/src/core/cpu/src/cpu_restrang_force.cpp
@@ -15,8 +15,8 @@ void calc_restrang_forces() {
 
     int state, i, j, k;
     coord_t dr, dr2, di, dk;
-    real_t lambda, r2ij, r2jk, rij, rjk, cos_th, th;
-    real_t dth, dv, ener, f1;
+    double lambda, r2ij, r2jk, rij, rjk, cos_th, th;
+    double dth, dv, ener, f1;
 
     for (int ir = 0; ir < ctx.n_restrangs; ir++) {
         state = restrangs[ir].ipsi - 1;
diff --git a/src/core/cpu/src/cpu_restrdis_force.cpp b/src/core/cpu/src/cpu_restrdis_force.cpp
index 859481f3..c15cbef7 100644
--- a/src/core/cpu/src/cpu_restrdis_force.cpp
+++ b/src/core/cpu/src/cpu_restrdis_force.cpp
@@ -14,7 +14,7 @@ void calc_restrdis_forces() {
 
     int state, i, j;
     coord_t dr;
-    real_t lambda, b, db, dv, ener;
+    double lambda, b, db, dv, ener;
 
     for (int ir = 0; ir < ctx.n_restrdists; ir++) {
         state = restrdists[ir].ipsi - 1;
diff --git a/src/core/cpu/src/cpu_restrpos_force.cpp b/src/core/cpu/src/cpu_restrpos_force.cpp
index a3e8710d..6db044b4 100644
--- a/src/core/cpu/src/cpu_restrpos_force.cpp
+++ b/src/core/cpu/src/cpu_restrpos_force.cpp
@@ -14,7 +14,7 @@ void calc_restrpos_forces() {
 
     int state, i;
     coord_t dr;
-    real_t lambda, ener, x2, y2, z2;
+    double lambda, ener, x2, y2, z2;
 
     for (int ir = 0; ir < ctx.n_restrspos; ir++) {
         state = restrspos[ir].ipsi - 1;
diff --git a/src/core/cpu/src/cpu_restrseq_force.cpp b/src/core/cpu/src/cpu_restrseq_force.cpp
index f9ff9fd0..296762e8 100644
--- a/src/core/cpu/src/cpu_restrseq_force.cpp
+++ b/src/core/cpu/src/cpu_restrseq_force.cpp
@@ -13,9 +13,9 @@ void calc_restrseq_forces() {
     auto &restrseqs = ctx.restrseqs->cpu_data_p;
     auto *heavy = ctx.heavy->cpu_data_p;
 
-    real_t k, mass, totmass;
+    double k, mass, totmass;
     coord_t dr;
-    real_t r2, ener;
+    double r2, ener;
 
     for (int s = 0; s < ctx.n_restrseqs; s++) {
         k = restrseqs[s].k;
diff --git a/src/core/cpu/src/cpu_restrwall_force.cpp b/src/core/cpu/src/cpu_restrwall_force.cpp
index 7da6faa6..fd49749a 100644
--- a/src/core/cpu/src/cpu_restrwall_force.cpp
+++ b/src/core/cpu/src/cpu_restrwall_force.cpp
@@ -11,7 +11,7 @@ void calc_restrwall_forces() {
     auto &restrwalls = ctx.restrwalls->cpu_data_p;
     auto *heavy = ctx.heavy->cpu_data_p;
 
-    real_t k, b, db, ener, dv, fexp;
+    double k, b, db, ener, dv, fexp;
     coord_t dr;
 
     for (int ir = 0; ir < ctx.n_restrwalls; ir++) {
diff --git a/src/core/cpu/src/cpu_shake.cpp b/src/core/cpu/src/cpu_shake.cpp
index 91162c98..cb29a0f0 100644
--- a/src/core/cpu/src/cpu_shake.cpp
+++ b/src/core/cpu/src/cpu_shake.cpp
@@ -34,7 +34,7 @@ int calc_shake_constraints(coord_t* coords, coord_t* xcoords) {
                     const int aj = shake_bond.aj - 1;
                     coord_t xij;
                     coord_t xxij;
-                    real_t xij2, diff, corr, scp;
+                    double xij2, diff, corr, scp;
 
                     xij.x = coords[ai].x - coords[aj].x;
                     xij.y = coords[ai].y - coords[aj].y;
@@ -75,7 +75,7 @@ int calc_shake_constraints(coord_t* coords, coord_t* xcoords) {
                 const int ai = shake_bonds[shake + i].ai - 1;
                 const int aj = shake_bonds[shake + i].aj - 1;
                 coord_t xxij;
-                real_t xxij2;
+                double xxij2;
 
                 xxij.x = xcoords[ai].x - xcoords[aj].x;
                 xxij.y = xcoords[ai].y - xcoords[aj].y;
@@ -125,11 +125,11 @@ void stop_cm_translation() {
     auto &atypes = ctx.atypes->cpu_data_p;
     auto &catypes = ctx.catypes->cpu_data_p;
     auto &velocities = ctx.velocities->cpu_data_p;
-    real_t total_mass = 0;
+    double total_mass = 0;
     coord_t vcm = {};
 
     for (int ai = 0; ai < ctx.n_atoms; ai++) {
-        const real_t rmass = catypes[atypes[ai].code - 1].m;
+        const double rmass = catypes[atypes[ai].code - 1].m;
         total_mass += rmass;
         vcm.x += velocities[ai].x * rmass;
         vcm.y += velocities[ai].y;
diff --git a/src/core/cpu/src/cpu_temperature.cpp b/src/core/cpu/src/cpu_temperature.cpp
index 537dec77..6b76139f 100644
--- a/src/core/cpu/src/cpu_temperature.cpp
+++ b/src/core/cpu/src/cpu_temperature.cpp
@@ -17,11 +17,11 @@ void calc_temperature() {
     auto *excluded = ctx.excluded->cpu_data_p;
     ctx.Temp = 0;
     ctx.Tfree = 0;
-    real_t Temp_solute = 0, Tfree_solute = 0, Texcl_solute = 0;
-    real_t Tfree_solvent = 0, Temp_solvent = 0, Texcl_solvent = 0;
-    real_t Ekinmax = 1000.0 * ctx.Ndegf * Boltz * ctx.md.temperature / 2.0 / ctx.n_atoms;
-    real_t ener;
-    real_t mass_i;
+    double Temp_solute = 0, Tfree_solute = 0, Texcl_solute = 0;
+    double Tfree_solvent = 0, Temp_solvent = 0, Texcl_solvent = 0;
+    double Ekinmax = 1000.0 * ctx.Ndegf * Boltz * ctx.md.temperature / 2.0 / ctx.n_atoms;
+    double ener;
+    double mass_i;
 
     ctx.Temp = 0;
     for (int i = 0; i < ctx.n_atoms_solute; i++) {
diff --git a/src/core/cpu/src/cpu_torsion_force.cpp b/src/core/cpu/src/cpu_torsion_force.cpp
index 37a68298..4ebb44b2 100644
--- a/src/core/cpu/src/cpu_torsion_force.cpp
+++ b/src/core/cpu/src/cpu_torsion_force.cpp
@@ -5,7 +5,7 @@
 #include "context.h"
 #include "cpu_utils.h"
 
-real_t calc_torsion_forces(int start, int end) {
+double calc_torsion_forces(int start, int end) {
     auto& ctx = Context::instance();
     auto &torsions = ctx.torsions->cpu_data_p;
     auto &ctorsions = ctx.ctorsions->cpu_data_p;
@@ -17,11 +17,11 @@ real_t calc_torsion_forces(int start, int end) {
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
     coord_t di, dl, dpi, dpj, dpk, dpl;
 
-    real_t bj2inv, bk2inv, bjinv, bkinv;
-    real_t cos_phi, phi;
-    real_t arg, dv, f1;
-    real_t ener;
-    real_t torsion = 0;
+    double bj2inv, bk2inv, bjinv, bkinv;
+    double cos_phi, phi;
+    double arg, dv, f1;
+    double ener;
+    double torsion = 0;
 
     torsion_t t;
     ctorsion_t ctors;
diff --git a/src/core/cpu/src/utils.cpp b/src/core/cpu/src/utils.cpp
index 00c37e41..ed680aa3 100644
--- a/src/core/cpu/src/utils.cpp
+++ b/src/core/cpu/src/utils.cpp
@@ -1,25 +1,24 @@
 #include <math.h>
 #include <stdio.h>
 
-#include "common/include/precision.h"
-
 // Get a value from a gaussian distributed random variable with
 // mean mean and standard deviation sd
-real_t gauss(real_t mean, real_t sd) {
-    real_t v1, v2, nd10;
+double gauss(double mean, double sd) {
+    double v1, v2, nd10;
 
-    v1 = ( (real_t)(rand()) + 1. )/( (real_t)(RAND_MAX) + 1. );
-    v2 = ( (real_t)(rand()) + 1. )/( (real_t)(RAND_MAX) + 1. );
+    v1 = ( (double)(rand()) + 1. )/( (double)(RAND_MAX) + 1. );
+    v2 = ( (double)(rand()) + 1. )/( (double)(RAND_MAX) + 1. );
     nd10 = cos(2 * M_PI * v2) * sqrt(-2. * log(v1));
 
     return sd * nd10 + mean;
 }
 
 
-real_t to_degrees(real_t radians) {
+double to_degrees(double radians) {
     return radians * (180.0 / M_PI);
 }
 
-real_t to_radians(real_t degrees) {
+double to_radians(double degrees) {
     return degrees * (M_PI / 180.0);
 }
+
diff --git a/src/core/cuda/include/cuda_angle_force.cuh b/src/core/cuda/include/cuda_angle_force.cuh
index 63ebb011..c2e00e15 100644
--- a/src/core/cuda/include/cuda_angle_force.cuh
+++ b/src/core/cuda/include/cuda_angle_force.cuh
@@ -1,7 +1,5 @@
 #pragma once
 
-#include "common/include/precision.h"
-
 void init_angle_force_kernel_data();
-real_t calc_angle_forces_host(int start, int end);
+double calc_angle_forces_host(int start, int end);
 void cleanup_angle_force();
diff --git a/src/core/cuda/include/cuda_bond_force.cuh b/src/core/cuda/include/cuda_bond_force.cuh
index bddc873c..83961ed5 100644
--- a/src/core/cuda/include/cuda_bond_force.cuh
+++ b/src/core/cuda/include/cuda_bond_force.cuh
@@ -1,7 +1,5 @@
 #pragma once
 
-#include "common/include/precision.h"
-
 void init_bond_force_kernel_data();
-real_t calc_bond_forces_host(int start, int end);
+double calc_bond_forces_host(int start, int end);
 void cleanup_bond_force();
diff --git a/src/core/cuda/include/cuda_improper2_force.cuh b/src/core/cuda/include/cuda_improper2_force.cuh
index 9e0a2cfd..cb0a9635 100644
--- a/src/core/cuda/include/cuda_improper2_force.cuh
+++ b/src/core/cuda/include/cuda_improper2_force.cuh
@@ -1,7 +1,5 @@
 #pragma once
 
-#include "common/include/precision.h"
-
 void init_improper2_force_kernel_data();
-real_t calc_improper2_forces_host(int start, int end);
+double calc_improper2_forces_host(int start, int end);
 void cleanup_improper2_force();
diff --git a/src/core/cuda/include/cuda_nonbonded_force.cuh b/src/core/cuda/include/cuda_nonbonded_force.cuh
index ee227088..f1a9b252 100644
--- a/src/core/cuda/include/cuda_nonbonded_force.cuh
+++ b/src/core/cuda/include/cuda_nonbonded_force.cuh
@@ -1,12 +1,8 @@
 #pragma once
 
-#include <utility>
-
-#include "common/include/precision.h"
-
 void init_nonbonded_force_kernel_data();
 
-std::pair<real_t, real_t> calc_nonbonded_force_host(
+std::pair<double, double> calc_nonbonded_force_host(
     int nx, 
     int ny, 
     int* x_idx_list, 
@@ -18,7 +14,7 @@ std::pair<real_t, real_t> calc_nonbonded_force_host(
     const int* x_atypes_types, 
     const int* y_atypes_types,
     const bool disable_water_h_lj = false,
-    const real_t lambda = 1.0
+    const double lambda = 1.0
 );
 
 void cleanup_nonbonded_force();
diff --git a/src/core/cuda/include/cuda_torsion_force.cuh b/src/core/cuda/include/cuda_torsion_force.cuh
index cac7e191..50315181 100644
--- a/src/core/cuda/include/cuda_torsion_force.cuh
+++ b/src/core/cuda/include/cuda_torsion_force.cuh
@@ -1,8 +1,6 @@
 #pragma once
 
-#include "common/include/precision.h"
-
 void init_torsion_force_kernel_data();
-real_t calc_torsion_forces_host(int start, int end);
+double calc_torsion_forces_host(int start, int end);
 
 void cleanup_torsion_force();
diff --git a/src/core/cuda/include/cuda_utility.cuh b/src/core/cuda/include/cuda_utility.cuh
index 9cbcefd5..36767be0 100644
--- a/src/core/cuda/include/cuda_utility.cuh
+++ b/src/core/cuda/include/cuda_utility.cuh
@@ -3,8 +3,7 @@
 #include <math.h>
 
 #include "common/include/cuda_runtime_utility.h"
-#include "common/include/precision.h"
 
-__device__ inline real_t to_radians_device(real_t degrees) {
+__device__ inline double to_radians_device(double degrees) {
     return degrees * (M_PI / 180.0);
 }
diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu
index 445bed51..f20b039a 100644
--- a/src/core/cuda/src/cuda_angle_force.cu
+++ b/src/core/cuda/src/cuda_angle_force.cu
@@ -4,10 +4,10 @@
 
 namespace CudaAngleForce {
 bool is_initialized = false;
-real_t* d_energy_sum;
+double* d_energy_sum;
 }  // namespace CudaAngleForce
 
-__global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, coord_t* coords, cangle_t* cangles, dvel_t* dvelocities, real_t* energy_sum) {
+__global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, coord_t* coords, cangle_t* cangles, dvel_t* dvelocities, double* energy_sum) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x + start;
     if (idx >= end) return;
 
@@ -24,22 +24,21 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co
     coord_t rji = {ri.x - rj.x, ri.y - rj.y, ri.z - rj.z};
     coord_t rjk = {rk.x - rj.x, rk.y - rj.y, rk.z - rj.z};
 
-    real_t rji_length = sqrt(rji.x * rji.x + rji.y * rji.y + rji.z * rji.z);
-    real_t rjk_length = sqrt(rjk.x * rjk.x + rjk.y * rjk.y + rjk.z * rjk.z);
+    double rji_length = sqrt(rji.x * rji.x + rji.y * rji.y + rji.z * rji.z);
+    double rjk_length = sqrt(rjk.x * rjk.x + rjk.y * rjk.y + rjk.z * rjk.z);
 
-    real_t cos_theta = (rji.x * rjk.x + rji.y * rjk.y + rji.z * rjk.z) / (rji_length * rjk_length);
+    double cos_theta = (rji.x * rjk.x + rji.y * rjk.y + rji.z * rjk.z) / (rji_length * rjk_length);
 
-    cos_theta = cos_theta > static_cast<real_t>(1.0) ? static_cast<real_t>(1.0) : cos_theta;
-    cos_theta = cos_theta < static_cast<real_t>(-1.0) ? static_cast<real_t>(-1.0) : cos_theta;
-    real_t theta = acos(cos_theta);
+    cos_theta = fmax(fmin(cos_theta, 1.0), -1.0);  // Clamp value to avoid NaNs
+    double theta = acos(cos_theta);
 
-    real_t dtheta = theta - to_radians_device(cang.th0);
-    real_t energy = 0.5 * cang.kth * dtheta * dtheta;
+    double dtheta = theta - to_radians_device(cang.th0);
+    double energy = 0.5 * cang.kth * dtheta * dtheta;
 
     // calculate force magnitude
-    real_t dv = cang.kth * dtheta;
+    double dv = cang.kth * dtheta;
 
-    real_t f1 = sin(theta);
+    double f1 = sin(theta);
     if (fabs(f1) < k_singular_sin_epsilon) {
         f1 = -1.0 / k_singular_sin_epsilon;
     } else {
@@ -71,7 +70,7 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co
     atomicAdd(&dvelocities[j].z, -dv * (di.z + dk.z));
 }
 
-real_t calc_angle_forces_host(int start, int end) {
+double calc_angle_forces_host(int start, int end) {
     int N = end - start;
     if (N <= 0) return 0.0;
     using namespace CudaAngleForce;
@@ -86,8 +85,8 @@ real_t calc_angle_forces_host(int start, int end) {
     // todo: now have to do that, after moving all to CudaContext, can remove it
     // ctx.sync_all_to_device();
 
-    real_t h_energy_sum = 0.0;
-    cudaMemcpy(d_energy_sum, &h_energy_sum, sizeof(real_t), cudaMemcpyHostToDevice);
+    double h_energy_sum = 0.0;
+    cudaMemcpy(d_energy_sum, &h_energy_sum, sizeof(double), cudaMemcpyHostToDevice);
 
     // launch kernel
     calc_angle_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_angles, d_coords, d_cangles, d_dvelocities, d_energy_sum);
@@ -95,14 +94,14 @@ real_t calc_angle_forces_host(int start, int end) {
 
     // todo: Now have to do that, after moving all to CudaContext, can remove it
     // copy results back to host
-    cudaMemcpy(&h_energy_sum, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_energy_sum, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
     return h_energy_sum;
 }
 
 void init_angle_force_kernel_data() {
     using namespace CudaAngleForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t));
+        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_bond_force.cu b/src/core/cuda/src/cuda_bond_force.cu
index 476d7209..9b31a660 100644
--- a/src/core/cuda/src/cuda_bond_force.cu
+++ b/src/core/cuda/src/cuda_bond_force.cu
@@ -3,9 +3,9 @@
 #include "cuda_utility.cuh"
 namespace CudaBondForce {
 bool is_initialized = false;
-real_t* d_energy_sum;
+double* d_energy_sum;
 }  // namespace CudaBondForce
-__global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord_t* coords, cbond_t* cbonds, dvel_t* dvelocities, real_t* energy_sum) {
+__global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord_t* coords, cbond_t* cbonds, dvel_t* dvelocities, double* energy_sum) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x + start;
     if (idx >= end) return;
     bond_t bond = bonds[idx];
@@ -13,18 +13,18 @@ __global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord
     coord_t rj = coords[bond.aj - 1];
     cbond_t cbond = cbonds[bond.code - 1];
 
-    real_t dx = rj.x - ri.x;
-    real_t dy = rj.y - ri.y;
-    real_t dz = rj.z - ri.z;
-    real_t r = sqrt(dx * dx + dy * dy + dz * dz);
+    double dx = rj.x - ri.x;
+    double dy = rj.y - ri.y;
+    double dz = rj.z - ri.z;
+    double r = sqrt(dx * dx + dy * dy + dz * dz);
 
-    real_t dr = r - cbond.b0;
-    real_t energy = 0.5 * cbond.kb * dr * dr;
+    double dr = r - cbond.b0;
+    double energy = 0.5 * cbond.kb * dr * dr;
 
     atomicAdd(energy_sum, energy);
 
     // update forces
-    real_t f = cbond.kb * dr / r;
+    double f = cbond.kb * dr / r;
     atomicAdd(&dvelocities[bond.aj - 1].x, f * dx);
     atomicAdd(&dvelocities[bond.aj - 1].y, f * dy);
     atomicAdd(&dvelocities[bond.aj - 1].z, f * dz);
@@ -33,15 +33,15 @@ __global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord
     atomicAdd(&dvelocities[bond.ai - 1].z, -f * dz);
 }
 
-real_t calc_bond_forces_host(int start, int end) {
+double calc_bond_forces_host(int start, int end) {
     int N = end - start;
     if (N <= 0) return 0.0;
     using namespace CudaBondForce;
     int blockSize = 256;
     int numBlocks = (N + blockSize - 1) / blockSize;
 
-    real_t energy = 0.0;
-    cudaMemcpy(d_energy_sum, &energy, sizeof(real_t), cudaMemcpyHostToDevice);
+    double energy = 0.0;
+    cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice);
 
     auto& host_ctx = Context::instance();
     bond_t* d_bonds = host_ctx.bonds->gpu_data_p;
@@ -51,7 +51,7 @@ real_t calc_bond_forces_host(int start, int end) {
 
     calc_bond_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_bonds, d_coords, d_cbonds, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
-    cudaMemcpy(&energy, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&energy, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
 
     return energy;
 }
@@ -59,7 +59,7 @@ real_t calc_bond_forces_host(int start, int end) {
 void init_bond_force_kernel_data() {
     using namespace CudaBondForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t));
+        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu
index f0f790db..dd7d91aa 100644
--- a/src/core/cuda/src/cuda_improper2_force.cu
+++ b/src/core/cuda/src/cuda_improper2_force.cu
@@ -4,10 +4,10 @@
 
 namespace CudaImproper2Force {
 bool is_initialized = false;
-real_t* d_energy_sum;
+double* d_energy_sum;
 }  // namespace CudaImproper2Force
 
-__global__ void calc_improper2_forces_kernel(int start, int end, improper_t* impropers, cimproper_t* cimpropers, coord_t* coords, dvel_t* dvelocities, real_t* energy_sum) {
+__global__ void calc_improper2_forces_kernel(int start, int end, improper_t* impropers, cimproper_t* cimpropers, coord_t* coords, dvel_t* dvelocities, double* energy_sum) {
     int i = blockIdx.x * blockDim.x + threadIdx.x + start;
     if (i >= end) return;
 
@@ -15,8 +15,8 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp
 
     coord_t ai, aj, ak, al;
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
-    real_t bj2inv, bk2inv, bjinv, bkinv;
-    real_t cos_phi, phi, arg, ener, dv, f1;
+    double bj2inv, bk2inv, bjinv, bkinv;
+    double cos_phi, phi, arg, ener, dv, f1;
     coord_t di, dl, dpi, dpj, dpk, dpl;
 
     improper_t imp;
@@ -124,15 +124,15 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp
     atomicAdd(&dvelocities[ali].z, dv * dpl.z);
 }
 
-real_t calc_improper2_forces_host(int start, int end) {
+double calc_improper2_forces_host(int start, int end) {
     int N = end - start;
     if (N <= 0) return 0.0;
     using namespace CudaImproper2Force;
     int blockSize = 256;
     int numBlocks = (N + blockSize - 1) / blockSize;
 
-    real_t energy = 0.0;
-    cudaMemcpy(d_energy_sum, &energy, sizeof(real_t), cudaMemcpyHostToDevice);
+    double energy = 0.0;
+    cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice);
 
     auto& host_ctx = Context::instance();
     coord_t* d_coords = host_ctx.coords->gpu_data_p;
@@ -142,14 +142,14 @@ real_t calc_improper2_forces_host(int start, int end) {
 
     calc_improper2_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_impropers, d_cimpropers, d_coords, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
-    cudaMemcpy(&energy, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&energy, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
     return energy;
 }
 
 void init_improper2_force_kernel_data() {
     using namespace CudaImproper2Force;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t));
+        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_leapfrog.cu b/src/core/cuda/src/cuda_leapfrog.cu
index 2ac8245a..1e010f7e 100644
--- a/src/core/cuda/src/cuda_leapfrog.cu
+++ b/src/core/cuda/src/cuda_leapfrog.cu
@@ -18,20 +18,20 @@ __global__ void calc_leapfrog_kernel(
     coord_t* xcoords,
     int n_atoms,
     int n_atoms_solute,
-    real_t Tscale_solute,
-    real_t Tscale_solvent,
-    real_t dt) {
+    double Tscale_solute,
+    double Tscale_solvent,
+    double dt) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_atoms) return;
     int i = idx;
 
     // Kernel implementation goes here
-    real_t mass_i, winv_i;
+    double mass_i, winv_i;
 
     mass_i = catypes[atypes[i].code - 1].m;
 
     winv_i = 1 / mass_i;
-    real_t scale = (i < n_atoms_solute) ? Tscale_solute : Tscale_solvent;
+    double scale = (i < n_atoms_solute) ? Tscale_solute : Tscale_solvent;
     velocities[i].x = (velocities[i].x - dvelocities[i].x * dt * winv_i) * scale;
     velocities[i].y = (velocities[i].y - dvelocities[i].y * dt * winv_i) * scale;
     velocities[i].z = (velocities[i].z - dvelocities[i].z * dt * winv_i) * scale;
@@ -50,7 +50,7 @@ __global__ void update_velocities_from_positions_kernel(
     const coord_t* coords,
     const coord_t* xcoords,
     int n_atoms,
-    real_t dt) {
+    double dt) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_atoms) return;
 
diff --git a/src/core/cuda/src/cuda_nonbonded_14_force.cu b/src/core/cuda/src/cuda_nonbonded_14_force.cu
index f925fe6f..78c4bc91 100644
--- a/src/core/cuda/src/cuda_nonbonded_14_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_14_force.cu
@@ -9,8 +9,8 @@ bool is_initialized = false;
 constexpr int kNonbonded14ModeCount = 3;
 
 int* d_atom_to_qi = nullptr;
-real_t* d_evdw_totals = nullptr;
-real_t* d_ecoul_totals = nullptr;
+double* d_evdw_totals = nullptr;
+double* d_ecoul_totals = nullptr;
 
 __device__ __forceinline__ nonbond_work_t nonbond14_rsqrt(nonbond_work_t value) {
 #ifdef QDYN_SPFP
@@ -96,13 +96,13 @@ __global__ void calc_nonbonded_14_force_kernel(
     const catype_t* unified_catypes,
     const coord_t* d_coords,
     dvel_t* d_dvelocities,
-    real_t* evdw_totals,
-    real_t* ecoul_totals,
+    double* evdw_totals,
+    double* ecoul_totals,
     bool include_pp,
     int state,
     int n_atoms,
     int n_qatoms,
-    real_t lambda) {
+    double lambda) {
     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_pairs) return;
 
@@ -166,14 +166,14 @@ __global__ void calc_nonbonded_14_force_kernel(
 
 namespace {
 struct Nonbonded14EnergyBuckets {
-    real_t evdw[CudaNonbonded14Force::kNonbonded14ModeCount] = {};
-    real_t ecoul[CudaNonbonded14Force::kNonbonded14ModeCount] = {};
+    double evdw[CudaNonbonded14Force::kNonbonded14ModeCount] = {};
+    double ecoul[CudaNonbonded14Force::kNonbonded14ModeCount] = {};
 };
 }
 
 static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host(
     int state,
-    real_t lambda,
+    double lambda,
     bool include_pp) {
     using namespace CudaNonbonded14Force;
 
@@ -182,8 +182,8 @@ static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host(
     Nonbonded14EnergyBuckets energies = {};
     if (n_ngbrs_14 == 0) return energies;
 
-    cudaMemset(d_ecoul_totals, 0, sizeof(real_t) * kNonbonded14ModeCount);
-    cudaMemset(d_evdw_totals, 0, sizeof(real_t) * kNonbonded14ModeCount);
+    cudaMemset(d_ecoul_totals, 0, sizeof(double) * kNonbonded14ModeCount);
+    cudaMemset(d_evdw_totals, 0, sizeof(double) * kNonbonded14ModeCount);
 
     const int block_size = 256;
     const int num_blocks = (n_ngbrs_14 + block_size - 1) / block_size;
@@ -208,8 +208,8 @@ static Nonbonded14EnergyBuckets calc_nonbonded_14_force_state_host(
 
     cudaDeviceSynchronize();
 
-    cudaMemcpy(energies.evdw, d_evdw_totals, sizeof(real_t) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost);
-    cudaMemcpy(energies.ecoul, d_ecoul_totals, sizeof(real_t) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost);
+    cudaMemcpy(energies.evdw, d_evdw_totals, sizeof(double) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost);
+    cudaMemcpy(energies.ecoul, d_ecoul_totals, sizeof(double) * kNonbonded14ModeCount, cudaMemcpyDeviceToHost);
 
     return energies;
 }
@@ -221,7 +221,7 @@ void calc_nonbonded_14_forces_host() {
     if (host.n_ngbrs14 == 0) return;
 
     for (int state = 0; state < host.n_lambdas; state++) {
-        const real_t lambda = lambdas[state];
+        const double lambda = lambdas[state];
         const bool include_pp = (state == 0);
         Nonbonded14EnergyBuckets energies = calc_nonbonded_14_force_state_host(state, lambda, include_pp);
 
@@ -248,8 +248,8 @@ void init_nonbonded_14_force_kernel_data() {
     check_cudaMalloc((void**)&d_atom_to_qi, sizeof(int) * host.atom_to_qi.size());
     check_cuda(cudaMemcpy(d_atom_to_qi, host.atom_to_qi.data(), sizeof(int) * host.atom_to_qi.size(), cudaMemcpyHostToDevice));
 
-    check_cudaMalloc((void**)&d_evdw_totals, sizeof(real_t) * kNonbonded14ModeCount);
-    check_cudaMalloc((void**)&d_ecoul_totals, sizeof(real_t) * kNonbonded14ModeCount);
+    check_cudaMalloc((void**)&d_evdw_totals, sizeof(double) * kNonbonded14ModeCount);
+    check_cudaMalloc((void**)&d_ecoul_totals, sizeof(double) * kNonbonded14ModeCount);
 
     is_initialized = true;
 }
diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu
index d7f0719c..32b4077a 100644
--- a/src/core/cuda/src/cuda_nonbonded_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_force.cu
@@ -7,7 +7,7 @@
 
 namespace CudaNonbondedForce {
 bool is_initialized = false;
-real_t *d_evdw_total, *d_ecoul_total;
+double *d_evdw_total, *d_ecoul_total;
 
 template <typename WorkT>
 struct nonbond_vec_t {
@@ -20,11 +20,9 @@ __device__ __forceinline__ float nonbond_rsqrt(float value) {
     return rsqrtf(value);
 }
 
-#ifndef QDYN_SPFP
 __device__ __forceinline__ double nonbond_rsqrt(double value) {
     return rsqrt(value);
 }
-#endif
 
 __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
     x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f);
@@ -41,7 +39,6 @@ __device__ __forceinline__ T shfl_value(T v, int srcLane, unsigned mask = 0xffff
     return __shfl_sync(mask, v, srcLane);
 }
 
-#ifndef QDYN_SPFP
 template <>
 __device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mask) {
     int2 a = *reinterpret_cast<int2*>(&v);
@@ -49,7 +46,6 @@ __device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mas
     a.y = __shfl_sync(mask, a.y, srcLane);
     return *reinterpret_cast<double*>(&a);
 }
-#endif
 
 __device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) {
     v.x = shfl_value(v.x, srcLane, mask);
@@ -80,8 +76,8 @@ __device__ void calculate_unforce_bound(
     const WorkT r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz);
     const WorkT r2 = r * r;
     const WorkT r6 = r2 * r2 * r2;
-    // real_t v_a = r6 * r6;
-    // real_t v_b = r6;
+    // double v_a = r6 * r6;
+    // double v_b = r6;
     // ecoul = r;
     // evdw = v_a - v_b;
     // dv = r2 * (-ecoul - v_a + v_b);
@@ -120,8 +116,8 @@ __global__ void calc_nonbonded_force_kernel(
 
     dvel_t* d_dvelocities,
 
-    real_t* evdw_tot,
-    real_t* ecoul_tot,
+    double* evdw_tot,
+    double* ecoul_tot,
 
     bool symmetric,
 
@@ -134,7 +130,7 @@ __global__ void calc_nonbonded_force_kernel(
     const int n_catype_types,
     const int zero_catype_type,
     const int n_qelscales,
-    const real_t lambda,
+    const double lambda,
     const q_elscale_t* d_qelscales  // todo: Now doesn't use it. Should optimize it later
 
 ) {
@@ -184,8 +180,8 @@ __global__ void calc_nonbonded_force_kernel(
     nonbond_vec_t<WorkT> x_force = {0.0, 0.0, 0.0};
     nonbond_vec_t<WorkT> y_force = {0.0, 0.0, 0.0};
 
-    real_t evdw_sum = 0.0;
-    real_t ecoul_sum = 0.0;
+    double evdw_sum = 0.0;
+    double ecoul_sum = 0.0;
 
     const unsigned mask = 0xffffffffu;
 
@@ -311,7 +307,7 @@ __global__ void calc_nonbonded_force_kernel(
 
 }  // namespace CudaNonbondedForce
 
-std::pair<real_t, real_t> calc_nonbonded_force_host(
+std::pair<double, double> calc_nonbonded_force_host(
     int nx,
     int ny,
     int* x_idx_list,
@@ -322,7 +318,7 @@ std::pair<real_t, real_t> calc_nonbonded_force_host(
     const int* y_charges_types,
     const int* x_atypes_types,
     const int* y_atypes_types,
-    const bool disable_water_h_lj, const real_t lambda) {
+    const bool disable_water_h_lj, const double lambda) {
     using namespace CudaNonbondedForce;
     Context& host = Context::instance();
     const int thread_num = 256;
@@ -338,8 +334,8 @@ std::pair<real_t, real_t> calc_nonbonded_force_host(
 
     dim3 grid = dim3(grid_sz);
 
-    cudaMemset(d_ecoul_total, 0, sizeof(real_t));
-    cudaMemset(d_evdw_total, 0, sizeof(real_t));
+    cudaMemset(d_ecoul_total, 0, sizeof(double));
+    cudaMemset(d_evdw_total, 0, sizeof(double));
 
     auto launch_kernel = [&](auto work_tag) {
         using WorkT = decltype(work_tag);
@@ -377,9 +373,9 @@ std::pair<real_t, real_t> calc_nonbonded_force_host(
 
     cudaDeviceSynchronize();
 
-    real_t evdw_tot = 0, ecoul_tot = 0;
-    cudaMemcpy(&evdw_tot, d_evdw_total, sizeof(real_t), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&ecoul_tot, d_ecoul_total, sizeof(real_t), cudaMemcpyDeviceToHost);
+    double evdw_tot = 0, ecoul_tot = 0;
+    cudaMemcpy(&evdw_tot, d_evdw_total, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&ecoul_tot, d_ecoul_total, sizeof(double), cudaMemcpyDeviceToHost);
 
     return {evdw_tot, ecoul_tot};
 }
@@ -387,8 +383,8 @@ std::pair<real_t, real_t> calc_nonbonded_force_host(
 void init_nonbonded_force_kernel_data() {
     using namespace CudaNonbondedForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_evdw_total, sizeof(real_t));
-        check_cudaMalloc((void**)&d_ecoul_total, sizeof(real_t));
+        check_cudaMalloc((void**)&d_evdw_total, sizeof(double));
+        check_cudaMalloc((void**)&d_ecoul_total, sizeof(double));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu
index bdb35608..7be0656f 100644
--- a/src/core/cuda/src/cuda_polx_water_force.cu
+++ b/src/core/cuda/src/cuda_polx_water_force.cu
@@ -14,11 +14,11 @@ int* water_shell = nullptr;
 int* water_rank = nullptr;
 int* polx_list_sh = nullptr;  // use 1d array to simulate 2d array
 
-real_t* d_energy;
+double* d_energy;
 int* d_list_sh = nullptr;
-real_t* d_theta = nullptr;
-real_t* d_theta0 = nullptr;
-real_t* d_tdum = nullptr;
+double* d_theta = nullptr;
+double* d_theta0 = nullptr;
+double* d_tdum = nullptr;
 int* d_water_shell = nullptr;
 int* d_water_rank = nullptr;
 
@@ -27,15 +27,15 @@ int* d_water_rank = nullptr;
 __global__ void calc_polx_theta_and_shells(
     int n_waters, int n_shells, int n_atoms_solute,
     coord_t* coords, topo_t topo, shell_t* wshells, int* list_sh,
-    real_t* theta, real_t* theta0, real_t* tdum) {
+    double* theta, double* theta0, double* tdum) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_waters) return;
     int i = idx;
 
     int wi, iis;
     coord_t rmu, rcu;
-    real_t rm, rc;
-    real_t cos_th;
+    double rm, rc;
+    double cos_th;
 
     theta[i] = 0;
     theta0[i] = 0;
@@ -81,7 +81,7 @@ __global__ void calc_polx_theta_and_shells(
 __global__ void calc_polx_water_forces_kernel(
     int n_waters, int n_atoms_solute, shell_t* wshells,
     coord_t* coords, dvel_t* dvelocities, topo_t topo,
-    real_t* theta, md_t md, real_t* energy,
+    double* theta, md_t md, double* energy,
     int* water_rank, int* water_shell) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_waters) return;
@@ -92,21 +92,21 @@ __global__ void calc_polx_water_forces_kernel(
 
     int wi, ii;
     coord_t rmu, rcu, f1O, f1H1, f1H2, f2;
-    real_t rm, rc;
-    real_t cos_th;
-    real_t avtdum, arg, f0, dv;
-    real_t ener;
+    double rm, rc;
+    double cos_th;
+    double avtdum, arg, f0, dv;
+    double ener;
 
     avtdum = 0;
     ii = idx;
-    arg = 1 + ((1 - 2 * (real_t)(il + 1)) / (real_t)wshells[is].n_inshell);
-    real_t theta_val = acos(arg);
+    arg = 1 + ((1 - 2 * (double)(il + 1)) / (double)wshells[is].n_inshell);
+    double theta_val = acos(arg);
     theta_val = theta_val - 3 * sin(theta_val) * wshells[is].cstb / 2;
     if (theta_val < 0) theta_val = 0;
     if (theta_val > M_PI) theta_val = M_PI;
 
     avtdum += theta[ii];
-    const real_t dtheta = theta[ii] - theta_val + wshells[is].theta_corr;
+    const double dtheta = theta[ii] - theta_val + wshells[is].theta_corr;
     ener = .5 * md.polarisation_force * dtheta * dtheta;
     // E_restraint.Upolx += ener;
     atomicAdd(energy, ener);
@@ -164,7 +164,7 @@ __global__ void calc_polx_water_forces_kernel(
     atomicAdd(&dvelocities[wi + 2].y, f0 * (f1H2.y));
     atomicAdd(&dvelocities[wi + 2].z, f0 * (f1H2.z));
 
-    atomicAdd(&wshells[is].avtheta, avtdum / (real_t)wshells[is].n_inshell);
+    atomicAdd(&wshells[is].avtheta, avtdum / (double)wshells[is].n_inshell);
     atomicAdd(&wshells[is].avn_inshell, wshells[is].n_inshell);
 }
 
@@ -174,7 +174,7 @@ void sort_waters() {
     auto *wshells = ctx.wshells->cpu_data_p;
 
     int imin, jmin, jw;
-    real_t tmin;
+    double tmin;
     // Sort the waters according to theta
     for (int is = 0; is < ctx.n_shells; is++) {
         imin = 0;
@@ -224,7 +224,7 @@ void calc_polx_water_forces_host(int iteration) {
 
     // todo: sort in cpu now..
     ctx.wshells->download();
-    cudaMemcpy(ctx.tdum.data(), d_tdum, ctx.n_waters * sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(ctx.tdum.data(), d_tdum, ctx.n_waters * sizeof(double), cudaMemcpyDeviceToHost);
     cudaMemcpy(polx_list_sh, d_list_sh, ctx.n_max_inshell * ctx.n_shells * sizeof(int), cudaMemcpyDeviceToHost);
 
     // Reset per-water metadata; only waters placed in shells will be overwritten in sort_waters().
@@ -244,8 +244,8 @@ void calc_polx_water_forces_host(int iteration) {
     if (iteration != 0 && iteration % itdis_update == 0) {
         for (int is = 0; is < ctx.n_shells; is++) {
             printf("SHELL %d\n", is);
-            wshells[is].avtheta /= (real_t)itdis_update;
-            wshells[is].avn_inshell /= (real_t)itdis_update;
+            wshells[is].avtheta /= (double)itdis_update;
+            wshells[is].avn_inshell /= (double)itdis_update;
             wshells[is].theta_corr = wshells[is].theta_corr + wshells[is].avtheta - acos(wshells[is].cstb);
             printf("average theta = %f, average in shell = %f, theta_corr = %f\n",
                    wshells[is].avtheta * 180 / M_PI, wshells[is].avn_inshell, wshells[is].theta_corr * 180 / M_PI);
@@ -256,12 +256,12 @@ void calc_polx_water_forces_host(int iteration) {
     }
 
     // Calculate energy and force
-    cudaMemset(d_energy, 0, sizeof(real_t));
+    cudaMemset(d_energy, 0, sizeof(double));
     calc_polx_water_forces_kernel<<<numBlocks, blockSize>>>(
         ctx.n_waters, ctx.n_atoms_solute, d_wshells, d_coords, d_dvelocities, ctx.topo,
         d_theta, ctx.md, d_energy, d_water_rank, d_water_shell);
-    real_t energy;
-    cudaMemcpy(&energy, d_energy, sizeof(real_t), cudaMemcpyDeviceToHost);
+    double energy;
+    cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost);
     ctx.E_restraint.Upolx += energy;
     ctx.wshells->download();
     // Copy back forces for all atoms (solute + solvent); water forces were being dropped.
@@ -275,11 +275,11 @@ void init_polx_water_force_kernel_data() {
         water_shell = new int[ctx.n_waters];
         polx_list_sh = new int[ctx.n_max_inshell * ctx.n_shells];
 
-        check_cudaMalloc((void**)&d_energy, sizeof(real_t));
+        check_cudaMalloc((void**)&d_energy, sizeof(double));
         check_cudaMalloc((void**)&d_list_sh, ctx.n_max_inshell * ctx.n_shells * sizeof(int));
-        check_cudaMalloc((void**)&d_theta, ctx.n_waters * sizeof(real_t));
-        check_cudaMalloc((void**)&d_theta0, ctx.n_waters * sizeof(real_t));
-        check_cudaMalloc((void**)&d_tdum, ctx.n_waters * sizeof(real_t));
+        check_cudaMalloc((void**)&d_theta, ctx.n_waters * sizeof(double));
+        check_cudaMalloc((void**)&d_theta0, ctx.n_waters * sizeof(double));
+        check_cudaMalloc((void**)&d_tdum, ctx.n_waters * sizeof(double));
         check_cudaMalloc((void**)&d_water_rank, ctx.n_waters * sizeof(int));
         check_cudaMalloc((void**)&d_water_shell, ctx.n_waters * sizeof(int));
 
diff --git a/src/core/cuda/src/cuda_pshell_force.cu b/src/core/cuda/src/cuda_pshell_force.cu
index b6ef257e..5221cb9e 100644
--- a/src/core/cuda/src/cuda_pshell_force.cu
+++ b/src/core/cuda/src/cuda_pshell_force.cu
@@ -5,8 +5,8 @@
 #include <iostream>
 namespace CudaPshellForce {
 bool is_initialized = false;
-real_t* d_ufix_energy;
-real_t* d_ushell_energy;
+double* d_ufix_energy;
+double* d_ushell_energy;
 
 }  // namespace CudaPshellForce
 __global__ void calc_pshell_force_kernel(
@@ -15,14 +15,14 @@ __global__ void calc_pshell_force_kernel(
     bool* excluded,
     coord_t* coords,
     coord_t* coords_init,
-    real_t* ufix_energy,
-    real_t* ushell_energy,
+    double* ufix_energy,
+    double* ushell_energy,
     dvel_t* dvelocities) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i >= n_atoms_solute) return;
 
     coord_t dr;
-    real_t k, r2, ener;
+    double k, r2, ener;
 
     if (shell[i] || excluded[i]) {
         // printf("i = %d excluded = %s shell = %s\n", i, excluded[i] ? "True" : "False", shell[i] ? "True" : "False");
@@ -57,8 +57,8 @@ void calc_pshell_forces_host() {
     auto d_coords_init = host.coords_init->gpu_data_p;
     auto d_dvelocities = host.dvelocities->gpu_data_p;
 
-    cudaMemset(d_ufix_energy, 0, sizeof(real_t));
-    cudaMemset(d_ushell_energy, 0, sizeof(real_t));
+    cudaMemset(d_ufix_energy, 0, sizeof(double));
+    cudaMemset(d_ushell_energy, 0, sizeof(double));
 
     int blockSize = 256;
     int numBlocks = (host.n_atoms_solute + blockSize - 1) / blockSize;
@@ -72,10 +72,10 @@ void calc_pshell_forces_host() {
         d_ushell_energy,
         d_dvelocities);
     cudaDeviceSynchronize();
-    real_t ufix_energy;
-    real_t ushell_energy;
-    cudaMemcpy(&ufix_energy, d_ufix_energy, sizeof(real_t), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&ushell_energy, d_ushell_energy, sizeof(real_t), cudaMemcpyDeviceToHost);
+    double ufix_energy;
+    double ushell_energy;
+    cudaMemcpy(&ufix_energy, d_ufix_energy, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&ushell_energy, d_ushell_energy, sizeof(double), cudaMemcpyDeviceToHost);
 
     host.E_restraint.Ufix += ufix_energy;
     host.E_restraint.Ushell += ushell_energy;
@@ -85,8 +85,8 @@ void calc_pshell_forces_host() {
 void init_pshell_force_kernel_data() {
     using namespace CudaPshellForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_ufix_energy, sizeof(real_t));
-        check_cudaMalloc((void**)&d_ushell_energy, sizeof(real_t));
+        check_cudaMalloc((void**)&d_ufix_energy, sizeof(double));
+        check_cudaMalloc((void**)&d_ushell_energy, sizeof(double));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_radix_water_force.cu b/src/core/cuda/src/cuda_radix_water_force.cu
index 26c8c94f..f037e9db 100644
--- a/src/core/cuda/src/cuda_radix_water_force.cu
+++ b/src/core/cuda/src/cuda_radix_water_force.cu
@@ -6,20 +6,20 @@
 #include "cuda/include/cuda_utility.cuh"
 namespace CudaRadixWaterForce {
 bool is_initialized = false;
-real_t* d_energy;
+double* d_energy;
 }  // namespace CudaRadixWaterForce
 
 __global__ void calc_radix_water_forces_kernel(
     coord_t* coords,
-    real_t shift,
+    double shift,
     int n_atoms_solute,
     int n_atoms,
     topo_t topo,
     md_t md,
-    real_t Dwmz,
-    real_t awmz,
+    double Dwmz,
+    double awmz,
     dvel_t* dvelocities,
-    real_t* energy) {
+    double* energy) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     i = n_atoms_solute + i * 3;  // Process only oxygen atoms of water molecules
     if (i >= n_atoms) return;
@@ -29,16 +29,16 @@ __global__ void calc_radix_water_forces_kernel(
     dr.x = coords[i].x - topo.solvent_center.x;
     dr.y = coords[i].y - topo.solvent_center.y;
     dr.z = coords[i].z - topo.solvent_center.z;
-    real_t b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z);
-    real_t db = b - (topo.solvent_radius - shift);
+    double b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z);
+    double db = b - (topo.solvent_radius - shift);
 
-    real_t ener, dv;
+    double ener, dv;
     if (db > 0) {
         ener = 0.5 * md.radial_force * db * db - Dwmz;
         dv = md.radial_force * db / b;
     } else {
         if (b > 0.0) {
-            real_t fexp = exp(awmz * db);
+            double fexp = exp(awmz * db);
             ener = Dwmz * (fexp * fexp - 2 * fexp);
             dv = -2 * Dwmz * awmz * (fexp - fexp * fexp) / b;
         } else {
@@ -70,16 +70,16 @@ void calc_radix_water_forces_host() {
 
     auto d_coords = host.coords->gpu_data_p;
     auto d_dvelocities = host.dvelocities->gpu_data_p;
-    check_cuda(cudaMemset(d_energy, 0, sizeof(real_t)));
+    check_cuda(cudaMemset(d_energy, 0, sizeof(double)));
 
-    real_t shift;
+    double shift;
     if (host.md.radial_force != 0) {
         shift = sqrt(Boltz * host.Tfree / host.md.radial_force);
     } else {
         shift = 0;
     }
 
-    real_t energy = 0.0;
+    double energy = 0.0;
     calc_radix_water_forces_kernel<<<numBlocks, blockSize>>>(d_coords,
                                                              shift,
                                                              host.n_atoms_solute,
@@ -91,14 +91,14 @@ void calc_radix_water_forces_host() {
                                                              d_dvelocities,
                                                              d_energy);
     check_cuda(cudaDeviceSynchronize());
-    check_cuda(cudaMemcpy(&energy, d_energy, sizeof(real_t), cudaMemcpyDeviceToHost));
+    check_cuda(cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost));
     host.E_restraint.Uradx += energy;
 }
 
 void init_radix_water_force_kernel_data() {
     using namespace CudaRadixWaterForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_energy, sizeof(real_t));
+        check_cudaMalloc((void**)&d_energy, sizeof(double));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu
index e32872b7..567a78df 100644
--- a/src/core/cuda/src/cuda_restrang_force.cu
+++ b/src/core/cuda/src/cuda_restrang_force.cu
@@ -3,26 +3,26 @@
 #include "common/include/context.h"
 namespace CudaRestrangForce {
 bool is_initialized = false;
-real_t* d_E_restraint;
+double* d_E_restraint;
 }  // namespace CudaRestrangForce
 
 __global__ void calc_restrang_force_kernel(
     restrang_t* restrangs,
     int n_restrangs,
     coord_t* coords,
-    real_t* lambdas,
+    double* lambdas,
     int n_lambdas,
     dvel_t* dvelocities,
     E_restraint_t* EQ_restraint,
-    real_t* E_restraint) {
+    double* E_restraint) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_restrangs) return;
     int ir = idx;
 
     int state, i, j, k;
     coord_t dr, dr2, di, dk;
-    real_t lambda, r2ij, r2jk, rij, rjk, cos_th, th;
-    real_t dth, dv, ener, f1;
+    double lambda, r2ij, r2jk, rij, rjk, cos_th, th;
+    double dth, dv, ener, f1;
 
     state = restrangs[ir].ipsi - 1;
     i = restrangs[ir].ai - 1;
@@ -110,8 +110,8 @@ void calc_restrang_force_host() {
     auto d_dvelocities = host.dvelocities->gpu_data_p;
     auto d_EQ_restraint = host.EQ_restraint->gpu_data_p;
 
-    real_t val = 0;
-    cudaMemcpy(d_E_restraint, &val, sizeof(real_t), cudaMemcpyHostToDevice);
+    double val = 0;
+    cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice);
 
     int blockSize = 256;
     int numBlocks = (host.n_restrangs + blockSize - 1) / blockSize;
@@ -126,14 +126,14 @@ void calc_restrang_force_host() {
         d_E_restraint);
     cudaDeviceSynchronize();
     host.EQ_restraint->download();
-    cudaMemcpy(&val, d_E_restraint, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&val, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost);
     host.E_restraint.Upres += val;
 }
 
 void init_restrang_force_kernel_data() {
     using namespace CudaRestrangForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_E_restraint, sizeof(real_t));
+        check_cudaMalloc((void**)&d_E_restraint, sizeof(double));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_restrdis_force.cu b/src/core/cuda/src/cuda_restrdis_force.cu
index cdb035b4..14f9b466 100644
--- a/src/core/cuda/src/cuda_restrdis_force.cu
+++ b/src/core/cuda/src/cuda_restrdis_force.cu
@@ -5,24 +5,24 @@
 #include "common/include/context.h"
 namespace CudaRestrdisForce {
 bool is_initialized = false;
-real_t* d_E_restraint;
+double* d_E_restraint;
 }  // namespace CudaRestrdisForce
 
 __global__ void calc_restrdis_forces_kernel(
     restrdis_t* restrdists,
     int n_restrdists,
     coord_t* coords,
-    real_t* lambdas,
+    double* lambdas,
     int n_lambdas,
     dvel_t* dvelocities,
     E_restraint_t* EQ_restraint,
-    real_t* E_restraint) {
+    double* E_restraint) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_restrdists) return;
 
     int state, i, j;
     coord_t dr;
-    real_t lambda, b, db, dv, ener;
+    double lambda, b, db, dv, ener;
 
     int ir = idx;
 
@@ -82,7 +82,7 @@ void calc_restrdis_forces_host() {
     auto d_dvelocities = host.dvelocities->gpu_data_p;
     auto d_EQ_restraint = host.EQ_restraint->gpu_data_p;
 
-    cudaMemset(d_E_restraint, 0, sizeof(real_t));
+    cudaMemset(d_E_restraint, 0, sizeof(double));
 
     int blockSize = 256;
     int numBlocks = (host.n_restrdists + blockSize - 1) / blockSize;
@@ -97,8 +97,8 @@ void calc_restrdis_forces_host() {
         d_E_restraint);
     cudaDeviceSynchronize();
     host.EQ_restraint->download();
-    real_t ener;
-    cudaMemcpy(&ener, d_E_restraint, sizeof(real_t), cudaMemcpyDeviceToHost);
+    double ener;
+    cudaMemcpy(&ener, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost);
     printf("Energy restraint: %f\n", ener);
     host.E_restraint.Upres += ener;
 }
@@ -106,7 +106,7 @@ void calc_restrdis_forces_host() {
 void init_restrdis_force_kernel_data() {
     using namespace CudaRestrdisForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_E_restraint, sizeof(real_t));
+        check_cudaMalloc((void**)&d_E_restraint, sizeof(double));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_restrpos_force.cu b/src/core/cuda/src/cuda_restrpos_force.cu
index 5307bff5..695e2b33 100644
--- a/src/core/cuda/src/cuda_restrpos_force.cu
+++ b/src/core/cuda/src/cuda_restrpos_force.cu
@@ -6,17 +6,17 @@
 
 namespace CudaRestrposForce {
 bool is_initialized = false;
-real_t* d_E_restraint;
+double* d_E_restraint;
 }  // namespace CudaRestrposForce
 
 __global__ void calc_restrpos_forces_kernel(
     restrpos_t* restrspos,
     int n_restrspos,
     coord_t* coords,
-    real_t* lambdas,
+    double* lambdas,
     int n_lambdas,
     E_restraint_t* EQ_restraint,
-    real_t* E_restraint,
+    double* E_restraint,
     dvel_t* dvelocities) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_restrspos) return;
@@ -24,7 +24,7 @@ __global__ void calc_restrpos_forces_kernel(
 
     int state, i;
     coord_t dr;
-    real_t lambda, ener, x2, y2, z2;
+    double lambda, ener, x2, y2, z2;
 
     state = restrspos[ir].ipsi - 1;
     i = restrspos[ir].a - 1;
@@ -64,8 +64,8 @@ void calc_restrpos_forces_host() {
     auto& host = Context::instance();
     if (host.n_restrspos == 0) return;
     using namespace CudaRestrposForce;
-    real_t val = 0.0;
-    cudaMemcpy(d_E_restraint, &val, sizeof(real_t), cudaMemcpyHostToDevice);
+    double val = 0.0;
+    cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice);
 
     auto d_restrspos = host.restrspos->gpu_data_p;
     auto d_coords = host.coords->gpu_data_p;
@@ -85,7 +85,7 @@ void calc_restrpos_forces_host() {
         d_E_restraint,
         d_dvelocities);
     cudaDeviceSynchronize();
-    cudaMemcpy(&val, d_E_restraint, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&val, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost);
     host.E_restraint.Upres += val;
     host.EQ_restraint->download();
 }
@@ -93,7 +93,7 @@ void calc_restrpos_forces_host() {
 void init_restrpos_force_kernel_data() {
     using namespace CudaRestrposForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_E_restraint, sizeof(real_t));
+        check_cudaMalloc((void**)&d_E_restraint, sizeof(double));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_restrseq_force.cu b/src/core/cuda/src/cuda_restrseq_force.cu
index e5951303..71835e4e 100644
--- a/src/core/cuda/src/cuda_restrseq_force.cu
+++ b/src/core/cuda/src/cuda_restrseq_force.cu
@@ -4,7 +4,7 @@
 
 namespace CudaRestrseqForce {
 bool is_initialized = false;
-real_t* d_upres_energy;
+double* d_upres_energy;
 }  // namespace CudaRestrseqForce
 __global__ void calc_restrseq_forces_kernel(
     int n_restrseqs,
@@ -15,13 +15,13 @@ __global__ void calc_restrseq_forces_kernel(
     catype_t* catypes,
     bool* heavy,
     dvel_t* dvelocities,
-    real_t* upres_energy) {
+    double* upres_energy) {
     int s = blockIdx.x * blockDim.x + threadIdx.x;
     if (s >= n_restrseqs) return;
 
-    real_t k, mass, totmass;
+    double k, mass, totmass;
     coord_t dr;
-    real_t r2, ener;
+    double r2, ener;
 
     k = restrseqs[s].k;
 
@@ -123,7 +123,7 @@ void calc_restrseq_forces_host() {
     auto d_catypes = host.catypes->gpu_data_p;
     auto d_heavy = host.heavy->gpu_data_p;
     auto d_dvelocities = host.dvelocities->gpu_data_p;
-    cudaMemset(d_upres_energy, 0, sizeof(real_t));
+    cudaMemset(d_upres_energy, 0, sizeof(double));
     // ctx.sync_all_to_device();
 
     int blockSize = 256;
@@ -139,8 +139,8 @@ void calc_restrseq_forces_host() {
         d_dvelocities,
         d_upres_energy);
     cudaDeviceSynchronize();
-    real_t upres_energy;
-    cudaMemcpy(&upres_energy, d_upres_energy, sizeof(real_t), cudaMemcpyDeviceToHost);
+    double upres_energy;
+    cudaMemcpy(&upres_energy, d_upres_energy, sizeof(double), cudaMemcpyDeviceToHost);
     host.E_restraint.Upres = upres_energy;
     printf("Restrseq U_upres: %f\n", upres_energy);
 }
@@ -148,7 +148,7 @@ void calc_restrseq_forces_host() {
 void init_restrseq_force_kernel_data() {
     using namespace CudaRestrseqForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_upres_energy, sizeof(real_t));
+        check_cudaMalloc((void**)&d_upres_energy, sizeof(double));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_restrwall_force.cu b/src/core/cuda/src/cuda_restrwall_force.cu
index 2ca01839..c928bb71 100644
--- a/src/core/cuda/src/cuda_restrwall_force.cu
+++ b/src/core/cuda/src/cuda_restrwall_force.cu
@@ -5,20 +5,20 @@
 
 namespace CudaRestrwallForce {
 bool is_initialized = false;
-real_t* d_energies;
+double* d_energies;
 }  // namespace CudaRestrwallForce
 
 __global__ void calc_restrwall_forces_kernel(
     restrwall_t* restrwalls,
     int n_restrwalls,
     coord_t* coords,
-    real_t* energies,
+    double* energies,
     dvel_t* dvelocities,
     bool* heavy, topo_t topo) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_restrwalls) return;
 
-    real_t k, b, db, ener, dv, fexp;
+    double k, b, db, ener, dv, fexp;
     coord_t dr;
 
     int ir = idx;
@@ -58,7 +58,7 @@ void calc_restrwall_forces_host() {
     auto d_coords = host.coords->gpu_data_p;
     auto d_dvelocities = host.dvelocities->gpu_data_p;
     auto d_heavy = host.heavy->gpu_data_p;
-    cudaMemset(d_energies, 0, sizeof(real_t));
+    cudaMemset(d_energies, 0, sizeof(double));
 
     int blockSize = 256;
     int numBlocks = (host.n_restrwalls + blockSize - 1) / blockSize;
@@ -69,8 +69,8 @@ void calc_restrwall_forces_host() {
         d_energies,
         d_dvelocities, d_heavy, host.topo);
     cudaDeviceSynchronize();
-    real_t h_energy;
-    cudaMemcpy(&h_energy, d_energies, sizeof(real_t), cudaMemcpyDeviceToHost);
+    double h_energy;
+    cudaMemcpy(&h_energy, d_energies, sizeof(double), cudaMemcpyDeviceToHost);
     printf("Restrwall energy: %f\n", h_energy);
     host.E_restraint.Upres += h_energy;
 }
@@ -78,7 +78,7 @@ void calc_restrwall_forces_host() {
 void init_restrwall_force_kernel_data() {
     using namespace CudaRestrwallForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_energies, sizeof(real_t));
+        check_cudaMalloc((void**)&d_energies, sizeof(double));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_shake_constraints.cu b/src/core/cuda/src/cuda_shake_constraints.cu
index 03fa76cd..bda47e50 100644
--- a/src/core/cuda/src/cuda_shake_constraints.cu
+++ b/src/core/cuda/src/cuda_shake_constraints.cu
@@ -17,7 +17,7 @@ __global__ void calc_shake_constraints_kernel(
     shake_bond_t* shake_bonds,
     coord_t* coords,
     coord_t* xcoords,
-    real_t* winv,
+    double* winv,
     int* total_iterations,
     int* mol_shake_offset) {
     int idx = blockIdx.x;
@@ -26,7 +26,7 @@ __global__ void calc_shake_constraints_kernel(
     int mol = idx;
 
     int ai, aj, n_iterations, shake;
-    real_t xij2, diff, corr, scp, xxij2;
+    double xij2, diff, corr, scp, xxij2;
     coord_t xij, xxij;
 
     if (mol_n_shakes[mol] == 0) return;
diff --git a/src/core/cuda/src/cuda_temperature.cu b/src/core/cuda/src/cuda_temperature.cu
index 46c4c373..baba687e 100644
--- a/src/core/cuda/src/cuda_temperature.cu
+++ b/src/core/cuda/src/cuda_temperature.cu
@@ -6,23 +6,23 @@
 
 namespace CudaTemperature {
 bool is_initialized = false;
-real_t* d_Temp_solute;
-real_t* d_Tfree_solute;
-real_t* d_Texcl_solute;
-real_t* d_Temp_solvent;
-real_t* d_Tfree_solvent;
-real_t* d_Texcl_solvent;
+double* d_Temp_solute;
+double* d_Tfree_solute;
+double* d_Texcl_solute;
+double* d_Temp_solvent;
+double* d_Tfree_solvent;
+double* d_Texcl_solvent;
 }  // namespace CudaTemperature
 
-__global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t* atypes, catype_t* catypes, vel_t* velocities, bool* excluded, real_t boltz, real_t ekinmax,
-                                        real_t* Temp_solute, real_t* Tfree_solute, real_t* Texcl_solute, real_t* Temp_solvent, real_t* Tfree_solvent, real_t* Texcl_solvent) {
+__global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t* atypes, catype_t* catypes, vel_t* velocities, bool* excluded, double boltz, double ekinmax,
+                                        double* Temp_solute, double* Tfree_solute, double* Texcl_solute, double* Temp_solvent, double* Tfree_solvent, double* Texcl_solvent) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_atoms) return;
-    real_t mass_i = catypes[atypes[idx].code - 1].m;
-    const real_t vx = velocities[idx].x;
-    const real_t vy = velocities[idx].y;
-    const real_t vz = velocities[idx].z;
-    real_t ener = .5 * mass_i * (vx * vx + vy * vy + vz * vz);
+    double mass_i = catypes[atypes[idx].code - 1].m;
+    const double vx = velocities[idx].x;
+    const double vy = velocities[idx].y;
+    const double vz = velocities[idx].z;
+    double ener = .5 * mass_i * (vx * vx + vy * vy + vz * vz);
     bool is_solute = (idx < n_atoms_solute);
     bool is_excluded = excluded[idx];
 
@@ -49,14 +49,14 @@ __global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t
 void calc_temperature_host() {
     auto& host = Context::instance();
     using namespace CudaTemperature;
-    real_t h_Temp_solute = 0.0, h_Tfree_solute = 0.0, h_Texcl_solute = 0.0, h_Temp_solvent = 0.0, h_Tfree_solvent = 0.0, h_Texcl_solvent = 0.0;
+    double h_Temp_solute = 0.0, h_Tfree_solute = 0.0, h_Texcl_solute = 0.0, h_Temp_solvent = 0.0, h_Tfree_solvent = 0.0, h_Texcl_solvent = 0.0;
 
-    cudaMemcpy(d_Temp_solute, &h_Temp_solute, sizeof(real_t), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_Tfree_solute, &h_Tfree_solute, sizeof(real_t), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_Texcl_solute, &h_Texcl_solute, sizeof(real_t), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_Temp_solvent, &h_Temp_solvent, sizeof(real_t), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_Tfree_solvent, &h_Tfree_solvent, sizeof(real_t), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_Texcl_solvent, &h_Texcl_solvent, sizeof(real_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Temp_solute, &h_Temp_solute, sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Tfree_solute, &h_Tfree_solute, sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Texcl_solute, &h_Texcl_solute, sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Temp_solvent, &h_Temp_solvent, sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Tfree_solvent, &h_Tfree_solvent, sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Texcl_solvent, &h_Texcl_solvent, sizeof(double), cudaMemcpyHostToDevice);
 
     atype_t* d_atypes = host.atypes->gpu_data_p;
     catype_t* d_catypes = host.catypes->gpu_data_p;
@@ -66,17 +66,17 @@ void calc_temperature_host() {
     int blockSize = 256;
     int numBlocks = (host.n_atoms + blockSize - 1) / blockSize;
 
-    real_t Ekinmax = 1000.0 * host.Ndegf * Boltz * host.md.temperature / 2.0 / host.n_atoms;
+    double Ekinmax = 1000.0 * host.Ndegf * Boltz * host.md.temperature / 2.0 / host.n_atoms;
     calc_temperature_kernel<<<numBlocks, blockSize>>>(host.n_atoms, host.n_atoms_solute, d_atypes, d_catypes, d_velocities, d_excluded, Boltz, Ekinmax,
                                                       d_Temp_solute, d_Tfree_solute, d_Texcl_solute, d_Temp_solvent, d_Tfree_solvent, d_Texcl_solvent);
 
     cudaDeviceSynchronize();
-    cudaMemcpy(&h_Temp_solute, d_Temp_solute, sizeof(real_t), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&h_Tfree_solute, d_Tfree_solute, sizeof(real_t), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&h_Texcl_solute, d_Texcl_solute, sizeof(real_t), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&h_Temp_solvent, d_Temp_solvent, sizeof(real_t), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&h_Tfree_solvent, d_Tfree_solvent, sizeof(real_t), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&h_Texcl_solvent, d_Texcl_solvent, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_Temp_solute, d_Temp_solute, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_Tfree_solute, d_Tfree_solute, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_Texcl_solute, d_Texcl_solute, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_Temp_solvent, d_Temp_solvent, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_Tfree_solvent, d_Tfree_solvent, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_Texcl_solvent, d_Texcl_solvent, sizeof(double), cudaMemcpyDeviceToHost);
     host.Tfree = h_Tfree_solute + h_Tfree_solvent;
     host.Temp = h_Temp_solute + h_Temp_solvent;
 
@@ -98,12 +98,12 @@ void calc_temperature_host() {
 void init_temperature_kernel_data() {
     using namespace CudaTemperature;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_Temp_solute, sizeof(real_t));
-        check_cudaMalloc((void**)&d_Tfree_solute, sizeof(real_t));
-        check_cudaMalloc((void**)&d_Texcl_solute, sizeof(real_t));
-        check_cudaMalloc((void**)&d_Temp_solvent, sizeof(real_t));
-        check_cudaMalloc((void**)&d_Tfree_solvent, sizeof(real_t));
-        check_cudaMalloc((void**)&d_Texcl_solvent, sizeof(real_t));
+        check_cudaMalloc((void**)&d_Temp_solute, sizeof(double));
+        check_cudaMalloc((void**)&d_Tfree_solute, sizeof(double));
+        check_cudaMalloc((void**)&d_Texcl_solute, sizeof(double));
+        check_cudaMalloc((void**)&d_Temp_solvent, sizeof(double));
+        check_cudaMalloc((void**)&d_Tfree_solvent, sizeof(double));
+        check_cudaMalloc((void**)&d_Texcl_solvent, sizeof(double));
         is_initialized = true;
     }
 }
diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu
index 1c0692ae..5baffbde 100644
--- a/src/core/cuda/src/cuda_torsion_force.cu
+++ b/src/core/cuda/src/cuda_torsion_force.cu
@@ -4,10 +4,10 @@
 
 namespace CudaTorsionForce {
 bool is_initialized = false;
-real_t* d_energy_sum = nullptr;
+double* d_energy_sum = nullptr;
 }  // namespace CudaTorsionForce
 
-__global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsions, ctorsion_t* ctorsions, coord_t* coords, dvel_t* dvelocities, real_t* energy_sum) {
+__global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsions, ctorsion_t* ctorsions, coord_t* coords, dvel_t* dvelocities, double* energy_sum) {
     int i = blockIdx.x * blockDim.x + threadIdx.x + start;
     if (i >= end) return;
     int aii, aji, aki, ali;
@@ -16,10 +16,10 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio
     coord_t rji, rjk, rkl, rnj, rnk, rki, rlj;
     coord_t di, dl, dpi, dpj, dpk, dpl;
 
-    real_t bj2inv, bk2inv, bjinv, bkinv;
-    real_t cos_phi, phi;
-    real_t arg, dv, f1;
-    real_t ener;
+    double bj2inv, bk2inv, bjinv, bkinv;
+    double cos_phi, phi;
+    double arg, dv, f1;
+    double ener;
 
     torsion_t t;
     ctorsion_t ctors;
@@ -63,8 +63,7 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio
     bkinv = sqrt(bk2inv);
 
     cos_phi = (rnj.x * rnk.x + rnj.y * rnk.y + rnj.z * rnk.z) * (bjinv * bkinv);
-    cos_phi = cos_phi > static_cast<real_t>(1.0) ? static_cast<real_t>(1.0) : cos_phi;
-    cos_phi = cos_phi < static_cast<real_t>(-1.0) ? static_cast<real_t>(-1.0) : cos_phi;
+    cos_phi = fmin(fmax(cos_phi, -1.0), 1.0);
     phi = acos(cos_phi);
     if (rjk.x * (rnj.y * rnk.z - rnj.z * rnk.y) + rjk.y * (rnj.z * rnk.x - rnj.x * rnk.z) + rjk.z * (rnj.x * rnk.y - rnj.y * rnk.x) < 0) {
         phi = -phi;
@@ -124,15 +123,15 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio
     atomicAdd(&dvelocities[ali].z, dv * dpl.z);
 }
 
-real_t calc_torsion_forces_host(int start, int end) {
+double calc_torsion_forces_host(int start, int end) {
     using namespace CudaTorsionForce;
     int N = end - start;
     if (N <= 0) return 0.0;
     int blockSize = 256;
     int numBlocks = (N + blockSize - 1) / blockSize;
 
-    real_t zero = 0.0;
-    cudaMemcpy(d_energy_sum, &zero, sizeof(real_t), cudaMemcpyHostToDevice);
+    double zero = 0.0;
+    cudaMemcpy(d_energy_sum, &zero, sizeof(double), cudaMemcpyHostToDevice);
 
     auto& host_ctx = Context::instance();
     coord_t* d_coords = host_ctx.coords->gpu_data_p;
@@ -142,7 +141,7 @@ real_t calc_torsion_forces_host(int start, int end) {
 
     calc_torsion_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_torsions, d_ctorsions, d_coords, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
-    cudaMemcpy(&zero, d_energy_sum, sizeof(real_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&zero, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
     return zero;
 }
 
@@ -151,7 +150,7 @@ real_t calc_torsion_forces_host(int start, int end) {
 void init_torsion_force_kernel_data() {
     using namespace CudaTorsionForce;
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_energy_sum, sizeof(real_t));
+        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
         is_initialized = true;
     }
 }

From b67d111424d837c7b084d2863158c824e5ccb852 Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Thu, 30 Apr 2026 16:10:03 +0200
Subject: [PATCH 19/20] Revert "Merge branch 'feature/qgpu_mixed_precision'
 into feature/qgpu_benchmark_script"

This reverts commit e6eee26979200b22c1cbc7ee6851dd1653aebfb0, reversing
changes made to edd65a3725ac4edd6a218716ee9fb6eb8dd3ce15.
---
 src/core/common/include/md_types.h         |   6 +-
 src/core/common/include/precision.h        |   9 +-
 src/core/cpu/src/cpu_angle_force.cpp       |   4 +-
 src/core/cpu/src/cpu_improper2_force.cpp   |   4 +-
 src/core/cpu/src/cpu_polx_water_force.cpp  |   4 +-
 src/core/cpu/src/cpu_q_angle_force.cpp     |   4 +-
 src/core/cpu/src/cpu_q_torsion_force.cpp   |   4 +-
 src/core/cpu/src/cpu_restrang_force.cpp    |   4 +-
 src/core/cpu/src/cpu_torsion_force.cpp     |   4 +-
 src/core/cuda/src/cuda_angle_force.cu      |   4 +-
 src/core/cuda/src/cuda_improper2_force.cu  |   2 +-
 src/core/cuda/src/cuda_nonbonded_force.cu  | 126 ++++++++++-----------
 src/core/cuda/src/cuda_polx_water_force.cu |   2 +-
 src/core/cuda/src/cuda_restrang_force.cu   |   4 +-
 src/core/cuda/src/cuda_torsion_force.cu    |   2 +-
 15 files changed, 84 insertions(+), 99 deletions(-)

diff --git a/src/core/common/include/md_types.h b/src/core/common/include/md_types.h
index 27c20cef..6a4d2865 100644
--- a/src/core/common/include/md_types.h
+++ b/src/core/common/include/md_types.h
@@ -310,9 +310,9 @@ struct vel_t {
 };
 
 struct dvel_t {
-    force_accum_t x;
-    force_accum_t y;
-    force_accum_t z;
+    double x;
+    double y;
+    double z;
 };
 
 struct E_bonded_t {
diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h
index 80b790f7..fc633f45 100644
--- a/src/core/common/include/precision.h
+++ b/src/core/common/include/precision.h
@@ -3,18 +3,11 @@
 #ifdef QDYN_SPFP
 using real_t = float;
 using nonbond_work_t = float;
-using force_accum_t = float;
 #else
 using real_t = double;
 using nonbond_work_t = double;
-using force_accum_t = double;
 #endif
 
 using energy_accum_t = double;
+using force_accum_t = double;
 using constraint_work_t = double;
-
-#ifdef QDYN_SPFP
-constexpr double k_singular_sin_epsilon = 1.0e-6;
-#else
-constexpr double k_singular_sin_epsilon = 1.0e-12;
-#endif
diff --git a/src/core/cpu/src/cpu_angle_force.cpp b/src/core/cpu/src/cpu_angle_force.cpp
index ae600561..a9c29c1e 100644
--- a/src/core/cpu/src/cpu_angle_force.cpp
+++ b/src/core/cpu/src/cpu_angle_force.cpp
@@ -64,9 +64,9 @@ double calc_angle_forces(int start, int end) {
         dv = cangle.kth * dth;
 
         f1 = sin(th);
-        if (std::fabs(f1) < k_singular_sin_epsilon) {
+        if (std::fabs(f1) < 1.0E-12) {
             // Avoid division by zero
-            f1 = -1.0 / k_singular_sin_epsilon;
+            f1 = -1.0E12;
         } else {
             f1 = -1.0 / f1;
         }
diff --git a/src/core/cpu/src/cpu_improper2_force.cpp b/src/core/cpu/src/cpu_improper2_force.cpp
index 6e4faa60..af73a9cc 100644
--- a/src/core/cpu/src/cpu_improper2_force.cpp
+++ b/src/core/cpu/src/cpu_improper2_force.cpp
@@ -79,8 +79,8 @@ double calc_improper2_forces(int start, int end) {
 
         // Forces
         f1 = sin(phi);
-        if (std::fabs(f1) < k_singular_sin_epsilon) {
-            f1 = std::copysign(k_singular_sin_epsilon, f1);
+        if (std::fabs(f1) < 1E-12) {
+            f1 = 1E-12;
         }
         f1 = -1 / f1;
 
diff --git a/src/core/cpu/src/cpu_polx_water_force.cpp b/src/core/cpu/src/cpu_polx_water_force.cpp
index 5116dbbb..9d0e4711 100644
--- a/src/core/cpu/src/cpu_polx_water_force.cpp
+++ b/src/core/cpu/src/cpu_polx_water_force.cpp
@@ -158,8 +158,8 @@ void calc_polx_w_forces(int iteration) {
                 cos_th = -1;
             }
             f0 = sin(acos(cos_th));
-            if (fabs(f0) < k_singular_sin_epsilon) {
-                f0 = k_singular_sin_epsilon;
+            if (fabs(f0) < 1.0E-12) {
+                f0 = 1.0E-12;
             }
             f0 = -1.0 / f0;
             f0 *= dv;
diff --git a/src/core/cpu/src/cpu_q_angle_force.cpp b/src/core/cpu/src/cpu_q_angle_force.cpp
index 14aa802c..c9c2ea65 100644
--- a/src/core/cpu/src/cpu_q_angle_force.cpp
+++ b/src/core/cpu/src/cpu_q_angle_force.cpp
@@ -56,8 +56,8 @@ void calc_qangle_forces(int state) {
 
         dv = ctx.q_cangles[ic].kth * dth * lambdas[state];
         f1 = sin(th);
-        if (fabs(f1) < k_singular_sin_epsilon) {
-            f1 = k_singular_sin_epsilon;
+        if (abs(f1) < 1E-12) {
+            f1 = 1E-12;
         }
         f1 = -1.0 / f1;
 
diff --git a/src/core/cpu/src/cpu_q_torsion_force.cpp b/src/core/cpu/src/cpu_q_torsion_force.cpp
index 7b7fb271..be309347 100644
--- a/src/core/cpu/src/cpu_q_torsion_force.cpp
+++ b/src/core/cpu/src/cpu_q_torsion_force.cpp
@@ -76,8 +76,8 @@ void calc_qtorsion_forces(int state) {
 
         // Forces
         f1 = sin(phi);
-        if (fabs(f1) < k_singular_sin_epsilon) {
-            f1 = copysign(k_singular_sin_epsilon, f1);
+        if (abs(f1) < 1E-12) {
+            f1 = 1E-12;
         }
         f1 = -1 / f1;
 
diff --git a/src/core/cpu/src/cpu_restrang_force.cpp b/src/core/cpu/src/cpu_restrang_force.cpp
index 84f593b0..d809a9c1 100644
--- a/src/core/cpu/src/cpu_restrang_force.cpp
+++ b/src/core/cpu/src/cpu_restrang_force.cpp
@@ -61,8 +61,8 @@ void calc_restrang_forces() {
         dv = lambda * restrangs[ir].k * dth;
 
         f1 = sin(th);
-        if (fabs(f1) < k_singular_sin_epsilon) {
-            f1 = -1.0 / k_singular_sin_epsilon;
+        if (fabs(f1) < 1E-12) {
+            f1 = -1E-12;
         } else {
             f1 = -1 / f1;
         }
diff --git a/src/core/cpu/src/cpu_torsion_force.cpp b/src/core/cpu/src/cpu_torsion_force.cpp
index 4ebb44b2..e8aaa2a3 100644
--- a/src/core/cpu/src/cpu_torsion_force.cpp
+++ b/src/core/cpu/src/cpu_torsion_force.cpp
@@ -88,8 +88,8 @@ double calc_torsion_forces(int start, int end) {
 
         // Forces
         f1 = sin(phi);
-        if (std::fabs(f1) < k_singular_sin_epsilon) {
-            f1 = std::copysign(k_singular_sin_epsilon, f1);
+        if (std::fabs(f1) < 1E-12) {
+            f1 = 1E-12;
         }
         f1 = -1 / f1;
 
diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu
index f20b039a..dcd044ce 100644
--- a/src/core/cuda/src/cuda_angle_force.cu
+++ b/src/core/cuda/src/cuda_angle_force.cu
@@ -39,8 +39,8 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co
     double dv = cang.kth * dtheta;
 
     double f1 = sin(theta);
-    if (fabs(f1) < k_singular_sin_epsilon) {
-        f1 = -1.0 / k_singular_sin_epsilon;
+    if (fabs(f1) < 1e-12) {
+        f1 = -1.0e12;
     } else {
         f1 = -1.0 / f1;
     }
diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu
index dd7d91aa..78707b12 100644
--- a/src/core/cuda/src/cuda_improper2_force.cu
+++ b/src/core/cuda/src/cuda_improper2_force.cu
@@ -76,7 +76,7 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp
 
     // Forces
     f1 = sin(phi);
-    if (fabs(f1) < k_singular_sin_epsilon) f1 = copysign(k_singular_sin_epsilon, f1);
+    if (fabs(f1) < 1E-12) f1 = 1E-12;
     f1 = -1 / f1;
     // printf("f1 = %f phi = %f cos_phi = %f\n", f1, phi, cos_phi);
 
diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu
index 32b4077a..ce3f73ae 100644
--- a/src/core/cuda/src/cuda_nonbonded_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_force.cu
@@ -9,19 +9,18 @@ namespace CudaNonbondedForce {
 bool is_initialized = false;
 double *d_evdw_total, *d_ecoul_total;
 
-template <typename WorkT>
 struct nonbond_vec_t {
-    WorkT x;
-    WorkT y;
-    WorkT z;
+    nonbond_work_t x;
+    nonbond_work_t y;
+    nonbond_work_t z;
 };
 
-__device__ __forceinline__ float nonbond_rsqrt(float value) {
+__device__ __forceinline__ nonbond_work_t nonbond_rsqrt(nonbond_work_t value) {
+#ifdef QDYN_SPFP
     return rsqrtf(value);
-}
-
-__device__ __forceinline__ double nonbond_rsqrt(double value) {
+#else
     return rsqrt(value);
+#endif
 }
 
 __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
@@ -54,7 +53,6 @@ __device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned m
     return v;
 }
 
-template <typename WorkT>
 __device__ void calculate_unforce_bound(
     const coord_t& x,
     const coord_t& y,
@@ -62,20 +60,20 @@ __device__ void calculate_unforce_bound(
     const real_t charge_product,
     const vdw_pair_param_t& pair_param,
 
-    const WorkT coulomb_constant,
+    const nonbond_work_t coulomb_constant,
 
-    const WorkT scaling,
-    const WorkT lambda,
+    const nonbond_work_t scaling,
+    const nonbond_work_t lambda,
 
-    WorkT& evdw,
-    WorkT& ecoul,
-    WorkT& dv) {
-    const WorkT dx = static_cast<WorkT>(x.x - y.x);
-    const WorkT dy = static_cast<WorkT>(x.y - y.y);
-    const WorkT dz = static_cast<WorkT>(x.z - y.z);
-    const WorkT r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz);
-    const WorkT r2 = r * r;
-    const WorkT r6 = r2 * r2 * r2;
+    nonbond_work_t& evdw,
+    nonbond_work_t& ecoul,
+    nonbond_work_t& dv) {
+    const nonbond_work_t dx = static_cast<nonbond_work_t>(x.x - y.x);
+    const nonbond_work_t dy = static_cast<nonbond_work_t>(x.y - y.y);
+    const nonbond_work_t dz = static_cast<nonbond_work_t>(x.z - y.z);
+    const nonbond_work_t r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz);
+    const nonbond_work_t r2 = r * r;
+    const nonbond_work_t r6 = r2 * r2 * r2;
     // double v_a = r6 * r6;
     // double v_b = r6;
     // ecoul = r;
@@ -84,13 +82,12 @@ __device__ void calculate_unforce_bound(
 
     ecoul = scaling * coulomb_constant * charge_product * r * lambda;
 
-    const WorkT v_a = static_cast<WorkT>(pair_param.a) * r6 * r6 * lambda;
-    const WorkT v_b = static_cast<WorkT>(pair_param.b) * r6 * lambda;
+    const nonbond_work_t v_a = static_cast<nonbond_work_t>(pair_param.a) * r6 * r6 * lambda;
+    const nonbond_work_t v_b = static_cast<nonbond_work_t>(pair_param.b) * r6 * lambda;
     evdw = v_a - v_b;
-    dv = r2 * (-ecoul - static_cast<WorkT>(12.0) * v_a + static_cast<WorkT>(6.0) * v_b);
+    dv = r2 * (-ecoul - static_cast<nonbond_work_t>(12.0) * v_a + static_cast<nonbond_work_t>(6.0) * v_b);
 }
 
-template <typename WorkT>
 __global__ void calc_nonbonded_force_kernel(
     const int nx,
     const int ny,
@@ -177,8 +174,8 @@ __global__ void calc_nonbonded_force_kernel(
     int x_catype_type_idx = (x_idx < nx) ? x_atypes_types[x_idx] : -1;
     int y_catype_type_idx = (y_idx < ny) ? y_atypes_types[y_idx] : -1;
 
-    nonbond_vec_t<WorkT> x_force = {0.0, 0.0, 0.0};
-    nonbond_vec_t<WorkT> y_force = {0.0, 0.0, 0.0};
+    nonbond_vec_t x_force = {0.0, 0.0, 0.0};
+    nonbond_vec_t y_force = {0.0, 0.0, 0.0};
 
     double evdw_sum = 0.0;
     double ecoul_sum = 0.0;
@@ -233,14 +230,14 @@ __global__ void calc_nonbonded_force_kernel(
         }
     }
 
-    const WorkT kernel_lambda = static_cast<WorkT>(lambda);
-    const WorkT coulomb_constant = static_cast<WorkT>(d_topo.coulomb_constant);
+    const nonbond_work_t kernel_lambda = static_cast<nonbond_work_t>(lambda);
+    const nonbond_work_t coulomb_constant = static_cast<nonbond_work_t>(d_topo.coulomb_constant);
     const int charge_pair_row = x_charge_type_idx * n_charge_types;
     const int pair_row = (x_catype_type_idx >= 0) ? x_catype_type_idx * n_catype_types : 0;
 
     for (int i = 0; i < 32; i++) {
         if (is_valid()) {
-            WorkT scaling = static_cast<WorkT>(1.0);
+            nonbond_work_t scaling = static_cast<nonbond_work_t>(1.0);
             real_t charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx];
             vdw_pair_param_t pair_param = catype_pair_params[pair_row + y_catype_type_idx];
 
@@ -252,7 +249,7 @@ __global__ void calc_nonbonded_force_kernel(
             //     }
             // }
 
-            WorkT evdw = 0, ecoul = 0, dv = 0;
+            nonbond_work_t evdw = 0, ecoul = 0, dv = 0;
 
             calculate_unforce_bound(
                 x_coord,
@@ -269,9 +266,9 @@ __global__ void calc_nonbonded_force_kernel(
             evdw_sum += evdw;
             ecoul_sum += ecoul;
 
-            const WorkT dx = static_cast<WorkT>(x_coord.x - y_coord.x);
-            const WorkT dy = static_cast<WorkT>(x_coord.y - y_coord.y);
-            const WorkT dz = static_cast<WorkT>(x_coord.z - y_coord.z);
+            const nonbond_work_t dx = static_cast<nonbond_work_t>(x_coord.x - y_coord.x);
+            const nonbond_work_t dy = static_cast<nonbond_work_t>(x_coord.y - y_coord.y);
+            const nonbond_work_t dz = static_cast<nonbond_work_t>(x_coord.z - y_coord.z);
             y_force.x -= dv * dx;
             y_force.y -= dv * dy;
             y_force.z -= dv * dz;
@@ -337,39 +334,34 @@ std::pair<double, double> calc_nonbonded_force_host(
     cudaMemset(d_ecoul_total, 0, sizeof(double));
     cudaMemset(d_evdw_total, 0, sizeof(double));
 
-    auto launch_kernel = [&](auto work_tag) {
-        using WorkT = decltype(work_tag);
-        calc_nonbonded_force_kernel<WorkT><<<grid, block_sz>>>(
-            nx,
-            ny,
-            x_charges_types,
-            y_charges_types,
-            host.charge_pair_products->gpu_data_p,
-            x_atypes_types,
-            y_atypes_types,
-            host.catype_pair_params->gpu_data_p,
-            host.topo,
-            host.excluded->gpu_data_p,
-            host.LJ_matrix->gpu_data_p,
-            x_idx_list,
-            y_idx_list,
-            host.coords->gpu_data_p,
-            host.dvelocities->gpu_data_p,
-            d_evdw_total,
-            d_ecoul_total,
-            symmetric,
-            disable_water_h_lj,
-            host.n_atoms_solute,
-            host.n_charge_types,
-            host.zero_charge_type,
-            host.n_catype_types,
-            host.zero_catype_type,
-            host.n_qelscales,
-            lambda,
-            host.q_elscales->gpu_data_p);
-    };
-
-    launch_kernel(nonbond_work_t{});
+    calc_nonbonded_force_kernel<<<grid, block_sz>>>(
+        nx,
+        ny,
+        x_charges_types,
+        y_charges_types,
+        host.charge_pair_products->gpu_data_p,
+        x_atypes_types,
+        y_atypes_types,
+        host.catype_pair_params->gpu_data_p,
+        host.topo,
+        host.excluded->gpu_data_p,
+        host.LJ_matrix->gpu_data_p,
+        x_idx_list,
+        y_idx_list,
+        host.coords->gpu_data_p,
+        host.dvelocities->gpu_data_p,
+        d_evdw_total,
+        d_ecoul_total,
+        symmetric,
+        disable_water_h_lj,
+        host.n_atoms_solute,
+        host.n_charge_types,
+        host.zero_charge_type,
+        host.n_catype_types,
+        host.zero_catype_type,
+        host.n_qelscales,
+        lambda,
+        host.q_elscales->gpu_data_p);
 
     cudaDeviceSynchronize();
 
diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu
index 7be0656f..13c37fbc 100644
--- a/src/core/cuda/src/cuda_polx_water_force.cu
+++ b/src/core/cuda/src/cuda_polx_water_force.cu
@@ -136,7 +136,7 @@ __global__ void calc_polx_water_forces_kernel(
     if (cos_th > 1) cos_th = 1;
     if (cos_th < -1) cos_th = -1;
     f0 = sin(acos(cos_th));
-    if (abs(f0) < k_singular_sin_epsilon) f0 = k_singular_sin_epsilon;
+    if (abs(f0) < 1.0E-12) f0 = 1.0E-12;
     f0 = -1.0 / f0;
     f0 *= dv;
 
diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu
index 567a78df..b214aee9 100644
--- a/src/core/cuda/src/cuda_restrang_force.cu
+++ b/src/core/cuda/src/cuda_restrang_force.cu
@@ -64,8 +64,8 @@ __global__ void calc_restrang_force_kernel(
     dv = lambda * restrangs[ir].k * dth;
 
     f1 = sin(th);
-    if (fabs(f1) < k_singular_sin_epsilon) {
-        f1 = -1.0 / k_singular_sin_epsilon;
+    if (fabs(f1) < 1E-12) {
+        f1 = -1E-12;
     } else {
         f1 = -1 / f1;
     }
diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu
index 5baffbde..97b687a6 100644
--- a/src/core/cuda/src/cuda_torsion_force.cu
+++ b/src/core/cuda/src/cuda_torsion_force.cu
@@ -76,7 +76,7 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio
 
     // Forces
     f1 = sin(phi);
-    if (fabs(f1) < k_singular_sin_epsilon) f1 = copysign(k_singular_sin_epsilon, f1);
+    if (fabs(f1) < 1E-12) f1 = 1E-12;
     f1 = -1 / f1;
 
     di.x = f1 * (rnk.x * (bjinv * bkinv) - cos_phi * rnj.x * bj2inv);

From 3c39af706d38ed50b490a7c269ee3379ce3e25fd Mon Sep 17 00:00:00 2001
From: "shen.guo" <g.shen@rug.nl>
Date: Thu, 30 Apr 2026 16:12:34 +0200
Subject: [PATCH 20/20] revert all

---
 src/core/Makefile                            |  32 ++----
 src/core/common/include/context.h            |   2 +-
 src/core/common/include/md_types.h           |  28 +++---
 src/core/common/include/vdw_rules.h          |  26 +++--
 src/core/common/src/handler.cpp              |   5 -
 src/core/common/src/init.cpp                 |  12 +--
 src/core/cpu/src/cpu_nonbonded_pp_force.cpp  |  21 ++--
 src/core/cpu/src/cpu_nonbonded_pw_force.cpp  |  28 +++---
 src/core/cpu/src/cpu_nonbonded_qp_force.cpp  |  24 ++---
 src/core/cpu/src/cpu_nonbonded_qq_force.cpp  |  23 ++---
 src/core/cpu/src/cpu_nonbonded_qw_force.cpp  |  47 +++++----
 src/core/cpu/src/cpu_nonbonded_ww_force.cpp  |  49 +++++----
 src/core/cuda/src/cuda_angle_force.cu        |  12 +--
 src/core/cuda/src/cuda_improper2_force.cu    |   4 +-
 src/core/cuda/src/cuda_leapfrog.cu           |  35 +++----
 src/core/cuda/src/cuda_nonbonded_14_force.cu |  94 +++++++----------
 src/core/cuda/src/cuda_nonbonded_force.cu    | 100 +++++++------------
 src/core/cuda/src/cuda_polx_water_force.cu   |  13 ++-
 src/core/cuda/src/cuda_pshell_force.cu       |   2 +-
 src/core/cuda/src/cuda_radix_water_force.cu  |   9 +-
 src/core/cuda/src/cuda_restrang_force.cu     |   6 +-
 src/core/cuda/src/cuda_restrdis_force.cu     |   4 +-
 src/core/cuda/src/cuda_restrpos_force.cu     |   6 +-
 src/core/cuda/src/cuda_restrseq_force.cu     |   6 +-
 src/core/cuda/src/cuda_restrwall_force.cu    |   4 +-
 src/core/cuda/src/cuda_shake_constraints.cu  |   5 +-
 src/core/cuda/src/cuda_temperature.cu        |   5 +-
 src/core/cuda/src/cuda_torsion_force.cu      |   4 +-
 28 files changed, 261 insertions(+), 345 deletions(-)

diff --git a/src/core/Makefile b/src/core/Makefile
index 6acc5da1..367be528 100644
--- a/src/core/Makefile
+++ b/src/core/Makefile
@@ -1,44 +1,30 @@
 CC = nvcc
-SPFPFLAGS =
-ifeq ($(QDYN_SPFP),1)
-SPFPFLAGS += -DQDYN_SPFP
-endif
-CFLAGS = -O3 -std=c++17 -arch=sm_86 $(SPFPFLAGS) -I./cuda/include -I./common/include -I./cpu/include -I.
+CFLAGS = -O3 -std=c++17 -arch=sm_89 -I./cuda/include -I./common/include -I./cpu/include -I.
 DEPFLAGS = -MMD -MF $(@:.o=.d)
-BUILD_MODE = $(if $(filter 1,$(QDYN_SPFP)),spfp,dpfp)
-OBJDIR = .build/$(BUILD_MODE)
-TARGET = $(OBJDIR)/qdyn
 
 # collect all .cu files except main.cu
 SRCS = $(filter-out main.cu, $(wildcard *.cu cuda/src/*.cu))
 CPPSRCS = $(wildcard common/*.cpp common/src/*.cpp cpu/*.cpp cpu/src/*.cpp)
-MAIN_OBJ = $(OBJDIR)/main.o
-OBJS = $(addprefix $(OBJDIR)/,$(SRCS:.cu=.o)) $(addprefix $(OBJDIR)/,$(CPPSRCS:.cpp=.o))
+MAIN_OBJ = main.o
+OBJS = $(SRCS:.cu=.o) $(CPPSRCS:.cpp=.o)
 DEPS = $(MAIN_OBJ:.o=.d) $(OBJS:.o=.d)
 
 all: qdyn move
 
-qdyn: $(TARGET)
-	cp $< $@
-
-$(TARGET): $(MAIN_OBJ) $(OBJS)
+qdyn: $(MAIN_OBJ) $(OBJS)
 	$(CC) $(CFLAGS) -o $@ $^
 
-$(OBJDIR)/%.o: %.cu
-	mkdir -p $(@D)
+%.o: %.cu
 	$(CC) $(CFLAGS) $(DEPFLAGS) -c $< -o $@
 
-$(OBJDIR)/%.o: %.cpp
-	mkdir -p $(@D)
+%.o: %.cpp
 	$(CC) $(CFLAGS) $(DEPFLAGS) -c $< -o $@
 
 clean:
-	rm -rf .build qdyn ../../bin/qdyn
+	rm -f *.o *.d cuda/src/*.o cuda/src/*.d common/*.o common/*.d common/src/*.o common/src/*.d cpu/*.o cpu/*.d cpu/src/*.o cpu/src/*.d ../../bin/qdyn
 
-move: $(TARGET)
+move:
 	mkdir -p ../../bin
-	cp $< ../../bin/qdyn
-
-.PHONY: all qdyn clean move
+	mv qdyn ../../bin/
 
 -include $(DEPS)
diff --git a/src/core/common/include/context.h b/src/core/common/include/context.h
index 83817bb8..c77a2c91 100644
--- a/src/core/common/include/context.h
+++ b/src/core/common/include/context.h
@@ -187,7 +187,7 @@ class Context {
     std::unique_ptr<HostDeviceBuffer<int>> p_atoms_list;
     std::unique_ptr<HostDeviceBuffer<int>> w_atoms_list;
     std::unique_ptr<HostDeviceBuffer<int>> q_atoms_list;
-    std::unique_ptr<HostDeviceBuffer<real_t>> charge_pair_products;
+    std::unique_ptr<HostDeviceBuffer<double>> charge_pair_products;
     std::unique_ptr<HostDeviceBuffer<int>> p_charge_types;
     std::unique_ptr<HostDeviceBuffer<int>> w_charge_types;
     std::unique_ptr<HostDeviceBuffer<int>> q_charge_types;
diff --git a/src/core/common/include/md_types.h b/src/core/common/include/md_types.h
index 6a4d2865..60f1f56a 100644
--- a/src/core/common/include/md_types.h
+++ b/src/core/common/include/md_types.h
@@ -2,8 +2,6 @@
 
 #include <string>
 #include <vector>
-
-#include "common/include/precision.h"
 /* =============================================
  * == FROM MD FILE
  * =============================================
@@ -49,9 +47,9 @@ struct md_t {
 };
 
 struct coord_t {
-    real_t x;
-    real_t y;
-    real_t z;
+    double x;
+    double y;
+    double z;
 };
 
 struct bond_t {
@@ -116,7 +114,7 @@ struct charge_t {
 
 struct ccharge_t {
     int code;
-    real_t charge;
+    double charge;
 };
 
 struct atype_t {
@@ -127,17 +125,17 @@ struct atype_t {
 struct catype_t {
     int code;
     double m;
-    real_t aii_normal;
-    real_t bii_normal;
+    double aii_normal;
+    double bii_normal;
     // double aii_polar;
     // double bii_polar;
-    real_t aii_1_4;
-    real_t bii_1_4;
+    double aii_1_4;
+    double bii_1_4;
 };
 
 struct vdw_pair_param_t {
-    real_t a;
-    real_t b;
+    double a;
+    double b;
 };
 
 struct topo_t {
@@ -304,9 +302,9 @@ struct shake_bond_t {
  */
 
 struct vel_t {
-    real_t x;
-    real_t y;
-    real_t z;
+    double x;
+    double y;
+    double z;
 };
 
 struct dvel_t {
diff --git a/src/core/common/include/vdw_rules.h b/src/core/common/include/vdw_rules.h
index 5b8e8604..ca7bd762 100644
--- a/src/core/common/include/vdw_rules.h
+++ b/src/core/common/include/vdw_rules.h
@@ -4,10 +4,15 @@
 
 #include <math.h>
 
-template <typename Real>
+
+// Geometric rule: A_ij = sqrt(A_i) * sqrt(A_j), B_ij = sqrt(B_i) * sqrt(B_j)
+// Energy: V = A_ij * r^-12 - B_ij * r^-6
+// Parameters: ai_aii, aj_aii are sqrt(A_i), sqrt(A_j)
+//             ai_bii, aj_bii are sqrt(B_i), sqrt(B_j)
+//             r6 is 1/r^6
 __device__ __host__ inline void calc_vdw_geometric(
-    Real ai_aii, Real aj_aii, Real ai_bii, Real aj_bii,
-    Real r6, Real* V_a, Real* V_b) {
+    double ai_aii, double aj_aii, double ai_bii, double aj_bii,
+    double r6, double* V_a, double* V_b) {
     *V_a = r6 * r6 * ai_aii * aj_aii;
     *V_b = r6 * ai_bii * aj_bii;
 }
@@ -19,17 +24,16 @@ __device__ __host__ inline void calc_vdw_geometric(
 //             ai_aii, aj_aii store R*_i, R*_j (vdW radius)
 //             ai_bii, aj_bii store sqrt(eps_i), sqrt(eps_j) (after preprocessing)
 //             r6 is 1/r^6
-template <typename Real>
 __device__ __host__ inline void calc_vdw_arithmetic(
-    Real Rstar_i, Real Rstar_j, Real sqrt_eps_i, Real sqrt_eps_j,
-    Real r6, Real* V_a, Real* V_b) {
-    Real Rstar_ij = Rstar_i + Rstar_j;           // Arithmetic combination
-    Real sqrt_eps_ij = sqrt_eps_i * sqrt_eps_j;  // Geometric combination (already sqrt)
+    double Rstar_i, double Rstar_j, double sqrt_eps_i, double sqrt_eps_j,
+    double r6, double* V_a, double* V_b) {
+    double Rstar_ij = Rstar_i + Rstar_j;           // Arithmetic combination
+    double sqrt_eps_ij = sqrt_eps_i * sqrt_eps_j;  // Geometric combination (already sqrt)
 
     // Compute R6 = (R*_ij)^6
-    Real R2 = Rstar_ij * Rstar_ij;
-    Real R6 = R2 * R2 * R2;
+    double R2 = Rstar_ij * Rstar_ij;
+    double R6 = R2 * R2 * R2;
 
     *V_a = sqrt_eps_ij * R6 * R6 * r6 * r6;  // sqrt(eps_i * eps_j) * R^12 * r^-12
-    *V_b = static_cast<Real>(2.0) * sqrt_eps_ij * R6 * r6;  // 2 * sqrt(eps_i * eps_j) * R^6 * r^-6
+    *V_b = 2.0 * sqrt_eps_ij * R6 * r6;      // 2 * sqrt(eps_i * eps_j) * R^6 * r^-6
 }
diff --git a/src/core/common/src/handler.cpp b/src/core/common/src/handler.cpp
index b462b2c7..3fdd1341 100644
--- a/src/core/common/src/handler.cpp
+++ b/src/core/common/src/handler.cpp
@@ -88,11 +88,6 @@ void Handler::update_energy_totals() {
 }
 
 void Handler::print_outputs(int iteration) {
-    auto& host = Context::instance();
-    if (host.run_gpu && host.md.trajectory != 0 && iteration % host.md.trajectory == 0) {
-        host.coords->download();
-        host.velocities->download();
-    }
     print_energies();
     write_coords(iteration);
     write_velocities(iteration);
diff --git a/src/core/common/src/init.cpp b/src/core/common/src/init.cpp
index 499c01cb..dc519a9f 100644
--- a/src/core/common/src/init.cpp
+++ b/src/core/common/src/init.cpp
@@ -77,11 +77,9 @@ void initialize_catype_tables() {
             const catype_t& cj = h_catype_table_all[j];
             vdw_pair_param_t pair_param = {};
             if (ctx.topo.vdw_rule == VDW_GEOMETRIC) {
-                calc_vdw_geometric(
-                    ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, static_cast<real_t>(1.0), &pair_param.a, &pair_param.b);
+                calc_vdw_geometric(ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, 1.0, &pair_param.a, &pair_param.b);
             } else {
-                calc_vdw_arithmetic(
-                    ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, static_cast<real_t>(1.0), &pair_param.a, &pair_param.b);
+                calc_vdw_arithmetic(ci.aii_normal, cj.aii_normal, ci.bii_normal, cj.bii_normal, 1.0, &pair_param.a, &pair_param.b);
             }
             h_catype_pair_params[i * ctx.n_catype_types + j] = pair_param;
         }
@@ -170,11 +168,10 @@ void initialize_charge_tables() {
     ctx.zero_charge_type = add_charge(0.0);
     ctx.n_charge_types = static_cast<int>(h_charge_table_all.size());
 
-    std::vector<real_t> h_charge_pair_products(ctx.n_charge_types * ctx.n_charge_types);
+    std::vector<double> h_charge_pair_products(ctx.n_charge_types * ctx.n_charge_types);
     for (int i = 0; i < ctx.n_charge_types; i++) {
         for (int j = 0; j < ctx.n_charge_types; j++) {
-            h_charge_pair_products[i * ctx.n_charge_types + j] =
-                static_cast<real_t>(h_charge_table_all[i].charge * h_charge_table_all[j].charge);
+            h_charge_pair_products[i * ctx.n_charge_types + j] = h_charge_table_all[i].charge * h_charge_table_all[j].charge;
         }
     }
 
@@ -916,3 +913,4 @@ void write_headers() {
     write_header("velocities.csv");
     write_energy_header();
 }
+
diff --git a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
index 390c67eb..ce744ad0 100644
--- a/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_pp_force.cpp
@@ -15,10 +15,11 @@ void calc_nonbonded_pp_forces() {
     bool bond14, bond23;
     double scaling;
     coord_t da;
-    real_t r2a, ra, r6a;
-    real_t V_a, V_b;
-    real_t crg_i, crg_j;
-    real_t ai_aii, aj_aii, ai_bii, aj_bii;
+    double r2a, ra, r6a;
+    double Vela, V_a, V_b;
+    double dva;
+    double crg_i, crg_j;
+    double ai_aii, aj_aii, ai_bii, aj_bii;
     int i, j;
     for (int pi = 0; pi < ctx.n_patoms; pi++) {
         for (int pj = pi + 1; pj < ctx.n_patoms; pj++) {
@@ -41,11 +42,11 @@ void calc_nonbonded_pp_forces() {
             da.x = coords[j].x - coords[i].x;
             da.y = coords[j].y - coords[i].y;
             da.z = coords[j].z - coords[i].z;
-            r2a = static_cast<real_t>(1.0) / (da.x * da.x + da.y * da.y + da.z * da.z);
-            ra = static_cast<real_t>(std::sqrt(r2a));
+            r2a = 1 / (std::pow(da.x, 2) + std::pow(da.y, 2) + std::pow(da.z, 2));
+            ra = sqrt(r2a);
             r6a = r2a * r2a * r2a;
 
-            const real_t Vela = static_cast<real_t>(scaling * ctx.topo.coulomb_constant) * crg_i * crg_j * ra;
+            Vela = scaling * ctx.topo.coulomb_constant * crg_i * crg_j * ra;
 
             ai_aii = bond14 ? ai_type.aii_1_4 : ai_type.aii_normal;
             aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
@@ -57,7 +58,7 @@ void calc_nonbonded_pp_forces() {
             } else {
                 calc_vdw_arithmetic(ai_aii, aj_aii, ai_bii, aj_bii, r6a, &V_a, &V_b);
             }
-            const real_t dva = r2a * (-Vela - static_cast<real_t>(12.0) * V_a + static_cast<real_t>(6.0) * V_b);
+            dva = r2a * (-Vela - 12 * V_a + 6 * V_b);
 
             dvelocities[i].x -= dva * da.x;
             dvelocities[i].y -= dva * da.y;
@@ -67,8 +68,8 @@ void calc_nonbonded_pp_forces() {
             dvelocities[j].y += dva * da.y;
             dvelocities[j].z += dva * da.z;
 
-            ctx.E_nonbond_pp.Ucoul += static_cast<double>(Vela);
-            ctx.E_nonbond_pp.Uvdw += static_cast<double>(V_a - V_b);
+            ctx.E_nonbond_pp.Ucoul += Vela;
+            ctx.E_nonbond_pp.Uvdw += (V_a - V_b);
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
index 030c1290..6bf2c27e 100644
--- a/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_pw_force.cpp
@@ -22,21 +22,21 @@ void calc_nonbonded_pw_forces() {
                 continue;
             }
 
-            const real_t qi = ctx.unified_ccharge(atom_i, 0).charge;
-            const real_t qj = ctx.unified_ccharge(atom_j, 0).charge;
+            const double qi = ctx.unified_ccharge(atom_i, 0).charge;
+            const double qj = ctx.unified_ccharge(atom_j, 0).charge;
 
             const catype_t& atom_i_type = ctx.unified_catype(atom_i, 0);
             const catype_t& atom_j_type = ctx.unified_catype(atom_j, 0);
 
-            real_t v_a = 0.0;
-            real_t v_b = 0.0;
-            const real_t dx = coords[atom_j].x - coords[atom_i].x;
-            const real_t dy = coords[atom_j].y - coords[atom_i].y;
-            const real_t dz = coords[atom_j].z - coords[atom_i].z;
-            const real_t r2inv = static_cast<real_t>(1.0) / (dx * dx + dy * dy + dz * dz);
-            const real_t rinv = static_cast<real_t>(std::sqrt(r2inv));
-            const real_t r6inv = r2inv * r2inv * r2inv;
-            const real_t ecoul = static_cast<real_t>(ctx.topo.coulomb_constant) * qi * qj * rinv;
+            double v_a = 0.0;
+            double v_b = 0.0;
+            const double dx = coords[atom_j].x - coords[atom_i].x;
+            const double dy = coords[atom_j].y - coords[atom_i].y;
+            const double dz = coords[atom_j].z - coords[atom_i].z;
+            const double r2inv = 1.0 / (dx * dx + dy * dy + dz * dz);
+            const double rinv = std::sqrt(r2inv);
+            const double r6inv = r2inv * r2inv * r2inv;
+            const double ecoul = ctx.topo.coulomb_constant * qi * qj * rinv;
 
             if (ctx.topo.vdw_rule == VDW_GEOMETRIC) {
                 calc_vdw_geometric(atom_i_type.aii_normal,
@@ -56,7 +56,7 @@ void calc_nonbonded_pw_forces() {
                                     &v_b);
             }
 
-            const real_t scale = r2inv * (-ecoul - static_cast<real_t>(12.0) * v_a + static_cast<real_t>(6.0) * v_b);
+            const double scale = r2inv * (-ecoul - 12.0 * v_a + 6.0 * v_b);
 
             dvelocities[atom_i].x -= scale * dx;
             dvelocities[atom_i].y -= scale * dy;
@@ -66,8 +66,8 @@ void calc_nonbonded_pw_forces() {
             dvelocities[atom_j].y += scale * dy;
             dvelocities[atom_j].z += scale * dz;
 
-            ctx.E_nonbond_pw.Ucoul += static_cast<double>(ecoul);
-            ctx.E_nonbond_pw.Uvdw += static_cast<double>(v_a - v_b);
+            ctx.E_nonbond_pw.Ucoul += ecoul;
+            ctx.E_nonbond_pw.Uvdw += (v_a - v_b);
         }
     }
 }
diff --git a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
index 7a81a516..65a74a6c 100644
--- a/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qp_force.cpp
@@ -15,11 +15,10 @@ void calc_nonbonded_qp_forces() {
     auto *excluded = ctx.excluded->cpu_data_p;
     int i, j;
     coord_t da;
-    real_t r2, r;
-    real_t ai_aii, aj_aii, ai_bii, aj_bii;
+    double r2, r6, r;
+    double ai_aii, aj_aii, ai_bii, aj_bii;
     bool bond23, bond14;
-    double scaling;
-    real_t Vel, V_a, V_b, dv;
+    double scaling, Vel, V_a, V_b, dv;
 
     for (int qi = 0; qi < ctx.n_qatoms; qi++) {
         for (int pj = 0; pj < ctx.n_patoms; pj++) {
@@ -38,10 +37,12 @@ void calc_nonbonded_qp_forces() {
             da.y = coords[j].y - coords[i].y;
             da.z = coords[j].z - coords[i].z;
 
-            r2 = da.x * da.x + da.y * da.y + da.z * da.z;
-            r2 = static_cast<real_t>(1.0) / r2;
-            r = static_cast<real_t>(std::sqrt(r2));
-            const real_t r6inv = r2 * r2 * r2;  // 1/r^6 for vdW calculation
+            r2 = pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2);
+
+            r6 = r2 * r2 * r2;
+            r2 = 1 / r2;
+            r = sqrt(r2);
+            double r6inv = r2 * r2 * r2;  // 1/r^6 for vdW calculation
 
             for (int state = 0; state < ctx.n_lambdas; state++) {
                 const catype_t& qi_type = ctx.unified_catype(i, state);
@@ -52,8 +53,7 @@ void calc_nonbonded_qp_forces() {
                 ai_bii = bond14 ? qi_type.bii_1_4 : qi_type.bii_normal;
                 aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;
 
-                Vel = static_cast<real_t>(ctx.topo.coulomb_constant * scaling) *
-                      ctx.unified_ccharge(i, state).charge * ctx.unified_ccharge(j, state).charge * r;
+                Vel = ctx.topo.coulomb_constant * scaling * ctx.unified_ccharge(i, state).charge * ctx.unified_ccharge(j, state).charge * r;
                 if (ctx.topo.vdw_rule == VDW_GEOMETRIC) {
                     calc_vdw_geometric(ai_aii, aj_aii, ai_bii, aj_bii, r6inv, &V_a, &V_b);
                 } else {
@@ -70,8 +70,8 @@ void calc_nonbonded_qp_forces() {
                 dvelocities[j].z += dv * da.z;
 
                 // Update Q totals
-                ctx.EQ_nonbond_qp[state].Ucoul += static_cast<double>(Vel);
-                ctx.EQ_nonbond_qp[state].Uvdw += static_cast<double>(V_a - V_b);
+                ctx.EQ_nonbond_qp[state].Ucoul += Vel;
+                ctx.EQ_nonbond_qp[state].Uvdw += (V_a - V_b);
             }
         }
     }
diff --git a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
index 006a3c0e..2b062d48 100644
--- a/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qq_force.cpp
@@ -15,14 +15,14 @@ void calc_nonbonded_qq_forces() {
     auto *excluded = ctx.excluded->cpu_data_p;
     auto *q_elscales = ctx.q_elscales->cpu_data_p;
     int ai, aj;
-    real_t crg_i, crg_j;
+    double crg_i, crg_j;
     double elscale, scaling;
     bool bond23, bond14;
     coord_t da;
-    real_t r2a, ra, r6a;
-    real_t Vela, V_a, V_b;
-    real_t dva;
-    real_t ai_aii, aj_aii, ai_bii, aj_bii;
+    double r2a, ra, r6a;
+    double Vela, V_a, V_b;
+    double dva;
+    double ai_aii, aj_aii, ai_bii, aj_bii;
 
     for (int state = 0; state < ctx.n_lambdas; state++) {
         for (int qi = 0; qi < ctx.n_qatoms; qi++) {
@@ -54,11 +54,11 @@ void calc_nonbonded_qq_forces() {
                 da.x = coords[aj].x - coords[ai].x;
                 da.y = coords[aj].y - coords[ai].y;
                 da.z = coords[aj].z - coords[ai].z;
-                r2a = static_cast<real_t>(1.0) / (da.x * da.x + da.y * da.y + da.z * da.z);
-                ra = static_cast<real_t>(std::sqrt(r2a));
+                r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2));
+                ra = sqrt(r2a);
                 r6a = r2a * r2a * r2a;
 
-                Vela = static_cast<real_t>(scaling * ctx.topo.coulomb_constant * elscale) * crg_i * crg_j * ra;
+                Vela = scaling * ctx.topo.coulomb_constant * crg_i * crg_j * ra * elscale;
 
                 ai_aii = bond14 ? qi_type.aii_1_4 : qi_type.aii_normal;
                 aj_aii = bond14 ? qj_type.aii_1_4 : qj_type.aii_normal;
@@ -70,8 +70,7 @@ void calc_nonbonded_qq_forces() {
                 } else {
                     calc_vdw_arithmetic(ai_aii, aj_aii, ai_bii, aj_bii, r6a, &V_a, &V_b);
                 }
-                dva = r2a * (-Vela - static_cast<real_t>(12.0) * V_a + static_cast<real_t>(6.0) * V_b) *
-                      static_cast<real_t>(lambdas[state]);
+                dva = r2a * (-Vela - 12 * V_a + 6 * V_b) * lambdas[state];
 
                 dvelocities[ai].x -= dva * da.x;
                 dvelocities[ai].y -= dva * da.y;
@@ -81,8 +80,8 @@ void calc_nonbonded_qq_forces() {
                 dvelocities[aj].y += dva * da.y;
                 dvelocities[aj].z += dva * da.z;
 
-                ctx.EQ_nonbond_qq[state].Ucoul += static_cast<double>(Vela);
-                ctx.EQ_nonbond_qq[state].Uvdw += static_cast<double>(V_a - V_b);
+                ctx.EQ_nonbond_qq[state].Ucoul += Vela;
+                ctx.EQ_nonbond_qq[state].Uvdw += (V_a - V_b);
             }
         }
     }
diff --git a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
index 8d18bc55..17530a16 100644
--- a/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_qw_force.cpp
@@ -13,17 +13,17 @@ void calc_nonbonded_qw_forces() {
     auto *excluded = ctx.excluded->cpu_data_p;
     int i;
     coord_t dO, dH1, dH2;
-    real_t r2O, rH1, rH2, rO, r2H1, r2H2;
-    real_t dvO, dvH1, dvH2;
-    real_t V_a, V_b, VelO, VelH1, VelH2;
-    real_t ai_aii, ai_bii;
+    double r2O, rH1, rH2, r6O, rO, r2H1, r2H2;
+    double dvO, dvH1, dvH2;
+    double V_a, V_b, VelO, VelH1, VelH2;
+    double ai_aii, ai_bii;
 
     // Loop over O-atoms, q-atoms
     for (int j = ctx.n_atoms_solute; j < ctx.n_atoms; j += 3) {
         const catype_t& ow_type = ctx.unified_catype(j, 0);
-        const real_t ow_charge = ctx.unified_ccharge(j, 0).charge;
-        const real_t hw1_charge = ctx.unified_ccharge(j + 1, 0).charge;
-        const real_t hw2_charge = ctx.unified_ccharge(j + 2, 0).charge;
+        const double ow_charge = ctx.unified_ccharge(j, 0).charge;
+        const double hw1_charge = ctx.unified_ccharge(j + 1, 0).charge;
+        const double hw2_charge = ctx.unified_ccharge(j + 2, 0).charge;
         for (int qi = 0; qi < ctx.n_qatoms; qi++) {
             i = ctx.q_atoms[qi];
             if (excluded[i] || excluded[j]) continue;
@@ -36,12 +36,13 @@ void calc_nonbonded_qw_forces() {
             dH2.x = coords[j + 2].x - coords[i].x;
             dH2.y = coords[j + 2].y - coords[i].y;
             dH2.z = coords[j + 2].z - coords[i].z;
-            r2O = dO.x * dO.x + dO.y * dO.y + dO.z * dO.z;
-            rH1 = static_cast<real_t>(std::sqrt(static_cast<real_t>(1.0) / (dH1.x * dH1.x + dH1.y * dH1.y + dH1.z * dH1.z)));
-            rH2 = static_cast<real_t>(std::sqrt(static_cast<real_t>(1.0) / (dH2.x * dH2.x + dH2.y * dH2.y + dH2.z * dH2.z)));
-            r2O = static_cast<real_t>(1.0) / r2O;
-            rO = static_cast<real_t>(std::sqrt(r2O));
-            const real_t r6Oinv = r2O * r2O * r2O;  // 1/r^6 for vdW calculation
+            r2O = pow(dO.x, 2) + pow(dO.y, 2) + pow(dO.z, 2);
+            rH1 = sqrt(1.0 / (pow(dH1.x, 2) + pow(dH1.y, 2) + pow(dH1.z, 2)));
+            rH2 = sqrt(1.0 / (pow(dH2.x, 2) + pow(dH2.y, 2) + pow(dH2.z, 2)));
+            r6O = r2O * r2O * r2O;
+            r2O = 1.0 / r2O;
+            rO = sqrt(r2O);
+            double r6Oinv = r2O * r2O * r2O;  // 1/r^6 for vdW calculation
             r2H1 = rH1 * rH1;
             r2H2 = rH2 * rH2;
 
@@ -62,21 +63,19 @@ void calc_nonbonded_qw_forces() {
                     calc_vdw_arithmetic(ai_aii, ow_type.aii_normal, ai_bii, ow_type.bii_normal, r6Oinv, &V_a, &V_b);
                 }
 
-                const real_t q_charge = ctx.unified_ccharge(i, state).charge;
-                const real_t coulomb_constant = static_cast<real_t>(ctx.topo.coulomb_constant);
-                VelO = coulomb_constant * ow_charge * q_charge * rO;
-                VelH1 = coulomb_constant * hw1_charge * q_charge * rH1;
-                VelH2 = coulomb_constant * hw2_charge * q_charge * rH2;
+                const double q_charge = ctx.unified_ccharge(i, state).charge;
+                VelO = ctx.topo.coulomb_constant * ow_charge * q_charge * rO;
+                VelH1 = ctx.topo.coulomb_constant * hw1_charge * q_charge * rH1;
+                VelH2 = ctx.topo.coulomb_constant * hw2_charge * q_charge * rH2;
 
                 // if (state == 0 && qi == 1) printf("j = %d ai__aii = %f A_O = %f B_O = %f V_a = %f V_b = %f r6O = %f\n", j, ai_aii, A_O, B_O, V_a, V_b, r6O);
 
-                const real_t lambda = static_cast<real_t>(lambdas[state]);
-                dvO += r2O * (-VelO - (static_cast<real_t>(12.0) * V_a - static_cast<real_t>(6.0) * V_b)) * lambda;
-                dvH1 -= r2H1 * VelH1 * lambda;
-                dvH2 -= r2H2 * VelH2 * lambda;
+                dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * lambdas[state];
+                dvH1 -= r2H1 * VelH1 * lambdas[state];
+                dvH2 -= r2H2 * VelH2 * lambdas[state];
 
-                ctx.EQ_nonbond_qw[state].Ucoul += static_cast<double>(VelO + VelH1 + VelH2);
-                ctx.EQ_nonbond_qw[state].Uvdw += static_cast<double>(V_a - V_b);
+                ctx.EQ_nonbond_qw[state].Ucoul += (VelO + VelH1 + VelH2);
+                ctx.EQ_nonbond_qw[state].Uvdw += (V_a - V_b);
             }
 
             // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!!
diff --git a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
index 3be5e6f0..505dd45a 100644
--- a/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
+++ b/src/core/cpu/src/cpu_nonbonded_ww_force.cpp
@@ -1,21 +1,18 @@
 #include "cpu_nonbonded_ww_force.h"
 
-#include <cmath>
-
 #include "constants.h"
 #include "context.h"
 #include "vdw_rules.h"
 
 namespace {
-void calc_oxygen_vdw_parameters(const Context& ctx, int oxygen_i, int oxygen_j, real_t* vdw_a, real_t* vdw_b) {
+void calc_oxygen_vdw_parameters(const Context& ctx, int oxygen_i, int oxygen_j, double* vdw_a, double* vdw_b) {
     const catype_t& oi_type = ctx.unified_catype(oxygen_i, 0);
     const catype_t& oj_type = ctx.unified_catype(oxygen_j, 0);
     if (ctx.topo.vdw_rule == VDW_GEOMETRIC) {
         *vdw_a = oi_type.aii_normal * oj_type.aii_normal;
         *vdw_b = oi_type.bii_normal * oj_type.bii_normal;
     } else {
-        calc_vdw_arithmetic(
-            oi_type.aii_normal, oj_type.aii_normal, oi_type.bii_normal, oj_type.bii_normal, static_cast<real_t>(1.0), vdw_a, vdw_b);
+        calc_vdw_arithmetic(oi_type.aii_normal, oj_type.aii_normal, oi_type.bii_normal, oj_type.bii_normal, 1.0, vdw_a, vdw_b);
     }
 }
 }  // namespace
@@ -23,33 +20,33 @@ void calc_oxygen_vdw_parameters(const Context& ctx, int oxygen_i, int oxygen_j,
 void accumulate_pair_force(Context& ctx,
                            int atom_i,
                            int atom_j,
-                           real_t qi,
-                           real_t qj,
+                           double qi,
+                           double qj,
                            bool include_vdw,
-                           real_t vdw_a,
-                           real_t vdw_b,
+                           double vdw_a,
+                           double vdw_b,
                            E_nonbonded_t& energy) {
     auto &coords = ctx.coords->cpu_data_p;
     auto &dvelocities = ctx.dvelocities->cpu_data_p;
-    const real_t dx = coords[atom_j].x - coords[atom_i].x;
-    const real_t dy = coords[atom_j].y - coords[atom_i].y;
-    const real_t dz = coords[atom_j].z - coords[atom_i].z;
+    const double dx = coords[atom_j].x - coords[atom_i].x;
+    const double dy = coords[atom_j].y - coords[atom_i].y;
+    const double dz = coords[atom_j].z - coords[atom_i].z;
 
-    const real_t r2inv = static_cast<real_t>(1.0) / (dx * dx + dy * dy + dz * dz);
-    const real_t rinv = static_cast<real_t>(std::sqrt(r2inv));
-    const real_t ecoul = static_cast<real_t>(ctx.topo.coulomb_constant) * qi * qj * rinv;
+    const double r2inv = 1.0 / (dx * dx + dy * dy + dz * dz);
+    const double rinv = std::sqrt(r2inv);
+    const double ecoul = ctx.topo.coulomb_constant * qi * qj * rinv;
 
-    real_t evdw = 0.0;
-    real_t dva = -ecoul;
+    double evdw = 0.0;
+    double dva = -ecoul;
     if (include_vdw) {
-        const real_t r6inv = r2inv * r2inv * r2inv;
-        const real_t v_a = vdw_a * r6inv * r6inv;
-        const real_t v_b = vdw_b * r6inv;
+        const double r6inv = r2inv * r2inv * r2inv;
+        const double v_a = vdw_a * r6inv * r6inv;
+        const double v_b = vdw_b * r6inv;
         evdw = v_a - v_b;
-        dva -= static_cast<real_t>(12.0) * v_a - static_cast<real_t>(6.0) * v_b;
+        dva -= 12.0 * v_a - 6.0 * v_b;
     }
 
-    const real_t scale = r2inv * dva;
+    const double scale = r2inv * dva;
 
     dvelocities[atom_i].x -= scale * dx;
     dvelocities[atom_i].y -= scale * dy;
@@ -59,8 +56,8 @@ void accumulate_pair_force(Context& ctx,
     dvelocities[atom_j].y += scale * dy;
     dvelocities[atom_j].z += scale * dz;
 
-    energy.Ucoul += static_cast<double>(ecoul);
-    energy.Uvdw += static_cast<double>(evdw);
+    energy.Ucoul += ecoul;
+    energy.Uvdw += evdw;
 }
 
 void calc_nonbonded_ww_forces() {
@@ -73,8 +70,8 @@ void calc_nonbonded_ww_forces() {
         const int base_i = ctx.n_atoms_solute + 3 * water_i;
         for (int water_j = water_i + 1; water_j < ctx.n_waters; ++water_j) {
             const int base_j = ctx.n_atoms_solute + 3 * water_j;
-            real_t oxygen_vdw_a = 0.0;
-            real_t oxygen_vdw_b = 0.0;
+            double oxygen_vdw_a = 0.0;
+            double oxygen_vdw_b = 0.0;
             calc_oxygen_vdw_parameters(ctx, base_i, base_j, &oxygen_vdw_a, &oxygen_vdw_b);
             for (int atom_i = 0; atom_i < 3; ++atom_i) {
                 for (int atom_j = 0; atom_j < 3; ++atom_j) {
diff --git a/src/core/cuda/src/cuda_angle_force.cu b/src/core/cuda/src/cuda_angle_force.cu
index dcd044ce..7c49cffb 100644
--- a/src/core/cuda/src/cuda_angle_force.cu
+++ b/src/core/cuda/src/cuda_angle_force.cu
@@ -48,14 +48,14 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co
     atomicAdd(energy_sum, energy);
 
     coord_t di = {
-        static_cast<real_t>(f1 * (rjk.x / (rji_length * rjk_length) - cos_theta * rji.x / (rji_length * rji_length))),
-        static_cast<real_t>(f1 * (rjk.y / (rji_length * rjk_length) - cos_theta * rji.y / (rji_length * rji_length))),
-        static_cast<real_t>(f1 * (rjk.z / (rji_length * rjk_length) - cos_theta * rji.z / (rji_length * rji_length)))};
+        f1 * (rjk.x / (rji_length * rjk_length) - cos_theta * rji.x / (rji_length * rji_length)),
+        f1 * (rjk.y / (rji_length * rjk_length) - cos_theta * rji.y / (rji_length * rji_length)),
+        f1 * (rjk.z / (rji_length * rjk_length) - cos_theta * rji.z / (rji_length * rji_length))};
 
     coord_t dk = {
-        static_cast<real_t>(f1 * (rji.x / (rji_length * rjk_length) - cos_theta * rjk.x / (rjk_length * rjk_length))),
-        static_cast<real_t>(f1 * (rji.y / (rji_length * rjk_length) - cos_theta * rjk.y / (rjk_length * rjk_length))),
-        static_cast<real_t>(f1 * (rji.z / (rji_length * rjk_length) - cos_theta * rjk.z / (rjk_length * rjk_length)))};
+        f1 * (rji.x / (rji_length * rjk_length) - cos_theta * rjk.x / (rjk_length * rjk_length)),
+        f1 * (rji.y / (rji_length * rjk_length) - cos_theta * rjk.y / (rjk_length * rjk_length)),
+        f1 * (rji.z / (rji_length * rjk_length) - cos_theta * rjk.z / (rjk_length * rjk_length))};
 
     atomicAdd(&dvelocities[i].x, dv * di.x);
     atomicAdd(&dvelocities[i].y, dv * di.y);
diff --git a/src/core/cuda/src/cuda_improper2_force.cu b/src/core/cuda/src/cuda_improper2_force.cu
index 78707b12..e44678e0 100644
--- a/src/core/cuda/src/cuda_improper2_force.cu
+++ b/src/core/cuda/src/cuda_improper2_force.cu
@@ -51,8 +51,8 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp
     rnk.y = -rjk.z * rkl.x + rjk.x * rkl.z;
     rnk.z = -rjk.x * rkl.y + rjk.y * rkl.x;
 
-    bj2inv = 1 / (rnj.x * rnj.x + rnj.y * rnj.y + rnj.z * rnj.z);
-    bk2inv = 1 / (rnk.x * rnk.x + rnk.y * rnk.y + rnk.z * rnk.z);
+    bj2inv = 1 / (pow(rnj.x, 2) + pow(rnj.y, 2) + pow(rnj.z, 2));
+    bk2inv = 1 / (pow(rnk.x, 2) + pow(rnk.y, 2) + pow(rnk.z, 2));
     bjinv = sqrt(bj2inv);
     bkinv = sqrt(bk2inv);
 
diff --git a/src/core/cuda/src/cuda_leapfrog.cu b/src/core/cuda/src/cuda_leapfrog.cu
index 1e010f7e..49312337 100644
--- a/src/core/cuda/src/cuda_leapfrog.cu
+++ b/src/core/cuda/src/cuda_leapfrog.cu
@@ -45,20 +45,6 @@ __global__ void calc_leapfrog_kernel(
     coords[i].z += velocities[i].z * dt;
 }
 
-__global__ void update_velocities_from_positions_kernel(
-    vel_t* velocities,
-    const coord_t* coords,
-    const coord_t* xcoords,
-    int n_atoms,
-    double dt) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= n_atoms) return;
-
-    velocities[idx].x = (coords[idx].x - xcoords[idx].x) / dt;
-    velocities[idx].y = (coords[idx].y - xcoords[idx].y) / dt;
-    velocities[idx].z = (coords[idx].z - xcoords[idx].z) / dt;
-}
-
 void calc_leapfrog_host() {
     auto& host = Context::instance();
     auto d_atypes = host.atypes->gpu_data_p;
@@ -84,17 +70,24 @@ void calc_leapfrog_host() {
         host.dt);
     check_cuda(cudaDeviceSynchronize());
 
+    host.velocities->download();
+    host.dvelocities->download();
+    host.coords->download();
+    host.xcoords->download();
+
     // shake
+    // todo: Here is some problem, it writes into cpu memory, but we use gpu..
     printf("n_shake_constraints: %d\n", host.n_shake_constraints);
     if (host.n_shake_constraints > 0) {
         calc_shake_constraints_host();
-        update_velocities_from_positions_kernel<<<numBlocks, blockSize>>>(
-            d_velocities,
-            d_coords,
-            d_xcoords,
-            host.n_atoms,
-            host.dt);
-        check_cuda(cudaDeviceSynchronize());
+        auto &velocities = host.velocities->cpu_data_p;
+        auto &coords = host.coords->cpu_data_p;
+        auto *xcoords = host.xcoords->cpu_data_p;
+        for (int i = 0; i < host.n_atoms; i++) {
+            velocities[i].x = (coords[i].x - xcoords[i].x) / host.dt;
+            velocities[i].y = (coords[i].y - xcoords[i].y) / host.dt;
+            velocities[i].z = (coords[i].z - xcoords[i].z) / host.dt;
+        }
     }
 }
 
diff --git a/src/core/cuda/src/cuda_nonbonded_14_force.cu b/src/core/cuda/src/cuda_nonbonded_14_force.cu
index 78c4bc91..fa404ee7 100644
--- a/src/core/cuda/src/cuda_nonbonded_14_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_14_force.cu
@@ -12,14 +12,6 @@ int* d_atom_to_qi = nullptr;
 double* d_evdw_totals = nullptr;
 double* d_ecoul_totals = nullptr;
 
-__device__ __forceinline__ nonbond_work_t nonbond14_rsqrt(nonbond_work_t value) {
-#ifdef QDYN_SPFP
-    return rsqrtf(value);
-#else
-    return rsqrt(value);
-#endif
-}
-
 __device__ __forceinline__ int unified_parameter_index(
     int atom_idx,
     int state,
@@ -37,53 +29,37 @@ __device__ __forceinline__ int unified_parameter_index(
 __device__ void calculate_nonbonded_14_pair(
     const coord_t& x,
     const coord_t& y,
-    real_t x_charge,
-    real_t y_charge,
-    real_t x_aii,
-    real_t y_aii,
-    real_t x_bii,
-    real_t y_bii,
-    nonbond_work_t coulomb_constant,
-    nonbond_work_t scaling,
+    double x_charge,
+    double y_charge,
+    double x_aii,
+    double y_aii,
+    double x_bii,
+    double y_bii,
+    double coulomb_constant,
+    double scaling,
     int vdw_rule,
-    nonbond_work_t lambda,
-    nonbond_work_t& evdw,
-    nonbond_work_t& ecoul,
-    nonbond_work_t& dv) {
-    const nonbond_work_t dx = static_cast<nonbond_work_t>(x.x - y.x);
-    const nonbond_work_t dy = static_cast<nonbond_work_t>(x.y - y.y);
-    const nonbond_work_t dz = static_cast<nonbond_work_t>(x.z - y.z);
-    const nonbond_work_t r = nonbond14_rsqrt(dx * dx + dy * dy + dz * dz);
-    const nonbond_work_t r2 = r * r;
-    const nonbond_work_t r6 = r2 * r2 * r2;
+    double lambda,
+    double& evdw,
+    double& ecoul,
+    double& dv) {
+    const double3 d = {x.x - y.x, x.y - y.y, x.z - y.z};
+    const double r = rsqrt(d.x * d.x + d.y * d.y + d.z * d.z);
+    const double r2 = r * r;
+    const double r6 = r2 * r2 * r2;
 
     ecoul = scaling * coulomb_constant * x_charge * y_charge * r * lambda;
 
-    nonbond_work_t v_a = 0.0;
-    nonbond_work_t v_b = 0.0;
+    double v_a = 0.0;
+    double v_b = 0.0;
     if (vdw_rule == VDW_GEOMETRIC) {
-        calc_vdw_geometric(
-            static_cast<nonbond_work_t>(x_aii),
-            static_cast<nonbond_work_t>(y_aii),
-            static_cast<nonbond_work_t>(x_bii),
-            static_cast<nonbond_work_t>(y_bii),
-            r6,
-            &v_a,
-            &v_b);
+        calc_vdw_geometric(x_aii, y_aii, x_bii, y_bii, r6, &v_a, &v_b);
     } else {
-        calc_vdw_arithmetic(
-            static_cast<nonbond_work_t>(x_aii),
-            static_cast<nonbond_work_t>(y_aii),
-            static_cast<nonbond_work_t>(x_bii),
-            static_cast<nonbond_work_t>(y_bii),
-            r6,
-            &v_a,
-            &v_b);
+        calc_vdw_arithmetic(x_aii, y_aii, x_bii, y_bii, r6, &v_a, &v_b);
     }
     v_a *= lambda;
     v_b *= lambda;
     evdw = v_a - v_b;
-    dv = r2 * (-ecoul - static_cast<nonbond_work_t>(12.0) * v_a + static_cast<nonbond_work_t>(6.0) * v_b);
+    dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b);
 }
 
 __global__ void calc_nonbonded_14_force_kernel(
@@ -126,10 +102,10 @@ __global__ void calc_nonbonded_14_force_kernel(
     const coord_t ri = d_coords[ai];
     const coord_t rj = d_coords[aj];
 
-    nonbond_work_t evdw = 0.0;
-    nonbond_work_t ecoul = 0.0;
-    nonbond_work_t dv = 0.0;
-    const nonbond_work_t pair_lambda = static_cast<nonbond_work_t>((mode == NONBONDED_14_PP) ? 1.0 : lambda);
+    double evdw = 0.0;
+    double ecoul = 0.0;
+    double dv = 0.0;
+    const double pair_lambda = (mode == NONBONDED_14_PP) ? 1.0 : lambda;
 
     calculate_nonbonded_14_pair(
         ri,
@@ -140,23 +116,21 @@ __global__ void calc_nonbonded_14_force_kernel(
         aj_type.aii_1_4,
         ai_type.bii_1_4,
         aj_type.bii_1_4,
-        static_cast<nonbond_work_t>(d_topo.coulomb_constant),
-        static_cast<nonbond_work_t>(d_topo.el14_scale),
+        d_topo.coulomb_constant,
+        d_topo.el14_scale,
         d_topo.vdw_rule,
         pair_lambda,
         evdw,
         ecoul,
         dv);
 
-    const nonbond_work_t dx = static_cast<nonbond_work_t>(rj.x - ri.x);
-    const nonbond_work_t dy = static_cast<nonbond_work_t>(rj.y - ri.y);
-    const nonbond_work_t dz = static_cast<nonbond_work_t>(rj.z - ri.z);
-    atomicAdd(&d_dvelocities[ai].x, -dv * dx);
-    atomicAdd(&d_dvelocities[ai].y, -dv * dy);
-    atomicAdd(&d_dvelocities[ai].z, -dv * dz);
-    atomicAdd(&d_dvelocities[aj].x, dv * dx);
-    atomicAdd(&d_dvelocities[aj].y, dv * dy);
-    atomicAdd(&d_dvelocities[aj].z, dv * dz);
+    const double3 d = {rj.x - ri.x, rj.y - ri.y, rj.z - ri.z};
+    atomicAdd(&d_dvelocities[ai].x, -dv * d.x);
+    atomicAdd(&d_dvelocities[ai].y, -dv * d.y);
+    atomicAdd(&d_dvelocities[ai].z, -dv * d.z);
+    atomicAdd(&d_dvelocities[aj].x, dv * d.x);
+    atomicAdd(&d_dvelocities[aj].y, dv * d.y);
+    atomicAdd(&d_dvelocities[aj].z, dv * d.z);
 
     atomicAdd(&evdw_totals[mode], evdw);
     atomicAdd(&ecoul_totals[mode], ecoul);
diff --git a/src/core/cuda/src/cuda_nonbonded_force.cu b/src/core/cuda/src/cuda_nonbonded_force.cu
index ce3f73ae..432a7137 100644
--- a/src/core/cuda/src/cuda_nonbonded_force.cu
+++ b/src/core/cuda/src/cuda_nonbonded_force.cu
@@ -9,20 +9,6 @@ namespace CudaNonbondedForce {
 bool is_initialized = false;
 double *d_evdw_total, *d_ecoul_total;
 
-struct nonbond_vec_t {
-    nonbond_work_t x;
-    nonbond_work_t y;
-    nonbond_work_t z;
-};
-
-__device__ __forceinline__ nonbond_work_t nonbond_rsqrt(nonbond_work_t value) {
-#ifdef QDYN_SPFP
-    return rsqrtf(value);
-#else
-    return rsqrt(value);
-#endif
-}
-
 __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
     x = (int)floorf((2 * n + 1 - sqrtf((2 * n + 1) * (2 * n + 1) - 8 * t)) * 0.5f);
     y = t - (x * n - (x * (x - 1) >> 1));
@@ -33,13 +19,7 @@ __device__ __forceinline__ void idx2xy(int n, int t, int& x, int& y) {
     y += x;
 }
 
-template <typename T>
-__device__ __forceinline__ T shfl_value(T v, int srcLane, unsigned mask = 0xffffffffu) {
-    return __shfl_sync(mask, v, srcLane);
-}
-
-template <>
-__device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mask) {
+__device__ __forceinline__ double shfl(double v, int srcLane, unsigned mask = 0xffffffffu) {
     int2 a = *reinterpret_cast<int2*>(&v);
     a.x = __shfl_sync(mask, a.x, srcLane);
     a.y = __shfl_sync(mask, a.y, srcLane);
@@ -47,9 +27,9 @@ __device__ __forceinline__ double shfl_value(double v, int srcLane, unsigned mas
 }
 
 __device__ __forceinline__ coord_t shfl_coord(coord_t v, int srcLane, unsigned mask = 0xffffffffu) {
-    v.x = shfl_value(v.x, srcLane, mask);
-    v.y = shfl_value(v.y, srcLane, mask);
-    v.z = shfl_value(v.z, srcLane, mask);
+    v.x = shfl(v.x, srcLane, mask);
+    v.y = shfl(v.y, srcLane, mask);
+    v.z = shfl(v.z, srcLane, mask);
     return v;
 }
 
@@ -57,23 +37,21 @@ __device__ void calculate_unforce_bound(
     const coord_t& x,
     const coord_t& y,
 
-    const real_t charge_product,
+    const double charge_product,
     const vdw_pair_param_t& pair_param,
 
-    const nonbond_work_t coulomb_constant,
+    const double coulomb_constant,
 
-    const nonbond_work_t scaling,
-    const nonbond_work_t lambda,
+    const double scaling,
+    const double lambda,
 
-    nonbond_work_t& evdw,
-    nonbond_work_t& ecoul,
-    nonbond_work_t& dv) {
-    const nonbond_work_t dx = static_cast<nonbond_work_t>(x.x - y.x);
-    const nonbond_work_t dy = static_cast<nonbond_work_t>(x.y - y.y);
-    const nonbond_work_t dz = static_cast<nonbond_work_t>(x.z - y.z);
-    const nonbond_work_t r = nonbond_rsqrt(dx * dx + dy * dy + dz * dz);
-    const nonbond_work_t r2 = r * r;
-    const nonbond_work_t r6 = r2 * r2 * r2;
+    double& evdw,
+    double& ecoul,
+    double& dv) {
+    double3 d = {x.x - y.x, x.y - y.y, x.z - y.z};
+    double r = rsqrt(d.x * d.x + d.y * d.y + d.z * d.z);
+    double r2 = r * r;
+    double r6 = r2 * r2 * r2;
     // double v_a = r6 * r6;
     // double v_b = r6;
     // ecoul = r;
@@ -82,10 +60,10 @@ __device__ void calculate_unforce_bound(
 
     ecoul = scaling * coulomb_constant * charge_product * r * lambda;
 
-    const nonbond_work_t v_a = static_cast<nonbond_work_t>(pair_param.a) * r6 * r6 * lambda;
-    const nonbond_work_t v_b = static_cast<nonbond_work_t>(pair_param.b) * r6 * lambda;
+    double v_a = pair_param.a * r6 * r6 * lambda;
+    double v_b = pair_param.b * r6 * lambda;
     evdw = v_a - v_b;
-    dv = r2 * (-ecoul - static_cast<nonbond_work_t>(12.0) * v_a + static_cast<nonbond_work_t>(6.0) * v_b);
+    dv = r2 * (-ecoul - 12.0 * v_a + 6.0 * v_b);
 }
 
 __global__ void calc_nonbonded_force_kernel(
@@ -94,7 +72,7 @@ __global__ void calc_nonbonded_force_kernel(
 
     const int* x_charges_types,
     const int* y_charges_types,
-    const real_t* charge_pair_products,
+    const double* charge_pair_products,
 
     const int* x_atypes_types,
     const int* y_atypes_types,
@@ -161,7 +139,7 @@ __global__ void calc_nonbonded_force_kernel(
     int x_atom_idx = (x_idx < nx) ? x_idx_list[x_idx] : -1;
     int y_atom_idx = (y_idx < ny) ? y_idx_list[y_idx] : -1;
 
-    coord_t invalid = {static_cast<real_t>(-1e9), static_cast<real_t>(-1e9), static_cast<real_t>(-1e9)};
+    coord_t invalid = {-1e9, -1e9, -1e9};
     coord_t x_coord = (x_atom_idx >= 0) ? d_coords[x_atom_idx] : invalid;
     coord_t y_coord = (y_atom_idx >= 0) ? d_coords[y_atom_idx] : invalid;
 
@@ -174,8 +152,8 @@ __global__ void calc_nonbonded_force_kernel(
     int x_catype_type_idx = (x_idx < nx) ? x_atypes_types[x_idx] : -1;
     int y_catype_type_idx = (y_idx < ny) ? y_atypes_types[y_idx] : -1;
 
-    nonbond_vec_t x_force = {0.0, 0.0, 0.0};
-    nonbond_vec_t y_force = {0.0, 0.0, 0.0};
+    double3 x_force = {0.0, 0.0, 0.0};
+    double3 y_force = {0.0, 0.0, 0.0};
 
     double evdw_sum = 0.0;
     double ecoul_sum = 0.0;
@@ -216,9 +194,9 @@ __global__ void calc_nonbonded_force_kernel(
         y_charge_type_idx = __shfl_sync(mask, y_charge_type_idx, src);
         y_catype_type_idx = __shfl_sync(mask, y_catype_type_idx, src);
 
-        y_force.x = shfl_value(y_force.x, src, mask);
-        y_force.y = shfl_value(y_force.y, src, mask);
-        y_force.z = shfl_value(y_force.z, src, mask);
+        y_force.x = shfl(y_force.x, src, mask);
+        y_force.y = shfl(y_force.y, src, mask);
+        y_force.z = shfl(y_force.z, src, mask);
     };
 
     if (disable_water_h_lj) {
@@ -230,15 +208,13 @@ __global__ void calc_nonbonded_force_kernel(
         }
     }
 
-    const nonbond_work_t kernel_lambda = static_cast<nonbond_work_t>(lambda);
-    const nonbond_work_t coulomb_constant = static_cast<nonbond_work_t>(d_topo.coulomb_constant);
     const int charge_pair_row = x_charge_type_idx * n_charge_types;
     const int pair_row = (x_catype_type_idx >= 0) ? x_catype_type_idx * n_catype_types : 0;
 
     for (int i = 0; i < 32; i++) {
         if (is_valid()) {
-            nonbond_work_t scaling = static_cast<nonbond_work_t>(1.0);
-            real_t charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx];
+            double scaling = 1.0;
+            double charge_product = charge_pair_products[charge_pair_row + y_charge_type_idx];
             vdw_pair_param_t pair_param = catype_pair_params[pair_row + y_catype_type_idx];
 
             // todo: Now the idx is wrong, should optimize it later
@@ -249,16 +225,16 @@ __global__ void calc_nonbonded_force_kernel(
             //     }
             // }
 
-            nonbond_work_t evdw = 0, ecoul = 0, dv = 0;
+            double evdw = 0, ecoul = 0, dv = 0;
 
             calculate_unforce_bound(
                 x_coord,
                 y_coord,
                 charge_product,
                 pair_param,
-                coulomb_constant,
+                d_topo.coulomb_constant,
                 scaling,
-                kernel_lambda,
+                lambda,
                 evdw,
                 ecoul,
                 dv);
@@ -266,16 +242,14 @@ __global__ void calc_nonbonded_force_kernel(
             evdw_sum += evdw;
             ecoul_sum += ecoul;
 
-            const nonbond_work_t dx = static_cast<nonbond_work_t>(x_coord.x - y_coord.x);
-            const nonbond_work_t dy = static_cast<nonbond_work_t>(x_coord.y - y_coord.y);
-            const nonbond_work_t dz = static_cast<nonbond_work_t>(x_coord.z - y_coord.z);
-            y_force.x -= dv * dx;
-            y_force.y -= dv * dy;
-            y_force.z -= dv * dz;
+            double3 d = {x_coord.x - y_coord.x, x_coord.y - y_coord.y, x_coord.z - y_coord.z};
+            y_force.x -= dv * d.x;
+            y_force.y -= dv * d.y;
+            y_force.z -= dv * d.z;
 
-            x_force.x += dv * dx;
-            x_force.y += dv * dy;
-            x_force.z += dv * dz;
+            x_force.x += dv * d.x;
+            x_force.y += dv * d.y;
+            x_force.z += dv * d.z;
         }
         do_shuffle();
     }
diff --git a/src/core/cuda/src/cuda_polx_water_force.cu b/src/core/cuda/src/cuda_polx_water_force.cu
index 13c37fbc..9b0eb667 100644
--- a/src/core/cuda/src/cuda_polx_water_force.cu
+++ b/src/core/cuda/src/cuda_polx_water_force.cu
@@ -46,7 +46,7 @@ __global__ void calc_polx_theta_and_shells(
     rmu.y = coords[wi + 1].y + coords[wi + 2].y - 2 * coords[wi].y;
     rmu.z = coords[wi + 1].z + coords[wi + 2].z - 2 * coords[wi].z;
 
-    rm = sqrt(rmu.x * rmu.x + rmu.y * rmu.y + rmu.z * rmu.z);
+    rm = sqrt(pow(rmu.x, 2) + pow(rmu.y, 2) + pow(rmu.z, 2));
 
     rmu.x /= rm;
     rmu.y /= rm;
@@ -55,7 +55,7 @@ __global__ void calc_polx_theta_and_shells(
     rcu.x = coords[wi].x - topo.solvent_center.x;
     rcu.y = coords[wi].y - topo.solvent_center.y;
     rcu.z = coords[wi].z - topo.solvent_center.z;
-    rc = sqrt(rcu.x * rcu.x + rcu.y * rcu.y + rcu.z * rcu.z);
+    rc = sqrt(pow(rcu.x, 2) + pow(rcu.y, 2) + pow(rcu.z, 2));
     rcu.x /= rc;
     rcu.y /= rc;
     rcu.z /= rc;
@@ -106,19 +106,18 @@ __global__ void calc_polx_water_forces_kernel(
     if (theta_val > M_PI) theta_val = M_PI;
 
     avtdum += theta[ii];
-    const double dtheta = theta[ii] - theta_val + wshells[is].theta_corr;
-    ener = .5 * md.polarisation_force * dtheta * dtheta;
+    ener = .5 * md.polarisation_force * pow(theta[ii] - theta_val + wshells[is].theta_corr, 2);
     // E_restraint.Upolx += ener;
     atomicAdd(energy, ener);
 
-    dv = md.polarisation_force * dtheta;
+    dv = md.polarisation_force * (theta[ii] - theta_val + wshells[is].theta_corr);
     wi = n_atoms_solute + 3 * ii;
 
     rmu.x = coords[wi + 1].x + coords[wi + 2].x - 2 * coords[wi].x;
     rmu.y = coords[wi + 1].y + coords[wi + 2].y - 2 * coords[wi].y;
     rmu.z = coords[wi + 1].z + coords[wi + 2].z - 2 * coords[wi].z;
 
-    rm = sqrt(rmu.x * rmu.x + rmu.y * rmu.y + rmu.z * rmu.z);
+    rm = sqrt(pow(rmu.x, 2) + pow(rmu.y, 2) + pow(rmu.z, 2));
 
     rmu.x /= rm;
     rmu.y /= rm;
@@ -127,7 +126,7 @@ __global__ void calc_polx_water_forces_kernel(
     rcu.x = coords[wi].x - topo.solvent_center.x;
     rcu.y = coords[wi].y - topo.solvent_center.y;
     rcu.z = coords[wi].z - topo.solvent_center.z;
-    rc = sqrt(rcu.x * rcu.x + rcu.y * rcu.y + rcu.z * rcu.z);
+    rc = sqrt(pow(rcu.x, 2) + pow(rcu.y, 2) + pow(rcu.z, 2));
     rcu.x /= rc;
     rcu.y /= rc;
     rcu.z /= rc;
diff --git a/src/core/cuda/src/cuda_pshell_force.cu b/src/core/cuda/src/cuda_pshell_force.cu
index 5221cb9e..a01fb536 100644
--- a/src/core/cuda/src/cuda_pshell_force.cu
+++ b/src/core/cuda/src/cuda_pshell_force.cu
@@ -34,7 +34,7 @@ __global__ void calc_pshell_force_kernel(
         dr.x = coords[i].x - coords_init[i].x;
         dr.y = coords[i].y - coords_init[i].y;
         dr.z = coords[i].z - coords_init[i].z;
-        r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
+        r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
         ener = 0.5 * k * r2;
         // printf("dr = %f %f %f\n", dr.x, dr.y, dr.z);
 
diff --git a/src/core/cuda/src/cuda_radix_water_force.cu b/src/core/cuda/src/cuda_radix_water_force.cu
index f037e9db..06f5f5a3 100644
--- a/src/core/cuda/src/cuda_radix_water_force.cu
+++ b/src/core/cuda/src/cuda_radix_water_force.cu
@@ -29,18 +29,18 @@ __global__ void calc_radix_water_forces_kernel(
     dr.x = coords[i].x - topo.solvent_center.x;
     dr.y = coords[i].y - topo.solvent_center.y;
     dr.z = coords[i].z - topo.solvent_center.z;
-    double b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z);
+    double b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2));
     double db = b - (topo.solvent_radius - shift);
 
     double ener, dv;
     if (db > 0) {
-        ener = 0.5 * md.radial_force * db * db - Dwmz;
+        ener = 0.5 * md.radial_force * pow(db, 2) - Dwmz;
         dv = md.radial_force * db / b;
     } else {
         if (b > 0.0) {
             double fexp = exp(awmz * db);
-            ener = Dwmz * (fexp * fexp - 2 * fexp);
-            dv = -2 * Dwmz * awmz * (fexp - fexp * fexp) / b;
+            ener = Dwmz * (pow(fexp, 2) - 2 * fexp);
+            dv = -2 * Dwmz * awmz * (fexp - pow(fexp, 2)) / b;
         } else {
             dv = 0;
             ener = 0;
@@ -91,6 +91,7 @@ void calc_radix_water_forces_host() {
                                                              d_dvelocities,
                                                              d_energy);
     check_cuda(cudaDeviceSynchronize());
+    host.dvelocities->download();
     check_cuda(cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost));
     host.E_restraint.Uradx += energy;
 }
diff --git a/src/core/cuda/src/cuda_restrang_force.cu b/src/core/cuda/src/cuda_restrang_force.cu
index b214aee9..eb0813f5 100644
--- a/src/core/cuda/src/cuda_restrang_force.cu
+++ b/src/core/cuda/src/cuda_restrang_force.cu
@@ -45,8 +45,8 @@ __global__ void calc_restrang_force_kernel(
         lambda = 1;
     }
 
-    r2ij = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
-    r2jk = dr2.x * dr2.x + dr2.y * dr2.y + dr2.z * dr2.z;
+    r2ij = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
+    r2jk = pow(dr2.x, 2) + pow(dr2.y, 2) + pow(dr2.z, 2);
 
     rij = sqrt(r2ij);
     rjk = sqrt(r2jk);
@@ -60,7 +60,7 @@ __global__ void calc_restrang_force_kernel(
     th = acos(cos_th);
     dth = th - to_radians_device(restrangs[ir].ang);
 
-    ener = .5 * restrangs[ir].k * dth * dth;
+    ener = .5 * restrangs[ir].k * pow(dth, 2);
     dv = lambda * restrangs[ir].k * dth;
 
     f1 = sin(th);
diff --git a/src/core/cuda/src/cuda_restrdis_force.cu b/src/core/cuda/src/cuda_restrdis_force.cu
index 14f9b466..9aacf977 100644
--- a/src/core/cuda/src/cuda_restrdis_force.cu
+++ b/src/core/cuda/src/cuda_restrdis_force.cu
@@ -40,7 +40,7 @@ __global__ void calc_restrdis_forces_kernel(
         lambda = 1;
     }
 
-    b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z);
+    b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2));
     if (b < restrdists[ir].d1) {
         db = b - restrdists[ir].d1;
     } else if (b > restrdists[ir].d2) {
@@ -50,7 +50,7 @@ __global__ void calc_restrdis_forces_kernel(
         return;
     }
 
-    ener = .5 * restrdists[ir].k * db * db;
+    ener = .5 * restrdists[ir].k * pow(db, 2);
     dv = lambda * restrdists[ir].k * db / b;
 
     atomicAdd(&dvelocities[j].x, dr.x * dv);
diff --git a/src/core/cuda/src/cuda_restrpos_force.cu b/src/core/cuda/src/cuda_restrpos_force.cu
index 695e2b33..5f479364 100644
--- a/src/core/cuda/src/cuda_restrpos_force.cu
+++ b/src/core/cuda/src/cuda_restrpos_force.cu
@@ -39,9 +39,9 @@ __global__ void calc_restrpos_forces_kernel(
         lambda = 1;
     }
 
-    x2 = dr.x * dr.x;
-    y2 = dr.y * dr.y;
-    z2 = dr.z * dr.z;
+    x2 = pow(dr.x, 2);
+    y2 = pow(dr.y, 2);
+    z2 = pow(dr.z, 2);
 
     ener = .5 * restrspos[ir].k.x * x2 + .5 * restrspos[ir].k.y * y2 + .5 * restrspos[ir].k.z * z2;
 
diff --git a/src/core/cuda/src/cuda_restrseq_force.cu b/src/core/cuda/src/cuda_restrseq_force.cu
index 71835e4e..b5db3552 100644
--- a/src/core/cuda/src/cuda_restrseq_force.cu
+++ b/src/core/cuda/src/cuda_restrseq_force.cu
@@ -46,7 +46,7 @@ __global__ void calc_restrseq_forces_kernel(
             dr.x /= n_ctr;
             dr.y /= n_ctr;
             dr.z /= n_ctr;
-            r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
+            r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
             ener = .5 * k * r2;
             atomicAdd(upres_energy, ener);
 
@@ -77,7 +77,7 @@ __global__ void calc_restrseq_forces_kernel(
             dr.x /= totmass;
             dr.y /= totmass;
             dr.z /= totmass;
-            r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
+            r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
             ener = .5 * k * r2;
             atomicAdd(upres_energy, ener);
 
@@ -100,7 +100,7 @@ __global__ void calc_restrseq_forces_kernel(
                 dr.y = coords[i].y - coords_init[i].y;
                 dr.z = coords[i].z - coords_init[i].z;
 
-                r2 = dr.x * dr.x + dr.y * dr.y + dr.z * dr.z;
+                r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
                 ener = .5 * k * r2;
                 atomicAdd(upres_energy, ener);
 
diff --git a/src/core/cuda/src/cuda_restrwall_force.cu b/src/core/cuda/src/cuda_restrwall_force.cu
index c928bb71..12d890ad 100644
--- a/src/core/cuda/src/cuda_restrwall_force.cu
+++ b/src/core/cuda/src/cuda_restrwall_force.cu
@@ -29,11 +29,11 @@ __global__ void calc_restrwall_forces_kernel(
             dr.y = coords[i].y - topo.solvent_center.y;
             dr.z = coords[i].z - topo.solvent_center.z;
 
-            b = sqrt(dr.x * dr.x + dr.y * dr.y + dr.z * dr.z);
+            b = sqrt(pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2));
             db = b - restrwalls[ir].d;
 
             if (db > 0) {
-                ener = .5 * k * db * db - restrwalls[ir].dMorse;
+                ener = .5 * k * pow(db, 2) - restrwalls[ir].dMorse;
                 dv = k * db / b;
             } else {
                 fexp = exp(restrwalls[ir].aMorse * db);
diff --git a/src/core/cuda/src/cuda_shake_constraints.cu b/src/core/cuda/src/cuda_shake_constraints.cu
index bda47e50..e9dfd051 100644
--- a/src/core/cuda/src/cuda_shake_constraints.cu
+++ b/src/core/cuda/src/cuda_shake_constraints.cu
@@ -48,7 +48,7 @@ __global__ void calc_shake_constraints_kernel(
                     xij.x = coords[ai].x - coords[aj].x;
                     xij.y = coords[ai].y - coords[aj].y;
                     xij.z = coords[ai].z - coords[aj].z;
-                    xij2 = xij.x * xij.x + xij.y * xij.y + xij.z * xij.z;
+                    xij2 = pow(xij.x, 2) + pow(xij.y, 2) + pow(xij.z, 2);
                     diff = shake_bonds[shake + i].dist2 - xij2;
                     if (fabs(diff) < shake_tol * shake_bonds[shake + i].dist2) {
                         shake_bonds[shake + i].ready = true;
@@ -86,7 +86,7 @@ __global__ void calc_shake_constraints_kernel(
                 xxij.x = xcoords[ai].x - xcoords[aj].x;
                 xxij.y = xcoords[ai].y - xcoords[aj].y;
                 xxij.z = xcoords[ai].z - xcoords[aj].z;
-                xxij2 = xxij.x * xxij.x + xxij.y * xxij.y + xxij.z * xxij.z;
+                xxij2 = pow(xxij.x, 2) + pow(xxij.y, 2) + pow(xxij.z, 2);
                 printf(">>> Shake failed, i = %d,j = %d, d = %f, d0 = %f", ai, aj, sqrt(xxij2), shake_bonds[shake + i].dist2);
             }
             return;
@@ -154,5 +154,6 @@ int calc_shake_constraints_host() {
         d_mol_shake_offset);
     cudaDeviceSynchronize();
     cudaMemcpy(&total_iterations_host, d_total_iterations, sizeof(int), cudaMemcpyDeviceToHost);
+    host.coords->download();
     return host.n_molecules == 0 ? 0 : total_iterations_host / host.n_molecules;
 }
diff --git a/src/core/cuda/src/cuda_temperature.cu b/src/core/cuda/src/cuda_temperature.cu
index baba687e..a02c6cf7 100644
--- a/src/core/cuda/src/cuda_temperature.cu
+++ b/src/core/cuda/src/cuda_temperature.cu
@@ -19,10 +19,7 @@ __global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx >= n_atoms) return;
     double mass_i = catypes[atypes[idx].code - 1].m;
-    const double vx = velocities[idx].x;
-    const double vy = velocities[idx].y;
-    const double vz = velocities[idx].z;
-    double ener = .5 * mass_i * (vx * vx + vy * vy + vz * vz);
+    double ener = .5 * mass_i * (pow(velocities[idx].x, 2) + pow(velocities[idx].y, 2) + pow(velocities[idx].z, 2));
     bool is_solute = (idx < n_atoms_solute);
     bool is_excluded = excluded[idx];
 
diff --git a/src/core/cuda/src/cuda_torsion_force.cu b/src/core/cuda/src/cuda_torsion_force.cu
index 97b687a6..6ef7cd45 100644
--- a/src/core/cuda/src/cuda_torsion_force.cu
+++ b/src/core/cuda/src/cuda_torsion_force.cu
@@ -57,8 +57,8 @@ __global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsio
     rnk.y = -rjk.z * rkl.x + rjk.x * rkl.z;
     rnk.z = -rjk.x * rkl.y + rjk.y * rkl.x;
 
-    bj2inv = 1 / (rnj.x * rnj.x + rnj.y * rnj.y + rnj.z * rnj.z);
-    bk2inv = 1 / (rnk.x * rnk.x + rnk.y * rnk.y + rnk.z * rnk.z);
+    bj2inv = 1 / (pow(rnj.x, 2) + pow(rnj.y, 2) + pow(rnj.z, 2));
+    bk2inv = 1 / (pow(rnk.x, 2) + pow(rnk.y, 2) + pow(rnk.z, 2));
     bjinv = sqrt(bj2inv);
     bkinv = sqrt(bk2inv);