diff --git a/benchmark-qgpu/benchmark_correctness.py b/benchmark-qgpu/benchmark_correctness.py
new file mode 100644
index 00000000..f8f4fd5c
--- /dev/null
+++ b/benchmark-qgpu/benchmark_correctness.py
@@ -0,0 +1,499 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import io
+import json
+import math
+import os
+import shutil
+import sys
+from contextlib import redirect_stdout
+from datetime import datetime
+from pathlib import Path
+from statistics import mean
+
+os.environ.setdefault("MPLCONFIGDIR", "/tmp/qgpu-benchmark-matplotlib")
+
+import matplotlib
+
+matplotlib.use("Agg")
+from matplotlib import pyplot as plt
+
+from benchmark_test import (
+    ROOT,
+    command_text,
+    prepare_qgpu_input,
+    prepare_restart_with_qdyn_test,
+    resolve_fortran_bin,
+    resolve_qgpu_bin,
+    resolve_test_data,
+    run_timed,
+    write_md_input,
+)
+
+sys.path.insert(0, str(ROOT / "src" / "Qgpu"))
+
+import compare  # noqa: E402
+import energy as ENERGY  # noqa: E402
+
+
+def default_collect_out(test_name):
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_{test_name}_correctness"
+
+
+def run_qgpu_once(qgpu_bin, prepared_data_dir, run_dir):
+    if run_dir.exists():
+        shutil.rmtree(run_dir)
+    run_dir.mkdir(parents=True)
+    data_dir = run_dir / prepared_data_dir.name
+    shutil.copytree(prepared_data_dir, data_dir)
+
+    stdout_path = run_dir / "qgpu.log"
+    stderr_path = run_dir / "qgpu.err"
+    args = [str(qgpu_bin), "--gpu", str(data_dir)]
+    return_code, wall_seconds = run_timed(args, ROOT, stdout_path, stderr_path)
+    if return_code != 0:
+        raise RuntimeError(
+            "QGPU correctness run failed. "
+            f"Command: {command_text(args)} Logs: stdout={stdout_path} stderr={stderr_path}"
+        )
+    return data_dir, {
+        "command": command_text(args),
+        "return_code": return_code,
+        "wall_seconds": wall_seconds,
+        "stdout": str(stdout_path),
+        "stderr": str(stderr_path),
+    }
+
+
+def load_qgpu_energy(qgpu_data_dir):
+    energy_path = Path(qgpu_data_dir) / "output" / "energies.csv"
+    if not energy_path.exists():
+        raise FileNotFoundError(f"QGPU energy file not found: {energy_path}")
+    return ENERGY.Read_Energy(str(energy_path), 0).QDYN(), energy_path
+
+
+def load_fortran_energy(fortran_dir):
+    q_data_path = Path(fortran_dir) / "Q_data.json"
+    if not q_data_path.exists():
+        raise FileNotFoundError(f"Fortran energy JSON not found: {q_data_path}")
+    with open(q_data_path, encoding="utf-8") as json_f:
+        return json.load(json_f), q_data_path
+
+
+def find_prepared_qgpu_dir(reference_dir):
+    prepare_root = Path(reference_dir) / "qgpu_prepare" / "TEST"
+    if not prepare_root.is_dir():
+        raise FileNotFoundError(f"Prepared QGPU TEST directory not found: {prepare_root}")
+    candidates = sorted(path for path in prepare_root.iterdir() if path.is_dir() and (path / "md.csv").exists())
+    if len(candidates) != 1:
+        shown = ", ".join(str(path) for path in candidates)
+        raise RuntimeError(f"Expected exactly one prepared QGPU directory under {prepare_root}; found: {shown}")
+    return candidates[0]
+
+
+def copy_reference_inputs(reference_dir, out_dir):
+    reference_dir = Path(reference_dir).expanduser().resolve()
+    source_fortran_dir = reference_dir / "fortran_reference"
+    if not (source_fortran_dir / "Q_data.json").exists():
+        raise FileNotFoundError(f"Fortran reference Q_data.json not found: {source_fortran_dir / 'Q_data.json'}")
+
+    source_prepared_dir = find_prepared_qgpu_dir(reference_dir)
+    fortran_dir = out_dir / "fortran_reference"
+    prep_dir = out_dir / "qgpu_prepare"
+    prepared_data_dir = prep_dir / "TEST" / source_prepared_dir.name
+
+    if fortran_dir.exists():
+        shutil.rmtree(fortran_dir)
+    if prep_dir.exists():
+        shutil.rmtree(prep_dir)
+
+    shutil.copytree(source_fortran_dir, fortran_dir)
+    shutil.copytree(source_prepared_dir, prepared_data_dir)
+    return fortran_dir, prep_dir, prepared_data_dir, reference_dir
+
+
+def build_correctness_rows(fortran_data, qgpu_data, tolerance):
+    compare.ENERGY_TOLERANCE = tolerance
+    rows = []
+    frames = sorted(int(key) for key in fortran_data.keys() if key.isdigit())
+    for frame in frames:
+        if frame >= len(qgpu_data):
+            continue
+        with redirect_stdout(io.StringIO()):
+            passed, fortran_values, qgpu_values = compare.compare_energies(
+                fortran_data[str(frame)],
+                qgpu_data[frame],
+            )
+        for term, fortran_value, qgpu_value in zip(compare.header, fortran_values, qgpu_values):
+            if math.isnan(fortran_value) or math.isnan(qgpu_value):
+                continue
+            abs_error = abs(fortran_value - qgpu_value)
+            rel_error = abs_error / abs(fortran_value) if fortran_value != 0 else ""
+            rows.append(
+                {
+                    "frame": frame,
+                    "term": term,
+                    "fortran": fortran_value,
+                    "qgpu": qgpu_value,
+                    "abs_error": abs_error,
+                    "rel_error": rel_error,
+                    "passed_tolerance": abs_error <= tolerance,
+                    "frame_passed": passed,
+                }
+            )
+    if not rows:
+        raise RuntimeError("No comparable energy rows were produced.")
+    return rows
+
+
+def summarize_rows(rows, tolerance):
+    abs_errors = [float(row["abs_error"]) for row in rows]
+    by_term = {}
+    for row in rows:
+        by_term.setdefault(row["term"], []).append(float(row["abs_error"]))
+
+    term_summary = []
+    for term, values in sorted(by_term.items()):
+        term_summary.append(
+            {
+                "term": term,
+                "max_abs_error": max(values),
+                "mean_abs_error": mean(values),
+                "rmse": math.sqrt(mean([value * value for value in values])),
+            }
+        )
+
+    return {
+        "tolerance": tolerance,
+        "frames": sorted({int(row["frame"]) for row in rows}),
+        "terms": len(by_term),
+        "rows": len(rows),
+        "max_abs_error": max(abs_errors),
+        "mean_abs_error": mean(abs_errors),
+        "rmse": math.sqrt(mean([value * value for value in abs_errors])),
+        "passed": all(float(row["abs_error"]) <= tolerance for row in rows),
+        "term_summary": term_summary,
+    }
+
+
+def write_outputs(rows, summary, out_dir, metadata):
+    terms_csv = out_dir / "correctness_terms.csv"
+    summary_json = out_dir / "correctness_summary.json"
+
+    with open(terms_csv, "w", newline="", encoding="utf-8") as csv_f:
+        fieldnames = [
+            "frame",
+            "term",
+            "fortran",
+            "qgpu",
+            "abs_error",
+            "rel_error",
+            "passed_tolerance",
+            "frame_passed",
+        ]
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+
+    payload = {
+        "created_at": datetime.now().isoformat(timespec="seconds"),
+        "metadata": metadata,
+        "summary": summary,
+    }
+    with open(summary_json, "w", encoding="utf-8") as json_f:
+        json.dump(payload, json_f, indent=2)
+
+    return terms_csv, summary_json
+
+
+def collect(args):
+    out_dir = Path(args.out).expanduser().resolve() if args.out else default_collect_out(args.test)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    qgpu_bin = resolve_qgpu_bin(args.qgpu_bin)
+    qgpu_run_dir = out_dir / "qgpu_run"
+
+    if args.reference_dir:
+        print(f"Reusing Fortran/QGPU prepared reference from {args.reference_dir}")
+        fortran_dir, prep_dir, prepared_data_dir, reference_dir = copy_reference_inputs(args.reference_dir, out_dir)
+        prep_fortran_bin = None
+    else:
+        default_prep_fortran_bin = (
+            ROOT / "src" / "q6" / "bin" / "q6" / "qdynp"
+            if args.prep_fortran_mpi_procs is not None
+            else ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"
+        )
+        prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin or default_prep_fortran_bin)
+        data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake)
+
+        fortran_dir = out_dir / "fortran_reference"
+        prep_dir = out_dir / "qgpu_prepare"
+        fortran_dir.mkdir(parents=True, exist_ok=True)
+
+        if args.prep_fortran_mpi_procs is None:
+            print(f"Preparing Fortran reference for {args.test}")
+        else:
+            print(
+                f"Preparing Fortran reference for {args.test} "
+                f"with {args.prep_fortran_mpi_procs} MPI rank(s)"
+            )
+        write_md_input(data, fortran_dir)
+        prepare_restart_with_qdyn_test(
+            data,
+            prep_fortran_bin,
+            fortran_dir,
+            mpi_procs=args.prep_fortran_mpi_procs,
+            mpirun_bin=args.mpirun_bin,
+            mpirun_args=args.mpirun_args,
+        )
+
+        print("Preparing QGPU input")
+        prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir)
+        reference_dir = None
+
+    print("Running QGPU correctness simulation")
+    qgpu_data_dir, qgpu_run = run_qgpu_once(qgpu_bin, prepared_data_dir, qgpu_run_dir)
+
+    fortran_data, fortran_energy_path = load_fortran_energy(fortran_dir)
+    qgpu_data, qgpu_energy_path = load_qgpu_energy(qgpu_data_dir)
+    rows = build_correctness_rows(fortran_data, qgpu_data, args.tolerance)
+    summary = summarize_rows(rows, args.tolerance)
+
+    terms_csv, summary_json = write_outputs(
+        rows,
+        summary,
+        out_dir,
+        {
+            "test": args.test,
+            "steps": args.steps,
+            "lambda": args.lambda_name,
+            "shake": args.shake,
+            "qgpu_bin": str(qgpu_bin),
+            "prep_fortran_bin": str(prep_fortran_bin) if prep_fortran_bin is not None else None,
+            "prep_fortran_mpi_procs": args.prep_fortran_mpi_procs,
+            "mpirun_bin": args.mpirun_bin,
+            "mpirun_args": args.mpirun_args,
+            "reference_dir": str(reference_dir) if reference_dir is not None else None,
+            "prepared_qgpu_input": str(prepared_data_dir),
+            "fortran_energy": str(fortran_energy_path),
+            "qgpu_energy": str(qgpu_energy_path),
+            "qgpu_run": qgpu_run,
+        },
+    )
+
+    print(f"Terms CSV: {terms_csv}")
+    print(f"Summary JSON: {summary_json}")
+    print(
+        f"max |delta E| = {summary['max_abs_error']:.6g} kcal/mol; "
+        f"RMSE = {summary['rmse']:.6g}; passed = {summary['passed']}"
+    )
+    return 0
+
+
+def load_rows(csv_path):
+    rows = []
+    with open(csv_path, newline="", encoding="utf-8") as csv_f:
+        reader = csv.DictReader(csv_f)
+        for row in reader:
+            row["frame"] = int(row["frame"])
+            row["fortran"] = float(row["fortran"])
+            row["qgpu"] = float(row["qgpu"])
+            row["abs_error"] = float(row["abs_error"])
+            rows.append(row)
+    if not rows:
+        raise RuntimeError(f"No rows found in {csv_path}")
+    return rows
+
+
+def select_term_rows(rows, term):
+    selected = [row for row in rows if row["term"] == term]
+    if not selected:
+        terms = ", ".join(sorted({row["term"] for row in rows}))
+        raise ValueError(f"Term '{term}' not found. Available terms: {terms}")
+    return sorted(selected, key=lambda row: row["frame"])
+
+
+def plot(args):
+    rows = load_rows(Path(args.csv).expanduser().resolve())
+    selected = select_term_rows(rows, args.term)
+
+    frames = [row["frame"] for row in selected]
+    fortran_values = [row["fortran"] for row in selected]
+    qgpu_values = [row["qgpu"] for row in selected]
+    abs_errors = [row["abs_error"] for row in selected]
+    rel_errors_pct = [
+        (row["abs_error"] / abs(row["fortran"]) * 100.0) if row["fortran"] != 0 else 0.0
+        for row in selected
+    ]
+    max_abs_error = max(abs_errors)
+    mean_abs_error = mean(abs_errors)
+    rmse = math.sqrt(mean([value * value for value in abs_errors]))
+    max_rel_error = max(rel_errors_pct)
+    mean_rel_error = mean(rel_errors_pct)
+
+    if args.error_mode == "relative":
+        plotted_errors = rel_errors_pct
+        error_ylabel = "Relative error (%)"
+        tolerance = args.tolerance
+        tolerance_label = "rel. tolerance"
+    else:
+        plotted_errors = abs_errors
+        error_ylabel = "|delta E|"
+        tolerance = args.tolerance
+        tolerance_label = "tolerance"
+
+    out_path = Path(args.out).expanduser().resolve()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    fig = plt.figure(figsize=(9.8, 4.2))
+    grid = fig.add_gridspec(2, 2, width_ratios=[4.2, 1.45], height_ratios=[2.3, 1.3])
+    ax_energy = fig.add_subplot(grid[0, 0])
+    ax_error = fig.add_subplot(grid[1, 0], sharex=ax_energy)
+    ax_panel = fig.add_subplot(grid[:, 1])
+
+    ax_energy.plot(frames, fortran_values, color="#4a4a4a", linewidth=1.8, label="Fortran")
+    ax_energy.plot(frames, qgpu_values, color="#0b71c8", linestyle="--", linewidth=1.6, label="QGPU")
+    ax_energy.set_title(args.title, loc="left", fontsize=13, weight="bold", color="#113b5f")
+    ax_energy.set_ylabel(f"{args.term} (kcal/mol)")
+    ax_energy.grid(axis="y", color="#e5e8ee", linewidth=0.8)
+    ax_energy.legend(frameon=False, loc="best", fontsize=8)
+    ax_energy.spines["top"].set_visible(False)
+    ax_energy.spines["right"].set_visible(False)
+
+    ax_error.plot(frames, plotted_errors, color="#d62728", linewidth=1.6)
+    ax_error.fill_between(frames, plotted_errors, color="#d62728", alpha=0.13)
+    if tolerance is not None:
+        ax_error.axhline(tolerance, color="#777777", linestyle=":", linewidth=1.0, label=tolerance_label)
+        ax_error.legend(frameon=False, loc="best", fontsize=8)
+    ax_error.set_xlabel("MD step")
+    ax_error.set_ylabel(error_ylabel)
+    ax_error.grid(axis="y", color="#e5e8ee", linewidth=0.8)
+    ax_error.spines["top"].set_visible(False)
+    ax_error.spines["right"].set_visible(False)
+
+    ax_panel.set_facecolor("#eef5fd")
+    for spine in ax_panel.spines.values():
+        spine.set_color("#8ab9ef")
+    ax_panel.set_xticks([])
+    ax_panel.set_yticks([])
+    if args.error_mode == "relative":
+        ax_panel.text(0.5, 0.84, "Consistency", ha="center", va="center", fontsize=13, weight="bold", color="#0b3970")
+        ax_panel.text(0.5, 0.64, f"{max_rel_error:.3f}%", ha="center", va="center", fontsize=24, weight="bold", color="#003c7f")
+        ax_panel.text(0.5, 0.50, "max rel. error", ha="center", va="center", fontsize=10, color="#0b3970")
+        ax_panel.axhline(0.36, xmin=0.15, xmax=0.85, color="#8ab9ef", linewidth=0.8)
+        ax_panel.text(0.5, 0.25, f"mean {mean_rel_error:.3f}%", ha="center", va="center", fontsize=11, weight="bold", color="#0b3970")
+        ax_panel.text(0.5, 0.13, f"abs RMSE {rmse:.2e}", ha="center", va="center", fontsize=9, color="#0b3970")
+    else:
+        ax_panel.text(0.5, 0.82, "Agreement", ha="center", va="center", fontsize=13, weight="bold", color="#0b3970")
+        ax_panel.text(0.5, 0.62, f"{max_abs_error:.2e}", ha="center", va="center", fontsize=22, weight="bold", color="#003c7f")
+        ax_panel.text(0.5, 0.48, "max |delta E|", ha="center", va="center", fontsize=10, color="#0b3970")
+        ax_panel.axhline(0.34, xmin=0.15, xmax=0.85, color="#8ab9ef", linewidth=0.8)
+        ax_panel.text(0.5, 0.23, f"RMSE {rmse:.2e}", ha="center", va="center", fontsize=11, weight="bold", color="#0b3970")
+        ax_panel.text(0.5, 0.12, f"mean {mean_abs_error:.2e}", ha="center", va="center", fontsize=10, color="#0b3970")
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=220)
+    plt.close(fig)
+    print(f"Plot written to: {out_path}")
+    return 0
+
+
+def positive_int(value):
+    parsed = int(value)
+    if parsed < 1:
+        raise argparse.ArgumentTypeError("must be >= 1")
+    return parsed
+
+
+def nonnegative_float(value):
+    parsed = float(value)
+    if parsed < 0:
+        raise argparse.ArgumentTypeError("must be >= 0")
+    return parsed
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Collect and plot Fortran vs QGPU energy correctness.")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    collect_parser = subparsers.add_parser("collect", help="Run a correctness benchmark and write CSV data.")
+    collect_parser.add_argument("--test", required=True, help="runTEST.py test name.")
+    collect_parser.add_argument("--steps", type=positive_int, required=True, help="MD steps.")
+    collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.")
+    collect_parser.add_argument("--shake", action="store_true", help="Enable shake.")
+    collect_parser.add_argument("--out", help="Output directory.")
+    collect_parser.add_argument("--qgpu-bin", help="Path to QGPU qdyn binary.")
+    collect_parser.add_argument(
+        "--reference-dir",
+        help="Existing correctness result directory containing fortran_reference/ and qgpu_prepare/ to reuse.",
+    )
+    collect_parser.add_argument(
+        "--prep-fortran-bin",
+        default=None,
+        help="Path to Fortran binary used to generate reference data. Defaults to qdynp with MPI, otherwise qdyn_test.",
+    )
+    collect_parser.add_argument(
+        "--prep-fortran-mpi-procs",
+        type=positive_int,
+        default=None,
+        help="Run the Fortran reference preparation through mpirun with this many MPI ranks.",
+    )
+    collect_parser.add_argument(
+        "--mpirun-bin",
+        default="mpirun",
+        help="MPI launcher to use with --prep-fortran-mpi-procs. Defaults to mpirun.",
+    )
+    collect_parser.add_argument(
+        "--mpirun-args",
+        default=None,
+        help='Extra MPI launcher arguments, quoted as one string, e.g. "--bind-to core".',
+    )
+    collect_parser.add_argument(
+        "--tolerance",
+        type=nonnegative_float,
+        default=1e-3,
+        help="Absolute energy tolerance in kcal/mol for pass/fail summary.",
+    )
+
+    plot_parser = subparsers.add_parser("plot", help="Plot correctness from correctness_terms.csv.")
+    plot_parser.add_argument("csv", help="correctness_terms.csv from collect.")
+    plot_parser.add_argument("--out", required=True, help="Output PNG path.")
+    plot_parser.add_argument("--term", default="total-Utot", help="Energy term to plot.")
+    plot_parser.add_argument(
+        "--title",
+        default="Long-Run Energy Consistency",
+        help="Plot title.",
+    )
+    plot_parser.add_argument(
+        "--error-mode",
+        choices=["absolute", "relative"],
+        default="absolute",
+        help="Plot absolute kcal/mol error or relative percent error.",
+    )
+    plot_parser.add_argument(
+        "--tolerance",
+        type=nonnegative_float,
+        default=None,
+        help="Optional horizontal tolerance line on the error panel. Units follow --error-mode.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    if args.command == "collect":
+        return collect(args)
+    if args.command == "plot":
+        return plot(args)
+    raise SystemExit(f"Unknown command: {args.command}")
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except (FileNotFoundError, RuntimeError, ValueError) as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        raise SystemExit(1)
diff --git a/benchmark-qgpu/benchmark_nsday.py b/benchmark-qgpu/benchmark_nsday.py
new file mode 100644
index 00000000..30ffbebc
--- /dev/null
+++ b/benchmark-qgpu/benchmark_nsday.py
@@ -0,0 +1,533 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import json
+import os
+import shutil
+import subprocess
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from statistics import median
+
+os.environ.setdefault("MPLCONFIGDIR", "/tmp/qgpu-benchmark-matplotlib")
+
+import matplotlib
+
+matplotlib.use("Agg")
+from matplotlib import pyplot as plt
+
+from benchmark_test import (
+    ROOT,
+    TIME_STEP_NS,
+    command_text,
+    prepare_qgpu_input,
+    prepare_restart_with_qdyn_test,
+    resolve_fortran_bin,
+    resolve_qgpu_bin,
+    resolve_test_data,
+    write_md_input,
+)
+
+
+RESTART_INIT_STEPS = 1
+
+
+def read_steps_from_md_csv(data_dir):
+    md_path = Path(data_dir) / "md.csv"
+    if not md_path.exists():
+        raise FileNotFoundError(f"md.csv not found: {md_path}")
+    with open(md_path, encoding="utf-8") as md_f:
+        for line in md_f:
+            if line.startswith("steps;"):
+                return int(line.strip().split(";", 1)[1])
+    raise RuntimeError(f"Could not find steps in {md_path}")
+
+
+def default_collect_out(label):
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    safe_label = "".join(ch if ch.isalnum() or ch in "._-" else "_" for ch in label)
+    return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_{safe_label}_nsday"
+
+
+def prepare_from_test(args, out_dir):
+    init_data = resolve_test_data(args.test, RESTART_INIT_STEPS, args.lambda_name, args.shake)
+    benchmark_data = resolve_test_data(args.test, args.steps, args.lambda_name, args.shake)
+    fortran_dir = out_dir / "prepare" / args.test / "fortran"
+    prep_dir = out_dir / "prepare" / args.test / "qgpu_prepare"
+    fortran_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Preparing QGPU restart for {args.test} with {RESTART_INIT_STEPS} MD step(s) in {out_dir}")
+    write_md_input(init_data, fortran_dir)
+    prepare_restart_with_qdyn_test(init_data, resolve_fortran_bin(args.prep_fortran_bin), fortran_dir)
+
+    print(f"Writing QGPU benchmark input for {args.test} with {args.steps} MD step(s)")
+    write_md_input(benchmark_data, fortran_dir)
+    prepared_data_dir = prepare_qgpu_input(benchmark_data, fortran_dir, prep_dir)
+    prepared_steps = read_steps_from_md_csv(prepared_data_dir)
+    if prepared_steps != args.steps:
+        raise RuntimeError(
+            f"Prepared QGPU input has {prepared_steps} steps, expected {args.steps}: {prepared_data_dir}"
+        )
+    return prepared_data_dir
+
+
+def resolve_collect_data_dir(args, out_dir):
+    if args.data_dir:
+        data_dir = Path(args.data_dir).expanduser().resolve()
+        if not data_dir.is_dir():
+            raise FileNotFoundError(f"data dir not found: {data_dir}")
+        steps = args.steps if args.steps is not None else read_steps_from_md_csv(data_dir)
+        return data_dir, steps
+
+    if not args.test:
+        raise SystemExit("collect requires --test or --data-dir.")
+    if args.steps is None:
+        raise SystemExit("collect with --test requires --steps.")
+    data_dir = prepare_from_test(args, out_dir)
+    return data_dir, args.steps
+
+
+def cleanup_successful_task_data(processes):
+    removed = 0
+    for item in processes:
+        if item["return_code"] != 0:
+            continue
+        data_dir = item["data_dir"]
+        if data_dir.exists():
+            shutil.rmtree(data_dir)
+            removed += 1
+    return removed
+
+
+def run_concurrency_batch(
+    qgpu_bin,
+    prepared_data_dir,
+    run_dir,
+    concurrency,
+    steps,
+    label,
+    repeat,
+    cleanup_run_data=False,
+):
+    if run_dir.exists():
+        shutil.rmtree(run_dir)
+    run_dir.mkdir(parents=True)
+
+    command_template = None
+    launch_specs = []
+    for index in range(1, concurrency + 1):
+        proc_dir = run_dir / f"proc_{index:03d}"
+        data_dir = proc_dir / prepared_data_dir.name
+        proc_dir.mkdir(parents=True)
+        shutil.copytree(prepared_data_dir, data_dir)
+
+        stdout_path = proc_dir / "qgpu.log"
+        stderr_path = proc_dir / "qgpu.err"
+        args = [str(qgpu_bin), "--gpu", str(data_dir)]
+        command_template = command_text([str(qgpu_bin), "--gpu", "<data_dir>"])
+        launch_specs.append(
+            {
+                "index": index,
+                "args": args,
+                "stdout": stdout_path,
+                "stderr": stderr_path,
+                "data_dir": data_dir,
+                "command": command_text(args),
+            }
+        )
+
+    processes = []
+    process_rows = []
+    batch_start = time.perf_counter()
+    for spec in launch_specs:
+        stdout_f = open(spec["stdout"], "w", encoding="utf-8")
+        stderr_f = open(spec["stderr"], "w", encoding="utf-8")
+        proc_start = time.perf_counter()
+        process = subprocess.Popen(spec["args"], cwd=ROOT, stdout=stdout_f, stderr=stderr_f)
+        processes.append(
+            {
+                "index": spec["index"],
+                "process": process,
+                "stdout_file": stdout_f,
+                "stderr_file": stderr_f,
+                "stdout": spec["stdout"],
+                "stderr": spec["stderr"],
+                "data_dir": spec["data_dir"],
+                "start": proc_start,
+                "command": spec["command"],
+            }
+        )
+
+    remaining = set(range(len(processes)))
+    while remaining:
+        for item_index in list(remaining):
+            item = processes[item_index]
+            return_code = item["process"].poll()
+            if return_code is None:
+                continue
+            item["return_code"] = return_code
+            item["end"] = time.perf_counter()
+            item["stdout_file"].close()
+            item["stderr_file"].close()
+            remaining.remove(item_index)
+        if remaining:
+            time.sleep(0.01)
+
+    for item in processes:
+        wall_seconds = item["end"] - item["start"]
+        process_rows.append(
+            {
+                "label": label,
+                "concurrency": concurrency,
+                "repeat": repeat,
+                "process_index": item["index"],
+                "return_code": item["return_code"],
+                "process_wall_seconds": wall_seconds,
+                "process_ns_per_day": steps * TIME_STEP_NS * 86400 / wall_seconds if wall_seconds > 0 else "",
+                "stdout": str(item["stdout"]),
+                "stderr": str(item["stderr"]),
+                "command": item["command"],
+            }
+        )
+
+    batch_wall_seconds = time.perf_counter() - batch_start
+    failed = sum(1 for row in process_rows if row["return_code"] != 0)
+
+    if cleanup_run_data:
+        removed_task_data = cleanup_successful_task_data(processes)
+        if removed_task_data:
+            print(f"Removed copied QGPU data for {removed_task_data} successful task(s) under {run_dir}")
+
+    total_ns_per_day = concurrency * steps * TIME_STEP_NS * 86400 / batch_wall_seconds
+    mean_process_ns_per_day = (
+        sum(float(row["process_ns_per_day"]) for row in process_rows if row["process_ns_per_day"] != "")
+        / len(process_rows)
+    )
+    return {
+        "label": label,
+        "concurrency": concurrency,
+        "repeat": repeat,
+        "steps": steps,
+        "batch_wall_seconds": batch_wall_seconds,
+        "total_ns_per_day": total_ns_per_day,
+        "mean_process_ns_per_day": mean_process_ns_per_day,
+        "failed_processes": failed,
+        "command": command_template,
+    }, process_rows
+
+
+def write_collect_outputs(batch_rows, process_rows, out_dir, meta):
+    summary_csv = out_dir / "nsday_summary.csv"
+    process_csv = out_dir / "nsday_processes.csv"
+    meta_json = out_dir / "nsday_meta.json"
+
+    with open(summary_csv, "w", newline="", encoding="utf-8") as csv_f:
+        fieldnames = [
+            "label",
+            "concurrency",
+            "repeat",
+            "steps",
+            "batch_wall_seconds",
+            "total_ns_per_day",
+            "mean_process_ns_per_day",
+            "failed_processes",
+            "command",
+        ]
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(batch_rows)
+
+    with open(process_csv, "w", newline="", encoding="utf-8") as csv_f:
+        fieldnames = [
+            "label",
+            "concurrency",
+            "repeat",
+            "process_index",
+            "return_code",
+            "process_wall_seconds",
+            "process_ns_per_day",
+            "stdout",
+            "stderr",
+            "command",
+        ]
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(process_rows)
+
+    with open(meta_json, "w", encoding="utf-8") as json_f:
+        json.dump(meta, json_f, indent=2)
+
+    return summary_csv, process_csv, meta_json
+
+
+def collect(args):
+    label = args.label or args.test or Path(args.data_dir).name
+    out_dir = Path(args.out).expanduser().resolve() if args.out else default_collect_out(label)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    qgpu_bin = resolve_qgpu_bin(args.qgpu_bin)
+    prepared_data_dir, steps = resolve_collect_data_dir(args, out_dir)
+
+    batch_rows = []
+    process_rows = []
+    for concurrency in args.concurrency:
+        for repeat in range(1, args.repeat + 1):
+            run_dir = out_dir / "runs" / f"c{concurrency:03d}" / f"repeat_{repeat:03d}"
+            print(f"Running {label}: concurrency={concurrency}, repeat={repeat}")
+            batch_row, rows = run_concurrency_batch(
+                qgpu_bin=qgpu_bin,
+                prepared_data_dir=prepared_data_dir,
+                run_dir=run_dir,
+                concurrency=concurrency,
+                steps=steps,
+                label=label,
+                repeat=repeat,
+                cleanup_run_data=not args.keep_run_data,
+            )
+            batch_rows.append(batch_row)
+            process_rows.extend(rows)
+            if batch_row["failed_processes"]:
+                summary_csv, process_csv, meta_json = write_collect_outputs(
+                    batch_rows,
+                    process_rows,
+                    out_dir,
+                    {
+                        "created_at": datetime.now().isoformat(timespec="seconds"),
+                        "label": label,
+                        "qgpu_bin": str(qgpu_bin),
+                        "prepared_data_dir": str(prepared_data_dir),
+                        "steps": steps,
+                        "keep_run_data": args.keep_run_data,
+                    },
+                )
+                raise RuntimeError(
+                    f"{batch_row['failed_processes']} process(es) failed at concurrency "
+                    f"{concurrency}, repeat {repeat}. Summary: {summary_csv}; processes: {process_csv}; meta: {meta_json}"
+                )
+            if args.pause_seconds > 0:
+                time.sleep(args.pause_seconds)
+
+    summary_csv, process_csv, meta_json = write_collect_outputs(
+        batch_rows,
+        process_rows,
+        out_dir,
+        {
+            "created_at": datetime.now().isoformat(timespec="seconds"),
+            "label": label,
+            "test": args.test,
+            "data_dir": str(prepared_data_dir),
+            "qgpu_bin": str(qgpu_bin),
+            "steps": steps,
+            "concurrency": args.concurrency,
+            "repeat": args.repeat,
+            "keep_run_data": args.keep_run_data,
+        },
+    )
+    print(f"Summary CSV: {summary_csv}")
+    print(f"Process CSV: {process_csv}")
+    print(f"Metadata JSON: {meta_json}")
+    return 0
+
+
+def load_plot_series(csv_paths, metric):
+    series = {}
+    for csv_path in csv_paths:
+        with open(csv_path, newline="", encoding="utf-8") as csv_f:
+            reader = csv.DictReader(csv_f)
+            for row in reader:
+                if int(row.get("failed_processes") or 0) != 0:
+                    continue
+                label = row["label"]
+                concurrency = int(row["concurrency"])
+                value = float(row[metric])
+                series.setdefault(label, {}).setdefault(concurrency, []).append(value)
+
+    plotted = []
+    for label, by_concurrency in sorted(series.items()):
+        xs = sorted(by_concurrency)
+        ys = [median(by_concurrency[x]) for x in xs]
+        plotted.append({"label": label, "xs": xs, "ys": ys})
+    if not plotted:
+        raise RuntimeError("No successful rows found in input CSV file(s).")
+    return plotted
+
+
+def plot(args):
+    metric = args.metric
+    metric_labels = {
+        "total_ns_per_day": "Total Throughput (ns/day)",
+        "mean_process_ns_per_day": "Mean Per-Process Throughput (ns/day)",
+    }
+    series = load_plot_series([Path(path).expanduser().resolve() for path in args.csv], metric)
+    out_path = Path(args.out).expanduser().resolve()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    fig, (ax, panel) = plt.subplots(
+        1,
+        2,
+        figsize=(9.5, 3.2),
+        gridspec_kw={"width_ratios": [4.4, 1.55]},
+    )
+    palette = ["#1f77b4", "#43a047", "#f57c00", "#7b1fa2", "#00838f"]
+    all_points = []
+    all_xs = sorted({x for item in series for x in item["xs"]})
+    for index, item in enumerate(series):
+        color = palette[index % len(palette)]
+        ax.plot(item["xs"], item["ys"], marker="o", linewidth=1.8, markersize=4.5, color=color, label=item["label"])
+        for x, y in zip(item["xs"], item["ys"]):
+            all_points.append((y, item["label"], x))
+            if len(item["xs"]) == 1:
+                x_offset = 0
+                ha = "center"
+            elif x == item["xs"][0]:
+                x_offset = 6
+                ha = "left"
+            elif x == item["xs"][-1]:
+                x_offset = -6
+                ha = "right"
+            else:
+                x_offset = 0
+                ha = "center"
+            ax.annotate(
+                f"{y:.1f}",
+                xy=(x, y),
+                xytext=(x_offset, 6),
+                textcoords="offset points",
+                ha=ha,
+                va="bottom",
+                fontsize=8,
+                weight="bold",
+                color="#253142",
+            )
+
+    y_values = [point[0] for point in all_points]
+    y_min = min(y_values)
+    y_max = max(y_values)
+    y_span = y_max - y_min
+    y_pad = max(y_span * 0.22, y_max * 0.035, 0.5)
+    ax.set_ylim(max(0, y_min - y_pad * 0.35), y_max + y_pad)
+    ax.set_xticks(all_xs)
+    if len(all_xs) == 1:
+        ax.set_xlim(all_xs[0] - 0.5, all_xs[0] + 0.5)
+    else:
+        x_pad = max((all_xs[-1] - all_xs[0]) * 0.06, 0.25)
+        ax.set_xlim(all_xs[0] - x_pad, all_xs[-1] + x_pad)
+
+    ax.text(0.0, 1.14, args.title, transform=ax.transAxes, fontsize=13, weight="bold", color="#0f5f18")
+    ax.text(0.0, 1.07, args.subtitle, transform=ax.transAxes, fontsize=9, style="italic", color="#253142")
+    ax.set_xlabel("Concurrent Simulations")
+    ax.set_ylabel(metric_labels[metric])
+    ax.grid(axis="y", color="#e3e7ed", linewidth=0.8)
+    ax.set_axisbelow(True)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    if len(series) > 1:
+        ax.legend(frameon=False, loc="upper left", bbox_to_anchor=(0.0, 1.01), ncols=2, fontsize=8)
+
+    best_points = sorted(all_points, reverse=True)
+    best = best_points[0]
+    second = None
+    seen_labels = {best[1]}
+    for point in best_points[1:]:
+        if point[1] not in seen_labels:
+            second = point
+            break
+
+    panel.set_facecolor("#edf7eb")
+    for spine in panel.spines.values():
+        spine.set_color("#a3d39b")
+    panel.set_xticks([])
+    panel.set_yticks([])
+    panel.text(0.5, 0.80, "Up to", ha="center", va="center", fontsize=11, weight="bold", color="#14751c")
+    panel.text(0.5, 0.55, f"{best[0]:.1f}", ha="center", va="center", fontsize=30, weight="bold", color="#14751c")
+    panel.text(0.5, 0.35, "ns/day", ha="center", va="center", fontsize=13, weight="bold", color="#14751c")
+    panel.text(0.5, 0.20, f"{best[1]}", ha="center", va="center", fontsize=9, color="#253142")
+    if second is not None:
+        panel.axhline(0.12, xmin=0.12, xmax=0.88, color="#7fbf79", linewidth=0.8)
+        panel.text(0.5, 0.05, f"{second[0]:.1f} ns/day", ha="center", va="bottom", fontsize=10, weight="bold", color="#14751c")
+
+    fig.tight_layout(rect=(0, 0, 1, 0.9))
+    fig.savefig(out_path, dpi=220)
+    plt.close(fig)
+    print(f"Plot written to: {out_path}")
+    return 0
+
+
+def positive_int(value):
+    parsed = int(value)
+    if parsed < 1:
+        raise argparse.ArgumentTypeError("must be >= 1")
+    return parsed
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Collect and plot QGPU concurrency throughput in ns/day.")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    collect_parser = subparsers.add_parser("collect", help="Run QGPU concurrency benchmark and write CSV data.")
+    collect_parser.add_argument("--test", help="runTEST.py test name to prepare and benchmark.")
+    collect_parser.add_argument("--data-dir", help="Existing prepared QGPU input directory containing md.csv.")
+    collect_parser.add_argument("--steps", type=positive_int, help="MD steps. Required with --test; optional with --data-dir.")
+    collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.")
+    collect_parser.add_argument("--shake", action="store_true", help="Enable shake when preparing from --test.")
+    collect_parser.add_argument(
+        "--concurrency",
+        type=positive_int,
+        nargs="+",
+        default=[1, 2, 4, 8],
+        help="Concurrent QGPU simulations to run.",
+    )
+    collect_parser.add_argument("--repeat", type=positive_int, default=1, help="Repeats per concurrency level.")
+    collect_parser.add_argument("--label", help="Series label written into the CSV, e.g. 'A100 (thrombin)'.")
+    collect_parser.add_argument("--out", help="Output directory.")
+    collect_parser.add_argument("--qgpu-bin", help="Path to QGPU qdyn binary.")
+    collect_parser.add_argument(
+        "--prep-fortran-bin",
+        default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"),
+        help="Path to qdyn_test used only when preparing from --test.",
+    )
+    collect_parser.add_argument("--pause-seconds", type=float, default=0.0, help="Pause between batches.")
+    collect_parser.add_argument(
+        "--keep-run-data",
+        action="store_true",
+        help=(
+            "Keep per-task copied QGPU input/output directories. By default successful task data is "
+            "deleted after logs and timing are recorded."
+        ),
+    )
+
+    plot_parser = subparsers.add_parser("plot", help="Plot ns/day vs concurrency from one or more CSV files.")
+    plot_parser.add_argument("csv", nargs="+", help="One or more nsday_summary.csv files from collect.")
+    plot_parser.add_argument("--out", required=True, help="Output PNG path.")
+    plot_parser.add_argument(
+        "--metric",
+        choices=["total_ns_per_day", "mean_process_ns_per_day"],
+        default="total_ns_per_day",
+        help="Y-axis metric.",
+    )
+    plot_parser.add_argument("--title", default="Multi-System Concurrency (MPS)", help="Plot title.")
+    plot_parser.add_argument(
+        "--subtitle",
+        default="Total simulation throughput at different concurrency levels",
+        help="Plot subtitle.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    if args.command == "collect":
+        return collect(args)
+    if args.command == "plot":
+        return plot(args)
+    raise SystemExit(f"Unknown command: {args.command}")
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except (FileNotFoundError, RuntimeError, ValueError) as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        raise SystemExit(1)
diff --git a/benchmark-qgpu/benchmark_report.html.j2 b/benchmark-qgpu/benchmark_report.html.j2
index 61c3a634..6cae93c7 100644
--- a/benchmark-qgpu/benchmark_report.html.j2
+++ b/benchmark-qgpu/benchmark_report.html.j2
@@ -23,9 +23,9 @@
   <small>Logs root: {{ logs_root }}</small>
 
   <div class="chart">
-  <h2>Simulation performance (ns/day)</h2>
+  <h2>Simulation throughput (total ns/day)</h2>
   <div id="nsday-chart"></div>
-  <div class="note">Simulated nanoseconds per wall-clock day.</div>
+  <div class="note">Total simulated nanoseconds per wall-clock day across all concurrent processes.</div>
   </div>
 
   <div class="chart">
@@ -209,8 +209,8 @@ function lineChart(containerId, cfg) {
   });
 
   lineChart("nsday-chart", {
-    xLabel: "Processes", yLabel: "ns/day", xs: payload.procs,
-    series: [{ label: "Performance (ns/day)", ys: payload.ns_per_day }]
+    xLabel: "Processes", yLabel: "total ns/day", xs: payload.procs,
+    series: [{ label: "Total throughput (ns/day)", ys: payload.ns_per_day }]
   });
 
 
@@ -233,7 +233,7 @@ function lineChart(containerId, cfg) {
       <th>GPU util mean (%)</th><th>GPU util peak (%)</th>
       <th>VRAM util mean (%)</th><th>VRAM util peak (%)</th>
       <th>Speedup (×)</th>
-      <th>ns/day</th>
+      <th>Total ns/day</th>
     </tr>`;
   table.appendChild(thead);
 
diff --git a/benchmark-qgpu/benchmark_report.py b/benchmark-qgpu/benchmark_report.py
index 2d393147..c5a85fb4 100644
--- a/benchmark-qgpu/benchmark_report.py
+++ b/benchmark-qgpu/benchmark_report.py
@@ -85,6 +85,7 @@ def fget(k):
         mem_means = [v["mem_util_mean"] for v in vals if math.isfinite(v["mem_util_mean"])]
         mem_peaks = [v["mem_util_peak"] for v in vals if math.isfinite(v["mem_util_peak"])]
         tmp_ns_per_day = [v["ns_per_day"] for v in vals if math.isfinite(v["ns_per_day"])]
+        total_ns_per_day = statistics.mean(tmp_ns_per_day) * p if tmp_ns_per_day else float("nan")
 
         
         rc_bad = sum(1 for v in vals if v["rc"] != 0)
@@ -99,7 +100,7 @@ def fget(k):
         gpu_util_peak.append(statistics.mean(util_peak) if util_peak else float("nan"))
         util_mem_mean.append(statistics.mean(mem_means) if mem_means else float("nan"))
         util_mem_peak.append(statistics.mean(mem_peaks) if mem_peaks else float("nan"))
-        ns_per_day.append(statistics.mean(tmp_ns_per_day) if tmp_ns_per_day else float("nan"))
+        ns_per_day.append(total_ns_per_day)
         
         
         Tn = max(walls) if walls else float("nan")
@@ -126,7 +127,7 @@ def fget(k):
             "vram_util_peak": statistics.mean(mem_peaks) if mem_peaks else float("nan"),
             "Tn": Tn,
             "speedup": speedup,
-            "ns_per_day": statistics.mean(tmp_ns_per_day) if tmp_ns_per_day else float("nan"),
+            "ns_per_day": total_ns_per_day,
         })
 
 
@@ -156,4 +157,3 @@ def fget(k):
     with open(out_html, "w", encoding="utf-8") as f:
         f.write(html_out)
     print(f"Report written to: {out_html}")
-
diff --git a/benchmark-qgpu/benchmark_run.py b/benchmark-qgpu/benchmark_run.py
index b119423b..7e0c396d 100644
--- a/benchmark-qgpu/benchmark_run.py
+++ b/benchmark-qgpu/benchmark_run.py
@@ -350,7 +350,12 @@ def _get(d, dotted, default=None):
 def run(args):
     data_dir = os.path.expanduser(args.data_dir)   # e.g., TEST/water
     bin_path = os.path.expanduser(args.bin)        # e.g., /path/to/qdyn
-    max_procs = int(args.max_processes)
+    if getattr(args, "concurrency", None):
+        concurrency = sorted(dict.fromkeys(int(value) for value in args.concurrency))
+    elif args.max_processes is not None:
+        concurrency = list(range(1, int(args.max_processes) + 1))
+    else:
+        raise ValueError("Pass --concurrency or --max_processes.")
 
     if not os.path.isdir(data_dir):
         raise FileNotFoundError(f"data_dir not found: {data_dir}")
@@ -386,7 +391,7 @@ def run(args):
     os.makedirs(logs_dir, exist_ok=True)
     work(1, logs_dir, f'"{bin_path}" "{data_dir}"', steps)
     
-    for process_num in range(1, max_procs + 1):
+    for process_num in concurrency:
         print(f"Will run {process_num} processes in parallel:")
         logs_dir = os.path.join(current_dir, f"benchmark_logs/{process_num:02d}_procs")
         os.makedirs(logs_dir, exist_ok=True)
@@ -405,4 +410,3 @@ def run(args):
     # generate report
     out_html = os.path.join(current_dir, "benchmark_report.html")
     make_html_report(logs_root, out_html)
-
diff --git a/benchmark-qgpu/benchmark_system_scaling.py b/benchmark-qgpu/benchmark_system_scaling.py
new file mode 100644
index 00000000..650c3d4c
--- /dev/null
+++ b/benchmark-qgpu/benchmark_system_scaling.py
@@ -0,0 +1,567 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import json
+import math
+import os
+import shutil
+import sys
+from datetime import datetime
+from pathlib import Path
+from statistics import median
+
+os.environ.setdefault("MPLCONFIGDIR", "/tmp/qgpu-benchmark-matplotlib")
+
+import matplotlib
+
+matplotlib.use("Agg")
+from matplotlib import pyplot as plt
+
+from benchmark_test import (
+    ROOT,
+    ns_per_day,
+    prepare_qgpu_input,
+    prepare_restart_with_qdyn_test,
+    resolve_fortran_bin,
+    resolve_qgpu_bin,
+    resolve_test_data,
+    run_fortran_repeats,
+    run_qgpu_repeats,
+    write_md_input,
+)
+from benchmark_nsday import run_concurrency_batch
+
+
+RESTART_INIT_STEPS = 1
+
+
+def default_collect_out():
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_system_scaling"
+
+
+def count_atoms(prepared_data_dir):
+    coords_path = Path(prepared_data_dir) / "coords.csv"
+    if not coords_path.exists():
+        raise FileNotFoundError(f"coords.csv not found: {coords_path}")
+    with open(coords_path, encoding="utf-8") as coords_f:
+        return int(coords_f.readline().strip())
+
+
+def successful_times(records):
+    return [float(record["wall_seconds"]) for record in records if int(record["return_code"]) == 0]
+
+
+def parse_optional_float(value):
+    if value in (None, ""):
+        return float("nan")
+    return float(value)
+
+
+def write_raw_records(records, out_dir):
+    path = out_dir / "system_scaling_raw.csv"
+    fieldnames = [
+        "test",
+        "runner",
+        "repeat",
+        "command",
+        "return_code",
+        "wall_seconds",
+        "steps",
+        "ns_per_day",
+        "stdout",
+        "stderr",
+    ]
+    with open(path, "w", newline="", encoding="utf-8") as csv_f:
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(records)
+    return path
+
+
+def write_qgpu_concurrency_records(records, out_dir):
+    path = out_dir / "system_scaling_qgpu_concurrency.csv"
+    fieldnames = [
+        "test",
+        "label",
+        "concurrency",
+        "repeat",
+        "steps",
+        "batch_wall_seconds",
+        "total_ns_per_day",
+        "mean_process_ns_per_day",
+        "failed_processes",
+        "command",
+    ]
+    with open(path, "w", newline="", encoding="utf-8") as csv_f:
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(records)
+    return path
+
+
+def write_summary(rows, out_dir, metadata):
+    summary_csv = out_dir / "system_scaling.csv"
+    meta_json = out_dir / "system_scaling_meta.json"
+
+    fieldnames = [
+        "test",
+        "atoms",
+        "steps",
+        "fortran_wall_median_s",
+        "qgpu_wall_median_s",
+        "fortran_ns_per_day",
+        "qgpu_ns_per_day",
+        "qgpu_best_concurrency",
+        "speedup_x",
+        "fortran_repeats",
+        "qgpu_repeats",
+    ]
+    with open(summary_csv, "w", newline="", encoding="utf-8") as csv_f:
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+
+    with open(meta_json, "w", encoding="utf-8") as json_f:
+        json.dump(metadata, json_f, indent=2)
+
+    return summary_csv, meta_json
+
+
+def cleanup_test_artifacts(out_dir, test_name):
+    test_dir = Path(out_dir) / test_name
+    if test_dir.exists():
+        shutil.rmtree(test_dir)
+        print(f"Removed intermediate run data: {test_dir}")
+
+
+def run_qgpu_concurrency_sweep(args, test_name, qgpu_bin, prepared_data_dir, qgpu_runs_dir):
+    batch_rows = []
+    process_rows = []
+    for concurrency in args.concurrency:
+        for repeat in range(1, args.repeat + 1):
+            run_dir = qgpu_runs_dir / f"c{concurrency:03d}" / f"repeat_{repeat:03d}"
+            print(f"Running QGPU for {test_name}: concurrency={concurrency}, repeat={repeat}")
+            batch_row, rows = run_concurrency_batch(
+                qgpu_bin=qgpu_bin,
+                prepared_data_dir=prepared_data_dir,
+                run_dir=run_dir,
+                concurrency=concurrency,
+                steps=args.steps,
+                label=test_name,
+                repeat=repeat,
+            )
+            batch_row["test"] = test_name
+            batch_rows.append(batch_row)
+            process_rows.extend(rows)
+            if batch_row["failed_processes"]:
+                raise RuntimeError(
+                    f"{batch_row['failed_processes']} QGPU process(es) failed for {test_name} "
+                    f"at concurrency {concurrency}, repeat {repeat}. Logs are under {run_dir}"
+                )
+            if not args.keep_run_data:
+                shutil.rmtree(run_dir)
+    return batch_rows, process_rows
+
+
+def collect_one_test(args, test_name, out_dir, fortran_bin, prep_fortran_bin, qgpu_bin):
+    test_dir = out_dir / test_name
+    fortran_dir = test_dir / "fortran"
+    prep_dir = test_dir / "qgpu_prepare"
+    qgpu_runs_dir = test_dir / "qgpu_runs"
+    fortran_dir.mkdir(parents=True, exist_ok=True)
+
+    data = resolve_test_data(test_name, args.steps, args.lambda_name, args.shake)
+    fortran_records = []
+    fortran_times = []
+
+    if args.gpu_only:
+        init_data = resolve_test_data(test_name, RESTART_INIT_STEPS, args.lambda_name, args.shake)
+        print(f"Preparing QGPU restart for {test_name} with {RESTART_INIT_STEPS} MD step(s)")
+        write_md_input(init_data, fortran_dir)
+        prepare_restart_with_qdyn_test(init_data, prep_fortran_bin, fortran_dir)
+
+        print(f"Writing QGPU benchmark input for {test_name} with {args.steps} MD step(s)")
+        write_md_input(data, fortran_dir)
+    else:
+        print(f"Preparing {test_name}")
+        write_md_input(data, fortran_dir)
+
+        print(f"Running Fortran qdyn for {test_name} ({args.repeat} repeat(s))")
+        fortran_records, fortran_ok = run_fortran_repeats(data, fortran_bin, fortran_dir, args.repeat, args.steps)
+        if not fortran_ok:
+            return None, fortran_records, []
+        fortran_times = successful_times(fortran_records)
+
+        print(f"Preparing QGPU input for {test_name}")
+        prepare_restart_with_qdyn_test(data, prep_fortran_bin, fortran_dir)
+
+    prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir)
+    atoms = count_atoms(prepared_data_dir)
+
+    qgpu_concurrency_rows = []
+    if args.concurrency:
+        qgpu_records = []
+        qgpu_concurrency_rows, _ = run_qgpu_concurrency_sweep(
+            args, test_name, qgpu_bin, prepared_data_dir, qgpu_runs_dir
+        )
+    else:
+        print(f"Running QGPU for {test_name} ({args.repeat} repeat(s))")
+        qgpu_records = run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, args.repeat, args.steps)
+
+    qgpu_times = successful_times(qgpu_records)
+    if args.concurrency:
+        successful_batches = [row for row in qgpu_concurrency_rows if int(row["failed_processes"]) == 0]
+        if not successful_batches:
+            return None, [*fortran_records, *qgpu_records], qgpu_concurrency_rows
+        best_qgpu = max(successful_batches, key=lambda row: float(row["total_ns_per_day"]))
+        qgpu_wall = float(best_qgpu["batch_wall_seconds"])
+        qgpu_ns_day = float(best_qgpu["total_ns_per_day"])
+        qgpu_best_concurrency = int(best_qgpu["concurrency"])
+        qgpu_repeat_count = len(qgpu_concurrency_rows)
+    else:
+        if not qgpu_times:
+            return None, [*fortran_records, *qgpu_records], qgpu_concurrency_rows
+        qgpu_wall = median(qgpu_times)
+        qgpu_ns_day = ns_per_day(args.steps, qgpu_wall)
+        qgpu_best_concurrency = 1
+        qgpu_repeat_count = len(qgpu_records)
+
+    if not args.gpu_only and not fortran_times:
+        return None, [*fortran_records, *qgpu_records], qgpu_concurrency_rows
+
+    fortran_wall = median(fortran_times) if fortran_times else None
+    fortran_ns_day = ns_per_day(args.steps, fortran_wall) if fortran_wall is not None else None
+    row = {
+        "test": test_name,
+        "atoms": atoms,
+        "steps": args.steps,
+        "fortran_wall_median_s": fortran_wall if fortran_wall is not None else "",
+        "qgpu_wall_median_s": qgpu_wall,
+        "fortran_ns_per_day": fortran_ns_day if fortran_ns_day is not None else "",
+        "qgpu_ns_per_day": qgpu_ns_day,
+        "qgpu_best_concurrency": qgpu_best_concurrency,
+        "speedup_x": qgpu_ns_day / fortran_ns_day if fortran_ns_day is not None and fortran_ns_day > 0 else "",
+        "fortran_repeats": len(fortran_records),
+        "qgpu_repeats": qgpu_repeat_count,
+    }
+    return row, [*fortran_records, *qgpu_records], qgpu_concurrency_rows
+
+
+def collect(args):
+    out_dir = Path(args.out).expanduser().resolve() if args.out else default_collect_out()
+    out_dir.mkdir(parents=True, exist_ok=True)
+    fortran_bin = None if args.gpu_only else resolve_fortran_bin(args.fortran_bin)
+    prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin)
+    qgpu_bin = resolve_qgpu_bin(args.qgpu_bin)
+
+    rows = []
+    raw_records = []
+    qgpu_concurrency_records = []
+    try:
+        for test_name in args.test:
+            row, records, concurrency_records = collect_one_test(
+                args, test_name, out_dir, fortran_bin, prep_fortran_bin, qgpu_bin
+            )
+            raw_records.extend(records)
+            qgpu_concurrency_records.extend(concurrency_records)
+            write_raw_records(raw_records, out_dir)
+            if args.concurrency:
+                write_qgpu_concurrency_records(qgpu_concurrency_records, out_dir)
+            if row is not None:
+                rows.append(row)
+                write_summary(
+                    rows,
+                    out_dir,
+                    {
+                        "created_at": datetime.now().isoformat(timespec="seconds"),
+                        "tests": args.test,
+                        "steps": args.steps,
+                        "repeat": args.repeat,
+                        "gpu_only": args.gpu_only,
+                        "concurrency": args.concurrency,
+                        "keep_run_data": args.keep_run_data,
+                        "fortran_bin": str(fortran_bin) if fortran_bin is not None else None,
+                        "prep_fortran_bin": str(prep_fortran_bin),
+                        "qgpu_bin": str(qgpu_bin),
+                    },
+                )
+                if not args.keep_run_data:
+                    cleanup_test_artifacts(out_dir, test_name)
+    finally:
+        raw_path = write_raw_records(raw_records, out_dir)
+        concurrency_path = (
+            write_qgpu_concurrency_records(qgpu_concurrency_records, out_dir) if args.concurrency else None
+        )
+
+    failures = [record for record in raw_records if int(record["return_code"]) != 0]
+    if failures:
+        first = failures[0]
+        raise RuntimeError(
+            f"{first['runner']} failed for {first['test']} repeat {first['repeat']}. "
+            f"Logs: stdout={first['stdout']} stderr={first['stderr']}; raw CSV: {raw_path}"
+        )
+
+    summary_csv, meta_json = write_summary(
+        rows,
+        out_dir,
+        {
+            "created_at": datetime.now().isoformat(timespec="seconds"),
+            "tests": args.test,
+            "steps": args.steps,
+            "repeat": args.repeat,
+            "gpu_only": args.gpu_only,
+            "concurrency": args.concurrency,
+            "keep_run_data": args.keep_run_data,
+            "fortran_bin": str(fortran_bin) if fortran_bin is not None else None,
+            "prep_fortran_bin": str(prep_fortran_bin),
+            "qgpu_bin": str(qgpu_bin),
+        },
+    )
+    print(f"Summary CSV: {summary_csv}")
+    print(f"Raw CSV: {raw_path}")
+    if concurrency_path is not None:
+        print(f"QGPU concurrency CSV: {concurrency_path}")
+    print(f"Metadata JSON: {meta_json}")
+    return 0
+
+
+def load_rows(csv_path):
+    rows = []
+    with open(csv_path, newline="", encoding="utf-8") as csv_f:
+        reader = csv.DictReader(csv_f)
+        for row in reader:
+            parsed = dict(row)
+            for key in [
+                "atoms",
+                "steps",
+                "fortran_wall_median_s",
+                "qgpu_wall_median_s",
+                "fortran_ns_per_day",
+                "qgpu_ns_per_day",
+                "qgpu_best_concurrency",
+                "speedup_x",
+            ]:
+                parsed[key] = parse_optional_float(parsed.get(key))
+            rows.append(parsed)
+    if not rows:
+        raise RuntimeError(f"No rows found in {csv_path}")
+    return rows
+
+
+def fmt_atoms(atoms):
+    atoms = int(atoms)
+    if atoms >= 1000:
+        return f"{atoms / 1000:.1f}k atoms"
+    return f"{atoms} atoms"
+
+
+def annotate_bars(ax, bars, formatter):
+    for bar in bars:
+        height = bar.get_height()
+        ax.text(
+            bar.get_x() + bar.get_width() / 2,
+            height,
+            formatter(height),
+            ha="center",
+            va="bottom",
+            fontsize=8,
+            weight="bold",
+        )
+
+
+def plot_speedup(rows, out_path, title):
+    if not any(math.isfinite(row["speedup_x"]) for row in rows):
+        raise RuntimeError("speedup plot requires Fortran data. Use --metric nsday for --gpu-only results.")
+    labels = [row["test"] for row in rows]
+    speedups = [row["speedup_x"] for row in rows]
+    atoms = [row["atoms"] for row in rows]
+
+    fig, (ax, panel) = plt.subplots(
+        1,
+        2,
+        figsize=(9.2, 3.3),
+        gridspec_kw={"width_ratios": [4.3, 1.55]},
+    )
+    x = range(len(rows))
+    bars = ax.bar(x, speedups, color="#0b71c8", width=0.62)
+    annotate_bars(ax, bars, lambda value: f"{value:.1f}x")
+    ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f")
+    ax.set_ylabel("Speedup vs Fortran (x)")
+    ax.set_xticks(list(x))
+    ax.set_xticklabels(labels)
+    ax.grid(axis="y", color="#e5e8ee", linewidth=0.8)
+    ax.set_axisbelow(True)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    for xpos, atom_count in zip(x, atoms):
+        ax.text(xpos, -0.08, fmt_atoms(atom_count), transform=ax.get_xaxis_transform(), ha="center", va="top", fontsize=8)
+
+    best = max(rows, key=lambda row: row["speedup_x"])
+    panel.set_facecolor("#eef5fd")
+    for spine in panel.spines.values():
+        spine.set_color("#8ab9ef")
+    panel.set_xticks([])
+    panel.set_yticks([])
+    panel.text(0.5, 0.80, "Best", ha="center", va="center", fontsize=12, weight="bold", color="#0b3970")
+    panel.text(0.5, 0.55, f"{best['speedup_x']:.1f}x", ha="center", va="center", fontsize=30, weight="bold", color="#003c7f")
+    panel.text(0.5, 0.35, "speedup", ha="center", va="center", fontsize=13, weight="bold", color="#0b3970")
+    panel.text(0.5, 0.18, best["test"], ha="center", va="center", fontsize=10, color="#0b3970")
+    panel.text(0.5, 0.08, fmt_atoms(best["atoms"]), ha="center", va="center", fontsize=9, color="#0b3970")
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=220)
+    plt.close(fig)
+
+
+def plot_nsday(rows, out_path, title):
+    labels = [row["test"] for row in rows]
+    x = list(range(len(rows)))
+    width = 0.34
+
+    fig, ax = plt.subplots(figsize=(8.6, 3.5))
+    fortran = [row["fortran_ns_per_day"] for row in rows]
+    qgpu = [row["qgpu_ns_per_day"] for row in rows]
+    has_fortran = any(math.isfinite(value) for value in fortran)
+    has_concurrency = any(math.isfinite(row["qgpu_best_concurrency"]) and row["qgpu_best_concurrency"] > 1 for row in rows)
+    qgpu_label = "QGPU best total" if has_concurrency else "QGPU"
+    if has_fortran:
+        bars_cpu = ax.bar([i - width / 2 for i in x], fortran, width, label="Fortran CPU", color="#9b9b9b")
+        bars_gpu = ax.bar([i + width / 2 for i in x], qgpu, width, label=qgpu_label, color="#0b71c8")
+        annotate_bars(ax, bars_cpu, lambda value: f"{value:.1f}")
+    else:
+        bars_gpu = ax.bar(x, qgpu, width * 1.55, label=qgpu_label, color="#0b71c8")
+    annotate_bars(ax, bars_gpu, lambda value: f"{value:.1f}")
+    ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f")
+    ax.set_ylabel("Best total ns/day" if has_concurrency else "ns/day")
+    ax.set_xticks(x)
+    ax.set_xticklabels(labels)
+    ax.grid(axis="y", color="#e5e8ee", linewidth=0.8)
+    ax.set_axisbelow(True)
+    ax.legend(frameon=False, loc="best")
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    for xpos, row in zip(x, rows):
+        ax.text(xpos, -0.08, fmt_atoms(row["atoms"]), transform=ax.get_xaxis_transform(), ha="center", va="top", fontsize=8)
+
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=220)
+    plt.close(fig)
+
+
+def plot_atoms(rows, out_path, title):
+    fig, ax = plt.subplots(figsize=(6.5, 3.8))
+    xs = [row["atoms"] for row in rows]
+    has_speedup = any(math.isfinite(row["speedup_x"]) for row in rows)
+    value_key = "speedup_x" if has_speedup else "qgpu_ns_per_day"
+    ys = [row[value_key] for row in rows]
+    ax.plot(xs, ys, color="#0b71c8", marker="o", linewidth=1.8)
+    for row in rows:
+        suffix = f"{row['speedup_x']:.1f}x" if has_speedup else f"{row['qgpu_ns_per_day']:.1f} ns/day"
+        ax.text(row["atoms"], row[value_key], f" {row['test']} ({suffix})", va="center", fontsize=8)
+    ax.set_title(title, loc="left", fontsize=13, weight="bold", color="#113b5f")
+    ax.set_xlabel("Atoms")
+    ax.set_ylabel("Speedup vs Fortran (x)" if has_speedup else "QGPU ns/day")
+    ax.grid(True, color="#e5e8ee", linewidth=0.8)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=220)
+    plt.close(fig)
+
+
+def plot(args):
+    rows = load_rows(Path(args.csv).expanduser().resolve())
+    rows.sort(key=lambda row: row["atoms"] if args.sort == "atoms" else row["test"])
+    out_path = Path(args.out).expanduser().resolve()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    if args.metric == "speedup":
+        plot_speedup(rows, out_path, args.title)
+    elif args.metric == "nsday":
+        plot_nsday(rows, out_path, args.title)
+    elif args.metric == "atoms":
+        plot_atoms(rows, out_path, args.title)
+    else:
+        raise SystemExit(f"Unknown metric: {args.metric}")
+
+    print(f"Plot written to: {out_path}")
+    return 0
+
+
+def positive_int(value):
+    parsed = int(value)
+    if parsed < 1:
+        raise argparse.ArgumentTypeError("must be >= 1")
+    return parsed
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Collect and plot QGPU scaling across molecular systems.")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    collect_parser = subparsers.add_parser("collect", help="Run Fortran/QGPU benchmark for multiple tests.")
+    collect_parser.add_argument("--test", nargs="+", required=True, help="runTEST.py test names.")
+    collect_parser.add_argument("--steps", type=positive_int, required=True, help="MD steps.")
+    collect_parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.")
+    collect_parser.add_argument("--shake", action="store_true", help="Enable shake.")
+    collect_parser.add_argument("--repeat", type=positive_int, default=1, help="Repeats per runner per system.")
+    collect_parser.add_argument(
+        "--concurrency",
+        type=positive_int,
+        nargs="+",
+        help="Concurrent QGPU instance counts to sweep; summary uses the maximum total ns/day.",
+    )
+    collect_parser.add_argument(
+        "--gpu-only",
+        action="store_true",
+        help="Skip timed Fortran qdyn runs and collect only QGPU performance.",
+    )
+    collect_parser.add_argument(
+        "--keep-run-data",
+        action="store_true",
+        help="Keep per-test run directories and logs. By default successful intermediate data is deleted.",
+    )
+    collect_parser.add_argument("--out", help="Output directory.")
+    collect_parser.add_argument(
+        "--fortran-bin",
+        default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn"),
+        help="Path to production Fortran qdyn binary.",
+    )
+    collect_parser.add_argument(
+        "--prep-fortran-bin",
+        default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"),
+        help="Path to qdyn_test used only to prepare QGPU restart CSVs.",
+    )
+    collect_parser.add_argument("--qgpu-bin", help="Path to QGPU qdyn binary.")
+
+    plot_parser = subparsers.add_parser("plot", help="Plot system scaling from system_scaling.csv.")
+    plot_parser.add_argument("csv", help="system_scaling.csv from collect.")
+    plot_parser.add_argument("--out", required=True, help="Output PNG path.")
+    plot_parser.add_argument(
+        "--metric",
+        choices=["speedup", "nsday", "atoms"],
+        default="speedup",
+        help="Plot style.",
+    )
+    plot_parser.add_argument("--sort", choices=["atoms", "test"], default="atoms", help="System order.")
+    plot_parser.add_argument("--title", default="Performance Across Molecular Systems", help="Plot title.")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    if args.command == "collect":
+        return collect(args)
+    if args.command == "plot":
+        return plot(args)
+    raise SystemExit(f"Unknown command: {args.command}")
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except (FileNotFoundError, RuntimeError, ValueError) as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        raise SystemExit(1)
diff --git a/benchmark-qgpu/benchmark_test.py b/benchmark-qgpu/benchmark_test.py
new file mode 100644
index 00000000..fcad0788
--- /dev/null
+++ b/benchmark-qgpu/benchmark_test.py
@@ -0,0 +1,699 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import json
+import os
+import shlex
+import shutil
+import subprocess
+import sys
+import time
+from contextlib import contextmanager
+from datetime import datetime
+from pathlib import Path
+from statistics import median
+
+os.environ.setdefault("MPLCONFIGDIR", "/tmp/qgpu-benchmark-matplotlib")
+
+import matplotlib
+
+matplotlib.use("Agg")
+from matplotlib import pyplot as plt
+
+
+ROOT = Path(__file__).resolve().parents[1]
+TIME_STEP_NS = 2e-6
+
+sys.path.insert(0, str(ROOT / "test"))
+sys.path.insert(0, str(ROOT / "src" / "qligfep-newbin-unfinished"))
+
+import runTEST  # noqa: E402
+import qdyn as qdyn_prepare  # noqa: E402
+
+
+@contextmanager
+def pushd(path):
+    previous = Path.cwd()
+    os.chdir(path)
+    try:
+        yield
+    finally:
+        os.chdir(previous)
+
+
+def abs_path(path):
+    if path is None:
+        return None
+    return str(Path(path).expanduser().resolve())
+
+
+def command_text(args):
+    return " ".join(shlex.quote(str(arg)) for arg in args)
+
+
+def split_mpirun_args(args):
+    if args is None:
+        return []
+    if isinstance(args, str):
+        return shlex.split(args)
+    return [str(arg) for arg in args]
+
+
+def build_fortran_command(fortran_bin, input_file, mpi_procs=None, mpirun_bin="mpirun", mpirun_args=None):
+    command = [str(fortran_bin), input_file]
+    if mpi_procs is None:
+        return command
+    return [str(mpirun_bin), "-np", str(mpi_procs), *split_mpirun_args(mpirun_args), *command]
+
+
+def resolve_qgpu_bin(path):
+    if path:
+        candidate = Path(path).expanduser()
+        if not candidate.is_absolute():
+            candidate = (Path.cwd() / candidate).resolve()
+        if not candidate.exists():
+            raise FileNotFoundError(f"QGPU binary not found: {candidate}")
+        return candidate
+
+    for candidate in (ROOT / "bin" / "qdyn", ROOT / "src" / "core" / "qdyn"):
+        if candidate.exists():
+            return candidate
+    raise FileNotFoundError(
+        "QGPU binary not found. Expected bin/qdyn or src/core/qdyn, "
+        "or pass --qgpu-bin."
+    )
+
+
+def resolve_fortran_bin(path):
+    candidate = Path(path).expanduser()
+    if not candidate.is_absolute():
+        candidate = (Path.cwd() / candidate).resolve()
+    if not candidate.exists():
+        raise FileNotFoundError(f"Fortran binary not found: {candidate}")
+    return candidate
+
+
+def resolve_test_data(test_name, steps, lambda_name, shake):
+    testinfo = runTEST.get_default_testinfo()
+    if test_name not in testinfo:
+        available = ", ".join(sorted(testinfo))
+        raise ValueError(f"Unknown test '{test_name}'. Available tests: {available}")
+
+    topdir = ROOT / "test" / "data" / "topology"
+    inputdir = ROOT / "test" / "data" / "inputs"
+    info = testinfo[test_name]
+    topfile = info[0]
+    if len(info) >= 3 and lambda_name is not None:
+        stem, suffix = topfile.rsplit(".", 1)
+        topfile = f"{stem}_{lambda_name}.{suffix}"
+
+    data = {
+        "avg": False,
+        "curtest": None,
+        "fep_path": None,
+        "inputdir": str(inputdir),
+        "lambda": lambda_name,
+        "plot": False,
+        "restraints_path": None,
+        "shake": shake,
+        "test": test_name,
+        "testinfo": testinfo,
+        "timestep": str(steps),
+        "topdir": str(topdir),
+        "topfile": topfile,
+        "topology_path": str(topdir / topfile),
+        "verbose": False,
+    }
+    if len(info) >= 3:
+        data["fep_path"] = str(inputdir / info[2])
+    if len(info) >= 4:
+        data["restraints_path"] = str(inputdir / info[3])
+
+    required = [Path(data["topology_path"])]
+    if data["fep_path"] is not None:
+        required.append(Path(data["fep_path"]))
+    if data["restraints_path"] is not None:
+        required.append(Path(data["restraints_path"]))
+    missing = [str(path) for path in required if not path.exists()]
+    if missing:
+        raise FileNotFoundError("Required input file(s) not found: " + ", ".join(missing))
+
+    return data
+
+
+def run_timed(args, cwd, stdout_path, stderr_path):
+    start = time.perf_counter()
+    with open(stdout_path, "w", encoding="utf-8") as stdout_f, open(
+        stderr_path, "w", encoding="utf-8"
+    ) as stderr_f:
+        completed = subprocess.run(args, cwd=cwd, stdout=stdout_f, stderr=stderr_f)
+    wall_seconds = time.perf_counter() - start
+    return completed.returncode, wall_seconds
+
+
+def ns_per_day(steps, wall_seconds):
+    if wall_seconds <= 0:
+        return None
+    return steps * TIME_STEP_NS * 86400 / wall_seconds
+
+
+def write_md_input(data, fortran_dir):
+    data["curtest"] = str(fortran_dir)
+    with pushd(fortran_dir):
+        runTEST.create_MD_input(data)
+
+
+def run_fortran_repeats(
+    data,
+    fortran_bin,
+    fortran_dir,
+    repeat,
+    steps,
+    mpi_procs=None,
+    mpirun_bin="mpirun",
+    mpirun_args=None,
+):
+    records = []
+    saw_success = False
+
+    for index in range(1, repeat + 1):
+        stdout_name = "fortran.log" if repeat == 1 else f"fortran_{index}.log"
+        stderr_name = "fortran.err" if repeat == 1 else f"fortran_{index}.err"
+        stdout_path = fortran_dir / stdout_name
+        stderr_path = fortran_dir / stderr_name
+        args = build_fortran_command(
+            fortran_bin,
+            "eq1.inp",
+            mpi_procs=mpi_procs,
+            mpirun_bin=mpirun_bin,
+            mpirun_args=mpirun_args,
+        )
+        return_code, wall_seconds = run_timed(args, fortran_dir, stdout_path, stderr_path)
+        if return_code == 0:
+            saw_success = True
+        records.append(
+            {
+                "test": data["test"],
+                "runner": "fortran",
+                "repeat": index,
+                "command": command_text(args),
+                "return_code": return_code,
+                "wall_seconds": wall_seconds,
+                "steps": steps,
+                "ns_per_day": ns_per_day(steps, wall_seconds),
+                "stdout": str(stdout_path),
+                "stderr": str(stderr_path),
+            }
+        )
+
+    return records, saw_success
+
+
+def prepare_restart_with_qdyn_test(
+    data,
+    prep_fortran_bin,
+    fortran_dir,
+    prep_steps=None,
+    mpi_procs=None,
+    mpirun_bin="mpirun",
+    mpirun_args=None,
+):
+    input_path = fortran_dir / "eq1.inp"
+    original_input = input_path.read_text(encoding="utf-8")
+    parse_data = data
+    if prep_steps is not None:
+        prep_data = dict(data)
+        prep_data["timestep"] = str(prep_steps)
+        write_md_input(prep_data, fortran_dir)
+        parse_data = prep_data
+
+    stdout_path = fortran_dir / "restart_prep_qdyn_test.log"
+    stderr_path = fortran_dir / "restart_prep_qdyn_test.err"
+    args = build_fortran_command(
+        prep_fortran_bin,
+        "eq1.inp",
+        mpi_procs=mpi_procs,
+        mpirun_bin=mpirun_bin,
+        mpirun_args=mpirun_args,
+    )
+    try:
+        return_code, _ = run_timed(args, fortran_dir, stdout_path, stderr_path)
+        if return_code != 0:
+            raise RuntimeError(
+                "QGPU restart preparation failed. "
+                f"Command: {command_text(args)} Logs: stdout={stdout_path} stderr={stderr_path}"
+            )
+
+        shutil.copyfile(stdout_path, fortran_dir / "eq1.log")
+        with pushd(fortran_dir):
+            runTEST.Parse_Q6_data(parse_data)
+    finally:
+        if prep_steps is not None:
+            input_path.write_text(original_input, encoding="utf-8")
+
+
+def prepare_qgpu_input(data, fortran_dir, prep_dir):
+    prep_dir.mkdir(parents=True, exist_ok=True)
+    restart_dir = prep_dir / "restart"
+    restart_dir.mkdir(exist_ok=True)
+    shutil.copyfile(fortran_dir / "coords.csv", restart_dir / "coords.csv")
+    shutil.copyfile(fortran_dir / "velocities.csv", restart_dir / "velocities.csv")
+
+    top_stem = Path(data["topfile"]).stem
+    wd_rel = f"TEST/{top_stem}"
+    with pushd(prep_dir):
+        qdyn_prepare.Create_Environment(top=data["topology_path"], wd=wd_rel)
+        qdyn_prepare.Prepare_Topology(top=data["topology_path"], wd=wd_rel)
+        qdyn_prepare.Prepare_MD(top=data["topology_path"], md=str(fortran_dir / "eq1.inp"), wd=wd_rel)
+        qdyn_prepare.Prepare_FEP(
+            fepfile=data["fep_path"],
+            wd=wd_rel,
+            top=data["topology_path"],
+        )
+        qdyn_prepare.Read_Restart(restart=str(restart_dir), wd=wd_rel, top=data["topology_path"])
+
+    prepared_data_dir = prep_dir / wd_rel
+    if not (prepared_data_dir / "md.csv").exists():
+        raise RuntimeError(f"Prepared QGPU data is missing md.csv: {prepared_data_dir}")
+    return prepared_data_dir
+
+
+def run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, repeat, steps):
+    qgpu_runs_dir.mkdir(parents=True, exist_ok=True)
+    records = []
+
+    for index in range(1, repeat + 1):
+        run_dir = qgpu_runs_dir / f"repeat_{index:03d}"
+        data_dir = run_dir / prepared_data_dir.name
+        if run_dir.exists():
+            shutil.rmtree(run_dir)
+        run_dir.mkdir(parents=True)
+        shutil.copytree(prepared_data_dir, data_dir)
+
+        stdout_path = run_dir / "qgpu.log"
+        stderr_path = run_dir / "qgpu.err"
+        args = [str(qgpu_bin), "--gpu", str(data_dir)]
+        return_code, wall_seconds = run_timed(args, ROOT, stdout_path, stderr_path)
+        records.append(
+            {
+                "test": data["test"],
+                "runner": "qgpu",
+                "repeat": index,
+                "command": command_text(args),
+                "return_code": return_code,
+                "wall_seconds": wall_seconds,
+                "steps": steps,
+                "ns_per_day": ns_per_day(steps, wall_seconds),
+                "stdout": str(stdout_path),
+                "stderr": str(stderr_path),
+            }
+        )
+
+    return records
+
+
+def write_summary_csv(records, out_dir):
+    csv_path = out_dir / "summary.csv"
+    fieldnames = [
+        "test",
+        "runner",
+        "repeat",
+        "command",
+        "return_code",
+        "wall_seconds",
+        "steps",
+        "ns_per_day",
+        "stdout",
+        "stderr",
+    ]
+    with open(csv_path, "w", newline="", encoding="utf-8") as csv_f:
+        writer = csv.DictWriter(csv_f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(records)
+    return csv_path
+
+
+def read_summary_csv(csv_path):
+    records = []
+    with open(csv_path, newline="", encoding="utf-8") as csv_f:
+        reader = csv.DictReader(csv_f)
+        for row in reader:
+            parsed = dict(row)
+            parsed["repeat"] = int(parsed["repeat"])
+            parsed["return_code"] = int(parsed["return_code"])
+            parsed["wall_seconds"] = float(parsed["wall_seconds"])
+            parsed["steps"] = int(parsed["steps"])
+            parsed["ns_per_day"] = float(parsed["ns_per_day"]) if parsed.get("ns_per_day") else None
+            records.append(parsed)
+    if not records:
+        raise RuntimeError(f"No records found in {csv_path}")
+    return records
+
+
+def summarize(records, args, qgpu_bin, fortran_bin, prep_fortran_bin):
+    by_test = {}
+    for record in records:
+        by_test.setdefault(record["test"], {}).setdefault(record["runner"], []).append(record)
+
+    tests = []
+    for test_name in sorted(by_test):
+        fortran_records = by_test[test_name].get("fortran", [])
+        qgpu_records = by_test[test_name].get("qgpu", [])
+        fortran_ok = [float(r["wall_seconds"]) for r in fortran_records if int(r["return_code"]) == 0]
+        qgpu_ok = [float(r["wall_seconds"]) for r in qgpu_records if int(r["return_code"]) == 0]
+        if not fortran_ok or not qgpu_ok:
+            continue
+        fortran_median = median(fortran_ok)
+        qgpu_median = median(qgpu_ok)
+        speedup = fortran_median / qgpu_median if qgpu_median > 0 else None
+        tests.append(
+            {
+                "test": test_name,
+                "fortran_median_seconds": fortran_median,
+                "qgpu_median_seconds": qgpu_median,
+                "speedup_x": speedup,
+                "improvement_pct": (speedup - 1) * 100 if speedup is not None else None,
+                "fortran_repeats": len(fortran_records),
+                "qgpu_repeats": len(qgpu_records),
+            }
+        )
+
+    return {
+        "created_at": datetime.now().isoformat(timespec="seconds"),
+        "args": {
+            "test": args.test,
+            "steps": args.steps,
+            "lambda": args.lambda_name,
+            "shake": args.shake,
+            "repeat": args.repeat,
+            "restart_prep_steps": getattr(args, "restart_prep_steps", None),
+            "fortran_mpi_procs": getattr(args, "fortran_mpi_procs", None),
+            "mpirun_bin": getattr(args, "mpirun_bin", None),
+            "mpirun_args": getattr(args, "mpirun_args", None),
+        },
+        "binaries": {
+            "fortran": str(fortran_bin),
+            "restart_prep_fortran": str(prep_fortran_bin),
+            "qgpu": str(qgpu_bin),
+        },
+        "tests": tests,
+    }
+
+
+def summarize_for_plot(records):
+    args = argparse.Namespace(
+        test=sorted({record["test"] for record in records}),
+        steps=sorted({int(record["steps"]) for record in records}),
+        lambda_name=None,
+        shake=None,
+        repeat=None,
+    )
+    return summarize(records, args, qgpu_bin="<from summary.csv>", fortran_bin="<from summary.csv>", prep_fortran_bin="")
+
+
+def write_summary_json(summary, out_dir):
+    json_path = out_dir / "summary.json"
+    with open(json_path, "w", encoding="utf-8") as json_f:
+        json.dump(summary, json_f, indent=2)
+    return json_path
+
+
+def plot_speedup(summary, out_dir):
+    tests = summary["tests"]
+    if not tests:
+        return None
+
+    fig_width = max(8.0, 2.0 + len(tests) * 1.2)
+    fig, (ax, panel) = plt.subplots(
+        1,
+        2,
+        figsize=(fig_width, 3.0),
+        gridspec_kw={"width_ratios": [3.6, 1.8]},
+    )
+
+    x_positions = list(range(len(tests)))
+    width = 0.34
+    fortran_times = [item["fortran_median_seconds"] for item in tests]
+    qgpu_times = [item["qgpu_median_seconds"] for item in tests]
+    labels = [item["test"] for item in tests]
+
+    ax.bar([x - width / 2 for x in x_positions], fortran_times, width, label="Fortran", color="#9b9b9b")
+    ax.bar([x + width / 2 for x in x_positions], qgpu_times, width, label="QGPU", color="#0b71c8")
+    ax.set_title("Execution Time (s)", fontsize=11, weight="bold")
+    ax.set_ylabel("Time (s)")
+    ax.set_xticks(x_positions)
+    ax.set_xticklabels(labels, rotation=0 if len(labels) <= 3 else 30, ha="center")
+    ax.legend(frameon=False, loc="upper right")
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    ax.grid(axis="y", color="#e7e7e7", linewidth=0.8)
+    ax.set_axisbelow(True)
+
+    for x, value in zip([x - width / 2 for x in x_positions], fortran_times):
+        ax.text(x, value, f"{value:.1f}", ha="center", va="bottom", fontsize=8, weight="bold")
+    for x, value in zip([x + width / 2 for x in x_positions], qgpu_times):
+        ax.text(x, value, f"{value:.1f}", ha="center", va="bottom", fontsize=8, weight="bold")
+
+    if len(tests) == 1:
+        ymax = max(fortran_times[0], qgpu_times[0])
+        ax.annotate(
+            "",
+            xy=(x_positions[0] + width / 2, qgpu_times[0] + ymax * 0.15),
+            xytext=(x_positions[0] - width / 2, fortran_times[0] * 0.85),
+            arrowprops={"arrowstyle": "->", "linestyle": "--", "color": "#0b71c8", "lw": 1.2},
+        )
+
+    best = max(tests, key=lambda item: item["speedup_x"] or 0)
+    panel.set_facecolor("#eef5fd")
+    for spine in panel.spines.values():
+        spine.set_color("#8ab9ef")
+    panel.set_xticks([])
+    panel.set_yticks([])
+    panel.text(0.5, 0.82, "Up to", ha="center", va="center", fontsize=13, weight="bold", color="#0b3970")
+    panel.text(
+        0.5,
+        0.52,
+        f"{best['speedup_x']:.1f}x",
+        ha="center",
+        va="center",
+        fontsize=32,
+        weight="bold",
+        color="#003c7f",
+    )
+    panel.text(0.5, 0.28, "speedup", ha="center", va="center", fontsize=14, weight="bold", color="#0b3970")
+    panel.text(0.5, 0.12, "(vs. Fortran)", ha="center", va="center", fontsize=10, color="#0b3970")
+
+    fig.tight_layout()
+    png_path = out_dir / "speedup.png"
+    fig.savefig(png_path, dpi=200)
+    plt.close(fig)
+    return png_path
+
+
+def plot_summary_csv(args):
+    csv_path = Path(args.csv).expanduser().resolve()
+    records = read_summary_csv(csv_path)
+    summary = summarize_for_plot(records)
+    if args.out:
+        out_path = Path(args.out).expanduser().resolve()
+        out_dir = out_path.parent
+        out_dir.mkdir(parents=True, exist_ok=True)
+        png_path = plot_speedup(summary, out_dir)
+        if png_path != out_path:
+            if out_path.exists():
+                out_path.unlink()
+            png_path.rename(out_path)
+            png_path = out_path
+    else:
+        png_path = plot_speedup(summary, csv_path.parent)
+    if png_path is None:
+        raise RuntimeError("No successful Fortran/QGPU pairs found to plot.")
+    print(f"Speedup plot: {png_path}")
+    return 0
+
+
+def default_out_dir(test_names):
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    label = test_names[0] if len(test_names) == 1 else "multi"
+    return ROOT / "benchmark-qgpu" / "results" / f"{stamp}_{label}"
+
+
+def parse_args():
+    if len(sys.argv) > 1 and sys.argv[1] == "plot":
+        parser = argparse.ArgumentParser(description="Plot benchmark_test.py speedup from an existing summary.csv.")
+        parser.add_argument("command", choices=["plot"])
+        parser.add_argument("csv", help="summary.csv written by benchmark_test.py.")
+        parser.add_argument("--out", help="Output PNG path. Defaults to speedup.png next to the CSV.")
+        return parser.parse_args()
+
+    parser = argparse.ArgumentParser(description="Benchmark Fortran vs QGPU for runTEST.py test cases.")
+    parser.add_argument("--test", nargs="+", help="Test name(s) from test/runTEST.py.")
+    parser.add_argument("--list-tests", action="store_true", help="List available tests and exit.")
+    parser.add_argument("--steps", type=int, help="MD steps to write into eq1.inp.")
+    parser.add_argument("--lambda", dest="lambda_name", default=None, help="Perturbation lambda suffix, e.g. eq5.")
+    parser.add_argument("--shake", action="store_true", help="Enable shake in generated MD input.")
+    parser.add_argument("--repeat", type=int, default=1, help="Number of repeats for each runner.")
+    parser.add_argument("--out", default=None, help="Output directory.")
+    parser.add_argument(
+        "--restart-prep-steps",
+        type=int,
+        default=1,
+        help="MD steps used only for qdyn_test restart preparation. Defaults to 1.",
+    )
+    parser.add_argument(
+        "--fortran-bin",
+        default=None,
+        help="Path to production Fortran binary used for timed Fortran runs. Defaults to qdynp with MPI, otherwise qdyn.",
+    )
+    parser.add_argument(
+        "--fortran-mpi-procs",
+        type=int,
+        default=None,
+        help="Run the timed Fortran binary through mpirun with this many MPI ranks.",
+    )
+    parser.add_argument(
+        "--mpirun-bin",
+        default="mpirun",
+        help="MPI launcher to use with --fortran-mpi-procs. Defaults to mpirun.",
+    )
+    parser.add_argument(
+        "--mpirun-args",
+        default=None,
+        help='Extra MPI launcher arguments, quoted as one string, e.g. "--bind-to core".',
+    )
+    parser.add_argument(
+        "--prep-fortran-bin",
+        default=str(ROOT / "src" / "q6" / "bin" / "q6" / "qdyn_test"),
+        help="Path to qdyn_test binary used only to prepare QGPU restart CSVs.",
+    )
+    parser.add_argument("--qgpu-bin", default=None, help="Path to QGPU qdyn binary.")
+    return parser.parse_args()
+
+
+def validate_args(args):
+    if getattr(args, "command", None) == "plot":
+        return
+    if args.list_tests:
+        return
+    if not args.test:
+        raise SystemExit("--test is required unless --list-tests is used.")
+    if args.steps is None:
+        raise SystemExit("--steps is required unless --list-tests is used.")
+    if args.steps < 1:
+        raise SystemExit("--steps must be >= 1.")
+    if args.repeat < 1:
+        raise SystemExit("--repeat must be >= 1.")
+    if args.restart_prep_steps < 1:
+        raise SystemExit("--restart-prep-steps must be >= 1.")
+    if args.fortran_mpi_procs is not None and args.fortran_mpi_procs < 1:
+        raise SystemExit("--fortran-mpi-procs must be >= 1.")
+
+
+def main():
+    args = parse_args()
+    validate_args(args)
+
+    if getattr(args, "command", None) == "plot":
+        return plot_summary_csv(args)
+
+    testinfo = runTEST.get_default_testinfo()
+    if args.list_tests:
+        for test_name in sorted(testinfo):
+            print(test_name)
+        return 0
+
+    qgpu_bin = resolve_qgpu_bin(args.qgpu_bin)
+    default_fortran_bin = (
+        ROOT / "src" / "q6" / "bin" / "q6" / "qdynp"
+        if args.fortran_mpi_procs is not None
+        else ROOT / "src" / "q6" / "bin" / "q6" / "qdyn"
+    )
+    fortran_bin = resolve_fortran_bin(args.fortran_bin or default_fortran_bin)
+    prep_fortran_bin = resolve_fortran_bin(args.prep_fortran_bin)
+    out_dir = Path(args.out).expanduser().resolve() if args.out else default_out_dir(args.test)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    all_records = []
+    try:
+        for test_name in args.test:
+            test_dir = out_dir / test_name
+            fortran_dir = test_dir / "fortran"
+            prep_dir = test_dir / "qgpu_prepare"
+            qgpu_runs_dir = test_dir / "qgpu_runs"
+            fortran_dir.mkdir(parents=True, exist_ok=True)
+
+            data = resolve_test_data(test_name, args.steps, args.lambda_name, args.shake)
+            print(f"Preparing Fortran input for {test_name} in {fortran_dir}")
+            write_md_input(data, fortran_dir)
+
+            if args.fortran_mpi_procs is None:
+                print(f"Running Fortran for {test_name} ({args.repeat} repeat(s))")
+            else:
+                print(
+                    f"Running Fortran for {test_name} with {args.fortran_mpi_procs} MPI rank(s) "
+                    f"({args.repeat} repeat(s))"
+                )
+            fortran_records, fortran_ok = run_fortran_repeats(
+                data,
+                fortran_bin,
+                fortran_dir,
+                args.repeat,
+                args.steps,
+                mpi_procs=args.fortran_mpi_procs,
+                mpirun_bin=args.mpirun_bin,
+                mpirun_args=args.mpirun_args,
+            )
+            all_records.extend(fortran_records)
+            if not fortran_ok:
+                continue
+
+            print(f"Preparing QGPU restart with qdyn_test for {test_name} ({args.restart_prep_steps} step(s))")
+            prepare_restart_with_qdyn_test(
+                data,
+                prep_fortran_bin,
+                fortran_dir,
+                prep_steps=args.restart_prep_steps,
+            )
+
+            print(f"Preparing QGPU CSV input for {test_name}")
+            prepared_data_dir = prepare_qgpu_input(data, fortran_dir, prep_dir)
+
+            print(f"Running QGPU for {test_name} ({args.repeat} repeat(s))")
+            all_records.extend(
+                run_qgpu_repeats(data, qgpu_bin, prepared_data_dir, qgpu_runs_dir, args.repeat, args.steps)
+            )
+    finally:
+        write_summary_csv(all_records, out_dir)
+
+    failures = [record for record in all_records if record["return_code"] != 0]
+    if failures:
+        first = failures[0]
+        raise RuntimeError(
+            f"{first['runner']} failed for {first['test']} repeat {first['repeat']}. "
+            f"Logs: stdout={first['stdout']} stderr={first['stderr']}"
+        )
+
+    summary = summarize(all_records, args, qgpu_bin, fortran_bin, prep_fortran_bin)
+    csv_path = write_summary_csv(all_records, out_dir)
+    json_path = write_summary_json(summary, out_dir)
+    png_path = plot_speedup(summary, out_dir)
+
+    print(f"Summary CSV: {csv_path}")
+    print(f"Summary JSON: {json_path}")
+    if png_path is not None:
+        print(f"Speedup plot: {png_path}")
+    for item in summary["tests"]:
+        print(
+            f"{item['test']}: Fortran {item['fortran_median_seconds']:.3f}s, "
+            f"QGPU {item['qgpu_median_seconds']:.3f}s, speedup {item['speedup_x']:.2f}x"
+        )
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except (FileNotFoundError, RuntimeError, ValueError) as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        raise SystemExit(1)
diff --git a/benchmark-qgpu/main.py b/benchmark-qgpu/main.py
index f379f021..4d4b6f96 100644
--- a/benchmark-qgpu/main.py
+++ b/benchmark-qgpu/main.py
@@ -1,5 +1,11 @@
 import argparse
-from benchmark_run import run 
+
+
+def positive_int(value):
+    parsed = int(value)
+    if parsed < 1:
+        raise argparse.ArgumentTypeError("must be >= 1")
+    return parsed
     
 
 if __name__ == "__main__":
@@ -7,8 +13,16 @@
 
     parser.add_argument('--data_dir', type=str, help='Directory containing a single test case.')
     parser.add_argument('--bin', type=str, help='Path to the Qdyn GPU executable.')
-    parser.add_argument('--max_processes', type=int, help='Max number of parallel processes to run.')
+    parser.add_argument('--max_processes', type=positive_int, help='Max number of parallel processes to run.')
+    parser.add_argument(
+        '--concurrency',
+        type=positive_int,
+        nargs='+',
+        help='Specific parallel process counts to run, e.g. --concurrency 1 2 3 4 5 10 15 20.',
+    )
 
     args = parser.parse_args()
 
-    run(args)
\ No newline at end of file
+    from benchmark_run import run
+
+    run(args)
diff --git a/src/core/common/include/precision.h b/src/core/common/include/precision.h
new file mode 100644
index 00000000..fc633f45
--- /dev/null
+++ b/src/core/common/include/precision.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#ifdef QDYN_SPFP
+using real_t = float;
+using nonbond_work_t = float;
+#else
+using real_t = double;
+using nonbond_work_t = double;
+#endif
+
+using energy_accum_t = double;
+using force_accum_t = double;
+using constraint_work_t = double;
diff --git a/test/runTEST.py b/test/runTEST.py
index ea8d5256..0b694155 100644
--- a/test/runTEST.py
+++ b/test/runTEST.py
@@ -17,6 +17,92 @@
 
 lambdas = ['eq5', '0744_0256', '0998_0002']
 
+
+def get_default_testinfo():
+    return {
+                'p-p'               : [
+                                        'benzene-vacuum.top',
+                                        '20'
+                                      ],
+                'q-p_benzene'       : [
+                                       'Na-benzene-vacuum.top',
+                                       '20',
+                                       'FEP_benzene.fep'
+                                      ],
+                'q-p_Na'            : [
+                                       'Na-benzene-vacuum.top',
+                                       '20',
+                                       'FEP_Na.fep'
+                                      ],
+                'q-p-w_benzene'     : [
+                                       'Na-benzene-water.top',
+                                       '20',
+                                       'FEP_benzene.fep'
+                                      ],
+                'q-p-w_Na'          : [
+                                       'Na-benzene-water.top',
+                                       '20',
+                                       'FEP_Na.fep'
+                                      ],
+                'q-q'               : [
+                                       'benzene-vacuum.top',
+                                       '20',
+                                       'FEP_benzene.fep'
+                                      ],
+                'w-p'               : [
+                                       'benzene-water.top',
+                                       '20'
+                                      ],
+                'w-q'               : [
+                                       'benzene-water.top',
+                                       '20',
+                                       'FEP_benzene.fep'
+                                      ],
+                'w-w'               : [
+                                       'water.top',
+                                       '20'
+                                      ],
+                'boundary'          : [
+                                       'ala_wat.top',
+                                       '14'
+                                      ],
+                'polypeptide'       : [
+                                       'ala_wat.top',
+                                       '15'
+                                      ],
+                'polypeptide25'     : [
+                                       'ala_wat25.top',
+                                       '25'
+                                      ],
+                'q-q-large_vac'     : [
+                                       'dualtop_vacuum.top',
+                                       '22',
+                                       'dualtop.fep'
+                                      ],
+                'cdk2'              : [
+                                       'cdk2.top',
+                                       '22',
+                                       'FEPm_cdk2.fep',
+                                       'restraints_cdk2.inp'
+                                      ],
+                'thrombin'          : [
+                                       'thrombin.top',
+                                       '25',
+                                       'FEPm_thrombin.fep',
+                                       'restraints_thrombin.inp'
+                                      ],
+            }
+
+
+def resolve_path(path, base_dir=None):
+    if path is None:
+        return None
+    if os.path.isabs(path):
+        return path
+    if base_dir is not None:
+        return os.path.abspath(os.path.join(base_dir, path))
+    return os.path.abspath(path)
+
 class Create_Environment(object):
     """
         Creates the workdirectory environment.
@@ -44,7 +130,7 @@ def __init__(self,data):
         _inv_lambda = None
         # Check if a lambda has been specified
 
-        if len(data['testinfo'][test]) >= 3 and data['lambda'] is not None:
+        if data.get('fep_path') is not None and data['lambda'] is not None:
             if not data['lambda'].startswith('eq'):
                 str_lambda = data['lambda'].split("_")[0]
                 str_inv_lambda = data['lambda'].split("_")[1]
@@ -86,31 +172,32 @@ def __init__(self,data):
 non_bond                  1
 
 [files]
-topology                  {}{}
+topology                  {}
 final                     eq1.re
 """.format(data['timestep'],
            shake, shake, shake,
            data['testinfo'][data['test']][1],
-           data['topdir'],
-           data['topfile'])
-        if len(data['testinfo'][test]) >= 3:
-            filename = data['testinfo'][data['test']][2]
+           data['topology_path'])
+        if data.get('fep_path') is not None:
+            filename = data['fep_path']
+            fep_name = os.path.basename(filename)
 
             fep_part = """fep                       {}{}
 
 [lambdas]
-""".format(data['inputdir'], filename)
+""".format("" if os.path.isabs(filename) else "",
+           filename)
             if _lambda is not None:
                 fep_part += _lambda + " " + _inv_lambda + "\n"
             else:
-                if filename.startswith("FEPm"):
+                if fep_name.startswith("FEPm"):
                     fep_part += "0.500 0.500\n"
                 else:
                     fep_part += "1.000 0.000\n"
             md_content = md_content + fep_part
         # Check if there are boundary conditions
-        if len(data['testinfo'][test]) >= 4:
-            filename = data['inputdir'] + '/' + data['testinfo'][data['test']][3]
+        if data.get('restraints_path') is not None:
+            filename = data['restraints_path']
             with open(filename, 'r') as f:
                 restraint_part = f.read()
                 md_content = md_content + restraint_part
@@ -192,9 +279,7 @@ def __init__(self,data):
                     outfile.write('{}\n'.format(v))
 
         # Parse the topology
-        Qtopology = '{}{}'.format(data['topdir'],
-                                  data['topfile'])
-        read_top = TOPOLOGY.Read_Topology(Qtopology)
+        read_top = TOPOLOGY.Read_Topology(data['topology_path'])
         top_data = read_top.Q()
         with open('coords.csv','w') as outfile:
             outfile.write('{}\n'.format(len(top_data['coords'])))
@@ -210,17 +295,16 @@ def __init__(self,data):
         shutil.copy('coords.csv', 'tmp/coords.csv')
         args = [
                 ' {}src/bin/qdyn.py'.format(settings.ROOT),
-                '-t', '{}{}'.format(data['topdir'],
-                                   data['topfile']),
+                '-t', data['topology_path'],
                 '-m', 'eq1.inp',
                 '-d', 'TEST',
                 '-r', 'tmp'
                ]
 
         # FEP file?
-        if len(data['testinfo'][data['test']]) >= 3:
+        if data.get('fep_path') is not None:
             args.append('-f')
-            args.append('{}{}'.format(data['inputdir'],data['testinfo'][data['test']][2]))
+            args.append(data['fep_path'])
 
         if data['verbose']:
             args.append('--verbose')
@@ -330,8 +414,8 @@ def __init__(self, data):
         self.data = data
         self.data['curdir'] = os.getcwd()
         self.data['executable'] = sys.executable
-        self.data['topdir'] = '{}test/data/topology/'.format(settings.ROOT)
-        self.data['inputdir']   = '{}test/data/inputs/'.format(settings.ROOT)
+        self.data['topdir'] = os.path.join(settings.ROOT, 'test/data/topology')
+        self.data['inputdir'] = os.path.join(settings.ROOT, 'test/data/inputs')
         # Step = step + 1
         self.data['timestep'] = '{}'.format(int(self.data['timestep'])+1)
 
@@ -340,79 +424,20 @@ def __init__(self, data):
         if self.data['wd'][-1] != '/':
             self.data['wd'] = self.data['wd'] + '/'
 
-        self.data['testinfo'] = {
-                    'p-p'               : [
-                                            'benzene-vacuum.top',
-                                            '20'
-                                          ],
-                    'q-p_benzene'       : [
-                                           'Na-benzene-vacuum.top',
-                                           '20',
-                                           'FEP_benzene.fep'
-                                          ],
-                    'q-p_Na'            : [
-                                           'Na-benzene-vacuum.top',
-                                           '20',
-                                           'FEP_Na.fep'
-                                          ],
-                    'q-p-w_benzene'     : [
-                                           'Na-benzene-water.top',
-                                           '20',
-                                           'FEP_benzene.fep'
-                                          ],
-                    'q-p-w_Na'          : [
-                                           'Na-benzene-water.top',
-                                           '20',
-                                           'FEP_Na.fep'
-                                          ],
-                    'q-q'               : [
-                                           'benzene-vacuum.top',
-                                           '20',
-                                           'FEP_benzene.fep'
-                                          ],
-                    'w-p'               : [
-                                           'benzene-water.top',
-                                           '20'
-                                          ],
-                    'w-q'               : [
-                                           'benzene-water.top',
-                                           '20',
-                                           'FEP_benzene.fep'                                            
-                                          ],
-                    'w-w'               : [
-                                           'water.top',
-                                           '20'
-                                          ],
-                    'boundary'          : [
-                                           'ala_wat.top',
-                                           '14'
-                                          ],
-                    'polypeptide'       : [
-                                           'ala_wat.top',
-                                           '15'                       
-                                          ],
-                    'polypeptide25'     : [
-                                           'ala_wat25.top',
-                                           '25'
-                                          ],
-                    'q-q-large_vac'     : [
-                                           'dualtop_vacuum.top',
-                                           '22',
-                                           'dualtop.fep'
-                                          ],
-                    'cdk2'              : [
-                                           'cdk2.top',
-                                           '22',
-                                           'FEPm_cdk2.fep',
-                                           'restraints_cdk2.inp'
-                                          ],
-                    'thrombin'          : [
-                                           'thrombin.top',
-                                           '25',
-                                           'FEPm_thrombin.fep',
-                                           'restraints_thrombin.inp'
-                                          ],
-                }
+        self.data['testinfo'] = get_default_testinfo()
+
+        if self.data['custom_top'] is not None:
+            custom_info = [
+                os.path.basename(self.data['custom_top']),
+                self.data['custom_shell_radius']
+            ]
+            if self.data['custom_fep'] is not None:
+                custom_info.append(os.path.basename(self.data['custom_fep']))
+            if self.data['custom_restraints'] is not None:
+                while len(custom_info) < 3:
+                    custom_info.append(None)
+                custom_info.append(os.path.basename(self.data['custom_restraints']))
+            self.data['testinfo'][self.data['custom_name']] = custom_info
 
         tests = data['testinfo'].keys()
         if self.data['run'] is not None:
@@ -422,9 +447,21 @@ def __init__(self, data):
             self.data['test'] = test
             self.data['curtest'] = self.data['wd'] + test
             _topfile = data['testinfo'][data['test']][0]
-            if len(data['testinfo'][test]) >= 3 and data['lambda'] is not None:
+            if len(data['testinfo'][test]) >= 3 and data['lambda'] is not None and test != self.data['custom_name']:
                 _topfile = _topfile.split(".")[0] + "_" + data['lambda'] + "." + _topfile.split(".")[1]
             self.data['topfile'] = _topfile
+            if test == self.data['custom_name'] and self.data['custom_top'] is not None:
+                self.data['topology_path'] = self.data['custom_top']
+                self.data['fep_path'] = self.data['custom_fep']
+                self.data['restraints_path'] = self.data['custom_restraints']
+            else:
+                self.data['topology_path'] = os.path.join(self.data['topdir'], self.data['topfile'])
+                self.data['fep_path'] = None
+                self.data['restraints_path'] = None
+                if len(data['testinfo'][test]) >= 3:
+                    self.data['fep_path'] = os.path.join(self.data['inputdir'], data['testinfo'][test][2])
+                if len(data['testinfo'][test]) >= 4:
+                    self.data['restraints_path'] = os.path.join(self.data['inputdir'], data['testinfo'][test][3])
             # INIT
             Create_Environment(self.data)
             
@@ -515,6 +552,36 @@ def __init__(self, data):
                         required = False,
                         help = "Specify a particular phase of the perturbation")
 
+    parser.add_argument('--custom-top',
+                        dest = "custom_top",
+                        default = None,
+                        required = False,
+                        help = "Path to a custom topology file to add as a test")
+
+    parser.add_argument('--custom-shell-radius',
+                        dest = "custom_shell_radius",
+                        default = '25',
+                        required = False,
+                        help = "Shell radius to use with --custom-top")
+
+    parser.add_argument('--custom-fep',
+                        dest = "custom_fep",
+                        default = None,
+                        required = False,
+                        help = "Optional FEP file for --custom-top")
+
+    parser.add_argument('--custom-restraints',
+                        dest = "custom_restraints",
+                        default = None,
+                        required = False,
+                        help = "Optional restraints file for --custom-top")
+
+    parser.add_argument('--custom-name',
+                        dest = "custom_name",
+                        default = 'custom',
+                        required = False,
+                        help = "Test name to use with --custom-top")
+
     parser.add_argument('--tolerance',
                         dest = "tolerance",
                         type = float,
@@ -523,5 +590,9 @@ def __init__(self, data):
                         help = "Energy comparison tolerance (default: 0.0 = exact match)")
 
     args = parser.parse_args()
-    
-    START = Init(vars(args))
+    data = vars(args)
+    data['custom_top'] = resolve_path(data['custom_top'])
+    data['custom_fep'] = resolve_path(data['custom_fep'])
+    data['custom_restraints'] = resolve_path(data['custom_restraints'])
+
+    START = Init(data)