From 871f60a22054f31f206f687fbb61a0d6be233cef Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Fri, 10 Apr 2026 08:35:50 -0700
Subject: [PATCH 1/6] primal crush for papilo

---
 .../diversity/diversity_manager.cu            | 37 +++++++--
 .../presolve/third_party_presolve.cpp         | 83 +++++++++++++++++++
 .../presolve/third_party_presolve.hpp         |  3 +
 cpp/src/mip_heuristics/solve.cu               | 27 +++++-
 cpp/tests/mip/incumbent_callback_test.cu      | 57 +++++++++++++
 5 files changed, 201 insertions(+), 6 deletions(-)

diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
index b8dc3d33bf..6226d9ec0f 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
@@ -9,6 +9,7 @@
 #include "diversity_manager.cuh"
 
 #include <mip_heuristics/mip_constants.hpp>
+#include <mip_heuristics/presolve/third_party_presolve.hpp>
 
 #include <mip_heuristics/presolve/conflict_graph/clique_table.cuh>
 #include <mip_heuristics/presolve/probing_cache.cuh>
@@ -182,9 +183,32 @@ void diversity_manager_t<i_t, f_t>::add_user_given_solutions(
   std::vector<solution_t<i_t, f_t>>& initial_sol_vector)
 {
   raft::common::nvtx::range fun_scope("add_user_given_solutions");
-  for (const auto& init_sol : context.settings.initial_solutions) {
+  const bool has_papilo   = problem_ptr->has_papilo_presolve_data();
+  const i_t papilo_orig_n = problem_ptr->get_papilo_original_num_variables();
+  for (size_t sol_idx = 0; sol_idx < context.settings.initial_solutions.size(); ++sol_idx) {
+    const auto& init_sol = context.settings.initial_solutions[sol_idx];
     solution_t<i_t, f_t> sol(*problem_ptr);
     rmm::device_uvector<f_t> init_sol_assignment(*init_sol, sol.handle_ptr->get_stream());
+
+    if (has_papilo && (i_t)init_sol_assignment.size() == papilo_orig_n) {
+      std::vector<f_t> h_original(init_sol_assignment.size());
+      raft::copy(h_original.data(),
+                 init_sol_assignment.data(),
+                 h_original.size(),
+                 sol.handle_ptr->get_stream());
+      std::vector<f_t> h_crushed;
+      problem_ptr->presolve_data.papilo_presolve_ptr->crush_primal_solution(h_original, h_crushed);
+      init_sol_assignment.resize(h_crushed.size(), sol.handle_ptr->get_stream());
+      raft::copy(init_sol_assignment.data(),
+                 h_crushed.data(),
+                 h_crushed.size(),
+                 sol.handle_ptr->get_stream());
+      CUOPT_LOG_DEBUG("Crushed initial solution %d through Papilo (%d -> %d vars)",
+                      sol_idx,
+                      papilo_orig_n,
+                      h_crushed.size());
+    }
+
     if (problem_ptr->pre_process_assignment(init_sol_assignment)) {
       relaxed_lp_settings_t lp_settings;
       lp_settings.time_limit            = std::min(60., timer.remaining_time() / 2);
@@ -202,10 +226,10 @@ void diversity_manager_t<i_t, f_t>::add_user_given_solutions(
                  sol.handle_ptr->get_stream());
       bool is_feasible = sol.compute_feasibility();
       cuopt_func_call(sol.test_variable_bounds(true));
-      CUOPT_LOG_INFO("Adding initial solution success! feas %d objective %f excess %f",
-                     is_feasible,
-                     sol.get_user_objective(),
-                     sol.get_total_excess());
+      CUOPT_LOG_DEBUG("Adding initial solution success! feas %d objective %f excess %f",
+                      is_feasible,
+                      sol.get_user_objective(),
+                      sol.get_total_excess());
       population.run_solution_callbacks(sol);
       initial_sol_vector.emplace_back(std::move(sol));
     } else {
@@ -418,6 +442,9 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
     CUOPT_LOG_INFO("GPU heuristics disabled via CUOPT_DISABLE_GPU_HEURISTICS=1");
     population.initialize_population();
     population.allocate_solutions();
+    if (check_b_b_preemption()) { return population.best_feasible(); }
+    add_user_given_solutions(initial_sol_vector);
+    population.add_solutions_from_vec(std::move(initial_sol_vector));
 
     while (!check_b_b_preemption()) {
       std::this_thread::sleep_for(std::chrono::milliseconds(100));
diff --git a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
index d94cf5aa67..3aad13382d 100644
--- a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
+++ b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
@@ -876,6 +876,89 @@ void third_party_presolve_t<i_t, f_t>::uncrush_primal_solution(
   full_primal = std::move(full_sol.primal);
 }
 
+/**
+ * Crush an original-space primal solution into the presolved (reduced) space.
+ *
+ * This is the forward version of Papilo's Postsolve::undo(). It replays
+ * each presolve reduction in forward order to transform variable values,
+ * then projects onto the surviving variables via origcol_mapping.
+ *
+ * The crushing process is remarkably straightforward since we only need to support primal crushing.
+ * We're essentially just projecting
+ */
+template <typename f_t>
+static void crush_primal_impl(const papilo::PostsolveStorage<f_t>& storage,
+                              const std::vector<f_t>& original_primal,
+                              std::vector<f_t>& reduced_primal)
+{
+  const auto& types   = storage.types;
+  const auto& indices = storage.indices;
+  const auto& values  = storage.values;
+  const auto& start   = storage.start;
+
+  const int n_cols_original = (int)storage.nColsOriginal;
+  cuopt_assert((int)original_primal.size() == n_cols_original,
+               "original_primal size must match nColsOriginal");
+
+  std::vector<f_t> assignment(original_primal.begin(), original_primal.end());
+
+  // Go through the postsolve reduction stack in forward order
+  for (int i = 0; i < (int)types.size(); ++i) {
+    int first = start[i];
+
+    switch (types[i]) {
+      // The one tricky reduction - Parallel column merging
+      case ReductionType::kParallelCol: {
+        // Storage layout: [orig_col1, flags1, orig_col2, flags2, -1]
+        //                 [col1lb,    col1ub, col2lb,    col2ub, col2scale]
+        int col1         = indices[first];
+        int col2         = indices[first + 2];
+        const f_t& scale = values[first + 4];
+        assignment[col2] += scale * assignment[col1];
+        break;
+      }
+
+      // All other reductions are either no-ops for primal crushing
+      // or simply metadata
+      case ReductionType::kFixedCol:                            // Handled via projection
+      case ReductionType::kSubstitutedCol:                      // Col is dropped
+      case ReductionType::kSubstitutedColWithDual:              // Col is dropped
+      case ReductionType::kFixedInfCol:                         // Col is dropped
+      case ReductionType::kVarBoundChange:                      // Noop
+      case ReductionType::kRedundantRow:                        // Noop
+      case ReductionType::kRowBoundChange:                      // Noop
+      case ReductionType::kRowBoundChangeForcedByRow:           // Noop
+      case ReductionType::kReasonForRowBoundChangeForcedByRow:  // Noop
+      case ReductionType::kSaveRow:                             // Noop
+      case ReductionType::kReducedBoundsCost:                   // Noop
+      case ReductionType::kColumnDualValue:                     // Dual only
+      case ReductionType::kRowDualValue:                        // Dual only
+      case ReductionType::kCoefficientChange:
+        break;  // Noop
+        // no default: case to let the compiler scream at us if a new reduction is later introduced
+    }
+  }
+
+  // Project the surviving variables into the presolved-space via
+  // the mapping provided by Papilo
+  const auto& col_map = storage.origcol_mapping;
+  reduced_primal.resize(col_map.size());
+  for (size_t k = 0; k < col_map.size(); ++k) {
+    reduced_primal[k] = assignment[col_map[k]];
+  }
+}
+
+template <typename i_t, typename f_t>
+void third_party_presolve_t<i_t, f_t>::crush_primal_solution(
+  const std::vector<f_t>& original_primal, std::vector<f_t>& reduced_primal) const
+{
+  cuopt_expects(presolver_ == cuopt::linear_programming::presolver_t::Papilo,
+                error_type_t::RuntimeError,
+                "Primal crushing is only supported for PaPILO presolve");
+  cuopt_assert(papilo_post_solve_storage_ != nullptr, "No postsolve storage available");
+  crush_primal_impl<f_t>(*papilo_post_solve_storage_, original_primal, reduced_primal);
+}
+
 template <typename i_t, typename f_t>
 third_party_presolve_t<i_t, f_t>::~third_party_presolve_t()
 {
diff --git a/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp b/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp
index a067f604e7..6f0d51e2f0 100644
--- a/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp
+++ b/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp
@@ -77,6 +77,9 @@ class third_party_presolve_t {
 
   void uncrush_primal_solution(const std::vector<f_t>& reduced_primal,
                                std::vector<f_t>& full_primal) const;
+
+  void crush_primal_solution(const std::vector<f_t>& original_primal,
+                             std::vector<f_t>& reduced_primal) const;
   const std::vector<i_t>& get_reduced_to_original_map() const { return reduced_to_original_map_; }
   const std::vector<i_t>& get_original_to_reduced_map() const { return original_to_reduced_map_; }
 
diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu
index be01516657..07948747d4 100644
--- a/cpp/src/mip_heuristics/solve.cu
+++ b/cpp/src/mip_heuristics/solve.cu
@@ -315,7 +315,6 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
       op_problem, settings.get_tolerances(), settings.determinism_mode == CUOPT_MODE_DETERMINISTIC);
 
     auto run_presolve              = settings.presolver != presolver_t::None;
-    run_presolve                   = run_presolve && settings.initial_solutions.size() == 0;
     bool has_set_solution_callback = false;
     for (auto callback : settings.get_mip_callbacks()) {
       if (callback != nullptr &&
@@ -337,11 +336,23 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
     std::unique_ptr<detail::early_cpufj_t<i_t, f_t>> early_cpufj;
     std::unique_ptr<detail::early_gpufj_t<i_t, f_t>> early_gpufj;
 
+    struct early_incumbent_entry_t {
+      f_t user_obj;
+      std::vector<f_t> assignment;
+    };
+    std::vector<early_incumbent_entry_t> early_incumbent_pool;
+
     // Track best incumbent found during presolve (shared across CPU and GPU FJ).
     // early_best_objective is in the original problem's solver-space (always minimization),
     // used for fast comparison in the callback.
     // early_best_user_obj is the corresponding user-space objective,
     // passed to run_mip for correct cross-space conversion.
+    // We attempt to crush early-heuristics solutions into the presolved space.
+    // This may fail if some dual reductions cut off parts of the polytope that the
+    // solutions lie in. We thus may hit scenarios where an excellent incumbent is found
+    // but is dropped due to these dual reductions, and we lose a good solution.
+    // This is why we still keep the solution around in OG-space
+    // and later extract it at the end of the solve.
     std::atomic<f_t> early_best_objective{std::numeric_limits<f_t>::infinity()};
     f_t early_best_user_obj{std::numeric_limits<f_t>::infinity()};
     std::vector<f_t> early_best_user_assignment;
@@ -355,6 +366,7 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
       auto early_fj_callback = [&early_best_objective,
                                 &early_best_user_obj,
                                 &early_best_user_assignment,
+                                &early_incumbent_pool,
                                 &early_callback_mutex,
                                 &early_fj_start,
                                 mip_callbacks = settings.get_mip_callbacks(),
@@ -367,6 +379,7 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
         early_best_objective.store(solver_obj);
         early_best_user_obj        = user_obj;
         early_best_user_assignment = assignment;
+        early_incumbent_pool.push_back({user_obj, assignment});
         double elapsed =
           std::chrono::duration<double>(std::chrono::steady_clock::now() - early_fj_start).count();
         CUOPT_LOG_INFO("New solution from early primal heuristics (%s). Objective %+.6e. Time %.2f",
@@ -461,6 +474,18 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
       early_cpufj.reset();
     }
 
+    // Add early-heuristic incumbents (original-space) to initial_solutions.
+    // PaPILO crushing + validation happens downstream in add_user_given_solutions().
+    if (!early_incumbent_pool.empty()) {
+      auto stream = op_problem.get_handle_ptr()->get_stream();
+      for (const auto& inc : early_incumbent_pool) {
+        auto d = std::make_shared<rmm::device_uvector<f_t>>(device_copy(inc.assignment, stream));
+        settings.initial_solutions.emplace_back(std::move(d));
+      }
+      CUOPT_LOG_DEBUG("Added %zu early-heuristic incumbents to initial solutions",
+                      early_incumbent_pool.size());
+    }
+
     if (settings.user_problem_file != "") {
       CUOPT_LOG_INFO("Writing user problem to file: %s", settings.user_problem_file.c_str());
       op_problem.write_to_mps(settings.user_problem_file);
diff --git a/cpp/tests/mip/incumbent_callback_test.cu b/cpp/tests/mip/incumbent_callback_test.cu
index 92ce2dd69c..6b36298ae7 100644
--- a/cpp/tests/mip/incumbent_callback_test.cu
+++ b/cpp/tests/mip/incumbent_callback_test.cu
@@ -194,4 +194,61 @@ TEST(mip_solve, early_heuristic_incumbent_fallback)
   if (!callback_solutions.empty()) { check_solutions(get_cb, mps_problem, settings); }
 }
 
+// Verify that a user-provided MIP start in original space is correctly crushed
+// through PaPILO presolve and accepted into the heuristic population.
+TEST(mip_solve, initial_solution_survives_papilo_crush)
+{
+  setenv("CUOPT_DISABLE_GPU_HEURISTICS", "1", 1);
+
+  const raft::handle_t handle_{};
+  auto path = make_path_absolute("mip/pk1.mps");
+  cuopt::mps_parser::mps_data_model_t<int, double> mps_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, false);
+  handle_.sync_stream();
+  auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem);
+  auto stream     = op_problem.get_handle_ptr()->get_stream();
+
+  // Step 1: solve to get a reference feasible solution
+  auto settings1       = mip_solver_settings_t<int, double>{};
+  settings1.time_limit = 30.;
+  settings1.presolver  = presolver_t::Papilo;
+  auto result1         = solve_mip(op_problem, settings1);
+  auto status1         = result1.get_termination_status();
+  ASSERT_TRUE(status1 == mip_termination_status_t::FeasibleFound ||
+              status1 == mip_termination_status_t::Optimal)
+    << "Reference solve must find a feasible solution";
+  auto reference_obj      = result1.get_objective_value();
+  auto reference_solution = cuopt::host_copy(result1.get_solution(), stream);
+  ASSERT_EQ((int)reference_solution.size(), op_problem.get_n_variables());
+
+  // Step 2: feed the reference solution as a MIP start with presolve ON
+  // and GPU heuristics disabled. B&B runs with node_limit=0 so it exits
+  // immediately. The only way we get a good objective is if the MIP start
+  // was crushed through PaPILO and accepted by add_user_given_solutions.
+  auto settings2       = mip_solver_settings_t<int, double>{};
+  settings2.time_limit = 10.;
+  settings2.presolver  = presolver_t::Papilo;
+  settings2.node_limit = 0;
+  settings2.add_initial_solution(reference_solution.data(), reference_solution.size(), stream);
+
+  int user_data = 0;
+  std::vector<std::pair<std::vector<double>, double>> callback_solutions;
+  test_get_solution_callback_t get_cb(callback_solutions, op_problem.get_n_variables(), &user_data);
+  settings2.set_mip_callback(&get_cb, &user_data);
+
+  auto result2 = solve_mip(op_problem, settings2);
+
+  unsetenv("CUOPT_DISABLE_GPU_HEURISTICS");
+
+  auto status2 = result2.get_termination_status();
+  EXPECT_TRUE(status2 == mip_termination_status_t::FeasibleFound ||
+              status2 == mip_termination_status_t::Optimal)
+    << "Crushed MIP start should yield a feasible result, got "
+    << mip_solution_t<int, double>::get_termination_status_string(status2);
+  EXPECT_TRUE(std::isfinite(result2.get_objective_value()));
+  EXPECT_NEAR(result2.get_objective_value(), reference_obj, 1e-4);
+
+  if (!callback_solutions.empty()) { check_solutions(get_cb, mps_problem, settings2); }
+}
+
 }  // namespace cuopt::linear_programming::test

From b27c1f0534477dacdf7140626b39cdee440a5bc8 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Thu, 9 Apr 2026 01:25:07 -0700
Subject: [PATCH 2/6] tentative crash fix

---
 cpp/src/mip_heuristics/solve.cu | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu
index 07948747d4..d1aed1fff4 100644
--- a/cpp/src/mip_heuristics/solve.cu
+++ b/cpp/src/mip_heuristics/solve.cu
@@ -8,6 +8,10 @@
 #include <cuopt/error.hpp>
 #include <cuopt/linear_programming/solve_remote.hpp>
 
+#include <omp.h>
+
+#include <utilities/scope_guard.hpp>
+
 #include <mip_heuristics/feasibility_jump/early_cpufj.cuh>
 #include <mip_heuristics/feasibility_jump/early_gpufj.cuh>
 #include <mip_heuristics/mip_constants.hpp>
@@ -251,6 +255,10 @@ template <typename i_t, typename f_t>
 mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
                                    mip_solver_settings_t<i_t, f_t> const& settings_const)
 {
+  // Release the OMP thread pool on exit so that consecutive solves from different
+  // std::async threads (e.g. in tests) don't deadlock on stale OMP master affinity.
+  auto omp_guard = cuopt::scope_guard([] { omp_pause_resource_all(omp_pause_hard); });
+
   try {
     mip_solver_settings_t<i_t, f_t> settings(settings_const);
     if (settings.presolver == presolver_t::Default || settings.presolver == presolver_t::PSLP) {

From bb9c91b4b1420c8b2b286da19c54054852cbafc1 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Tue, 14 Apr 2026 07:09:20 -0700
Subject: [PATCH 3/6] crush dual vectors and reduced costs

---
 .../presolve/third_party_presolve.cpp         | 337 ++++++++++++++---
 .../presolve/third_party_presolve.hpp         |  10 +
 cpp/src/pdlp/solve.cu                         |  12 +
 .../unit_tests/presolve_test.cu               | 353 ++++++++++++++++++
 cpp/tests/mip/incumbent_callback_test.cu      |   4 +-
 datasets/mip/download_miplib_test_dataset.sh  |  33 ++
 6 files changed, 700 insertions(+), 49 deletions(-)

diff --git a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
index 3aad13382d..41b78136cb 100644
--- a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
+++ b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
@@ -42,6 +42,8 @@
 #include <utilities/macros.cuh>
 #include <utilities/timer.hpp>
 
+#include <map>
+
 #include <raft/core/nvtx.hpp>
 
 namespace cuopt::linear_programming::detail {
@@ -137,8 +139,10 @@ papilo::Problem<f_t> build_papilo_problem(const optimization_problem_t<i_t, f_t>
     }
   }
 
-  for (size_t i = 0; i < h_var_types.size(); ++i) {
-    builder.setColIntegral(i, h_var_types[i] == var_t::INTEGER);
+  if (category == problem_category_t::MIP) {
+    for (size_t i = 0; i < h_var_types.size(); ++i) {
+      builder.setColIntegral(i, h_var_types[i] == var_t::INTEGER);
+    }
   }
 
   if (!h_constr_lb.empty() && !h_constr_ub.empty()) {
@@ -562,8 +566,6 @@ void set_presolve_methods(papilo::Presolve<f_t>& presolver,
   presolver.addPresolveMethod(uptr(new papilo::SimpleProbing<f_t>()));
   presolver.addPresolveMethod(uptr(new papilo::ParallelRowDetection<f_t>()));
   presolver.addPresolveMethod(uptr(new papilo::ParallelColDetection<f_t>()));
-
-  presolver.addPresolveMethod(uptr(new papilo::SingletonStuffing<f_t>()));
   presolver.addPresolveMethod(uptr(new papilo::DualFix<f_t>()));
   presolver.addPresolveMethod(uptr(new papilo::SimplifyInequalities<f_t>()));
   presolver.addPresolveMethod(uptr(new papilo::CliqueMerging<f_t>()));
@@ -574,6 +576,11 @@ void set_presolve_methods(papilo::Presolve<f_t>& presolver,
   presolver.addPresolveMethod(uptr(new papilo::Probing<f_t>()));
 
   if (!dual_postsolve) {
+    // SingletonStuffing causes dual crushing failures on:
+    //   tr12-30, ns1208400, gmu-35-50, dws008-01, neos-1445765,
+    //   neos-5107597-kakapo, rocI-4-11, traininstance2, traininstance6,
+    //   radiationm18-12-05, rococoB10-011000, b1c1s1
+    presolver.addPresolveMethod(uptr(new papilo::SingletonStuffing<f_t>()));
     presolver.addPresolveMethod(uptr(new papilo::DualInfer<f_t>()));
     presolver.addPresolveMethod(uptr(new papilo::SimpleSubstitution<f_t>()));
     presolver.addPresolveMethod(uptr(new papilo::Sparsify<f_t>()));
@@ -789,6 +796,72 @@ void third_party_presolve_t<i_t, f_t>::undo(rmm::device_uvector<f_t>& primal_sol
   std::vector<f_t> reduced_costs_vec_h(reduced_costs.size());
   raft::copy(reduced_costs_vec_h.data(), reduced_costs.data(), reduced_costs.size(), stream_view);
 
+  const auto& storage = *papilo_post_solve_storage_;
+  CUOPT_LOG_DEBUG("Papilo undo: reduced solution sizes: primal=%zu dual=%zu reduced_costs=%zu",
+                  primal_sol_vec_h.size(),
+                  dual_sol_vec_h.size(),
+                  reduced_costs_vec_h.size());
+  CUOPT_LOG_DEBUG(
+    "Papilo undo: postsolve storage: nColsOriginal=%d nRowsOriginal=%d "
+    "origcol_mapping size=%zu origrow_mapping size=%zu "
+    "types size=%zu indices size=%zu values size=%zu start size=%zu",
+    (int)storage.nColsOriginal,
+    (int)storage.nRowsOriginal,
+    storage.origcol_mapping.size(),
+    storage.origrow_mapping.size(),
+    storage.types.size(),
+    storage.indices.size(),
+    storage.values.size(),
+    storage.start.size());
+  CUOPT_LOG_DEBUG("Papilo undo: dual_postsolve=%d", (int)dual_postsolve);
+
+  // Pre-flight validation: scan postsolve entries for out-of-bounds indices
+  {
+    int n_cols_orig = (int)storage.nColsOriginal;
+    int n_rows_orig = (int)storage.nRowsOriginal;
+    for (int i = (int)storage.types.size() - 1; i >= 0; --i) {
+      int first = storage.start[i];
+      auto type = storage.types[i];
+
+      if (type == ReductionType::kFixedCol) {
+        int col = storage.indices[first];
+        if (col < 0 || col >= n_cols_orig) {
+          CUOPT_LOG_ERROR(
+            "Papilo undo pre-flight: kFixedCol at step %d has col=%d "
+            "but nColsOriginal=%d (out of bounds!)",
+            i,
+            col,
+            n_cols_orig);
+        }
+        if (dual_postsolve) {
+          int col_length = storage.indices[first + 1];
+          for (int k = 0; k < col_length; ++k) {
+            int row = storage.indices[first + 2 + k];
+            if (row < 0 || row >= n_rows_orig) {
+              CUOPT_LOG_ERROR(
+                "Papilo undo pre-flight: kFixedCol at step %d references "
+                "row=%d but nRowsOriginal=%d (out of bounds!)",
+                i,
+                row,
+                n_rows_orig);
+            }
+          }
+        }
+      } else if (type == ReductionType::kColumnDualValue || type == ReductionType::kRowDualValue) {
+        int idx   = storage.indices[first];
+        int limit = (type == ReductionType::kColumnDualValue) ? n_cols_orig : n_rows_orig;
+        if (idx < 0 || idx >= limit) {
+          CUOPT_LOG_ERROR(
+            "Papilo undo pre-flight: %s at step %d has idx=%d but limit=%d",
+            type == ReductionType::kColumnDualValue ? "kColumnDualValue" : "kRowDualValue",
+            i,
+            idx,
+            limit);
+        }
+      }
+    }
+  }
+
   papilo::Solution<f_t> reduced_sol(primal_sol_vec_h);
   if (dual_postsolve) {
     reduced_sol.dual         = dual_sol_vec_h;
@@ -798,13 +871,22 @@ void third_party_presolve_t<i_t, f_t>::undo(rmm::device_uvector<f_t>& primal_sol
   papilo::Solution<f_t> full_sol;
 
   papilo::Message Msg{};
-  Msg.setVerbosityLevel(papilo::VerbosityLevel::kQuiet);
+  Msg.setVerbosityLevel(papilo::VerbosityLevel::kDetailed);
   papilo::Postsolve<f_t> post_solver{Msg, papilo_post_solve_storage_->getNum()};
 
   bool is_optimal = false;
+  CUOPT_LOG_DEBUG("Papilo undo: calling post_solver.undo with %zu reduction steps",
+                  storage.types.size());
+
   auto status = post_solver.undo(reduced_sol, full_sol, *papilo_post_solve_storage_, is_optimal);
   check_postsolve_status(status);
 
+  if (dual_postsolve) {
+    std::vector<bool> row_survives(storage.nRowsOriginal, false);
+    for (size_t k = 0; k < storage.origrow_mapping.size(); ++k)
+      row_survives[storage.origrow_mapping[k]] = true;
+  }
+
   primal_solution.resize(full_sol.primal.size(), stream_view);
   dual_solution.resize(full_sol.dual.size(), stream_view);
   reduced_costs.resize(full_sol.reducedCosts.size(), stream_view);
@@ -876,87 +958,248 @@ void third_party_presolve_t<i_t, f_t>::uncrush_primal_solution(
   full_primal = std::move(full_sol.primal);
 }
 
+template <typename i_t, typename f_t>
+void third_party_presolve_t<i_t, f_t>::crush_primal_solution(
+  const std::vector<f_t>& original_primal, std::vector<f_t>& reduced_primal) const
+{
+  cuopt_expects(presolver_ == cuopt::linear_programming::presolver_t::Papilo,
+                error_type_t::RuntimeError,
+                "Primal crushing is only supported for PaPILO presolve");
+  cuopt_assert(papilo_post_solve_storage_ != nullptr, "No postsolve storage available");
+  std::vector<f_t> unused_y, unused_z;
+  std::vector<f_t> empty_vals;
+  std::vector<i_t> empty_indices, empty_offsets;
+  crush_primal_dual_solution(original_primal,
+                             {},
+                             reduced_primal,
+                             unused_y,
+                             {},
+                             unused_z,
+                             empty_vals,
+                             empty_indices,
+                             empty_offsets);
+}
+
 /**
- * Crush an original-space primal solution into the presolved (reduced) space.
+ * Crush an original-space primal+dual solution into the presolved (reduced) space.
  *
- * This is the forward version of Papilo's Postsolve::undo(). It replays
- * each presolve reduction in forward order to transform variable values,
- * then projects onto the surviving variables via origcol_mapping.
+ * This is the forward counterpart of Papilo's Postsolve::undo(). It replays
+ * each presolve reduction in forward order to transform variable/dual values,
+ * then projects onto the surviving columns/rows via origcol/origrow_mapping.
  *
- * The crushing process is remarkably straightforward since we only need to support primal crushing.
- * We're essentially just projecting
+ * Only two reductions actually transform survivor coordinates:
+ *   kParallelCol             — merges x[col1] into x[col2]
+ *   kRowBoundChangeForcedByRow — conditionally transfers y[deleted_row] → y[kept_row]
+ * Everything else is either projection-only or metadata.
  */
-template <typename f_t>
-static void crush_primal_impl(const papilo::PostsolveStorage<f_t>& storage,
-                              const std::vector<f_t>& original_primal,
-                              std::vector<f_t>& reduced_primal)
+template <typename i_t, typename f_t>
+void third_party_presolve_t<i_t, f_t>::crush_primal_dual_solution(
+  const std::vector<f_t>& x_original,
+  const std::vector<f_t>& y_original,
+  std::vector<f_t>& x_reduced,
+  std::vector<f_t>& y_reduced,
+  const std::vector<f_t>& z_original,
+  std::vector<f_t>& z_reduced,
+  const std::vector<f_t>& A_values,
+  const std::vector<i_t>& A_indices,
+  const std::vector<i_t>& A_offsets) const
 {
+  cuopt_expects(presolver_ == cuopt::linear_programming::presolver_t::Papilo,
+                error_type_t::RuntimeError,
+                "Crushing is only supported for PaPILO presolve");
+  cuopt_assert(papilo_post_solve_storage_ != nullptr, "No postsolve storage available");
+
+  const auto& storage = *papilo_post_solve_storage_;
   const auto& types   = storage.types;
   const auto& indices = storage.indices;
   const auto& values  = storage.values;
   const auto& start   = storage.start;
+  const auto& num     = storage.num;
 
-  const int n_cols_original = (int)storage.nColsOriginal;
-  cuopt_assert((int)original_primal.size() == n_cols_original,
-               "original_primal size must match nColsOriginal");
+  cuopt_assert((int)x_original.size() == (int)storage.nColsOriginal, "");
 
-  std::vector<f_t> assignment(original_primal.begin(), original_primal.end());
+  const bool crush_dual = !y_original.empty();
+  if (crush_dual) { cuopt_assert((int)y_original.size() == (int)storage.nRowsOriginal, ""); }
 
-  // Go through the postsolve reduction stack in forward order
+  const bool crush_rc = !z_original.empty();
+  if (crush_rc) { cuopt_assert((int)z_original.size() == (int)storage.nColsOriginal, ""); }
+
+  std::vector<f_t> x(x_original.begin(), x_original.end());
+  std::vector<f_t> y(y_original.begin(), y_original.end());
+  std::vector<f_t> z(z_original.begin(), z_original.end());
+
+  // Track current coefficient values for entries modified by kCoefficientChange,
+  // so repeated changes to the same (row, col) are handled correctly.
+  std::map<std::pair<int, int>, f_t> coeff_current;
+
+  // Look up the current value of a_{row,col}: use coeff_current if it was
+  // modified by a prior kCoefficientChange, otherwise fall back to the
+  // original CSR.
+  auto get_coeff = [&](int row, int col) -> f_t {
+    auto it = coeff_current.find({row, col});
+    if (it != coeff_current.end()) return it->second;
+    for (i_t p = A_offsets[row]; p < A_offsets[row + 1]; ++p) {
+      if (A_indices[p] == col) return A_values[p];
+    }
+    return 0;
+  };
+
+  std::map<int, int> reduction_type_count;
   for (int i = 0; i < (int)types.size(); ++i) {
     int first = start[i];
 
+    reduction_type_count[(int)types[i]]++;
+
     switch (types[i]) {
-      // The one tricky reduction - Parallel column merging
       case ReductionType::kParallelCol: {
         // Storage layout: [orig_col1, flags1, orig_col2, flags2, -1]
         //                 [col1lb,    col1ub, col2lb,    col2ub, col2scale]
         int col1         = indices[first];
         int col2         = indices[first + 2];
         const f_t& scale = values[first + 4];
-        assignment[col2] += scale * assignment[col1];
+        x[col2] += scale * x[col1];
+        break;
+      }
+
+      case ReductionType::kRowBoundChangeForcedByRow: {
+        if (!crush_dual) break;
+        cuopt_assert(i >= 1 && types[i - 1] == ReductionType::kReasonForRowBoundChangeForcedByRow,
+                     "kRowBoundChangeForcedByRow must be preceded by its reason record");
+
+        bool is_lhs = indices[first] == 1;
+        int row     = (int)values[first];
+
+        int reason_first = start[i - 1];
+        int deleted_row  = indices[reason_first + 1];
+        f_t factor       = values[reason_first];
+        cuopt_assert(factor != 0, "parallel row factor must be nonzero");
+
+        // Forward rule: if the deleted row carried dual signal that the
+        // reverse would have attributed to the kept row, transfer it back.
+        f_t candidate = y[deleted_row] / factor;
+        bool sign_ok  = is_lhs ? num.isGT(candidate, (f_t)0) : num.isLT(candidate, (f_t)0);
+
+        if (sign_ok) {
+          f_t y_old = y[row];
+          y[row]    = candidate;
+          // Maintain z = c - A^T y: propagate the y change into reduced costs
+          if (crush_rc) {
+            f_t delta_y = candidate - y_old;
+            for (i_t p = A_offsets[row]; p < A_offsets[row + 1]; ++p) {
+              f_t a = get_coeff(row, A_indices[p]);
+              z[A_indices[p]] -= delta_y * a;
+            }
+          }
+        }
+        break;
+      }
+
+      case ReductionType::kCoefficientChange: {
+        if (!crush_rc) break;
+        int row                   = indices[first];
+        int col                   = indices[first + 1];
+        f_t a_new                 = values[first];
+        f_t a_old                 = get_coeff(row, col);
+        coeff_current[{row, col}] = a_new;
+        z[col] += (a_old - a_new) * y[row];
+        break;
+      }
+
+      case ReductionType::kSubstitutedColWithDual: {
+        // Singleton substitution: column j is expressed via equality row k as
+        //   x_j = (rhs_k - Σ_{l≠j} a_kl·x_l) / a_kj
+        // This changes the objective for every column l in row k:
+        //   c_red[l] = c_orig[l] - (c_j / a_kj) · a_kl
+        // Adjust z accordingly:  Δz[l] = -(a_kl / a_kj)·z[j] - a_kl·y[k]
+        if (!crush_rc) break;
+        int row_k      = indices[first];  // equality row (original space)
+        int row_length = (int)values[first];
+        // Row coefficients start at first+3
+        int row_coef_start = first + 3;
+        // Substituted column index is after the row coefficients
+        int col_j = indices[row_coef_start + row_length];
+
+        // Find a_kj (coefficient of col j in row k)
+        f_t a_kj = 0;
+        for (int p = 0; p < row_length; ++p) {
+          if (indices[row_coef_start + p] == col_j) {
+            a_kj = values[row_coef_start + p];
+            break;
+          }
+        }
+        if (a_kj == 0) break;  // shouldn't happen
+
+        f_t z_j = z[col_j];
+        f_t y_k = y[row_k];
+
+        // Adjust z for each surviving column l in the equality row (l ≠ j)
+        for (int p = 0; p < row_length; ++p) {
+          int col_l = indices[row_coef_start + p];
+          if (col_l == col_j) continue;
+          f_t a_kl = values[row_coef_start + p];
+          z[col_l] -= (a_kl / a_kj) * z_j + a_kl * y_k;
+        }
         break;
       }
 
-      // All other reductions are either no-ops for primal crushing
-      // or simply metadata
       case ReductionType::kFixedCol:                            // Handled via projection
       case ReductionType::kSubstitutedCol:                      // Col is dropped
-      case ReductionType::kSubstitutedColWithDual:              // Col is dropped
       case ReductionType::kFixedInfCol:                         // Col is dropped
       case ReductionType::kVarBoundChange:                      // Noop
       case ReductionType::kRedundantRow:                        // Noop
       case ReductionType::kRowBoundChange:                      // Noop
-      case ReductionType::kRowBoundChangeForcedByRow:           // Noop
-      case ReductionType::kReasonForRowBoundChangeForcedByRow:  // Noop
-      case ReductionType::kSaveRow:                             // Noop
+      case ReductionType::kReasonForRowBoundChangeForcedByRow:  // Metadata for above
+      case ReductionType::kSaveRow:                             // Metadata
       case ReductionType::kReducedBoundsCost:                   // Noop
-      case ReductionType::kColumnDualValue:                     // Dual only
-      case ReductionType::kRowDualValue:                        // Dual only
-      case ReductionType::kCoefficientChange:
-        break;  // Noop
-        // no default: case to let the compiler scream at us if a new reduction is later introduced
+      case ReductionType::kColumnDualValue:                     // Column reduced-cost only
+      case ReductionType::kRowDualValue:                        // Handled via projection
+        break;
+        // no default: case to let the compiler yell at us if a new reduction is later introduced
     }
   }
 
-  // Project the surviving variables into the presolved-space via
-  // the mapping provided by Papilo
+  for (const auto& [type, count] : reduction_type_count) {
+    printf("reduction type: %d count: %d\n", type, count);
+  }
+
   const auto& col_map = storage.origcol_mapping;
-  reduced_primal.resize(col_map.size());
+  const auto& row_map = storage.origrow_mapping;
+
+  // Cancel contributions from removed rows.  The original-space z was
+  // computed as z = c - A^T y over ALL rows.  The reduced-space stationarity
+  // only involves surviving rows, so we must add back the terms from removed
+  // rows: z[j] += y[i] * a_{i,j} for every removed row i with y[i] != 0.
+  if (crush_rc) {
+    std::vector<bool> row_survives((int)storage.nRowsOriginal, false);
+    for (size_t k = 0; k < row_map.size(); ++k) {
+      row_survives[row_map[k]] = true;
+    }
+    for (int i = 0; i < (int)storage.nRowsOriginal; ++i) {
+      if (row_survives[i] || y[i] == 0) continue;
+      for (i_t p = A_offsets[i]; p < A_offsets[i + 1]; ++p) {
+        z[A_indices[p]] += y[i] * get_coeff(i, A_indices[p]);
+      }
+    }
+  }
+
+  x_reduced.resize(col_map.size());
   for (size_t k = 0; k < col_map.size(); ++k) {
-    reduced_primal[k] = assignment[col_map[k]];
+    x_reduced[k] = x[col_map[k]];
   }
-}
 
-template <typename i_t, typename f_t>
-void third_party_presolve_t<i_t, f_t>::crush_primal_solution(
-  const std::vector<f_t>& original_primal, std::vector<f_t>& reduced_primal) const
-{
-  cuopt_expects(presolver_ == cuopt::linear_programming::presolver_t::Papilo,
-                error_type_t::RuntimeError,
-                "Primal crushing is only supported for PaPILO presolve");
-  cuopt_assert(papilo_post_solve_storage_ != nullptr, "No postsolve storage available");
-  crush_primal_impl<f_t>(*papilo_post_solve_storage_, original_primal, reduced_primal);
+  if (crush_dual) {
+    y_reduced.resize(row_map.size());
+    for (size_t k = 0; k < row_map.size(); ++k) {
+      y_reduced[k] = y[row_map[k]];
+    }
+  }
+
+  if (crush_rc) {
+    z_reduced.resize(col_map.size());
+    for (size_t k = 0; k < col_map.size(); ++k) {
+      z_reduced[k] = z[col_map[k]];
+    }
+  }
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp b/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp
index 6f0d51e2f0..d0f6d02757 100644
--- a/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp
+++ b/cpp/src/mip_heuristics/presolve/third_party_presolve.hpp
@@ -80,6 +80,16 @@ class third_party_presolve_t {
 
   void crush_primal_solution(const std::vector<f_t>& original_primal,
                              std::vector<f_t>& reduced_primal) const;
+
+  void crush_primal_dual_solution(const std::vector<f_t>& x_original,
+                                  const std::vector<f_t>& y_original,
+                                  std::vector<f_t>& x_reduced,
+                                  std::vector<f_t>& y_reduced,
+                                  const std::vector<f_t>& z_original,
+                                  std::vector<f_t>& z_reduced,
+                                  const std::vector<f_t>& A_values,
+                                  const std::vector<i_t>& A_indices,
+                                  const std::vector<i_t>& A_offsets) const;
   const std::vector<i_t>& get_reduced_to_original_map() const { return reduced_to_original_map_; }
   const std::vector<i_t>& get_original_to_reduced_map() const { return original_to_reduced_map_; }
 
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 29a7f32db6..b6a9c40f6c 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -1551,6 +1551,18 @@ optimization_problem_solution_t<i_t, f_t> solve_lp(
         cuopt::device_copy(solution.get_reduced_cost(), op_problem.get_handle_ptr()->get_stream());
       bool status_to_skip = false;
 
+      CUOPT_LOG_DEBUG(
+        "Pre-undo: original problem n_vars=%d n_cons=%d, "
+        "reduced solution primal=%zu dual=%zu reduced_costs=%zu, "
+        "solver termination=%d dual_postsolve=%d",
+        op_problem.get_n_variables(),
+        op_problem.get_n_constraints(),
+        primal_solution.size(),
+        dual_solution.size(),
+        reduced_costs.size(),
+        (int)solution.get_termination_status(),
+        (int)settings.dual_postsolve);
+
       presolver->undo(primal_solution,
                       dual_solution,
                       reduced_costs,
diff --git a/cpp/tests/linear_programming/unit_tests/presolve_test.cu b/cpp/tests/linear_programming/unit_tests/presolve_test.cu
index 22fe9a39e1..525d896e27 100644
--- a/cpp/tests/linear_programming/unit_tests/presolve_test.cu
+++ b/cpp/tests/linear_programming/unit_tests/presolve_test.cu
@@ -10,6 +10,7 @@
 #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
 #include <cuopt/linear_programming/solve.hpp>
 #include <mip_heuristics/presolve/third_party_presolve.hpp>
+#include <mip_heuristics/utilities/sort_csr.cuh>
 #include <mps_parser/mps_data_model.hpp>
 #include <mps_parser/parser.hpp>
 #include <pdlp/utils.cuh>
@@ -23,6 +24,7 @@
 
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <cmath>
 #include <limits>
 #include <string>
@@ -100,6 +102,82 @@ static void check_variable_bounds(const std::vector<double>& solution,
   }
 }
 
+// A^T * y from CSR matrix A (m x n)
+static void compute_transpose_matvec(const std::vector<double>& values,
+                                     const std::vector<int>& indices,
+                                     const std::vector<int>& offsets,
+                                     const std::vector<double>& y,
+                                     int n_cols,
+                                     std::vector<double>& result)
+{
+  size_t n_rows = offsets.size() - 1;
+  assert(y.size() == n_rows);
+  result.assign(n_cols, 0.0);
+  for (size_t i = 0; i < n_rows; ++i) {
+    for (int j = offsets[i]; j < offsets[i + 1]; ++j) {
+      result[indices[j]] += values[j] * y[i];
+    }
+  }
+}
+
+// KKT complementary slackness on variable bounds:
+// reduced cost z_j = c_j - (A^T y)_j must be >= 0 at lower bound,
+// <= 0 at upper bound, and ~0 for interior variables.
+static void check_reduced_cost_consistency(const std::vector<double>& objective,
+                                           const std::vector<double>& ATy,
+                                           const std::vector<double>& primal,
+                                           const std::vector<double>& var_lb,
+                                           const std::vector<double>& var_ub,
+                                           double bound_tol,
+                                           double cs_tol)
+{
+  constexpr double inf = std::numeric_limits<double>::infinity();
+  assert(objective.size() == primal.size());
+  assert(ATy.size() == primal.size());
+  for (size_t j = 0; j < primal.size(); ++j) {
+    double z_j = objective[j] - ATy[j];
+    bool at_lb = (var_lb[j] > -inf && std::abs(primal[j] - var_lb[j]) <= bound_tol);
+    bool at_ub = (var_ub[j] < inf && std::abs(primal[j] - var_ub[j]) <= bound_tol);
+    if (at_lb && at_ub) continue;  // fixed variable — reduced cost unconstrained
+
+    if (at_lb) {
+      ASSERT_GE(z_j, -cs_tol) << "reduced cost violation at variable " << j << " (at lower bound)";
+    } else if (at_ub) {
+      ASSERT_LE(z_j, cs_tol) << "reduced cost violation at variable " << j << " (at upper bound)";
+    } else {
+      ASSERT_NEAR(z_j, 0.0, cs_tol) << "reduced cost should be ~0 for interior variable " << j;
+    }
+  }
+}
+
+// KKT complementary slackness on constraint bounds:
+// y_i >= 0 when constraint at lower bound, <= 0 at upper bound, ~0 when inactive.
+static void check_dual_sign_consistency(const std::vector<double>& Ax,
+                                        const std::vector<double>& dual,
+                                        const std::vector<double>& con_lb,
+                                        const std::vector<double>& con_ub,
+                                        double bound_tol,
+                                        double cs_tol)
+{
+  constexpr double inf = std::numeric_limits<double>::infinity();
+  assert(Ax.size() == dual.size());
+  for (size_t i = 0; i < dual.size(); ++i) {
+    bool at_lb = (con_lb[i] > -inf && std::abs(Ax[i] - con_lb[i]) <= bound_tol);
+    bool at_ub = (con_ub[i] < inf && std::abs(Ax[i] - con_ub[i]) <= bound_tol);
+    if (at_lb && at_ub) continue;  // equality constraint — dual unconstrained
+
+    if (at_lb) {
+      ASSERT_GE(dual[i], -cs_tol) << "dual sign violation at constraint " << i
+                                  << " (at lower bound)";
+    } else if (at_ub) {
+      ASSERT_LE(dual[i], cs_tol) << "dual sign violation at constraint " << i
+                                 << " (at upper bound)";
+    } else {
+      ASSERT_NEAR(dual[i], 0.0, cs_tol) << "dual should be ~0 for non-active constraint " << i;
+    }
+  }
+}
+
 // Test PSLP presolver postsolve accuracy using afiro problem
 TEST(pslp_presolve, postsolve_accuracy_afiro)
 {
@@ -383,6 +461,281 @@ TEST(pslp_presolve, postsolve_multiple_problems)
   }
 }
 
+// Crush an optimal original-space (x, y) into reduced space and verify that
+// the result satisfies the KKT conditions of the reduced problem: primal
+// feasibility, dual feasibility (reduced cost definition), and complementary
+// slackness on both variable and constraint bounds.
+struct crush_test_param {
+  std::string mps_path;
+  bool use_pdlp;
+};
+
+class dual_crush_round_trip : public ::testing::TestWithParam<crush_test_param> {};
+
+TEST_P(dual_crush_round_trip, kkt_check)
+{
+  const raft::handle_t handle_{};
+  auto stream = handle_.get_stream();
+
+  constexpr double bound_tol = 1e-5;
+  constexpr double kkt_tol   = 9e-2;
+
+  const auto& param = GetParam();
+  auto path         = make_path_absolute(param.mps_path);
+  auto mps          = cuopt::mps_parser::parse_mps<int, double>(path, false);
+  auto op_problem =
+    cuopt::linear_programming::mps_data_model_to_optimization_problem<int, double>(&handle_, mps);
+
+  // Step 1: Presolve with a single presolver instance (same one used for crush later)
+  detail::sort_csr(op_problem);
+  detail::third_party_presolve_t<int, double> presolver;
+  auto result = presolver.apply(op_problem,
+                                problem_category_t::LP,
+                                presolver_t::Papilo,
+                                /*dual_postsolve=*/true,
+                                /*abs_tol=*/1e-6,
+                                /*rel_tol=*/1e-9,
+                                /*time_limit=*/60.0);
+  ASSERT_TRUE(result.status == detail::third_party_presolve_status_t::REDUCED ||
+              result.status == detail::third_party_presolve_status_t::UNCHANGED);
+
+  // Step 2: Solve the reduced problem (no presolve — we already did it)
+  auto settings           = pdlp_solver_settings_t<int, double>{};
+  settings.presolver      = presolver_t::None;
+  settings.dual_postsolve = true;
+  settings.time_limit     = 60.0;
+  if (param.use_pdlp) {
+    settings.method                               = cuopt::linear_programming::method_t::PDLP;
+    settings.tolerances.absolute_dual_tolerance   = 1e-7;
+    settings.tolerances.relative_dual_tolerance   = 0;
+    settings.tolerances.absolute_primal_tolerance = 1e-7;
+    settings.tolerances.relative_primal_tolerance = 0;
+  } else {
+    settings.method = cuopt::linear_programming::method_t::DualSimplex;
+  }
+
+  auto reduced_solution = solve_lp(result.reduced_problem, settings);
+  ASSERT_EQ(reduced_solution.get_termination_status(), pdlp_termination_status_t::Optimal);
+
+  // Step 3: Postsolve to get original-space (x, y, z) using the same presolver.
+  // For PDLP, derive z = c_red - A_red^T y_red instead of using get_reduced_cost(),
+  // since PDLP's approximate solution may have inconsistent reduced costs.
+  auto primal_sol = cuopt::device_copy(reduced_solution.get_primal_solution(), stream);
+  auto dual_sol   = cuopt::device_copy(reduced_solution.get_dual_solution(), stream);
+  rmm::device_uvector<double> rc_sol(0, stream);
+  if (param.use_pdlp) {
+    auto red_A_vals    = result.reduced_problem.get_constraint_matrix_values_host();
+    auto red_A_indices = result.reduced_problem.get_constraint_matrix_indices_host();
+    auto red_A_offsets = result.reduced_problem.get_constraint_matrix_offsets_host();
+    auto red_c         = result.reduced_problem.get_objective_coefficients_host();
+    auto h_y_red       = host_copy(dual_sol, stream);
+    int red_n_vars     = result.reduced_problem.get_n_variables();
+
+    std::vector<double> ATy_red;
+    compute_transpose_matvec(
+      red_A_vals, red_A_indices, red_A_offsets, h_y_red, red_n_vars, ATy_red);
+    std::vector<double> z_red(red_n_vars);
+    for (int j = 0; j < red_n_vars; ++j) {
+      z_red[j] = red_c[j] - ATy_red[j];
+    }
+    rc_sol = cuopt::device_copy(z_red, stream);
+  } else {
+    rc_sol = cuopt::device_copy(reduced_solution.get_reduced_cost(), stream);
+  }
+
+  presolver.undo(primal_sol, dual_sol, rc_sol, problem_category_t::LP, false, true, stream);
+
+  auto x_orig  = host_copy(primal_sol, stream);
+  auto y_orig  = host_copy(dual_sol, stream);
+  auto rc_orig = host_copy(rc_sol, stream);
+
+  ASSERT_EQ((int)x_orig.size(), op_problem.get_n_variables());
+  ASSERT_EQ((int)y_orig.size(), op_problem.get_n_constraints());
+
+  // Step 4: Sanity-check the postsolve output in original space before crushing.
+  auto orig_A_vals    = op_problem.get_constraint_matrix_values_host();
+  auto orig_A_indices = op_problem.get_constraint_matrix_indices_host();
+  auto orig_A_offsets = op_problem.get_constraint_matrix_offsets_host();
+  auto orig_c         = op_problem.get_objective_coefficients_host();
+  auto orig_var_lb    = op_problem.get_variable_lower_bounds_host();
+  auto orig_var_ub    = op_problem.get_variable_upper_bounds_host();
+  auto orig_con_lb    = op_problem.get_constraint_lower_bounds_host();
+  auto orig_con_ub    = op_problem.get_constraint_upper_bounds_host();
+  {
+    int orig_n_vars = op_problem.get_n_variables();
+
+    // Stationarity: z == c - A^T y
+    std::vector<double> ATy_orig;
+    compute_transpose_matvec(
+      orig_A_vals, orig_A_indices, orig_A_offsets, y_orig, orig_n_vars, ATy_orig);
+    for (size_t j = 0; j < rc_orig.size(); ++j) {
+      double z_derived = orig_c[j] - ATy_orig[j];
+      EXPECT_NEAR(rc_orig[j], z_derived, kkt_tol)
+        << "postsolve stationarity violation at original variable " << j;
+    }
+
+    // Primal feasibility
+    check_variable_bounds(x_orig, orig_var_lb, orig_var_ub, bound_tol);
+    std::vector<double> Ax_orig;
+    compute_constraint_residuals(orig_A_vals, orig_A_indices, orig_A_offsets, x_orig, Ax_orig);
+    check_constraint_satisfaction(Ax_orig, orig_con_lb, orig_con_ub, bound_tol);
+
+    // Complementary slackness
+    check_reduced_cost_consistency(
+      orig_c, ATy_orig, x_orig, orig_var_lb, orig_var_ub, bound_tol, kkt_tol);
+    check_dual_sign_consistency(Ax_orig, y_orig, orig_con_lb, orig_con_ub, bound_tol, kkt_tol);
+  }
+
+  // Step 5: Crush using the SAME presolver that produced the postsolve storage
+  std::vector<double> x_crushed, y_crushed, rc_crushed;
+  presolver.crush_primal_dual_solution(x_orig,
+                                       y_orig,
+                                       x_crushed,
+                                       y_crushed,
+                                       rc_orig,
+                                       rc_crushed,
+                                       orig_A_vals,
+                                       orig_A_indices,
+                                       orig_A_offsets);
+
+  int n_vars = result.reduced_problem.get_n_variables();
+  int n_cons = result.reduced_problem.get_n_constraints();
+  ASSERT_EQ((int)x_crushed.size(), n_vars);
+  ASSERT_EQ((int)y_crushed.size(), n_cons);
+  ASSERT_EQ((int)rc_crushed.size(), n_vars);
+
+  // Extract reduced problem data on host
+  auto A_vals    = result.reduced_problem.get_constraint_matrix_values_host();
+  auto A_indices = result.reduced_problem.get_constraint_matrix_indices_host();
+  auto A_offsets = result.reduced_problem.get_constraint_matrix_offsets_host();
+  auto c_red     = result.reduced_problem.get_objective_coefficients_host();
+  auto var_lb    = result.reduced_problem.get_variable_lower_bounds_host();
+  auto var_ub    = result.reduced_problem.get_variable_upper_bounds_host();
+  auto con_lb    = result.reduced_problem.get_constraint_lower_bounds_host();
+  auto con_ub    = result.reduced_problem.get_constraint_upper_bounds_host();
+
+  // Primal feasibility: variable bounds
+  check_variable_bounds(x_crushed, var_lb, var_ub, bound_tol);
+
+  // Primal feasibility: constraint satisfaction (l_c <= Ax <= u_c)
+  std::vector<double> Ax;
+  compute_constraint_residuals(A_vals, A_indices, A_offsets, x_crushed, Ax);
+  check_constraint_satisfaction(Ax, con_lb, con_ub, bound_tol);
+
+  // Dual feasibility: compute implied reduced costs z = c - A^T y
+  std::vector<double> ATy;
+  compute_transpose_matvec(A_vals, A_indices, A_offsets, y_crushed, n_vars, ATy);
+
+  // Complementary slackness on variable bounds (reduced cost signs)
+  check_reduced_cost_consistency(c_red, ATy, x_crushed, var_lb, var_ub, bound_tol, kkt_tol);
+
+  // Complementary slackness on constraint bounds (dual variable signs)
+  check_dual_sign_consistency(Ax, y_crushed, con_lb, con_ub, bound_tol, kkt_tol);
+
+  // Crushed reduced costs: consistency with derived z = c - A^T y
+  for (size_t j = 0; j < rc_crushed.size(); ++j) {
+    double z_derived = c_red[j] - ATy[j];
+    ASSERT_NEAR(rc_crushed[j], z_derived, kkt_tol)
+      << "crushed reduced cost vs derived mismatch at variable " << j;
+  }
+
+  // Cross-check: crushed primal should match the solver's primal (unique at optimality)
+  auto x_ref = host_copy(reduced_solution.get_primal_solution(), stream);
+  ASSERT_EQ(x_crushed.size(), x_ref.size());
+  for (size_t i = 0; i < x_crushed.size(); ++i) {
+    EXPECT_NEAR(x_crushed[i], x_ref[i], kkt_tol) << "primal mismatch at reduced variable " << i;
+  }
+
+  // Cross-check: crushed objective should match the solver's objective
+  auto obj_ref       = reduced_solution.get_additional_termination_information().primal_objective;
+  double obj_crushed = 0.0;
+  for (size_t i = 0; i < x_crushed.size(); ++i) {
+    obj_crushed += c_red[i] * x_crushed[i];
+  }
+  obj_crushed += result.reduced_problem.get_objective_offset();
+  EXPECT_NEAR(obj_crushed, obj_ref, kkt_tol * std::max(1.0, std::abs(obj_ref)))
+    << "crushed objective mismatch";
+
+  // Note: y_crushed and rc_crushed are NOT required to match the solver's y_ref/rc_ref.
+  // Dual degeneracy means multiple optimal duals exist. The crush may land on a different
+  // one due to Papilo's postsolve performing dual pivots (kVarBoundChange). Both are valid
+  // as long as KKT conditions hold, which is already verified above.
+}
+
+// clang-format off
+INSTANTIATE_TEST_SUITE_P(
+  papilo_presolve,
+  dual_crush_round_trip,
+  ::testing::Values(
+    crush_test_param{"linear_programming/graph40-40/graph40-40.mps", true},
+    //crush_test_param{"linear_programming/ex10/ex10.mps", true},
+    //crush_test_param{"linear_programming/datt256_lp/datt256_lp.mps", true},
+    //crush_test_param{"linear_programming/woodlands09/woodlands09.mps", false},
+    //crush_test_param{"linear_programming/savsched1/savsched1.mps", false},
+    crush_test_param{"linear_programming/nug08-3rd/nug08-3rd.mps", true},
+    crush_test_param{"linear_programming/qap15/qap15.mps", true},
+    //crush_test_param{"linear_programming/scpm1/scpm1.mps", true},
+    //crush_test_param{"linear_programming/neos3/neos3.mps", true},
+    crush_test_param{"linear_programming/a2864/a2864.mps", true},
+    crush_test_param{"linear_programming/afiro_original.mps", false},
+    crush_test_param{"mip/enlight_hard.mps", false},
+    crush_test_param{"mip/enlight11.mps", false},
+    crush_test_param{"mip/50v-10.mps", false},
+    crush_test_param{"mip/fiball.mps", false},
+    crush_test_param{"mip/gen-ip054.mps", false},
+    crush_test_param{"mip/sct2.mps", false},
+    crush_test_param{"mip/drayage-25-23.mps", false},
+    crush_test_param{"mip/tr12-30.mps", false},
+    crush_test_param{"mip/neos-3004026-krka.mps", false},
+    crush_test_param{"mip/ns1208400.mps", false},
+    crush_test_param{"mip/gmu-35-50.mps", true},
+    crush_test_param{"mip/n2seq36q.mps", false},
+    crush_test_param{"mip/seymour1.mps", false},
+    //crush_test_param{"mip/thor50dday.mps", false},
+    crush_test_param{"mip/neos8.mps", false},
+    crush_test_param{"mip/app1-1.mps", false},
+    crush_test_param{"mip/bnatt400.mps", false},
+    crush_test_param{"mip/bnatt500.mps", false},
+    //crush_test_param{"mip/brazil3.mps", false},
+    crush_test_param{"mip/cbs-cta.mps", false},
+    crush_test_param{"mip/CMS750_4.mps", false},
+    crush_test_param{"mip/decomp2.mps", false},
+    crush_test_param{"mip/dws008-01.mps", false},
+    crush_test_param{"mip/germanrr.mps", false},
+    crush_test_param{"mip/graph20-20-1rand.mps", false},
+    crush_test_param{"mip/milo-v12-6-r2-40-1.mps", false},
+    crush_test_param{"mip/neos-1445765.mps", false},
+    crush_test_param{"mip/neos-1582420.mps", false},
+    crush_test_param{"mip/neos-3083819-nubu.mps", false},
+    crush_test_param{"mip/neos-5107597-kakapo.mps", false},
+    crush_test_param{"mip/neos-5188808-nattai.mps", false},
+    crush_test_param{"mip/net12.mps", false},
+    crush_test_param{"mip/rocI-4-11.mps", false},
+    crush_test_param{"mip/traininstance2.mps", false},
+    crush_test_param{"mip/traininstance6.mps", false},
+    crush_test_param{"mip/neos-787933.mps", false},
+    crush_test_param{"mip/radiationm18-12-05.mps", false},
+    crush_test_param{"mip/momentum1.mps", false},
+    crush_test_param{"mip/rococoB10-011000.mps", false},
+    crush_test_param{"mip/b1c1s1.mps", false},
+    crush_test_param{"mip/nu25-pr12.mps", false},
+    crush_test_param{"mip/air05.mps", false},
+    crush_test_param{"mip/seymour.mps", false},
+    crush_test_param{"mip/swath3.mps", false},
+    crush_test_param{"mip/neos-950242.mps", false},
+    crush_test_param{"mip/fastxgemm-n2r6s0t2.mps", false}
+  ),
+  [](const ::testing::TestParamInfo<crush_test_param>& info) {
+    std::string name = info.param.mps_path;
+    std::replace(name.begin(), name.end(), '/', '_');
+    std::replace(name.begin(), name.end(), '.', '_');
+    std::replace(name.begin(), name.end(), '-', '_');
+    if (info.param.use_pdlp) name += "_pdlp";
+    return name;
+  }
+);
+// clang-format on
+
 }  // namespace cuopt::linear_programming::test
 
 CUOPT_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/mip/incumbent_callback_test.cu b/cpp/tests/mip/incumbent_callback_test.cu
index 6b36298ae7..24005f395d 100644
--- a/cpp/tests/mip/incumbent_callback_test.cu
+++ b/cpp/tests/mip/incumbent_callback_test.cu
@@ -210,7 +210,7 @@ TEST(mip_solve, initial_solution_survives_papilo_crush)
 
   // Step 1: solve to get a reference feasible solution
   auto settings1       = mip_solver_settings_t<int, double>{};
-  settings1.time_limit = 30.;
+  settings1.time_limit = 5.;
   settings1.presolver  = presolver_t::Papilo;
   auto result1         = solve_mip(op_problem, settings1);
   auto status1         = result1.get_termination_status();
@@ -226,7 +226,7 @@ TEST(mip_solve, initial_solution_survives_papilo_crush)
   // immediately. The only way we get a good objective is if the MIP start
   // was crushed through PaPILO and accepted by add_user_given_solutions.
   auto settings2       = mip_solver_settings_t<int, double>{};
-  settings2.time_limit = 10.;
+  settings2.time_limit = 5.;
   settings2.presolver  = presolver_t::Papilo;
   settings2.node_limit = 0;
   settings2.add_initial_solution(reference_solution.data(), reference_solution.size(), stream);
diff --git a/datasets/mip/download_miplib_test_dataset.sh b/datasets/mip/download_miplib_test_dataset.sh
index d9cefbc32d..c4b97250dd 100755
--- a/datasets/mip/download_miplib_test_dataset.sh
+++ b/datasets/mip/download_miplib_test_dataset.sh
@@ -26,6 +26,39 @@ INSTANCES=(
     "enlight11"
     "supportcase22"
     "pk1"
+    # Instances with significant presolve reductions (dual crush round-trip tests)
+    "app1-1"
+    "bnatt400"
+    "bnatt500"
+    "brazil3"
+    "cbs-cta"
+    "CMS750_4"
+    "decomp2"
+    "dws008-01"
+    "germanrr"
+    "graph20-20-1rand"
+    "milo-v12-6-r2-40-1"
+    "neos-1445765"
+    "neos-1582420"
+    "neos-3083819-nubu"
+    "neos-5107597-kakapo"
+    "neos-5188808-nattai"
+    "net12"
+    "rocI-4-11"
+    "traininstance2"
+    "traininstance6"
+    # Medium-big instances with significant presolve reductions
+    "neos-787933"
+    "radiationm18-12-05"
+    "momentum1"
+    "rococoB10-011000"
+    "b1c1s1"
+    "nu25-pr12"
+    "air05"
+    "seymour"
+    "swath3"
+    "neos-950242"
+    "fastxgemm-n2r6s0t2"
 )
 
 BASE_URL="https://miplib.zib.de/WebData/instances"

From e7091301e82684f1c4bb5a6800a7d86fb98b44ca Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Tue, 14 Apr 2026 10:55:01 -0700
Subject: [PATCH 4/6] more tests, clenaup

---
 .../diversity/diversity_manager.cu            |   1 +
 .../presolve/third_party_presolve.cpp         |  86 +----------
 cpp/src/pdlp/solve.cu                         |  12 --
 .../unit_tests/presolve_test.cu               | 139 +++++++++++++++++-
 4 files changed, 140 insertions(+), 98 deletions(-)

diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
index 6226d9ec0f..884ffe046e 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
@@ -196,6 +196,7 @@ void diversity_manager_t<i_t, f_t>::add_user_given_solutions(
                  init_sol_assignment.data(),
                  h_original.size(),
                  sol.handle_ptr->get_stream());
+      sol.handle_ptr->sync_stream();
       std::vector<f_t> h_crushed;
       problem_ptr->presolve_data.papilo_presolve_ptr->crush_primal_solution(h_original, h_crushed);
       init_sol_assignment.resize(h_crushed.size(), sol.handle_ptr->get_stream());
diff --git a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
index 41b78136cb..16caf93533 100644
--- a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
+++ b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
@@ -42,8 +42,6 @@
 #include <utilities/macros.cuh>
 #include <utilities/timer.hpp>
 
-#include <map>
-
 #include <raft/core/nvtx.hpp>
 
 namespace cuopt::linear_programming::detail {
@@ -796,72 +794,6 @@ void third_party_presolve_t<i_t, f_t>::undo(rmm::device_uvector<f_t>& primal_sol
   std::vector<f_t> reduced_costs_vec_h(reduced_costs.size());
   raft::copy(reduced_costs_vec_h.data(), reduced_costs.data(), reduced_costs.size(), stream_view);
 
-  const auto& storage = *papilo_post_solve_storage_;
-  CUOPT_LOG_DEBUG("Papilo undo: reduced solution sizes: primal=%zu dual=%zu reduced_costs=%zu",
-                  primal_sol_vec_h.size(),
-                  dual_sol_vec_h.size(),
-                  reduced_costs_vec_h.size());
-  CUOPT_LOG_DEBUG(
-    "Papilo undo: postsolve storage: nColsOriginal=%d nRowsOriginal=%d "
-    "origcol_mapping size=%zu origrow_mapping size=%zu "
-    "types size=%zu indices size=%zu values size=%zu start size=%zu",
-    (int)storage.nColsOriginal,
-    (int)storage.nRowsOriginal,
-    storage.origcol_mapping.size(),
-    storage.origrow_mapping.size(),
-    storage.types.size(),
-    storage.indices.size(),
-    storage.values.size(),
-    storage.start.size());
-  CUOPT_LOG_DEBUG("Papilo undo: dual_postsolve=%d", (int)dual_postsolve);
-
-  // Pre-flight validation: scan postsolve entries for out-of-bounds indices
-  {
-    int n_cols_orig = (int)storage.nColsOriginal;
-    int n_rows_orig = (int)storage.nRowsOriginal;
-    for (int i = (int)storage.types.size() - 1; i >= 0; --i) {
-      int first = storage.start[i];
-      auto type = storage.types[i];
-
-      if (type == ReductionType::kFixedCol) {
-        int col = storage.indices[first];
-        if (col < 0 || col >= n_cols_orig) {
-          CUOPT_LOG_ERROR(
-            "Papilo undo pre-flight: kFixedCol at step %d has col=%d "
-            "but nColsOriginal=%d (out of bounds!)",
-            i,
-            col,
-            n_cols_orig);
-        }
-        if (dual_postsolve) {
-          int col_length = storage.indices[first + 1];
-          for (int k = 0; k < col_length; ++k) {
-            int row = storage.indices[first + 2 + k];
-            if (row < 0 || row >= n_rows_orig) {
-              CUOPT_LOG_ERROR(
-                "Papilo undo pre-flight: kFixedCol at step %d references "
-                "row=%d but nRowsOriginal=%d (out of bounds!)",
-                i,
-                row,
-                n_rows_orig);
-            }
-          }
-        }
-      } else if (type == ReductionType::kColumnDualValue || type == ReductionType::kRowDualValue) {
-        int idx   = storage.indices[first];
-        int limit = (type == ReductionType::kColumnDualValue) ? n_cols_orig : n_rows_orig;
-        if (idx < 0 || idx >= limit) {
-          CUOPT_LOG_ERROR(
-            "Papilo undo pre-flight: %s at step %d has idx=%d but limit=%d",
-            type == ReductionType::kColumnDualValue ? "kColumnDualValue" : "kRowDualValue",
-            i,
-            idx,
-            limit);
-        }
-      }
-    }
-  }
-
   papilo::Solution<f_t> reduced_sol(primal_sol_vec_h);
   if (dual_postsolve) {
     reduced_sol.dual         = dual_sol_vec_h;
@@ -871,22 +803,13 @@ void third_party_presolve_t<i_t, f_t>::undo(rmm::device_uvector<f_t>& primal_sol
   papilo::Solution<f_t> full_sol;
 
   papilo::Message Msg{};
-  Msg.setVerbosityLevel(papilo::VerbosityLevel::kDetailed);
+  Msg.setVerbosityLevel(papilo::VerbosityLevel::kQuiet);
   papilo::Postsolve<f_t> post_solver{Msg, papilo_post_solve_storage_->getNum()};
 
   bool is_optimal = false;
-  CUOPT_LOG_DEBUG("Papilo undo: calling post_solver.undo with %zu reduction steps",
-                  storage.types.size());
-
   auto status = post_solver.undo(reduced_sol, full_sol, *papilo_post_solve_storage_, is_optimal);
   check_postsolve_status(status);
 
-  if (dual_postsolve) {
-    std::vector<bool> row_survives(storage.nRowsOriginal, false);
-    for (size_t k = 0; k < storage.origrow_mapping.size(); ++k)
-      row_survives[storage.origrow_mapping[k]] = true;
-  }
-
   primal_solution.resize(full_sol.primal.size(), stream_view);
   dual_solution.resize(full_sol.dual.size(), stream_view);
   reduced_costs.resize(full_sol.reducedCosts.size(), stream_view);
@@ -1044,12 +967,9 @@ void third_party_presolve_t<i_t, f_t>::crush_primal_dual_solution(
     return 0;
   };
 
-  std::map<int, int> reduction_type_count;
   for (int i = 0; i < (int)types.size(); ++i) {
     int first = start[i];
 
-    reduction_type_count[(int)types[i]]++;
-
     switch (types[i]) {
       case ReductionType::kParallelCol: {
         // Storage layout: [orig_col1, flags1, orig_col2, flags2, -1]
@@ -1158,10 +1078,6 @@ void third_party_presolve_t<i_t, f_t>::crush_primal_dual_solution(
     }
   }
 
-  for (const auto& [type, count] : reduction_type_count) {
-    printf("reduction type: %d count: %d\n", type, count);
-  }
-
   const auto& col_map = storage.origcol_mapping;
   const auto& row_map = storage.origrow_mapping;
 
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index b6a9c40f6c..29a7f32db6 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -1551,18 +1551,6 @@ optimization_problem_solution_t<i_t, f_t> solve_lp(
         cuopt::device_copy(solution.get_reduced_cost(), op_problem.get_handle_ptr()->get_stream());
       bool status_to_skip = false;
 
-      CUOPT_LOG_DEBUG(
-        "Pre-undo: original problem n_vars=%d n_cons=%d, "
-        "reduced solution primal=%zu dual=%zu reduced_costs=%zu, "
-        "solver termination=%d dual_postsolve=%d",
-        op_problem.get_n_variables(),
-        op_problem.get_n_constraints(),
-        primal_solution.size(),
-        dual_solution.size(),
-        reduced_costs.size(),
-        (int)solution.get_termination_status(),
-        (int)settings.dual_postsolve);
-
       presolver->undo(primal_solution,
                       dual_solution,
                       reduced_costs,
diff --git a/cpp/tests/linear_programming/unit_tests/presolve_test.cu b/cpp/tests/linear_programming/unit_tests/presolve_test.cu
index 525d896e27..a010d970fc 100644
--- a/cpp/tests/linear_programming/unit_tests/presolve_test.cu
+++ b/cpp/tests/linear_programming/unit_tests/presolve_test.cu
@@ -676,7 +676,7 @@ INSTANTIATE_TEST_SUITE_P(
     crush_test_param{"linear_programming/qap15/qap15.mps", true},
     //crush_test_param{"linear_programming/scpm1/scpm1.mps", true},
     //crush_test_param{"linear_programming/neos3/neos3.mps", true},
-    crush_test_param{"linear_programming/a2864/a2864.mps", true},
+    //crush_test_param{"linear_programming/a2864/a2864.mps", true},
     crush_test_param{"linear_programming/afiro_original.mps", false},
     crush_test_param{"mip/enlight_hard.mps", false},
     crush_test_param{"mip/enlight11.mps", false},
@@ -736,6 +736,143 @@ INSTANTIATE_TEST_SUITE_P(
 );
 // clang-format on
 
+// Test that crushed solutions work as warmstarts in the full solving pipeline.
+// Presolve → cold PDLP solve → postsolve → crush → warmstarted PDLP solve.
+// The warmstarted solve should converge in fewer iterations than the cold start.
+class crush_warmstart : public ::testing::TestWithParam<std::string> {};
+
+TEST_P(crush_warmstart, round_trip)
+{
+  const raft::handle_t handle_{};
+  auto stream = handle_.get_stream();
+
+  auto path = make_path_absolute(GetParam());
+  auto mps  = cuopt::mps_parser::parse_mps<int, double>(path, false);
+  auto op_problem =
+    cuopt::linear_programming::mps_data_model_to_optimization_problem<int, double>(&handle_, mps);
+
+  // Step 1: Presolve
+  detail::sort_csr(op_problem);
+  detail::third_party_presolve_t<int, double> presolver;
+  auto result = presolver.apply(op_problem,
+                                problem_category_t::LP,
+                                presolver_t::Papilo,
+                                /*dual_postsolve=*/true,
+                                /*abs_tol=*/1e-6,
+                                /*rel_tol=*/1e-9,
+                                /*time_limit=*/60.0);
+  ASSERT_TRUE(result.status == detail::third_party_presolve_status_t::REDUCED ||
+              result.status == detail::third_party_presolve_status_t::UNCHANGED);
+
+  int n_red_vars = result.reduced_problem.get_n_variables();
+  int n_red_cons = result.reduced_problem.get_n_constraints();
+
+  // Step 2: Cold PDLP solve of the reduced problem
+  auto settings           = pdlp_solver_settings_t<int, double>{};
+  settings.presolver      = presolver_t::None;
+  settings.dual_postsolve = true;
+  settings.method         = cuopt::linear_programming::method_t::PDLP;
+  settings.time_limit     = 60.0;
+
+  auto cold_solution = solve_lp(result.reduced_problem, settings);
+  ASSERT_EQ(cold_solution.get_termination_status(), pdlp_termination_status_t::Optimal);
+
+  auto cold_iters = cold_solution.get_additional_termination_information().number_of_steps_taken;
+  double cold_obj = cold_solution.get_additional_termination_information().primal_objective;
+
+  // Step 3: Postsolve to original space (only primal + dual needed for warmstart)
+  auto primal_sol = cuopt::device_copy(cold_solution.get_primal_solution(), stream);
+  auto dual_sol   = cuopt::device_copy(cold_solution.get_dual_solution(), stream);
+  rmm::device_uvector<double> rc_sol(n_red_vars, stream);
+  thrust::fill(rmm::exec_policy(stream), rc_sol.begin(), rc_sol.end(), 0.0);
+
+  presolver.undo(primal_sol, dual_sol, rc_sol, problem_category_t::LP, false, true, stream);
+
+  auto x_orig = host_copy(primal_sol, stream);
+  auto y_orig = host_copy(dual_sol, stream);
+
+  // Step 4: Crush back to reduced space (no rc needed for warmstart)
+  auto orig_A_vals    = op_problem.get_constraint_matrix_values_host();
+  auto orig_A_indices = op_problem.get_constraint_matrix_indices_host();
+  auto orig_A_offsets = op_problem.get_constraint_matrix_offsets_host();
+
+  std::vector<double> x_crushed, y_crushed, rc_unused;
+  presolver.crush_primal_dual_solution(x_orig,
+                                       y_orig,
+                                       x_crushed,
+                                       y_crushed,
+                                       {},
+                                       rc_unused,
+                                       orig_A_vals,
+                                       orig_A_indices,
+                                       orig_A_offsets);
+
+  ASSERT_EQ((int)x_crushed.size(), n_red_vars);
+  ASSERT_EQ((int)y_crushed.size(), n_red_cons);
+
+  // Step 5: Warmstarted PDLP solve of the reduced problem
+  auto warm_settings = settings;
+  warm_settings.set_initial_primal_solution(x_crushed.data(), n_red_vars, stream);
+  warm_settings.set_initial_dual_solution(y_crushed.data(), n_red_cons, stream);
+
+  auto warm_solution = solve_lp(result.reduced_problem, warm_settings);
+  ASSERT_EQ(warm_solution.get_termination_status(), pdlp_termination_status_t::Optimal);
+
+  double warm_obj = warm_solution.get_additional_termination_information().primal_objective;
+  auto warm_iters = warm_solution.get_additional_termination_information().number_of_steps_taken;
+
+  // Verify: same objective
+  EXPECT_NEAR(warm_obj, cold_obj, 1e-3 * std::max(1.0, std::abs(cold_obj)))
+    << "warmstarted objective should match cold solve";
+
+  // Verify: warmstart should take fewer iterations than cold start
+  EXPECT_LE(warm_iters, cold_iters)
+    << "warmstarted solve should not take more iterations than cold solve"
+    << " (cold=" << cold_iters << ", warm=" << warm_iters << ")";
+}
+
+// clang-format off
+INSTANTIATE_TEST_SUITE_P(
+  papilo_presolve,
+  crush_warmstart,
+  ::testing::Values(
+    "mip/fiball.mps",
+    "mip/50v-10.mps",
+    "mip/drayage-25-23.mps",
+    "mip/neos-3004026-krka.mps",
+    "mip/app1-1.mps",
+    "mip/bnatt400.mps",
+    "mip/decomp2.mps",
+    "mip/graph20-20-1rand.mps",
+    "mip/milo-v12-6-r2-40-1.mps",
+    "mip/neos-1582420.mps",
+    "mip/neos-5188808-nattai.mps",
+    "mip/net12.mps",
+    "mip/n2seq36q.mps",
+    "mip/seymour1.mps",
+    "mip/neos8.mps",
+    "mip/CMS750_4.mps",
+    "mip/cbs-cta.mps",
+    "mip/swath3.mps",
+    "mip/air05.mps",
+    "mip/fastxgemm-n2r6s0t2.mps",
+    "mip/dws008-01.mps",
+    "mip/germanrr.mps",
+    "mip/neos-1445765.mps",
+    "mip/neos-3083819-nubu.mps",
+    "mip/neos-5107597-kakapo.mps",
+    "mip/rocI-4-11.mps"
+  ),
+  [](const ::testing::TestParamInfo<std::string>& info) {
+    std::string name = info.param;
+    std::replace(name.begin(), name.end(), '/', '_');
+    std::replace(name.begin(), name.end(), '.', '_');
+    std::replace(name.begin(), name.end(), '-', '_');
+    return name;
+  }
+);
+// clang-format on
+
 }  // namespace cuopt::linear_programming::test
 
 CUOPT_TEST_PROGRAM_MAIN()

From d69e8d0fa274dd34567104f2c7e6bf6139efebc8 Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Wed, 15 Apr 2026 01:21:55 -0700
Subject: [PATCH 5/6] cleanup the KKT check

---
 .../unit_tests/presolve_test.cu               | 117 +++++++++++-------
 1 file changed, 73 insertions(+), 44 deletions(-)

diff --git a/cpp/tests/linear_programming/unit_tests/presolve_test.cu b/cpp/tests/linear_programming/unit_tests/presolve_test.cu
index a010d970fc..b4ccfc733e 100644
--- a/cpp/tests/linear_programming/unit_tests/presolve_test.cu
+++ b/cpp/tests/linear_programming/unit_tests/presolve_test.cu
@@ -120,60 +120,89 @@ static void compute_transpose_matvec(const std::vector<double>& values,
   }
 }
 
-// KKT complementary slackness on variable bounds:
-// reduced cost z_j = c_j - (A^T y)_j must be >= 0 at lower bound,
-// <= 0 at upper bound, and ~0 for interior variables.
-static void check_reduced_cost_consistency(const std::vector<double>& objective,
-                                           const std::vector<double>& ATy,
+// General-form variable-bound KKT for exported solver solutions.
+//
+// We validate the scalar reduced cost z_j directly, with the user-space convention
+//   z = c - A^T y.
+// This is the same convention exposed by both PDLP and Dual Simplex after any
+// internal transformations are undone.
+//
+// Lower-bound multipliers correspond to positive reduced costs, upper-bound
+// multipliers correspond to negative reduced costs. If a side is absent, the
+// corresponding multiplier must be zero.
+static void check_reduced_cost_consistency(const std::vector<double>& reduced_cost,
                                            const std::vector<double>& primal,
                                            const std::vector<double>& var_lb,
                                            const std::vector<double>& var_ub,
                                            double bound_tol,
-                                           double cs_tol)
+                                           double dual_tol)
 {
   constexpr double inf = std::numeric_limits<double>::infinity();
-  assert(objective.size() == primal.size());
-  assert(ATy.size() == primal.size());
+  assert(reduced_cost.size() == primal.size());
+  assert(var_lb.size() == primal.size());
+  assert(var_ub.size() == primal.size());
   for (size_t j = 0; j < primal.size(); ++j) {
-    double z_j = objective[j] - ATy[j];
-    bool at_lb = (var_lb[j] > -inf && std::abs(primal[j] - var_lb[j]) <= bound_tol);
-    bool at_ub = (var_ub[j] < inf && std::abs(primal[j] - var_ub[j]) <= bound_tol);
-    if (at_lb && at_ub) continue;  // fixed variable — reduced cost unconstrained
-
-    if (at_lb) {
-      ASSERT_GE(z_j, -cs_tol) << "reduced cost violation at variable " << j << " (at lower bound)";
-    } else if (at_ub) {
-      ASSERT_LE(z_j, cs_tol) << "reduced cost violation at variable " << j << " (at upper bound)";
-    } else {
-      ASSERT_NEAR(z_j, 0.0, cs_tol) << "reduced cost should be ~0 for interior variable " << j;
+    const bool has_lb = var_lb[j] > -inf;
+    const bool has_ub = var_ub[j] < inf;
+    const double z_j  = reduced_cost[j];
+
+    // If a side is missing, its multiplier cannot exist.
+    if (!has_lb) {
+      ASSERT_LE(z_j, dual_tol) << "positive reduced cost requires a finite lower bound at variable "
+                               << j;
+    }
+    if (!has_ub) {
+      ASSERT_GE(z_j, -dual_tol)
+        << "negative reduced cost requires a finite upper bound at variable " << j;
+    }
+
+    // If we are strictly away from a bound, the multiplier for that side must vanish.
+    if (has_lb && primal[j] - var_lb[j] > bound_tol) {
+      ASSERT_LE(z_j, dual_tol)
+        << "positive reduced cost requires an active lower bound at variable " << j;
+    }
+    if (has_ub && var_ub[j] - primal[j] > bound_tol) {
+      ASSERT_GE(z_j, -dual_tol)
+        << "negative reduced cost requires an active upper bound at variable " << j;
     }
   }
 }
 
-// KKT complementary slackness on constraint bounds:
-// y_i >= 0 when constraint at lower bound, <= 0 at upper bound, ~0 when inactive.
+// General-form row-bound KKT for exported solver duals.
+//
+// Positive y_i corresponds to the lower row bound, negative y_i to the upper
+// row bound. If a side is absent, the corresponding multiplier must be zero.
 static void check_dual_sign_consistency(const std::vector<double>& Ax,
                                         const std::vector<double>& dual,
                                         const std::vector<double>& con_lb,
                                         const std::vector<double>& con_ub,
                                         double bound_tol,
-                                        double cs_tol)
+                                        double dual_tol)
 {
   constexpr double inf = std::numeric_limits<double>::infinity();
   assert(Ax.size() == dual.size());
+  assert(con_lb.size() == dual.size());
+  assert(con_ub.size() == dual.size());
   for (size_t i = 0; i < dual.size(); ++i) {
-    bool at_lb = (con_lb[i] > -inf && std::abs(Ax[i] - con_lb[i]) <= bound_tol);
-    bool at_ub = (con_ub[i] < inf && std::abs(Ax[i] - con_ub[i]) <= bound_tol);
-    if (at_lb && at_ub) continue;  // equality constraint — dual unconstrained
-
-    if (at_lb) {
-      ASSERT_GE(dual[i], -cs_tol) << "dual sign violation at constraint " << i
-                                  << " (at lower bound)";
-    } else if (at_ub) {
-      ASSERT_LE(dual[i], cs_tol) << "dual sign violation at constraint " << i
-                                 << " (at upper bound)";
-    } else {
-      ASSERT_NEAR(dual[i], 0.0, cs_tol) << "dual should be ~0 for non-active constraint " << i;
+    const bool has_lb = con_lb[i] > -inf;
+    const bool has_ub = con_ub[i] < inf;
+
+    if (!has_lb) {
+      ASSERT_LE(dual[i], dual_tol)
+        << "positive row dual requires a finite lower bound at constraint " << i;
+    }
+    if (!has_ub) {
+      ASSERT_GE(dual[i], -dual_tol)
+        << "negative row dual requires a finite upper bound at constraint " << i;
+    }
+
+    if (has_lb && Ax[i] - con_lb[i] > bound_tol) {
+      ASSERT_LE(dual[i], dual_tol)
+        << "positive row dual requires an active lower bound at constraint " << i;
+    }
+    if (has_ub && con_ub[i] - Ax[i] > bound_tol) {
+      ASSERT_GE(dual[i], -dual_tol)
+        << "negative row dual requires an active upper bound at constraint " << i;
     }
   }
 }
@@ -461,10 +490,11 @@ TEST(pslp_presolve, postsolve_multiple_problems)
   }
 }
 
-// Crush an optimal original-space (x, y) into reduced space and verify that
-// the result satisfies the KKT conditions of the reduced problem: primal
-// feasibility, dual feasibility (reduced cost definition), and complementary
-// slackness on both variable and constraint bounds.
+// Crush an optimal original-space (x, y, z) into reduced space and verify the
+// user-space general-form KKT conditions on both the original and reduced LPs.
+// This intentionally checks the exported solution convention, so the same
+// conditions apply to PDLP and Dual Simplex even though Dual Simplex uses a
+// different internal form.
 struct crush_test_param {
   std::string mps_path;
   bool use_pdlp;
@@ -580,9 +610,8 @@ TEST_P(dual_crush_round_trip, kkt_check)
     compute_constraint_residuals(orig_A_vals, orig_A_indices, orig_A_offsets, x_orig, Ax_orig);
     check_constraint_satisfaction(Ax_orig, orig_con_lb, orig_con_ub, bound_tol);
 
-    // Complementary slackness
-    check_reduced_cost_consistency(
-      orig_c, ATy_orig, x_orig, orig_var_lb, orig_var_ub, bound_tol, kkt_tol);
+    // Variable- and row-bound KKT in original space
+    check_reduced_cost_consistency(rc_orig, x_orig, orig_var_lb, orig_var_ub, bound_tol, kkt_tol);
     check_dual_sign_consistency(Ax_orig, y_orig, orig_con_lb, orig_con_ub, bound_tol, kkt_tol);
   }
 
@@ -626,10 +655,10 @@ TEST_P(dual_crush_round_trip, kkt_check)
   std::vector<double> ATy;
   compute_transpose_matvec(A_vals, A_indices, A_offsets, y_crushed, n_vars, ATy);
 
-  // Complementary slackness on variable bounds (reduced cost signs)
-  check_reduced_cost_consistency(c_red, ATy, x_crushed, var_lb, var_ub, bound_tol, kkt_tol);
+  // Variable-bound KKT in reduced space
+  check_reduced_cost_consistency(rc_crushed, x_crushed, var_lb, var_ub, bound_tol, kkt_tol);
 
-  // Complementary slackness on constraint bounds (dual variable signs)
+  // Row-bound KKT in reduced space
   check_dual_sign_consistency(Ax, y_crushed, con_lb, con_ub, bound_tol, kkt_tol);
 
   // Crushed reduced costs: consistency with derived z = c - A^T y

From d47d709eefa96060aa3c33add01a71ee45e948bb Mon Sep 17 00:00:00 2001
From: Alice Boucher <yboucher@nvidia.com>
Date: Wed, 15 Apr 2026 03:11:14 -0700
Subject: [PATCH 6/6] cleanup

---
 .../diversity/diversity_manager.cu            | 17 ++---
 .../presolve/third_party_presolve.cpp         |  3 +-
 cpp/src/mip_heuristics/solve.cu               |  2 +-
 .../unit_tests/presolve_test.cu               | 75 +++++++++----------
 cpp/tests/mip/incumbent_callback_test.cu      | 27 +++++--
 datasets/mip/download_miplib_test_dataset.sh  |  2 -
 6 files changed, 64 insertions(+), 62 deletions(-)

diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
index 884ffe046e..7efb19afdd 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
@@ -190,20 +190,13 @@ void diversity_manager_t<i_t, f_t>::add_user_given_solutions(
     solution_t<i_t, f_t> sol(*problem_ptr);
     rmm::device_uvector<f_t> init_sol_assignment(*init_sol, sol.handle_ptr->get_stream());
 
-    if (has_papilo && (i_t)init_sol_assignment.size() == papilo_orig_n) {
-      std::vector<f_t> h_original(init_sol_assignment.size());
-      raft::copy(h_original.data(),
-                 init_sol_assignment.data(),
-                 h_original.size(),
-                 sol.handle_ptr->get_stream());
-      sol.handle_ptr->sync_stream();
+    if (has_papilo) {
+      cuopt_assert((i_t)init_sol_assignment.size() == papilo_orig_n,
+                   "Initial solution size mismatch");
+      std::vector<f_t> h_original = host_copy(init_sol_assignment, sol.handle_ptr->get_stream());
       std::vector<f_t> h_crushed;
       problem_ptr->presolve_data.papilo_presolve_ptr->crush_primal_solution(h_original, h_crushed);
-      init_sol_assignment.resize(h_crushed.size(), sol.handle_ptr->get_stream());
-      raft::copy(init_sol_assignment.data(),
-                 h_crushed.data(),
-                 h_crushed.size(),
-                 sol.handle_ptr->get_stream());
+      expand_device_copy(init_sol_assignment, h_crushed, sol.handle_ptr->get_stream());
       CUOPT_LOG_DEBUG("Crushed initial solution %d through Papilo (%d -> %d vars)",
                       sol_idx,
                       papilo_orig_n,
diff --git a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
index 16caf93533..c02a4410bb 100644
--- a/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
+++ b/cpp/src/mip_heuristics/presolve/third_party_presolve.cpp
@@ -913,7 +913,6 @@ void third_party_presolve_t<i_t, f_t>::crush_primal_solution(
  * Only two reductions actually transform survivor coordinates:
  *   kParallelCol             — merges x[col1] into x[col2]
  *   kRowBoundChangeForcedByRow — conditionally transfers y[deleted_row] → y[kept_row]
- * Everything else is either projection-only or metadata.
  */
 template <typename i_t, typename f_t>
 void third_party_presolve_t<i_t, f_t>::crush_primal_dual_solution(
@@ -944,7 +943,7 @@ void third_party_presolve_t<i_t, f_t>::crush_primal_dual_solution(
   const bool crush_dual = !y_original.empty();
   if (crush_dual) { cuopt_assert((int)y_original.size() == (int)storage.nRowsOriginal, ""); }
 
-  const bool crush_rc = !z_original.empty();
+  const bool crush_rc = !z_original.empty() && crush_dual;
   if (crush_rc) { cuopt_assert((int)z_original.size() == (int)storage.nColsOriginal, ""); }
 
   std::vector<f_t> x(x_original.begin(), x_original.end());
diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu
index d1aed1fff4..5691f985ed 100644
--- a/cpp/src/mip_heuristics/solve.cu
+++ b/cpp/src/mip_heuristics/solve.cu
@@ -359,7 +359,7 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
     // This may fail if some dual reductions cut off parts of the polytope that the
     // solutions lie in. We thus may hit scenarios where an excellent incumbent is found
     // but is dropped due to these dual reductions, and we lose a good solution.
-    // This is why we still keep the solution around in OG-space
+    // This is why we still keep the solution around in original-space
     // and later extract it at the end of the solve.
     std::atomic<f_t> early_best_objective{std::numeric_limits<f_t>::infinity()};
     f_t early_best_user_obj{std::numeric_limits<f_t>::infinity()};
diff --git a/cpp/tests/linear_programming/unit_tests/presolve_test.cu b/cpp/tests/linear_programming/unit_tests/presolve_test.cu
index b4ccfc733e..a9477655e1 100644
--- a/cpp/tests/linear_programming/unit_tests/presolve_test.cu
+++ b/cpp/tests/linear_programming/unit_tests/presolve_test.cu
@@ -102,7 +102,6 @@ static void check_variable_bounds(const std::vector<double>& solution,
   }
 }
 
-// A^T * y from CSR matrix A (m x n)
 static void compute_transpose_matvec(const std::vector<double>& values,
                                      const std::vector<int>& indices,
                                      const std::vector<int>& offsets,
@@ -110,26 +109,28 @@ static void compute_transpose_matvec(const std::vector<double>& values,
                                      int n_cols,
                                      std::vector<double>& result)
 {
+  assert(!offsets.empty());
+  assert(values.size() == indices.size());
+  assert(n_cols >= 0);
   size_t n_rows = offsets.size() - 1;
   assert(y.size() == n_rows);
+  assert((size_t)offsets.back() == values.size());
   result.assign(n_cols, 0.0);
+  std::vector<double> kahan_compensation(n_cols, 0.0);
   for (size_t i = 0; i < n_rows; ++i) {
     for (int j = offsets[i]; j < offsets[i + 1]; ++j) {
-      result[indices[j]] += values[j] * y[i];
+      int col = indices[j];
+      assert(col >= 0 && col < n_cols);
+      double delta            = values[j] * y[i];
+      double corrected        = delta - kahan_compensation[col];
+      double next             = result[col] + corrected;
+      kahan_compensation[col] = (next - result[col]) - corrected;
+      result[col]             = next;
     }
   }
 }
 
-// General-form variable-bound KKT for exported solver solutions.
-//
-// We validate the scalar reduced cost z_j directly, with the user-space convention
-//   z = c - A^T y.
-// This is the same convention exposed by both PDLP and Dual Simplex after any
-// internal transformations are undone.
-//
-// Lower-bound multipliers correspond to positive reduced costs, upper-bound
-// multipliers correspond to negative reduced costs. If a side is absent, the
-// corresponding multiplier must be zero.
+// Complimentary slackness checks
 static void check_reduced_cost_consistency(const std::vector<double>& reduced_cost,
                                            const std::vector<double>& primal,
                                            const std::vector<double>& var_lb,
@@ -489,19 +490,14 @@ TEST(pslp_presolve, postsolve_multiple_problems)
     EXPECT_LT(rel_error, 0.01) << "Problem " << name << " objective mismatch";
   }
 }
-
-// Crush an optimal original-space (x, y, z) into reduced space and verify the
-// user-space general-form KKT conditions on both the original and reduced LPs.
-// This intentionally checks the exported solution convention, so the same
-// conditions apply to PDLP and Dual Simplex even though Dual Simplex uses a
-// different internal form.
 struct crush_test_param {
   std::string mps_path;
   bool use_pdlp;
 };
-
 class dual_crush_round_trip : public ::testing::TestWithParam<crush_test_param> {};
 
+// Crush an optimal original-space (x, y, z) into reduced space and verify the
+// user-space general-form KKT conditions on both the original and reduced LPs.
 TEST_P(dual_crush_round_trip, kkt_check)
 {
   const raft::handle_t handle_{};
@@ -529,7 +525,7 @@ TEST_P(dual_crush_round_trip, kkt_check)
   ASSERT_TRUE(result.status == detail::third_party_presolve_status_t::REDUCED ||
               result.status == detail::third_party_presolve_status_t::UNCHANGED);
 
-  // Step 2: Solve the reduced problem (no presolve — we already did it)
+  // Step 2: Solve the reduced problem (no presolve, we already did it)
   auto settings           = pdlp_solver_settings_t<int, double>{};
   settings.presolver      = presolver_t::None;
   settings.dual_postsolve = true;
@@ -615,7 +611,7 @@ TEST_P(dual_crush_round_trip, kkt_check)
     check_dual_sign_consistency(Ax_orig, y_orig, orig_con_lb, orig_con_ub, bound_tol, kkt_tol);
   }
 
-  // Step 5: Crush using the SAME presolver that produced the postsolve storage
+  // Step 5: Crush using the same presolver that produced the postsolve storage
   std::vector<double> x_crushed, y_crushed, rc_crushed;
   presolver.crush_primal_dual_solution(x_orig,
                                        y_orig,
@@ -633,7 +629,6 @@ TEST_P(dual_crush_round_trip, kkt_check)
   ASSERT_EQ((int)y_crushed.size(), n_cons);
   ASSERT_EQ((int)rc_crushed.size(), n_vars);
 
-  // Extract reduced problem data on host
   auto A_vals    = result.reduced_problem.get_constraint_matrix_values_host();
   auto A_indices = result.reduced_problem.get_constraint_matrix_indices_host();
   auto A_offsets = result.reduced_problem.get_constraint_matrix_offsets_host();
@@ -668,7 +663,7 @@ TEST_P(dual_crush_round_trip, kkt_check)
       << "crushed reduced cost vs derived mismatch at variable " << j;
   }
 
-  // Cross-check: crushed primal should match the solver's primal (unique at optimality)
+  // Cross-check: crushed primal should match the solver's primal
   auto x_ref = host_copy(reduced_solution.get_primal_solution(), stream);
   ASSERT_EQ(x_crushed.size(), x_ref.size());
   for (size_t i = 0; i < x_crushed.size(); ++i) {
@@ -684,11 +679,6 @@ TEST_P(dual_crush_round_trip, kkt_check)
   obj_crushed += result.reduced_problem.get_objective_offset();
   EXPECT_NEAR(obj_crushed, obj_ref, kkt_tol * std::max(1.0, std::abs(obj_ref)))
     << "crushed objective mismatch";
-
-  // Note: y_crushed and rc_crushed are NOT required to match the solver's y_ref/rc_ref.
-  // Dual degeneracy means multiple optimal duals exist. The crush may land on a different
-  // one due to Papilo's postsolve performing dual pivots (kVarBoundChange). Both are valid
-  // as long as KKT conditions hold, which is already verified above.
 }
 
 // clang-format off
@@ -809,11 +799,24 @@ TEST_P(crush_warmstart, round_trip)
   auto cold_iters = cold_solution.get_additional_termination_information().number_of_steps_taken;
   double cold_obj = cold_solution.get_additional_termination_information().primal_objective;
 
-  // Step 3: Postsolve to original space (only primal + dual needed for warmstart)
+  // Step 3: Postsolve to original space.
+  // Recompute z via kahan summation
   auto primal_sol = cuopt::device_copy(cold_solution.get_primal_solution(), stream);
   auto dual_sol   = cuopt::device_copy(cold_solution.get_dual_solution(), stream);
-  rmm::device_uvector<double> rc_sol(n_red_vars, stream);
-  thrust::fill(rmm::exec_policy(stream), rc_sol.begin(), rc_sol.end(), 0.0);
+
+  auto red_A_vals    = result.reduced_problem.get_constraint_matrix_values_host();
+  auto red_A_indices = result.reduced_problem.get_constraint_matrix_indices_host();
+  auto red_A_offsets = result.reduced_problem.get_constraint_matrix_offsets_host();
+  auto red_c         = result.reduced_problem.get_objective_coefficients_host();
+  auto h_y_red       = host_copy(dual_sol, stream);
+
+  std::vector<double> ATy_red;
+  compute_transpose_matvec(red_A_vals, red_A_indices, red_A_offsets, h_y_red, n_red_vars, ATy_red);
+  std::vector<double> z_red(n_red_vars);
+  for (int j = 0; j < n_red_vars; ++j) {
+    z_red[j] = red_c[j] - ATy_red[j];
+  }
+  auto rc_sol = cuopt::device_copy(z_red, stream);
 
   presolver.undo(primal_sol, dual_sol, rc_sol, problem_category_t::LP, false, true, stream);
 
@@ -850,12 +853,10 @@ TEST_P(crush_warmstart, round_trip)
   double warm_obj = warm_solution.get_additional_termination_information().primal_objective;
   auto warm_iters = warm_solution.get_additional_termination_information().number_of_steps_taken;
 
-  // Verify: same objective
-  EXPECT_NEAR(warm_obj, cold_obj, 1e-3 * std::max(1.0, std::abs(cold_obj)))
-    << "warmstarted objective should match cold solve";
+  double obj_tol = 1e-3 * (1.0 + std::abs(cold_obj));
+  EXPECT_NEAR(warm_obj, cold_obj, obj_tol) << "warmstarted objective should match cold solve";
 
-  // Verify: warmstart should take fewer iterations than cold start
-  EXPECT_LE(warm_iters, cold_iters)
+  EXPECT_LT(warm_iters, cold_iters)
     << "warmstarted solve should not take more iterations than cold solve"
     << " (cold=" << cold_iters << ", warm=" << warm_iters << ")";
 }
@@ -873,7 +874,6 @@ INSTANTIATE_TEST_SUITE_P(
     "mip/bnatt400.mps",
     "mip/decomp2.mps",
     "mip/graph20-20-1rand.mps",
-    "mip/milo-v12-6-r2-40-1.mps",
     "mip/neos-1582420.mps",
     "mip/neos-5188808-nattai.mps",
     "mip/net12.mps",
@@ -886,7 +886,6 @@ INSTANTIATE_TEST_SUITE_P(
     "mip/air05.mps",
     "mip/fastxgemm-n2r6s0t2.mps",
     "mip/dws008-01.mps",
-    "mip/germanrr.mps",
     "mip/neos-1445765.mps",
     "mip/neos-3083819-nubu.mps",
     "mip/neos-5107597-kakapo.mps",
diff --git a/cpp/tests/mip/incumbent_callback_test.cu b/cpp/tests/mip/incumbent_callback_test.cu
index 24005f395d..272e7ce9ab 100644
--- a/cpp/tests/mip/incumbent_callback_test.cu
+++ b/cpp/tests/mip/incumbent_callback_test.cu
@@ -34,6 +34,22 @@
 
 namespace cuopt::linear_programming::test {
 
+class scoped_env_restore_t {
+ public:
+  scoped_env_restore_t(const char* env_name, const char* new_value) : name_(env_name)
+  {
+    if (const char* prev = std::getenv(env_name)) { prev_value_ = prev; }
+    ::setenv(env_name, new_value, 1);
+  }
+  ~scoped_env_restore_t() { ::setenv(name_, prev_value_.c_str(), 1); }
+  scoped_env_restore_t(const scoped_env_restore_t&)            = delete;
+  scoped_env_restore_t& operator=(const scoped_env_restore_t&) = delete;
+
+ private:
+  const char* name_;
+  std::string prev_value_;
+};
+
 class test_set_solution_callback_t : public cuopt::internals::set_solution_callback_t {
  public:
   test_set_solution_callback_t(std::vector<std::pair<std::vector<double>, double>>& solutions_,
@@ -160,7 +176,7 @@ TEST(mip_solve, incumbent_get_set_callback_test)
 // population stays empty. The fallback in solver.cu must use the OG-space incumbent.
 TEST(mip_solve, early_heuristic_incumbent_fallback)
 {
-  setenv("CUOPT_DISABLE_GPU_HEURISTICS", "1", 1);
+  scoped_env_restore_t disable_gpu_heuristics_env("CUOPT_DISABLE_GPU_HEURISTICS", "1");
 
   const raft::handle_t handle_{};
   auto path = make_path_absolute("mip/pk1.mps");
@@ -181,8 +197,6 @@ TEST(mip_solve, early_heuristic_incumbent_fallback)
 
   auto solution = solve_mip(op_problem, settings);
 
-  unsetenv("CUOPT_DISABLE_GPU_HEURISTICS");
-
   EXPECT_GE(get_cb.n_calls, 1) << "Early heuristics should have emitted at least one incumbent";
   auto status = solution.get_termination_status();
   EXPECT_TRUE(status == mip_termination_status_t::FeasibleFound ||
@@ -198,7 +212,7 @@ TEST(mip_solve, early_heuristic_incumbent_fallback)
 // through PaPILO presolve and accepted into the heuristic population.
 TEST(mip_solve, initial_solution_survives_papilo_crush)
 {
-  setenv("CUOPT_DISABLE_GPU_HEURISTICS", "1", 1);
+  scoped_env_restore_t disable_gpu_heuristics_env("CUOPT_DISABLE_GPU_HEURISTICS", "1");
 
   const raft::handle_t handle_{};
   auto path = make_path_absolute("mip/pk1.mps");
@@ -208,7 +222,7 @@ TEST(mip_solve, initial_solution_survives_papilo_crush)
   auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem);
   auto stream     = op_problem.get_handle_ptr()->get_stream();
 
-  // Step 1: solve to get a reference feasible solution
+  // Step 1: solve to get a reference feasible solution. Pkl is easily solved to optimality
   auto settings1       = mip_solver_settings_t<int, double>{};
   settings1.time_limit = 5.;
   settings1.presolver  = presolver_t::Papilo;
@@ -225,6 +239,7 @@ TEST(mip_solve, initial_solution_survives_papilo_crush)
   // and GPU heuristics disabled. B&B runs with node_limit=0 so it exits
   // immediately. The only way we get a good objective is if the MIP start
   // was crushed through PaPILO and accepted by add_user_given_solutions.
+  // Early FJ is not strong enough to find the 11 optimal in the given time frame.
   auto settings2       = mip_solver_settings_t<int, double>{};
   settings2.time_limit = 5.;
   settings2.presolver  = presolver_t::Papilo;
@@ -238,8 +253,6 @@ TEST(mip_solve, initial_solution_survives_papilo_crush)
 
   auto result2 = solve_mip(op_problem, settings2);
 
-  unsetenv("CUOPT_DISABLE_GPU_HEURISTICS");
-
   auto status2 = result2.get_termination_status();
   EXPECT_TRUE(status2 == mip_termination_status_t::FeasibleFound ||
               status2 == mip_termination_status_t::Optimal)
diff --git a/datasets/mip/download_miplib_test_dataset.sh b/datasets/mip/download_miplib_test_dataset.sh
index c4b97250dd..0c54f1c847 100755
--- a/datasets/mip/download_miplib_test_dataset.sh
+++ b/datasets/mip/download_miplib_test_dataset.sh
@@ -26,7 +26,6 @@ INSTANCES=(
     "enlight11"
     "supportcase22"
     "pk1"
-    # Instances with significant presolve reductions (dual crush round-trip tests)
     "app1-1"
     "bnatt400"
     "bnatt500"
@@ -47,7 +46,6 @@ INSTANCES=(
     "rocI-4-11"
     "traininstance2"
     "traininstance6"
-    # Medium-big instances with significant presolve reductions
     "neos-787933"
     "radiationm18-12-05"
     "momentum1"