From fc7aa0468c3e54c124c7932b2468e2e06024240d Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 17 Feb 2026 14:11:54 +0100
Subject: [PATCH 01/13] tmp

---
 .../linear_programming/cuopt/run_pdlp.cu      |   2 +-
 compile.sh                                    |   2 +
 .../initial_scaling.cu                        |   2 +-
 cpp/src/pdlp/pdhg.cu                          |  35 ++++-
 cpp/src/pdlp/pdlp.cu                          | 142 +++++++++++++++++-
 .../restart_strategy/pdlp_restart_strategy.cu |   6 +-
 cpp/src/pdlp/solve.cu                         |   3 +
 .../adaptive_step_size_strategy.cu            | 108 +++++++++----
 cpp/src/pdlp/utilities/ping_pong_graph.cuh    |   2 +-
 .../solver_settings/solver_settings.py        |  38 +++++
 .../linear_programming/data_definition.py     |   5 +
 run_multiple.sh                               |   3 +
 test.py                                       |  12 ++
 13 files changed, 318 insertions(+), 42 deletions(-)
 create mode 100755 compile.sh
 create mode 100755 run_multiple.sh
 create mode 100755 test.py
diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu
index 229c72a49b..c3d6ad42f4 100644
--- a/benchmarks/linear_programming/cuopt/run_pdlp.cu
+++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu
@@ -107,7 +107,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t<int, double> create_sol
     string_to_pdlp_solver_mode(program.get<std::string>("--pdlp-solver-mode"));
   settings.method = static_cast<cuopt::linear_programming::method_t>(program.get<int>("--method"));
   settings.crossover = program.get<int>("--crossover");
-  settings.presolve  = program.get<int>("--presolve");
+  //settings.presolve  = program.get<int>("--presolve");
 
   return settings;
 }
diff --git a/compile.sh b/compile.sh
new file mode 100755
index 0000000000..bedf3a7506
--- /dev/null
+++ b/compile.sh
@@ -0,0 +1,2 @@
+./build.sh libcuopt libmps_parser --cache-tool=ccache --skip-tests-build -a -l=OFF
+./build.sh cuopt cuopt_mps_parser
\ No newline at end of file
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
index 031cd9c3b6..5a08c3bb53 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
@@ -545,7 +545,7 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::scale_problem()
 #ifdef CUPDLP_DEBUG_MODE
   print("constraint_lower_bound", op_problem_scaled_.constraint_lower_bounds);
   print("constraint_upper_bound", op_problem_scaled_.constraint_upper_bounds);
-  std::vector<f_t2> variable_bounds = host_copy(op_problem_scaled_.variable_bounds);
+  std::vector<f_t2> variable_bounds = host_copy(op_problem_scaled_.variable_bounds, stream_view_);
   std::vector<f_t> lower_bounds;
   std::vector<f_t> upper_bounds;
   for (const auto& variable_bound : variable_bounds) {
diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index 9a44bd31e3..c5efdcd722 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -567,6 +567,12 @@ struct primal_reflected_major_projection_bulk_op {
     const f_t obj_coef   = objective_coefficients[var_idx];
     const f_t aty_val    = current_AtY[idx];
 
+    cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_major_projection");
+    cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_major_projection");
+    cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0");
+    cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_major_projection");
+    cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_major_projection");
+
     const f_t next = primal_val - step_size * (obj_coef - aty_val);
 
     const f_t2 bounds = variable_bounds[var_idx];
@@ -576,6 +582,9 @@ struct primal_reflected_major_projection_bulk_op {
     potential_next_primal[idx] = next_clamped;
     dual_slack[idx]            = (next_clamped - next) / step_size;
     reflected_primal[idx]      = f_t(2.0) * next_clamped - primal_val;
+
+    cuopt_assert(!isnan(reflected_primal[idx]),
+                 "reflected_primal is NaN after primal_reflected_major_projection");
   }
 };
 
@@ -599,6 +608,12 @@ struct dual_reflected_major_projection_bulk_op {
     const f_t current_dual = dual_solution[idx];
     const f_t Ax           = dual_gradient[idx];
 
+    cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_major_projection");
+    cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_major_projection");
+    cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0");
+    cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_major_projection");
+    cuopt_assert(!isnan(Ax), "dual_gradient is NaN in dual_reflected_major_projection");
+
     const f_t tmp = current_dual / step_size - Ax;
     const f_t tmp_proj =
       cuda::std::max<f_t>(-constraint_upper_bounds[constraint_idx],
@@ -607,6 +622,9 @@ struct dual_reflected_major_projection_bulk_op {
 
     potential_next_dual[idx] = next_dual;
     reflected_dual[idx]      = f_t(2.0) * next_dual - current_dual;
+
+    cuopt_assert(!isnan(reflected_dual[idx]),
+                 "reflected_dual is NaN after dual_reflected_major_projection");
   }
 };
 
@@ -631,12 +649,19 @@ struct primal_reflected_projection_bulk_op {
     const f_t obj_coef   = objective_coefficients[var_idx];
     const f_t aty_val    = current_AtY[idx];
 
+    cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_projection");
+    cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_projection");
+    cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_projection");
+
     f_t reflected = primal_val - step_size * (obj_coef - aty_val);
 
     const f_t2 bounds = variable_bounds[var_idx];
     reflected = cuda::std::max(cuda::std::min(reflected, get_upper(bounds)), get_lower(bounds));
 
     reflected_primal[idx] = f_t(2.0) * reflected - primal_val;
+
+    cuopt_assert(!isnan(reflected_primal[idx]),
+                 "reflected_primal is NaN after primal_reflected_projection");
   }
 };
 
@@ -659,13 +684,21 @@ struct dual_reflected_projection_bulk_op {
 
     const f_t step_size    = dual_step_size[batch_idx];
     const f_t current_dual = dual_solution[idx];
-    const f_t tmp          = current_dual / step_size - dual_gradient[idx];
+
+    cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_projection");
+    cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_projection");
+    cuopt_assert(!isnan(dual_gradient[idx]), "dual_gradient is NaN in dual_reflected_projection");
+
+    const f_t tmp = current_dual / step_size - dual_gradient[idx];
     const f_t tmp_proj =
       cuda::std::max<f_t>(-constraint_upper_bounds[constraint_idx],
                           cuda::std::min<f_t>(tmp, -constraint_lower_bounds[constraint_idx]));
     const f_t next_dual = (tmp - tmp_proj) * step_size;
 
     reflected_dual[idx] = f_t(2.0) * next_dual - current_dual;
+
+    cuopt_assert(!isnan(reflected_dual[idx]),
+                 "reflected_dual is NaN after dual_reflected_projection");
   }
 };
 
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 67e001db29..e1ab866b5b 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -34,8 +34,10 @@
 #include <cub/cub.cuh>
 
 #include <thrust/count.h>
+#include <thrust/logical.h>
 #include <thrust/extrema.h>
 
+#include <cmath>
 #include <optional>
 #include <unordered_set>
 
@@ -1406,10 +1408,27 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal,
                                  const f_t interaction,
                                  f_t* fixed_point_error)
 {
+  cuopt_assert(!isnan(norm_squared_delta_primal), "norm_squared_delta_primal must not be NaN");
+  cuopt_assert(!isnan(norm_squared_delta_dual), "norm_squared_delta_dual must not be NaN");
+  cuopt_assert(!isnan(primal_weight), "primal_weight must not be NaN");
+  cuopt_assert(!isnan(step_size), "step_size must not be NaN");
+  cuopt_assert(!isnan(interaction), "interaction must not be NaN");
+  cuopt_assert(norm_squared_delta_primal >= f_t(0.0), "norm_squared_delta_primal must be >= 0");
+  cuopt_assert(norm_squared_delta_dual >= f_t(0.0), "norm_squared_delta_dual must be >= 0");
+  cuopt_assert(primal_weight > f_t(0.0), "primal_weight must be > 0");
+  cuopt_assert(step_size > f_t(0.0), "step_size must be > 0");
+
   const f_t movement =
     norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight;
   const f_t computed_interaction = f_t(2.0) * interaction * step_size;
 
+  //printf("movement %lf\n", movement);
+  //printf("computed_interaction %lf\n", computed_interaction);
+
+  cuopt_assert(
+    movement + computed_interaction >= f_t(0.0),
+    "Movement + computed interaction must be >= 0");
+
   *fixed_point_error = cuda::std::sqrt(movement + computed_interaction);
 
 #ifdef CUPDLP_DEBUG_MODE
@@ -1790,6 +1809,68 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
   // Sync to make sure all previous cuSparse operations are finished before setting the
   // potential_next_dual_solution
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+
+  // Validate reflected solutions have no NaN/Inf
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    pdhg_solver_.get_reflected_primal().data(),
+                    pdhg_solver_.get_reflected_primal().data() +
+                      pdhg_solver_.get_reflected_primal().size(),
+                    is_nan_or_inf<f_t>{}),
+    "reflected_primal contains NaN or Inf in compute_fixed_error");
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    pdhg_solver_.get_reflected_dual().data(),
+                    pdhg_solver_.get_reflected_dual().data() +
+                      pdhg_solver_.get_reflected_dual().size(),
+                    is_nan_or_inf<f_t>{}),
+    "reflected_dual contains NaN or Inf in compute_fixed_error");
+
+  // Validate primal/dual solutions have no NaN/Inf
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    pdhg_solver_.get_primal_solution().data(),
+                    pdhg_solver_.get_primal_solution().data() +
+                      pdhg_solver_.get_primal_solution().size(),
+                    is_nan_or_inf<f_t>{}),
+    "primal_solution contains NaN or Inf in compute_fixed_error");
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    pdhg_solver_.get_dual_solution().data(),
+                    pdhg_solver_.get_dual_solution().data() +
+                      pdhg_solver_.get_dual_solution().size(),
+                    is_nan_or_inf<f_t>{}),
+    "dual_solution contains NaN or Inf in compute_fixed_error");
+
+  // Validate deltas have no NaN/Inf
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    pdhg_solver_.get_saddle_point_state().get_delta_primal().data(),
+                    pdhg_solver_.get_saddle_point_state().get_delta_primal().data() +
+                      pdhg_solver_.get_saddle_point_state().get_delta_primal().size(),
+                    is_nan_or_inf<f_t>{}),
+    "delta_primal contains NaN or Inf in compute_fixed_error");
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    pdhg_solver_.get_saddle_point_state().get_delta_dual().data(),
+                    pdhg_solver_.get_saddle_point_state().get_delta_dual().data() +
+                      pdhg_solver_.get_saddle_point_state().get_delta_dual().size(),
+                    is_nan_or_inf<f_t>{}),
+    "delta_dual contains NaN or Inf in compute_fixed_error");
+
+  // Validate primal_weight and step_size have no NaN/Inf
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    primal_weight_.data(),
+                    primal_weight_.data() + primal_weight_.size(),
+                    is_nan_or_inf<f_t>{}),
+    "primal_weight_ contains NaN or Inf in compute_fixed_error");
+  cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(),
+                               step_size_.data(),
+                               step_size_.data() + step_size_.size(),
+                               is_nan_or_inf<f_t>{}),
+              "step_size_ contains NaN or Inf in compute_fixed_error");
+
   // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
   RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
                                            (void*)pdhg_solver_.get_reflected_dual().data()));
@@ -1813,6 +1894,49 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
     RAFT_CUDA_TRY(cudaStreamSynchronize(
       stream_view_));  // To make sure all the data is written from device to host
     RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    // Host-side diagnostic: copy small device arrays and verify movement + interaction >= 0
+    {
+      const auto bs = climber_strategies_.size();
+      std::vector<f_t> h_nsq_dp(bs), h_nsq_dd(bs), h_pw(bs), h_ss(bs), h_inter(bs);
+      RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dp.data(),
+                                step_size_strategy_.get_norm_squared_delta_primal().data(),
+                                bs * sizeof(f_t),
+                                cudaMemcpyDeviceToHost));
+      RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dd.data(),
+                                step_size_strategy_.get_norm_squared_delta_dual().data(),
+                                bs * sizeof(f_t),
+                                cudaMemcpyDeviceToHost));
+      RAFT_CUDA_TRY(cudaMemcpy(
+        h_pw.data(), primal_weight_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost));
+      RAFT_CUDA_TRY(
+        cudaMemcpy(h_ss.data(), step_size_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost));
+      RAFT_CUDA_TRY(cudaMemcpy(h_inter.data(),
+                                step_size_strategy_.get_interaction().data(),
+                                bs * sizeof(f_t),
+                                cudaMemcpyDeviceToHost));
+      for (size_t i = 0; i < bs; ++i) {
+        const f_t movement = h_nsq_dp[i] * h_pw[i] + h_nsq_dd[i] / h_pw[i];
+        const f_t comp_inter = f_t(2.0) * h_inter[i] * h_ss[i];
+        if (movement + comp_inter < f_t(0.0)) {
+          fprintf(stderr,
+                  "DIAGNOSTIC [%zu]: movement=%.17e comp_inter=%.17e sum=%.17e "
+                  "norm_sq_dx=%.17e norm_sq_dy=%.17e pw=%.17e ss=%.17e interaction=%.17e\n",
+                  i,
+                  (double)movement,
+                  (double)comp_inter,
+                  (double)(movement + comp_inter),
+                  (double)h_nsq_dp[i],
+                  (double)h_nsq_dd[i],
+                  (double)h_pw[i],
+                  (double)h_ss[i],
+                  (double)h_inter[i]);
+        }
+        cuopt_assert(movement + comp_inter >= f_t(0.0),
+                     "Host check: movement + computed_interaction must be >= 0");
+      }
+    }
+
 #ifdef CUPDLP_DEBUG_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
 #endif
@@ -1847,9 +1971,15 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
 #endif
 
   for (size_t i = 0; i < climber_strategies_.size(); ++i) {
+    cuopt_assert(!std::isnan(restart_strategy_.fixed_point_error_[i]),
+                 "fixed_point_error_ must not be NaN after compute_fixed_error");
+    cuopt_assert(restart_strategy_.fixed_point_error_[i] >= f_t(0.0),
+                 "fixed_point_error_ must be >= 0 after compute_fixed_error");
     if (has_restarted[i]) {
       restart_strategy_.initial_fixed_point_error_[i] = restart_strategy_.fixed_point_error_[i];
-      has_restarted[i]                                = false;
+      cuopt_assert(!std::isnan(restart_strategy_.initial_fixed_point_error_[i]),
+                   "initial_fixed_point_error_ must not be NaN after assignment");
+      has_restarted[i] = false;
     }
   }
 }
@@ -1869,6 +1999,7 @@ void pdlp_solver_t<i_t, f_t>::transpose_primal_dual_to_row(
   rmm::device_uvector<f_t> dual_slack_transposed(
     is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_);
 
+  RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_));
   CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(),
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
@@ -1945,6 +2076,7 @@ void pdlp_solver_t<i_t, f_t>::transpose_primal_dual_back_to_col(
   rmm::device_uvector<f_t> dual_slack_transposed(
     is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_);
 
+  RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_));
   CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(),
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
@@ -2632,7 +2764,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
     rmm::device_uvector<f_t> d_atq(n, stream_view_);
 
     std::mt19937 gen(1);
-    std::normal_distribution<double> dist(0.0, 1.0);
+    std::normal_distribution<f_t> dist(f_t(0.0), f_t(1.0));
 
     for (int i = 0; i < m; ++i)
       z[i] = dist(gen);
@@ -2684,7 +2816,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                                            vecATQ,
                                            CUSPARSE_SPMV_CSR_ALG2,
                                            (f_t*)cusparse_view_.buffer_transpose.data(),
-                                           stream_view_));
+                                           stream_view_.value()));
 
       // z = A @ A_t_q
       RAFT_CUSPARSE_TRY(
@@ -2697,7 +2829,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                                            vecZ,
                                            CUSPARSE_SPMV_CSR_ALG2,
                                            (f_t*)cusparse_view_.buffer_non_transpose.data(),
-                                           stream_view_));
+                                           stream_view_.value()));
       // sigma_max_sq = dot(q, z)
       RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                                       m,
@@ -2706,7 +2838,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                                                       d_z.data(),
                                                       primal_stride,
                                                       sigma_max_sq.data(),
-                                                      stream_view_));
+                                                      stream_view_.value()));
 
       cub::DeviceTransform::Transform(
         cuda::std::make_tuple(d_q.data(), d_z.data()),
diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index bc12fb360f..a6304a8568 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -1995,14 +1995,14 @@ void pdlp_restart_strategy_t<i_t, f_t>::solve_bound_constrained_trust_region(
     f_t* end   = threshold_.data() + primal_size_h_ + dual_size_h_;
     auto highest_negInf_primal =
       thrust::find(handle_ptr_->get_thrust_policy(),
-                   thrust::make_reverse_iterator(thrust::device_ptr<f_t>(end)),
-                   thrust::make_reverse_iterator(thrust::device_ptr<f_t>(start)),
+                   thrust::device_ptr<f_t>(end),
+                   thrust::device_ptr<f_t>(start),
                    -std::numeric_limits<f_t>::infinity());
 
     // Set ranges accordingly
     i_t index_start_primal = 0;
     i_t index_end_primal   = primal_size_h_ + dual_size_h_;
-    if (highest_negInf_primal != thrust::make_reverse_iterator(thrust::device_ptr<f_t>(start))) {
+    if (highest_negInf_primal != thrust::device_ptr<f_t>(start)) {
       cuopt_assert(device_to_host_value(thrust::raw_pointer_cast(&*highest_negInf_primal)) ==
                      -std::numeric_limits<f_t>::infinity(),
                    "Incorrect primal reverse iterator");
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index a2766be98a..374c9ff513 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -793,6 +793,7 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
 
   // If need warm start, solve the LP alone
   if (primal_dual_init || primal_weight_init) {
+    std::cout << "Solving LP for warm start" << std::endl;
     pdlp_solver_settings_t<i_t, f_t> warm_start_settings = settings;
     warm_start_settings.new_bounds.clear();
     warm_start_settings.method               = cuopt::linear_programming::method_t::PDLP;
@@ -841,6 +842,8 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   }
   if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); }
 
+  std::cout << "Solving batch PDLP" << std::endl;
+
   for (int i = 0; i < max_batch_size; i += optimal_batch_size) {
     const int current_batch_size = std::min(optimal_batch_size, max_batch_size - i);
     // Only take the new bounds from [i, i + current_batch_size)
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index 3c1b85aeac..47ba16a297 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -28,6 +28,8 @@
 
 #include <cub/cub.cuh>
 
+#include <thrust/logical.h>
+
 #include <limits>
 
 namespace cuopt::linear_programming::detail {
@@ -80,7 +82,7 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
       interaction_.data(),
       climber_strategies_.size(),
       primal_size_,
-      stream_view_));
+      stream_view_.value()));
     dot_product_bytes = std::max(dot_product_bytes, byte_needed);
 
     RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(
@@ -90,7 +92,7 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
       norm_squared_delta_primal_.data(),
       climber_strategies_.size(),
       primal_size_,
-      stream_view_));
+      stream_view_.value()));
     dot_product_bytes = std::max(dot_product_bytes, byte_needed);
 
     RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(
@@ -100,10 +102,10 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
       norm_squared_delta_dual_.data(),
       climber_strategies_.size(),
       dual_size_,
-      stream_view_));
+      stream_view_.value()));
     dot_product_bytes = std::max(dot_product_bytes, byte_needed);
 
-    dot_product_storage.resize(dot_product_bytes, stream_view_);
+    dot_product_storage.resize(dot_product_bytes, stream_view_.value());
   }
 }
 
@@ -143,7 +145,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::swap_context(
   const auto [grid_size, block_size] =
     kernel_config_from_batch_size(static_cast<i_t>(swap_pairs.size()));
   adaptive_step_size_swap_device_vectors_kernel<i_t, f_t>
-    <<<grid_size, block_size, 0, stream_view_>>>(thrust::raw_pointer_cast(swap_pairs.data()),
+    <<<grid_size, block_size, 0, stream_view_.value()>>>(thrust::raw_pointer_cast(swap_pairs.data()),
                                                  static_cast<i_t>(swap_pairs.size()),
                                                  make_span(interaction_),
                                                  make_span(norm_squared_delta_primal_),
@@ -159,9 +161,9 @@ void adaptive_step_size_strategy_t<i_t, f_t>::resize_context(i_t new_size)
   cuopt_assert(new_size > 0, "New size must be greater than 0");
   cuopt_assert(new_size < batch_size, "New size must be less than batch size");
 
-  interaction_.resize(new_size, stream_view_);
-  norm_squared_delta_primal_.resize(new_size, stream_view_);
-  norm_squared_delta_dual_.resize(new_size, stream_view_);
+  interaction_.resize(new_size, stream_view_.value());
+  norm_squared_delta_primal_.resize(new_size, stream_view_.value());
+  norm_squared_delta_dual_.resize(new_size, stream_view_.value());
 }
 
 template <typename i_t, typename f_t>
@@ -276,19 +278,19 @@ i_t adaptive_step_size_strategy_t<i_t, f_t>::get_valid_step_size() const
 template <typename i_t, typename f_t>
 f_t adaptive_step_size_strategy_t<i_t, f_t>::get_interaction(i_t i) const
 {
-  return interaction_.element(i, stream_view_);
+  return interaction_.element(i, stream_view_.value());
 }
 
 template <typename i_t, typename f_t>
 f_t adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_primal(i_t i) const
 {
-  return norm_squared_delta_primal_.element(i, stream_view_);
+  return norm_squared_delta_primal_.element(i, stream_view_.value());
 }
 
 template <typename i_t, typename f_t>
 f_t adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_dual(i_t i) const
 {
-  return norm_squared_delta_dual_.element(i, stream_view_);
+  return norm_squared_delta_dual_.element(i, stream_view_.value());
 }
 
 template <typename i_t, typename f_t>
@@ -337,7 +339,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_step_sizes(
                                      pdhg_solver.get_saddle_point_state());
     // Compute n_lim, n_next and decide if step size is valid
     compute_step_sizes_from_movement_and_interaction<i_t, f_t>
-      <<<1, 1, 0, stream_view_>>>(this->view(),
+      <<<1, 1, 0, stream_view_.value()>>>(this->view(),
                                   primal_step_size.data(),
                                   dual_step_size.data(),
                                   pdhg_solver.get_d_total_pdhg_iterations().data());
@@ -345,7 +347,27 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_step_sizes(
   }
   graph.launch(total_pdlp_iterations);
   // Steam sync so that next call can see modification made to host var valid_step_size
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value()));
+}
+
+template <typename i_t, typename f_t>
+__global__ void validate_interaction_and_movement_outputs(
+  raft::device_span<const f_t> norm_squared_delta_primal,
+  raft::device_span<const f_t> norm_squared_delta_dual,
+  raft::device_span<const f_t> interaction)
+{
+  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= norm_squared_delta_primal.size()) { return; }
+  cuopt_assert(!isnan(norm_squared_delta_primal[idx]),
+               "norm_squared_delta_primal is NaN after reduction");
+  cuopt_assert(!isnan(norm_squared_delta_dual[idx]),
+               "norm_squared_delta_dual is NaN after reduction");
+  cuopt_assert(!isnan(interaction[idx]),
+               "interaction is NaN after reduction");
+  cuopt_assert(norm_squared_delta_primal[idx] >= f_t(0.0),
+               "norm_squared_delta_primal must be >= 0 after reduction");
+  cuopt_assert(norm_squared_delta_dual[idx] >= f_t(0.0),
+               "norm_squared_delta_dual must be >= 0 after reduction");
 }
 
 template <typename i_t, typename f_t>
@@ -382,7 +404,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
   // We need to make sure both dot products happens after previous operations (next_primal/dual)
   // Thus, we add another node in the main stream before starting the SpMVs
 
-  if (!batch_mode_) deltas_are_done_.record(stream_view_);
+  if (!batch_mode_) deltas_are_done_.record(stream_view_.value());
 
   // primal_dual_interaction computation => we purposly diverge from the paper (delta_y . (A @ x' -
   // A@x)) to save one SpMV
@@ -406,7 +428,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                          cusparse_view.next_AtY,
                                          CUSPARSE_SPMV_CSR_ALG2,
                                          (f_t*)cusparse_view.buffer_transpose.data(),
-                                         stream_view_));
+                                         stream_view_.value()));
   } else {
     // TODO later batch mode: handle if not all restart
     RAFT_CUSPARSE_TRY(
@@ -420,7 +442,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                          cusparse_view.batch_next_AtYs,
                                          CUSPARSE_SPMM_CSR_ALG3,
                                          (f_t*)cusparse_view.buffer_transpose_batch.data(),
-                                         stream_view_));
+                                         stream_view_.value()));
   }
 
   // Compute Ay' - Ay = next_Aty - current_Aty
@@ -433,6 +455,31 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     cuda::std::minus<>{},
     stream_view_.value());
 
+  // Validate tmp_primal (A^T @ delta_y) has no NaN/Inf
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value()));
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    tmp_primal.data(),
+                    tmp_primal.data() + tmp_primal.size(),
+                    is_nan_or_inf<f_t>{}),
+    "tmp_primal (A^T @ delta_y) contains NaN or Inf in compute_interaction_and_movement");
+
+  // Validate delta_primal and delta_dual inputs have no NaN/Inf
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    current_saddle_point_state.get_delta_primal().data(),
+                    current_saddle_point_state.get_delta_primal().data() +
+                      current_saddle_point_state.get_delta_primal().size(),
+                    is_nan_or_inf<f_t>{}),
+    "delta_primal contains NaN or Inf in compute_interaction_and_movement");
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    current_saddle_point_state.get_delta_dual().data(),
+                    current_saddle_point_state.get_delta_dual().data() +
+                      current_saddle_point_state.get_delta_dual().size(),
+                    is_nan_or_inf<f_t>{}),
+    "delta_dual contains NaN or Inf in compute_interaction_and_movement");
+
   if (!batch_mode_) {
     // compute interaction (x'-x) . (A(y'-y))
     RAFT_CUBLAS_TRY(
@@ -443,7 +490,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                       current_saddle_point_state.get_delta_primal().data(),
                                       primal_stride,
                                       interaction_.data(),
-                                      stream_view_));
+                                      stream_view_.value()));
 
     // Compute movement
     //  compute euclidean norm squared which is
@@ -453,7 +500,8 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     //               2 + (0.5 /
     //               solver_state.primal_weight) *
     //               norm(delta_dual) ^ 2;
-    deltas_are_done_.stream_wait(stream_pool_.get_stream(0));
+    // All dot products run on stream_view_ to avoid concurrent cuBLAS workspace access
+    // (cuBLAS uses a single internal workspace shared across all streams for the same handle)
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                       current_saddle_point_state.get_primal_size(),
@@ -462,10 +510,8 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                       current_saddle_point_state.get_delta_primal().data(),
                                       primal_stride,
                                       norm_squared_delta_primal_.data(),
-                                      stream_pool_.get_stream(0)));
-    dot_delta_X_.record(stream_pool_.get_stream(0));
+                                      stream_view_.value()));
 
-    deltas_are_done_.stream_wait(stream_pool_.get_stream(1));
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                       current_saddle_point_state.get_dual_size(),
@@ -474,12 +520,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                       current_saddle_point_state.get_delta_dual().data(),
                                       dual_stride,
                                       norm_squared_delta_dual_.data(),
-                                      stream_pool_.get_stream(1)));
-    dot_delta_Y_.record(stream_pool_.get_stream(1));
-
-    // Wait on main stream for both dot to be done before launching the next kernel
-    dot_delta_X_.stream_wait(stream_view_);
-    dot_delta_Y_.stream_wait(stream_view_);
+                                      stream_view_.value()));
   } else {
     // TODO later batch mode: remove this once you want to do per climber restart
     cub::DeviceSegmentedReduce::Sum(
@@ -492,7 +533,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
       interaction_.data(),
       climber_strategies_.size(),
       primal_size_,
-      stream_view_);
+      stream_view_.value());
 
     cub::DeviceSegmentedReduce::Sum(
       dot_product_storage.data(),
@@ -502,7 +543,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
       norm_squared_delta_primal_.data(),
       climber_strategies_.size(),
       primal_size_,
-      stream_view_);
+      stream_view_.value());
 
     cub::DeviceSegmentedReduce::Sum(
       dot_product_storage.data(),
@@ -512,7 +553,14 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
       norm_squared_delta_dual_.data(),
       climber_strategies_.size(),
       dual_size_,
-      stream_view_);
+      stream_view_.value());
+
+    validate_interaction_and_movement_outputs<i_t, f_t>
+      <<<1, climber_strategies_.size(), 0, stream_view_>>>(
+        make_span(norm_squared_delta_primal_),
+        make_span(norm_squared_delta_dual_),
+        make_span(interaction_));
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }
 
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
index 16e2d64957..14180b5bfd 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
@@ -21,7 +21,7 @@ template <typename i_t>
 class ping_pong_graph_t {
  public:
   ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false)
-    : stream_view_(stream_view), is_legacy_batch_mode_(is_legacy_batch_mode)
+    : stream_view_(stream_view), is_legacy_batch_mode_(true)
   {
   }
 
diff --git a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
index 19db315349..4ec4a9aaf2 100644
--- a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
+++ b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
@@ -207,6 +207,44 @@ def set_pdlp_warm_start_data(self, pdlp_warm_start_data):
         """
         self.pdlp_warm_start_data = pdlp_warm_start_data
 
+    def set_mip_batch_pdlp_strong_branching(self, enable):
+        """
+        Note: Only supported for MILP
+
+        Toggle batch PDLP strong branching in the MIP solver.
+
+        Parameters
+        ----------
+        enable : bool
+            If True, enable batch PDLP strong branching (value 1).
+            If False, disable it (value 0).
+
+        Examples
+        --------
+        >>> settings.set_mip_batch_pdlp_strong_branching(True)
+        """
+        self.set_parameter(
+            "mip_batch_pdlp_strong_branching", 1 if enable else 0
+        )
+
+    def get_mip_batch_pdlp_strong_branching(self):
+        """
+        Note: Only supported for MILP
+
+        Get the current value of the batch PDLP strong branching setting.
+
+        Returns
+        -------
+        bool
+            True if batch PDLP strong branching is enabled, False otherwise.
+
+        Examples
+        --------
+        >>> settings.get_mip_batch_pdlp_strong_branching()
+        False
+        """
+        return bool(self.get_parameter("mip_batch_pdlp_strong_branching"))
+
     def set_mip_callback(self, callback, user_data):
         """
         Note: Only supported for MILP
diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
index 8412c745b5..59ea62089d 100644
--- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
+++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
@@ -450,6 +450,11 @@ class SolverConfig(BaseModel):
         description="Set True to run heuristics only, False to run "
         "heuristics and branch and bound for MILP",
     )
+    mip_batch_pdlp_strong_branching: Optional[int] = Field(
+        default=0,
+        description="Set 1 to enable batch PDLP strong branching "
+        "in the MIP solver, 0 to disable.",
+    )
     num_cpu_threads: Optional[int] = Field(
         default=None,
         description="Set the number of CPU threads to use for branch and bound.",  # noqa
diff --git a/run_multiple.sh b/run_multiple.sh
new file mode 100755
index 0000000000..183b25b46e
--- /dev/null
+++ b/run_multiple.sh
@@ -0,0 +1,3 @@
+for i in {1..5}; do
+    python test.py
+done
\ No newline at end of file
diff --git a/test.py b/test.py
new file mode 100755
index 0000000000..6cb236dae2
--- /dev/null
+++ b/test.py
@@ -0,0 +1,12 @@
+import cuopt_mps_parser
+from cuopt.linear_programming import Solve, SolverSettings
+
+data_model = cuopt_mps_parser.ParseMps("batch_instances/neos8.mps")
+
+settings = SolverSettings()
+settings.set_mip_batch_pdlp_strong_branching(True)
+
+solution = Solve(data_model, settings)
+
+print(solution.get_termination_reason())
+print(solution.get_primal_objective())

From 1614bc14836050bbf30308c7c7edf140f352770e Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 17 Feb 2026 17:42:10 +0100
Subject: [PATCH 02/13] fix

---
 cpp/src/pdlp/pdlp.cu                          | 89 +++++++------------
 .../restart_strategy/pdlp_restart_strategy.cu |  1 -
 .../convergence_information.cu                | 66 ++++++++------
 cpp/src/pdlp/utils.cuh                        | 20 +++--
 4 files changed, 82 insertions(+), 94 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index e1ab866b5b..082299902d 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -1188,24 +1188,42 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
                           f_t& avg)
 {
   auto abs_op      = [] __host__ __device__(f_t x) { return abs(x); };
-  auto min_nonzero = [] __host__ __device__(f_t x) {
+  auto min_nonzero = [] __host__ __device__(f_t x) -> f_t {
     return x == 0 ? std::numeric_limits<f_t>::max() : abs(x);
   };
 
-  smallest = thrust::transform_reduce(rmm::exec_policy(vec.stream()),
-                                      vec.begin(),
-                                      vec.end(),
-                                      min_nonzero,
-                                      std::numeric_limits<f_t>::max(),
-                                      thrust::minimum<f_t>());
-
-  largest = thrust::transform_reduce(
-    rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::maximum<f_t>());
-
-  f_t sum = thrust::transform_reduce(
-    rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::plus<f_t>());
-
-  avg = sum / vec.size();
+  auto stream = vec.stream();
+  auto n      = static_cast<int>(vec.size());
+
+  rmm::device_scalar<f_t> d_smallest(stream);
+  rmm::device_scalar<f_t> d_largest(stream);
+  rmm::device_scalar<f_t> d_sum(stream);
+
+  auto min_nz_iter = thrust::make_transform_iterator(vec.cbegin(), min_nonzero);
+  auto abs_iter    = thrust::make_transform_iterator(vec.cbegin(), abs_op);
+
+  void* d_temp   = nullptr;
+  size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 1;
+  cub::DeviceReduce::Reduce(
+    d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream);
+  cub::DeviceReduce::Reduce(
+    d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream);
+  cub::DeviceReduce::Reduce(
+    d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream);
+
+  size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3});
+  rmm::device_buffer temp_buf(max_bytes, stream);
+
+  cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream);
+  cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream);
+  cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream);
+
+  smallest = d_smallest.value(stream);
+  largest  = d_largest.value(stream);
+  avg      = d_sum.value(stream) / vec.size();
 };
 
 template <typename f_t>
@@ -1895,47 +1913,6 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
       stream_view_));  // To make sure all the data is written from device to host
     RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-    // Host-side diagnostic: copy small device arrays and verify movement + interaction >= 0
-    {
-      const auto bs = climber_strategies_.size();
-      std::vector<f_t> h_nsq_dp(bs), h_nsq_dd(bs), h_pw(bs), h_ss(bs), h_inter(bs);
-      RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dp.data(),
-                                step_size_strategy_.get_norm_squared_delta_primal().data(),
-                                bs * sizeof(f_t),
-                                cudaMemcpyDeviceToHost));
-      RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dd.data(),
-                                step_size_strategy_.get_norm_squared_delta_dual().data(),
-                                bs * sizeof(f_t),
-                                cudaMemcpyDeviceToHost));
-      RAFT_CUDA_TRY(cudaMemcpy(
-        h_pw.data(), primal_weight_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost));
-      RAFT_CUDA_TRY(
-        cudaMemcpy(h_ss.data(), step_size_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost));
-      RAFT_CUDA_TRY(cudaMemcpy(h_inter.data(),
-                                step_size_strategy_.get_interaction().data(),
-                                bs * sizeof(f_t),
-                                cudaMemcpyDeviceToHost));
-      for (size_t i = 0; i < bs; ++i) {
-        const f_t movement = h_nsq_dp[i] * h_pw[i] + h_nsq_dd[i] / h_pw[i];
-        const f_t comp_inter = f_t(2.0) * h_inter[i] * h_ss[i];
-        if (movement + comp_inter < f_t(0.0)) {
-          fprintf(stderr,
-                  "DIAGNOSTIC [%zu]: movement=%.17e comp_inter=%.17e sum=%.17e "
-                  "norm_sq_dx=%.17e norm_sq_dy=%.17e pw=%.17e ss=%.17e interaction=%.17e\n",
-                  i,
-                  (double)movement,
-                  (double)comp_inter,
-                  (double)(movement + comp_inter),
-                  (double)h_nsq_dp[i],
-                  (double)h_nsq_dd[i],
-                  (double)h_pw[i],
-                  (double)h_ss[i],
-                  (double)h_inter[i]);
-        }
-        cuopt_assert(movement + comp_inter >= f_t(0.0),
-                     "Host check: movement + computed_interaction must be >= 0");
-      }
-    }
 
 #ifdef CUPDLP_DEBUG_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index a6304a8568..b2ed166a2d 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -38,7 +38,6 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index 97e0e9c0e9..eec078f8d7 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -395,28 +395,28 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
                   "Batch mode not supported for per_constraint_residual");
 
     // Compute the linf of (residual_i - rel * b_i)
-    thrust::device_ptr<f_t> result_ptr(linf_primal_residual_.data());
-    const f_t neutral = f_t(0.0);
-
     if (settings.save_best_primal_so_far) {
       const i_t zero_int = 0;
       nb_violated_constraints_.set_value_async(zero_int, handle_ptr_->get_stream());
-      *result_ptr = thrust::transform_reduce(
-        handle_ptr_->get_thrust_policy(),
-        thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()),
-        thrust::make_zip_iterator(primal_residual_.cend(), combined_bounds.cend()),
-        relative_residual_t<i_t, f_t>{settings.tolerances.relative_primal_tolerance},
-        neutral,
-        thrust::maximum<f_t>());
-    } else {
-      *result_ptr = thrust::transform_reduce(
-        handle_ptr_->get_thrust_policy(),
-        thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()),
-        thrust::make_zip_iterator(primal_residual_.cend(), combined_bounds.cend()),
-        relative_residual_t<i_t, f_t>{settings.tolerances.relative_primal_tolerance},
-        neutral,
-        thrust::maximum<f_t>());
     }
+    auto transform_iter = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()),
+      relative_residual_t<i_t, f_t>{settings.tolerances.relative_primal_tolerance});
+    void* d_temp_storage      = nullptr;
+    size_t temp_storage_bytes = 0;
+    cub::DeviceReduce::Max(d_temp_storage,
+                              temp_storage_bytes,
+                              transform_iter,
+                              linf_primal_residual_.data(),
+                              primal_residual_.size(),
+                              stream_view_);
+    rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_);
+    cub::DeviceReduce::Max(temp_buf.data(),
+                              temp_storage_bytes,
+                              transform_iter,
+                              linf_primal_residual_.data(),
+                              primal_residual_.size(),
+                              stream_view_);
   }
 
   compute_dual_residual(op_problem_cusparse_view_,
@@ -458,16 +458,26 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
                   "Batch mode not supported for per_constraint_residual");
 
     // Compute the linf of (residual_i - rel * c_i)
-    thrust::device_ptr<f_t> result_ptr(linf_dual_residual_.data());
-    const f_t neutral = f_t(0.0);
-
-    *result_ptr = thrust::transform_reduce(
-      handle_ptr_->get_thrust_policy(),
-      thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()),
-      thrust::make_zip_iterator(dual_residual_.cend(), objective_coefficients.cend()),
-      relative_residual_t<i_t, f_t>{settings.tolerances.relative_dual_tolerance},
-      neutral,
-      thrust::maximum<f_t>());
+    {
+      auto transform_iter = thrust::make_transform_iterator(
+        thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()),
+        relative_residual_t<i_t, f_t>{settings.tolerances.relative_dual_tolerance});
+      void* d_temp_storage      = nullptr;
+      size_t temp_storage_bytes = 0;
+      cub::DeviceReduce::Max(d_temp_storage,
+                                temp_storage_bytes,
+                                transform_iter,
+                                linf_dual_residual_.data(),
+                                dual_residual_.size(),
+                                stream_view_);
+      rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_);
+      cub::DeviceReduce::Max(temp_buf.data(),
+                                temp_storage_bytes,
+                                transform_iter,
+                                linf_dual_residual_.data(),
+                                dual_residual_.size(),
+                                stream_view_);
+    }
   }
 
   const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh
index d48ae21c1a..9150ab8c51 100644
--- a/cpp/src/pdlp/utils.cuh
+++ b/cpp/src/pdlp/utils.cuh
@@ -604,15 +604,17 @@ void inline my_inf_norm(const rmm::device_uvector<f_t>& input_vector,
                         f_t* result,
                         raft::handle_t const* handle_ptr)
 {
-  const f_t neutral = f_t(0.0);
-  thrust::device_ptr<f_t> result_ptr(result);
-
-  *result_ptr = thrust::transform_reduce(handle_ptr->get_thrust_policy(),
-                                         input_vector.data(),
-                                         input_vector.data() + input_vector.size(),
-                                         abs_t<f_t>{},
-                                         neutral,
-                                         thrust::maximum<f_t>());
+  auto stream   = handle_ptr->get_stream();
+  auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t<f_t>{});
+  auto n        = static_cast<int>(input_vector.size());
+
+  void* d_temp          = nullptr;
+  size_t temp_bytes     = 0;
+  cub::DeviceReduce::Max(
+    d_temp, temp_bytes, abs_iter, result, n, stream);
+  rmm::device_buffer temp_buf(temp_bytes, stream);
+  cub::DeviceReduce::Max(
+    temp_buf.data(), temp_bytes, abs_iter, result, n, stream);
 }
 
 template <typename f_t>

From 4f3353191f6178ce02ffb3449daa9891ee8eb38e Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 18 Feb 2026 13:18:41 +0100
Subject: [PATCH 03/13] fix

---
 .../adaptive_step_size_strategy.cu            |  25 ----
 cpp/src/pdlp/utilities/ping_pong_graph.cu     | 123 ++++++++++++++++++
 cpp/src/pdlp/utilities/ping_pong_graph.cuh    |  87 ++-----------
 3 files changed, 137 insertions(+), 98 deletions(-)
 create mode 100644 cpp/src/pdlp/utilities/ping_pong_graph.cu

diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index 47ba16a297..32e21cfbf6 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -455,31 +455,6 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     cuda::std::minus<>{},
     stream_view_.value());
 
-  // Validate tmp_primal (A^T @ delta_y) has no NaN/Inf
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value()));
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    tmp_primal.data(),
-                    tmp_primal.data() + tmp_primal.size(),
-                    is_nan_or_inf<f_t>{}),
-    "tmp_primal (A^T @ delta_y) contains NaN or Inf in compute_interaction_and_movement");
-
-  // Validate delta_primal and delta_dual inputs have no NaN/Inf
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    current_saddle_point_state.get_delta_primal().data(),
-                    current_saddle_point_state.get_delta_primal().data() +
-                      current_saddle_point_state.get_delta_primal().size(),
-                    is_nan_or_inf<f_t>{}),
-    "delta_primal contains NaN or Inf in compute_interaction_and_movement");
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    current_saddle_point_state.get_delta_dual().data(),
-                    current_saddle_point_state.get_delta_dual().data() +
-                      current_saddle_point_state.get_delta_dual().size(),
-                    is_nan_or_inf<f_t>{}),
-    "delta_dual contains NaN or Inf in compute_interaction_and_movement");
-
   if (!batch_mode_) {
     // compute interaction (x'-x) . (A(y'-y))
     RAFT_CUBLAS_TRY(
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu
new file mode 100644
index 0000000000..08045b47a1
--- /dev/null
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu
@@ -0,0 +1,123 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#include <pdlp/utilities/ping_pong_graph.cuh>
+
+#include <raft/core/error.hpp>
+
+#include <utilities/logger.hpp>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t>
+ping_pong_graph_t<i_t>::ping_pong_graph_t(rmm::cuda_stream_view stream_view,
+                                          bool is_legacy_batch_mode)
+  : stream_view_(stream_view), is_legacy_batch_mode_(is_legacy_batch_mode)
+{
+}
+
+template <typename i_t>
+void ping_pong_graph_t<i_t>::cancel_active_capture()
+{
+  CUOPT_LOG_ERROR(
+    "Canceling active capture in ping_pong_graph_t");
+  if (capture_even_active_) {
+    RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
+    RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
+    capture_even_active_ = false;
+  }
+  if (capture_odd_active_) {
+    RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
+    RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
+    capture_odd_active_ = false;
+  }
+}
+
+template <typename i_t>
+ping_pong_graph_t<i_t>::~ping_pong_graph_t()
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    // This should not happen, but in case a graph was capturing while destroying the object
+    if (capture_even_active_ || capture_odd_active_) {
+      cancel_active_capture();
+    }
+    if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); }
+    if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); }
+  }
+#endif
+}
+
+template <typename i_t>
+void ping_pong_graph_t<i_t>::start_capture(i_t total_pdlp_iterations)
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
+      RAFT_CUDA_TRY(
+        cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
+      capture_even_active_ = true;
+    } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
+      RAFT_CUDA_TRY(
+        cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
+      capture_odd_active_ = true;
+    }
+  }
+#endif
+}
+
+template <typename i_t>
+void ping_pong_graph_t<i_t>::end_capture(i_t total_pdlp_iterations)
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
+      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
+      capture_even_active_ = false;
+      RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph));
+      even_initialized = true;
+      RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
+    } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
+      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
+      capture_odd_active_ = false;
+      RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph));
+      odd_initialized = true;
+      RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
+    }
+  }
+#endif
+}
+
+template <typename i_t>
+void ping_pong_graph_t<i_t>::launch(i_t total_pdlp_iterations)
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    if (total_pdlp_iterations % 2 == 0 && even_initialized) {
+      RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value()));
+    } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) {
+      RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value()));
+    }
+  }
+#endif
+}
+
+template <typename i_t>
+bool ping_pong_graph_t<i_t>::is_initialized(i_t total_pdlp_iterations)
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    return (total_pdlp_iterations % 2 == 0 && even_initialized) ||
+           (total_pdlp_iterations % 2 == 1 && odd_initialized);
+  }
+#endif
+  return false;
+}
+
+template class ping_pong_graph_t<int>;
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
index 14180b5bfd..9d6ead8cf7 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
@@ -9,6 +9,8 @@
 
 #include <pdlp/pdlp_constants.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <cuda_runtime.h>
 
 namespace cuopt::linear_programming::detail {
@@ -17,83 +19,20 @@ namespace cuopt::linear_programming::detail {
 // No additional checks for safe usage (calling launch() before initializing the graph) use with
 // caution Binary part is because in pdlp we swap pointers instead of copying vectors to accept a
 // valid pdhg step So every odd pdlp step it's one graph, every even step it's another graph
-template <typename i_t>
+  template <typename i_t>
 class ping_pong_graph_t {
  public:
-  ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false)
-    : stream_view_(stream_view), is_legacy_batch_mode_(true)
-  {
-  }
-
-  ~ping_pong_graph_t()
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); }
-      if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); }
-    }
-#endif
-  }
-
-  void start_capture(i_t total_pdlp_iterations)
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-        RAFT_CUDA_TRY(
-          cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
-      } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-        RAFT_CUDA_TRY(
-          cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
-      }
-    }
-#endif
-  }
-
-  void end_capture(i_t total_pdlp_iterations)
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-        RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
-        RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph));
-        even_initialized = true;
-        RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
-      } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-        RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
-        RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph));
-        odd_initialized = true;
-        RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
-      }
-    }
-#endif
-  }
+  ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false);
+  ~ping_pong_graph_t();
 
-  void launch(i_t total_pdlp_iterations)
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      if (total_pdlp_iterations % 2 == 0 && even_initialized) {
-        RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value()));
-      } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) {
-        RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value()));
-      }
-    }
-#endif
-  }
-
-  bool is_initialized(i_t total_pdlp_iterations)
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      return (total_pdlp_iterations % 2 == 0 && even_initialized) ||
-             (total_pdlp_iterations % 2 == 1 && odd_initialized);
-    }
-#endif
-    return false;
-  }
+  void start_capture(i_t total_pdlp_iterations);
+  void end_capture(i_t total_pdlp_iterations);
+  void launch(i_t total_pdlp_iterations);
+  bool is_initialized(i_t total_pdlp_iterations);
 
  private:
+  void cancel_active_capture();
+
   cudaGraph_t even_graph;
   cudaGraph_t odd_graph;
   cudaGraphExec_t even_instance;
@@ -101,7 +40,9 @@ class ping_pong_graph_t {
   rmm::cuda_stream_view stream_view_;
   bool even_initialized{false};
   bool odd_initialized{false};
-  // Temporary fix to disable cuda graph in legacy batch mode
+  bool capture_even_active_{false};
+  bool capture_odd_active_{false};
   bool is_legacy_batch_mode_{false};
 };
+
 }  // namespace cuopt::linear_programming::detail

From e0a530ed449f7c904d651701aa066dd90a5edd54 Mon Sep 17 00:00:00 2001
From: Trevor McKay <tmgithub1@gmail.com>
Date: Tue, 17 Feb 2026 13:49:33 -0500
Subject: [PATCH 04/13] workaround for thrust reverse iterator build error

---
 .../restart_strategy/pdlp_restart_strategy.cu   | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index b2ed166a2d..e42a05e1e6 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -34,6 +34,7 @@
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/logical.h>
@@ -1990,18 +1991,18 @@ void pdlp_restart_strategy_t<i_t, f_t>::solve_bound_constrained_trust_region(
                                    threshold_.end(),
                                    std::numeric_limits<f_t>::infinity());
     // Easier / Cleaner than to do reverse iterator arithmetic
-    f_t* start = threshold_.data();
-    f_t* end   = threshold_.data() + primal_size_h_ + dual_size_h_;
-    auto highest_negInf_primal =
-      thrust::find(handle_ptr_->get_thrust_policy(),
-                   thrust::device_ptr<f_t>(end),
-                   thrust::device_ptr<f_t>(start),
-                   -std::numeric_limits<f_t>::infinity());
+    f_t* start                 = threshold_.data();
+    f_t* end                   = threshold_.data() + primal_size_h_ + dual_size_h_;
+    using rev_iter_t           = thrust::reverse_iterator<thrust::device_ptr<f_t>>;
+    auto highest_negInf_primal = thrust::find(handle_ptr_->get_thrust_policy(),
+                                              rev_iter_t(thrust::device_ptr<f_t>(end)),
+                                              rev_iter_t(thrust::device_ptr<f_t>(start)),
+                                              -std::numeric_limits<f_t>::infinity());
 
     // Set ranges accordingly
     i_t index_start_primal = 0;
     i_t index_end_primal   = primal_size_h_ + dual_size_h_;
-    if (highest_negInf_primal != thrust::device_ptr<f_t>(start)) {
+    if (highest_negInf_primal != rev_iter_t(thrust::device_ptr<f_t>(start))) {
       cuopt_assert(device_to_host_value(thrust::raw_pointer_cast(&*highest_negInf_primal)) ==
                      -std::numeric_limits<f_t>::infinity(),
                    "Incorrect primal reverse iterator");

From e330718a2f40799de9a22615d1e99954bd821922 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 18 Feb 2026 13:55:20 +0100
Subject: [PATCH 05/13] remove compile file

---
 compile.sh | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100755 compile.sh

diff --git a/compile.sh b/compile.sh
deleted file mode 100755
index bedf3a7506..0000000000
--- a/compile.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-./build.sh libcuopt libmps_parser --cache-tool=ccache --skip-tests-build -a -l=OFF
-./build.sh cuopt cuopt_mps_parser
\ No newline at end of file

From dce6d4fec3d5a2ae6d3313e4f1af2db4ce55ccea Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 18 Feb 2026 15:56:09 +0100
Subject: [PATCH 06/13] fix

---
 .../linear_programming/cuopt/run_pdlp.cu      | 20 ++++++---
 cpp/src/branch_and_bound/pseudo_costs.cpp     |  3 +-
 cpp/src/pdlp/CMakeLists.txt                   |  1 +
 cpp/src/pdlp/pdlp.cu                          |  3 --
 .../adaptive_step_size_strategy.cu            | 44 +------------------
 .../adaptive_step_size_strategy.hpp           |  9 ----
 cpp/src/pdlp/utilities/ping_pong_graph.cu     | 16 +++----
 .../solver_settings/solver_settings.py        | 38 ----------------
 8 files changed, 26 insertions(+), 108 deletions(-)

diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu
index c3d6ad42f4..64897264c9 100644
--- a/benchmarks/linear_programming/cuopt/run_pdlp.cu
+++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu
@@ -70,15 +70,23 @@ static void parse_arguments(argparse::ArgumentParser& program)
       "Path to PDLP hyper-params file to configure PDLP solver. Has priority over PDLP solver "
       "modes.");
 
-  program.add_argument("--presolve")
-    .help("enable/disable presolve (default: true for MIP problems, false for LP problems)")
-    .default_value(0)
-    .scan<'i', int>()
-    .choices(0, 1);
+  program.add_argument("--presolver")
+  .help("Presolver to use. Possible values: None, Papilo, PSLP, Default")
+  .default_value("Default")
+  .choices("None", "Papilo", "PSLP", "Default");
 
   program.add_argument("--solution-path").help("Path where solution file will be generated");
 }
 
+static cuopt::linear_programming::presolver_t string_to_presolver(const std::string& presolver)
+{
+  if (presolver == "None") return cuopt::linear_programming::presolver_t::None;
+  if (presolver == "Papilo") return cuopt::linear_programming::presolver_t::Papilo;
+  if (presolver == "PSLP") return cuopt::linear_programming::presolver_t::PSLP;
+  if (presolver == "Default") return cuopt::linear_programming::presolver_t::Default;
+  return cuopt::linear_programming::presolver_t::Default;
+}
+
 static cuopt::linear_programming::pdlp_solver_mode_t string_to_pdlp_solver_mode(
   const std::string& mode)
 {
@@ -107,7 +115,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t<int, double> create_sol
     string_to_pdlp_solver_mode(program.get<std::string>("--pdlp-solver-mode"));
   settings.method = static_cast<cuopt::linear_programming::method_t>(program.get<int>("--method"));
   settings.crossover = program.get<int>("--crossover");
-  //settings.presolve  = program.get<int>("--presolve");
+  settings.presolver = string_to_presolver(program.get<std::string>("--presolver"));
 
   return settings;
 }
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 2ddc672750..e06f497bdc 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -335,8 +335,9 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
     }
 
     const auto mps_model = simplex_problem_to_mps_data_model(original_problem);
+    const raft::handle_t batch_pdlp_handle;
     const auto solutions =
-      batch_pdlp_solve(original_problem.handle_ptr, mps_model, fractional, fraction_values);
+      batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values);
     f_t batch_pdlp_strong_branching_time = toc(start_batch);
 
     // Find max iteration on how many are done accross the batch
diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt
index ced9da8edc..2071bdfdef 100644
--- a/cpp/src/pdlp/CMakeLists.txt
+++ b/cpp/src/pdlp/CMakeLists.txt
@@ -24,6 +24,7 @@ set(LP_CORE_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/infeasibility_information.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu
 )
 
 # C and Python adapter files
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 082299902d..eaafd1293e 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -1440,9 +1440,6 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal,
     norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight;
   const f_t computed_interaction = f_t(2.0) * interaction * step_size;
 
-  //printf("movement %lf\n", movement);
-  //printf("computed_interaction %lf\n", computed_interaction);
-
   cuopt_assert(
     movement + computed_interaction >= f_t(0.0),
     "Movement + computed interaction must be >= 0");
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index 32e21cfbf6..d491106aaf 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -28,14 +28,10 @@
 
 #include <cub/cub.cuh>
 
-#include <thrust/logical.h>
-
 #include <limits>
 
 namespace cuopt::linear_programming::detail {
 
-constexpr int parallel_stream_computation = 2;
-
 template <typename i_t, typename f_t>
 adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
   raft::handle_t const* handle_ptr,
@@ -47,10 +43,6 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
   const std::vector<pdlp_climber_strategy_t>& climber_strategies,
   const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params)
   : batch_mode_(climber_strategies.size() > 1),
-    stream_pool_(parallel_stream_computation),
-    dot_delta_X_(cudaEventDisableTiming),
-    dot_delta_Y_(cudaEventDisableTiming),
-    deltas_are_done_(cudaEventDisableTiming),
     handle_ptr_(handle_ptr),
     stream_view_(handle_ptr_->get_stream()),
     primal_size_(primal_size),
@@ -350,26 +342,6 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_step_sizes(
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value()));
 }
 
-template <typename i_t, typename f_t>
-__global__ void validate_interaction_and_movement_outputs(
-  raft::device_span<const f_t> norm_squared_delta_primal,
-  raft::device_span<const f_t> norm_squared_delta_dual,
-  raft::device_span<const f_t> interaction)
-{
-  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= norm_squared_delta_primal.size()) { return; }
-  cuopt_assert(!isnan(norm_squared_delta_primal[idx]),
-               "norm_squared_delta_primal is NaN after reduction");
-  cuopt_assert(!isnan(norm_squared_delta_dual[idx]),
-               "norm_squared_delta_dual is NaN after reduction");
-  cuopt_assert(!isnan(interaction[idx]),
-               "interaction is NaN after reduction");
-  cuopt_assert(norm_squared_delta_primal[idx] >= f_t(0.0),
-               "norm_squared_delta_primal must be >= 0 after reduction");
-  cuopt_assert(norm_squared_delta_dual[idx] >= f_t(0.0),
-               "norm_squared_delta_dual must be >= 0 after reduction");
-}
-
 template <typename i_t, typename f_t>
 void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
   rmm::device_uvector<f_t>& tmp_primal,
@@ -393,7 +365,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
 
     Deltas x & y were computed during pdhg step
 
-    We will compute in parallel (parallel cuda graph):
+    We will compute:
     ||(x' - x)||
     ||(y' - y)||
     (y' - y)_t . A @ (x' - x)
@@ -401,11 +373,6 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     And finally merge the results
   */
 
-  // We need to make sure both dot products happens after previous operations (next_primal/dual)
-  // Thus, we add another node in the main stream before starting the SpMVs
-
-  if (!batch_mode_) deltas_are_done_.record(stream_view_.value());
-
   // primal_dual_interaction computation => we purposly diverge from the paper (delta_y . (A @ x' -
   // A@x)) to save one SpMV
   // Instead we do: delta_x . (A_t @ y' - A_t @ y)
@@ -475,8 +442,6 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     //               2 + (0.5 /
     //               solver_state.primal_weight) *
     //               norm(delta_dual) ^ 2;
-    // All dot products run on stream_view_ to avoid concurrent cuBLAS workspace access
-    // (cuBLAS uses a single internal workspace shared across all streams for the same handle)
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                       current_saddle_point_state.get_primal_size(),
@@ -529,13 +494,6 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
       climber_strategies_.size(),
       dual_size_,
       stream_view_.value());
-
-    validate_interaction_and_movement_outputs<i_t, f_t>
-      <<<1, climber_strategies_.size(), 0, stream_view_>>>(
-        make_span(norm_squared_delta_primal_),
-        make_span(norm_squared_delta_dual_),
-        make_span(interaction_));
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }
 
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
index 8e7e048b18..1e969150e7 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
@@ -91,15 +91,6 @@ class adaptive_step_size_strategy_t {
  private:
   const bool batch_mode_;
 
-  // Stream pool to run different step size computation in parallel
-  // Because we already have the main stream, we just need 2 extra streams from this
-  rmm::cuda_stream_pool stream_pool_;
-
-  // Events to record when dot product of both delta_x and y are done and when to start them
-  event_handler_t deltas_are_done_;
-  event_handler_t dot_delta_X_;
-  event_handler_t dot_delta_Y_;
-
   raft::handle_t const* handle_ptr_{nullptr};
   rmm::cuda_stream_view stream_view_;
 
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu
index 08045b47a1..647672e535 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cu
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu
@@ -58,11 +58,11 @@ void ping_pong_graph_t<i_t>::start_capture(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-      RAFT_CUDA_TRY(
+      RAFT_CUDA_TRY_NO_THROW(
         cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
       capture_even_active_ = true;
     } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-      RAFT_CUDA_TRY(
+      RAFT_CUDA_TRY_NO_THROW(
         cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
       capture_odd_active_ = true;
     }
@@ -76,15 +76,15 @@ void ping_pong_graph_t<i_t>::end_capture(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
+      RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &even_graph));
       capture_even_active_ = false;
-      RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph));
+      RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&even_instance, even_graph));
       even_initialized = true;
       RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
     } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
+      RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
       capture_odd_active_ = false;
-      RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph));
+      RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&odd_instance, odd_graph));
       odd_initialized = true;
       RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
     }
@@ -98,9 +98,9 @@ void ping_pong_graph_t<i_t>::launch(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && even_initialized) {
-      RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value()));
+      RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(even_instance, stream_view_.value()));
     } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) {
-      RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value()));
+      RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(odd_instance, stream_view_.value()));
     }
   }
 #endif
diff --git a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
index 4ec4a9aaf2..19db315349 100644
--- a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
+++ b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
@@ -207,44 +207,6 @@ def set_pdlp_warm_start_data(self, pdlp_warm_start_data):
         """
         self.pdlp_warm_start_data = pdlp_warm_start_data
 
-    def set_mip_batch_pdlp_strong_branching(self, enable):
-        """
-        Note: Only supported for MILP
-
-        Toggle batch PDLP strong branching in the MIP solver.
-
-        Parameters
-        ----------
-        enable : bool
-            If True, enable batch PDLP strong branching (value 1).
-            If False, disable it (value 0).
-
-        Examples
-        --------
-        >>> settings.set_mip_batch_pdlp_strong_branching(True)
-        """
-        self.set_parameter(
-            "mip_batch_pdlp_strong_branching", 1 if enable else 0
-        )
-
-    def get_mip_batch_pdlp_strong_branching(self):
-        """
-        Note: Only supported for MILP
-
-        Get the current value of the batch PDLP strong branching setting.
-
-        Returns
-        -------
-        bool
-            True if batch PDLP strong branching is enabled, False otherwise.
-
-        Examples
-        --------
-        >>> settings.get_mip_batch_pdlp_strong_branching()
-        False
-        """
-        return bool(self.get_parameter("mip_batch_pdlp_strong_branching"))
-
     def set_mip_callback(self, callback, user_data):
         """
         Note: Only supported for MILP

From 9c03faf0f2e5e5d44670507c8ac348ca3f142659 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 18 Feb 2026 17:28:11 +0100
Subject: [PATCH 07/13] final cleanup

---
 cpp/src/pdlp/pdlp.cu  | 60 -------------------------------------------
 cpp/src/pdlp/solve.cu |  3 ---
 run_multiple.sh       |  3 ---
 test.py               | 12 ---------
 4 files changed, 78 deletions(-)
 delete mode 100755 run_multiple.sh
 delete mode 100755 test.py

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index eaafd1293e..4b4eed1f32 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -1825,66 +1825,6 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
   // potential_next_dual_solution
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
 
-  // Validate reflected solutions have no NaN/Inf
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    pdhg_solver_.get_reflected_primal().data(),
-                    pdhg_solver_.get_reflected_primal().data() +
-                      pdhg_solver_.get_reflected_primal().size(),
-                    is_nan_or_inf<f_t>{}),
-    "reflected_primal contains NaN or Inf in compute_fixed_error");
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    pdhg_solver_.get_reflected_dual().data(),
-                    pdhg_solver_.get_reflected_dual().data() +
-                      pdhg_solver_.get_reflected_dual().size(),
-                    is_nan_or_inf<f_t>{}),
-    "reflected_dual contains NaN or Inf in compute_fixed_error");
-
-  // Validate primal/dual solutions have no NaN/Inf
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    pdhg_solver_.get_primal_solution().data(),
-                    pdhg_solver_.get_primal_solution().data() +
-                      pdhg_solver_.get_primal_solution().size(),
-                    is_nan_or_inf<f_t>{}),
-    "primal_solution contains NaN or Inf in compute_fixed_error");
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    pdhg_solver_.get_dual_solution().data(),
-                    pdhg_solver_.get_dual_solution().data() +
-                      pdhg_solver_.get_dual_solution().size(),
-                    is_nan_or_inf<f_t>{}),
-    "dual_solution contains NaN or Inf in compute_fixed_error");
-
-  // Validate deltas have no NaN/Inf
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    pdhg_solver_.get_saddle_point_state().get_delta_primal().data(),
-                    pdhg_solver_.get_saddle_point_state().get_delta_primal().data() +
-                      pdhg_solver_.get_saddle_point_state().get_delta_primal().size(),
-                    is_nan_or_inf<f_t>{}),
-    "delta_primal contains NaN or Inf in compute_fixed_error");
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    pdhg_solver_.get_saddle_point_state().get_delta_dual().data(),
-                    pdhg_solver_.get_saddle_point_state().get_delta_dual().data() +
-                      pdhg_solver_.get_saddle_point_state().get_delta_dual().size(),
-                    is_nan_or_inf<f_t>{}),
-    "delta_dual contains NaN or Inf in compute_fixed_error");
-
-  // Validate primal_weight and step_size have no NaN/Inf
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    primal_weight_.data(),
-                    primal_weight_.data() + primal_weight_.size(),
-                    is_nan_or_inf<f_t>{}),
-    "primal_weight_ contains NaN or Inf in compute_fixed_error");
-  cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(),
-                               step_size_.data(),
-                               step_size_.data() + step_size_.size(),
-                               is_nan_or_inf<f_t>{}),
-              "step_size_ contains NaN or Inf in compute_fixed_error");
 
   // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
   RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 374c9ff513..a2766be98a 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -793,7 +793,6 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
 
   // If need warm start, solve the LP alone
   if (primal_dual_init || primal_weight_init) {
-    std::cout << "Solving LP for warm start" << std::endl;
     pdlp_solver_settings_t<i_t, f_t> warm_start_settings = settings;
     warm_start_settings.new_bounds.clear();
     warm_start_settings.method               = cuopt::linear_programming::method_t::PDLP;
@@ -842,8 +841,6 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   }
   if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); }
 
-  std::cout << "Solving batch PDLP" << std::endl;
-
   for (int i = 0; i < max_batch_size; i += optimal_batch_size) {
     const int current_batch_size = std::min(optimal_batch_size, max_batch_size - i);
     // Only take the new bounds from [i, i + current_batch_size)
diff --git a/run_multiple.sh b/run_multiple.sh
deleted file mode 100755
index 183b25b46e..0000000000
--- a/run_multiple.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-for i in {1..5}; do
-    python test.py
-done
\ No newline at end of file
diff --git a/test.py b/test.py
deleted file mode 100755
index 6cb236dae2..0000000000
--- a/test.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import cuopt_mps_parser
-from cuopt.linear_programming import Solve, SolverSettings
-
-data_model = cuopt_mps_parser.ParseMps("batch_instances/neos8.mps")
-
-settings = SolverSettings()
-settings.set_mip_batch_pdlp_strong_branching(True)
-
-solution = Solve(data_model, settings)
-
-print(solution.get_termination_reason())
-print(solution.get_primal_objective())

From 6c2fe356a9ffc0994f0e7b177233504aa73c1c7d Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 18 Feb 2026 17:39:07 +0100
Subject: [PATCH 08/13] addtional cleanup

---
 cpp/src/pdlp/pdhg.cu   | 4 ++++
 cpp/src/pdlp/utils.cuh | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index c5efdcd722..286d6de5b5 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -652,6 +652,8 @@ struct primal_reflected_projection_bulk_op {
     cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_projection");
     cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_projection");
     cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_projection");
+    cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_projection");
+    cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0");
 
     f_t reflected = primal_val - step_size * (obj_coef - aty_val);
 
@@ -688,6 +690,8 @@ struct dual_reflected_projection_bulk_op {
     cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_projection");
     cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_projection");
     cuopt_assert(!isnan(dual_gradient[idx]), "dual_gradient is NaN in dual_reflected_projection");
+    cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_projection");
+    cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0");
 
     const f_t tmp = current_dual / step_size - dual_gradient[idx];
     const f_t tmp_proj =
diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh
index 9150ab8c51..0f2ed44c42 100644
--- a/cpp/src/pdlp/utils.cuh
+++ b/cpp/src/pdlp/utils.cuh
@@ -606,7 +606,7 @@ void inline my_inf_norm(const rmm::device_uvector<f_t>& input_vector,
 {
   auto stream   = handle_ptr->get_stream();
   auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t<f_t>{});
-  auto n        = static_cast<int>(input_vector.size());
+  auto n        = input_vector.size();
 
   void* d_temp          = nullptr;
   size_t temp_bytes     = 0;

From a43dc0c78b0d8bd833e8c0b05d8e6e46b0483703 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Feb 2026 13:43:06 +0100
Subject: [PATCH 09/13] address PR comments, add tests, update doc

---
 cpp/src/pdlp/pdlp.cu                          | 33 ++++++++-------
 .../convergence_information.cu                |  8 ++--
 cpp/src/pdlp/utilities/ping_pong_graph.cu     | 41 +++++--------------
 cpp/src/pdlp/utilities/ping_pong_graph.cuh    |  1 -
 docs/cuopt/source/lp-qp-milp-settings.rst     | 10 +++++
 .../linear_programming/test_python_API.py     | 33 +++++++++++++++
 6 files changed, 75 insertions(+), 51 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 4b4eed1f32..eebfede7f9 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -1192,8 +1192,10 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
     return x == 0 ? std::numeric_limits<f_t>::max() : abs(x);
   };
 
+  cuopt_assert(vec.size() > 0, "Vector must not be empty");
+
   auto stream = vec.stream();
-  auto n      = static_cast<int>(vec.size());
+  size_t n      = vec.size();
 
   rmm::device_scalar<f_t> d_smallest(stream);
   rmm::device_scalar<f_t> d_largest(stream);
@@ -1203,23 +1205,23 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
   auto abs_iter    = thrust::make_transform_iterator(vec.cbegin(), abs_op);
 
   void* d_temp   = nullptr;
-  size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 1;
-  cub::DeviceReduce::Reduce(
-    d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream);
-  cub::DeviceReduce::Reduce(
-    d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream);
-  cub::DeviceReduce::Reduce(
-    d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream);
+  size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 0;
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream));
 
   size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3});
   rmm::device_buffer temp_buf(max_bytes, stream);
 
-  cub::DeviceReduce::Reduce(
-    temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream);
-  cub::DeviceReduce::Reduce(
-    temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream);
-  cub::DeviceReduce::Reduce(
-    temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream);
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream));
 
   smallest = d_smallest.value(stream);
   largest  = d_largest.value(stream);
@@ -1444,7 +1446,8 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal,
     movement + computed_interaction >= f_t(0.0),
     "Movement + computed interaction must be >= 0");
 
-  *fixed_point_error = cuda::std::sqrt(movement + computed_interaction);
+  // Clamp to 0 to avoid NaN
+  *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction));
 
 #ifdef CUPDLP_DEBUG_MODE
   printf("movement %lf\n", movement);
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index eec078f8d7..269cb58f5d 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -404,19 +404,19 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
       relative_residual_t<i_t, f_t>{settings.tolerances.relative_primal_tolerance});
     void* d_temp_storage      = nullptr;
     size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::Max(d_temp_storage,
+    RAFT_CUDA_TRY(cub::DeviceReduce::Max(d_temp_storage,
                               temp_storage_bytes,
                               transform_iter,
                               linf_primal_residual_.data(),
                               primal_residual_.size(),
-                              stream_view_);
+                              stream_view_));
     rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_);
-    cub::DeviceReduce::Max(temp_buf.data(),
+    RAFT_CUDA_TRY(cub::DeviceReduce::Max(temp_buf.data(),
                               temp_storage_bytes,
                               transform_iter,
                               linf_primal_residual_.data(),
                               primal_residual_.size(),
-                              stream_view_);
+                              stream_view_));
   }
 
   compute_dual_residual(op_problem_cusparse_view_,
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu
index 647672e535..5effbcdc48 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cu
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu
@@ -20,32 +20,11 @@ ping_pong_graph_t<i_t>::ping_pong_graph_t(rmm::cuda_stream_view stream_view,
 {
 }
 
-template <typename i_t>
-void ping_pong_graph_t<i_t>::cancel_active_capture()
-{
-  CUOPT_LOG_ERROR(
-    "Canceling active capture in ping_pong_graph_t");
-  if (capture_even_active_) {
-    RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
-    RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
-    capture_even_active_ = false;
-  }
-  if (capture_odd_active_) {
-    RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
-    RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
-    capture_odd_active_ = false;
-  }
-}
-
 template <typename i_t>
 ping_pong_graph_t<i_t>::~ping_pong_graph_t()
 {
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
-    // This should not happen, but in case a graph was capturing while destroying the object
-    if (capture_even_active_ || capture_odd_active_) {
-      cancel_active_capture();
-    }
     if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); }
     if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); }
   }
@@ -58,11 +37,11 @@ void ping_pong_graph_t<i_t>::start_capture(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-      RAFT_CUDA_TRY_NO_THROW(
+      RAFT_CUDA_TRY(
         cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
       capture_even_active_ = true;
     } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-      RAFT_CUDA_TRY_NO_THROW(
+      RAFT_CUDA_TRY(
         cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
       capture_odd_active_ = true;
     }
@@ -76,17 +55,17 @@ void ping_pong_graph_t<i_t>::end_capture(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-      RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &even_graph));
+      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
       capture_even_active_ = false;
-      RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&even_instance, even_graph));
+      RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph));
       even_initialized = true;
-      RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
+      RAFT_CUDA_TRY(cudaGraphDestroy(even_graph));
     } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-      RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
+      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
       capture_odd_active_ = false;
-      RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&odd_instance, odd_graph));
+      RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph));
       odd_initialized = true;
-      RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
+      RAFT_CUDA_TRY(cudaGraphDestroy(odd_graph));
     }
   }
 #endif
@@ -98,9 +77,9 @@ void ping_pong_graph_t<i_t>::launch(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && even_initialized) {
-      RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(even_instance, stream_view_.value()));
+      RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value()));
     } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) {
-      RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(odd_instance, stream_view_.value()));
+      RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value()));
     }
   }
 #endif
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
index 9d6ead8cf7..5113f804d6 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
@@ -31,7 +31,6 @@ class ping_pong_graph_t {
   bool is_initialized(i_t total_pdlp_iterations);
 
  private:
-  void cancel_active_capture();
 
   cudaGraph_t even_graph;
   cudaGraph_t odd_graph;
diff --git a/docs/cuopt/source/lp-qp-milp-settings.rst b/docs/cuopt/source/lp-qp-milp-settings.rst
index 51c6142c2b..bd1372f70e 100644
--- a/docs/cuopt/source/lp-qp-milp-settings.rst
+++ b/docs/cuopt/source/lp-qp-milp-settings.rst
@@ -513,3 +513,13 @@ Set this value to 0 to disable reliability branching.
 Set this value to k > 0, to enable reliability branching. A variable will be considered reliable if it has been branched on k times.
 
 .. note:: The default value is ``-1`` (automatic).
+
+Batch PDLP Strong Branching
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING`` controls whether to use batched PDLP over Dual Simplex during strong branching at the root.
+When enabled, the solver evaluates multiple branching candidates simultaneously in a single batched PDLP solve rather than solving them in parallel using Dual Simplex. This can significantly reduce the time spent in strong branching if Dual Simplex is struggling.
+Set this value to 0 to disable batched PDLP strong branching.
+Set this value to 1 to enable batched PDLP strong branching.
+
+.. note:: The default value is ``0`` (disabled). This setting is ignored if the problem is not a MIP problem.
diff --git a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
index dc470f3828..0eca50ba9b 100644
--- a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
+++ b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
@@ -30,6 +30,7 @@
     CUOPT_ELIMINATE_DENSE_COLUMNS,
     CUOPT_FOLDING,
     CUOPT_INFEASIBILITY_DETECTION,
+    CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING,
     CUOPT_MIP_CUT_PASSES,
     CUOPT_METHOD,
     CUOPT_ORDERING,
@@ -997,3 +998,35 @@ def test_cuts():
     assert problem.Status.name == "Optimal"
     assert problem.ObjValue == pytest.approx(-126, abs=1e-3)
     assert problem.SolutionStats.num_nodes == 0
+
+
+def test_batch_pdlp_strong_branching():
+    # Minimize - 86*y1 - 4*y2 - 40*y3
+    # subject to 774*y1 + 76*y2 + 42*y3 <= 875
+    #            67*y1 + 27*y2 + 53*y3 <= 875
+    #            y1, y2, y3 in {0, 1}
+
+    problem = Problem()
+    y1 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y1")
+    y2 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y2")
+    y3 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y3")
+
+    problem.addConstraint(774 * y1 + 76 * y2 + 42 * y3 <= 875)
+    problem.addConstraint(67 * y1 + 27 * y2 + 53 * y3 <= 875)
+
+    problem.setObjective(-86 * y1 - 4 * y2 - 40 * y3)
+
+    settings = SolverSettings()
+    settings.set_parameter(CUOPT_PRESOLVE, 0)
+    settings.set_parameter(CUOPT_TIME_LIMIT, 10)
+    settings.set_parameter(CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, 0)
+
+    problem.solve(settings)
+    assert problem.Status.name == "Optimal"
+    assert problem.ObjValue == pytest.approx(-126, abs=1e-3)
+
+    settings.set_parameter(CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, 1)
+
+    problem.solve(settings)
+    assert problem.Status.name == "Optimal"
+    assert problem.ObjValue == pytest.approx(-126, abs=1e-3)

From c8b8b74bc6cc5f65df94f868a0334464e360c374 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Feb 2026 13:44:54 +0100
Subject: [PATCH 10/13] format

---
 cpp/src/pdlp/pdlp.cu                          | 36 ++++++++++-------
 .../adaptive_step_size_strategy.cu            | 17 ++++----
 .../convergence_information.cu                | 40 +++++++++----------
 cpp/src/pdlp/utilities/ping_pong_graph.cu     |  6 +--
 cpp/src/pdlp/utilities/ping_pong_graph.cuh    |  3 +-
 cpp/src/pdlp/utils.cuh                        | 10 ++---
 6 files changed, 58 insertions(+), 54 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index eebfede7f9..72aead03d0 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -34,8 +34,8 @@
 #include <cub/cub.cuh>
 
 #include <thrust/count.h>
-#include <thrust/logical.h>
 #include <thrust/extrema.h>
+#include <thrust/logical.h>
 
 #include <cmath>
 #include <optional>
@@ -1188,14 +1188,13 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
                           f_t& avg)
 {
   auto abs_op      = [] __host__ __device__(f_t x) { return abs(x); };
-  auto min_nonzero = [] __host__ __device__(f_t x) -> f_t {
-    return x == 0 ? std::numeric_limits<f_t>::max() : abs(x);
-  };
+  auto min_nonzero = [] __host__ __device__(f_t x)
+    -> f_t { return x == 0 ? std::numeric_limits<f_t>::max() : abs(x); };
 
   cuopt_assert(vec.size() > 0, "Vector must not be empty");
 
   auto stream = vec.stream();
-  size_t n      = vec.size();
+  size_t n    = vec.size();
 
   rmm::device_scalar<f_t> d_smallest(stream);
   rmm::device_scalar<f_t> d_largest(stream);
@@ -1206,8 +1205,14 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
 
   void* d_temp   = nullptr;
   size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 0;
-  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
-    d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(d_temp,
+                                          bytes_1,
+                                          min_nz_iter,
+                                          d_smallest.data(),
+                                          n,
+                                          cuda::minimum<>{},
+                                          std::numeric_limits<f_t>::max(),
+                                          stream));
   RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
     d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream));
   RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
@@ -1216,8 +1221,14 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
   size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3});
   rmm::device_buffer temp_buf(max_bytes, stream);
 
-  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
-    temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(temp_buf.data(),
+                                          bytes_1,
+                                          min_nz_iter,
+                                          d_smallest.data(),
+                                          n,
+                                          cuda::minimum<>{},
+                                          std::numeric_limits<f_t>::max(),
+                                          stream));
   RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
     temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream));
   RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
@@ -1442,9 +1453,8 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal,
     norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight;
   const f_t computed_interaction = f_t(2.0) * interaction * step_size;
 
-  cuopt_assert(
-    movement + computed_interaction >= f_t(0.0),
-    "Movement + computed interaction must be >= 0");
+  cuopt_assert(movement + computed_interaction >= f_t(0.0),
+               "Movement + computed interaction must be >= 0");
 
   // Clamp to 0 to avoid NaN
   *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction));
@@ -1828,7 +1838,6 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
   // potential_next_dual_solution
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
 
-
   // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
   RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
                                            (void*)pdhg_solver_.get_reflected_dual().data()));
@@ -1853,7 +1862,6 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
       stream_view_));  // To make sure all the data is written from device to host
     RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-
 #ifdef CUPDLP_DEBUG_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
 #endif
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index d491106aaf..47e9a78a5e 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -137,11 +137,12 @@ void adaptive_step_size_strategy_t<i_t, f_t>::swap_context(
   const auto [grid_size, block_size] =
     kernel_config_from_batch_size(static_cast<i_t>(swap_pairs.size()));
   adaptive_step_size_swap_device_vectors_kernel<i_t, f_t>
-    <<<grid_size, block_size, 0, stream_view_.value()>>>(thrust::raw_pointer_cast(swap_pairs.data()),
-                                                 static_cast<i_t>(swap_pairs.size()),
-                                                 make_span(interaction_),
-                                                 make_span(norm_squared_delta_primal_),
-                                                 make_span(norm_squared_delta_dual_));
+    <<<grid_size, block_size, 0, stream_view_.value()>>>(
+      thrust::raw_pointer_cast(swap_pairs.data()),
+      static_cast<i_t>(swap_pairs.size()),
+      make_span(interaction_),
+      make_span(norm_squared_delta_primal_),
+      make_span(norm_squared_delta_dual_));
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -332,9 +333,9 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_step_sizes(
     // Compute n_lim, n_next and decide if step size is valid
     compute_step_sizes_from_movement_and_interaction<i_t, f_t>
       <<<1, 1, 0, stream_view_.value()>>>(this->view(),
-                                  primal_step_size.data(),
-                                  dual_step_size.data(),
-                                  pdhg_solver.get_d_total_pdhg_iterations().data());
+                                          primal_step_size.data(),
+                                          dual_step_size.data(),
+                                          pdhg_solver.get_d_total_pdhg_iterations().data());
     graph.end_capture(total_pdlp_iterations);
   }
   graph.launch(total_pdlp_iterations);
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index 269cb58f5d..1e9a69d130 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -405,18 +405,18 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
     void* d_temp_storage      = nullptr;
     size_t temp_storage_bytes = 0;
     RAFT_CUDA_TRY(cub::DeviceReduce::Max(d_temp_storage,
-                              temp_storage_bytes,
-                              transform_iter,
-                              linf_primal_residual_.data(),
-                              primal_residual_.size(),
-                              stream_view_));
+                                         temp_storage_bytes,
+                                         transform_iter,
+                                         linf_primal_residual_.data(),
+                                         primal_residual_.size(),
+                                         stream_view_));
     rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_);
     RAFT_CUDA_TRY(cub::DeviceReduce::Max(temp_buf.data(),
-                              temp_storage_bytes,
-                              transform_iter,
-                              linf_primal_residual_.data(),
-                              primal_residual_.size(),
-                              stream_view_));
+                                         temp_storage_bytes,
+                                         transform_iter,
+                                         linf_primal_residual_.data(),
+                                         primal_residual_.size(),
+                                         stream_view_));
   }
 
   compute_dual_residual(op_problem_cusparse_view_,
@@ -465,18 +465,18 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
       void* d_temp_storage      = nullptr;
       size_t temp_storage_bytes = 0;
       cub::DeviceReduce::Max(d_temp_storage,
-                                temp_storage_bytes,
-                                transform_iter,
-                                linf_dual_residual_.data(),
-                                dual_residual_.size(),
-                                stream_view_);
+                             temp_storage_bytes,
+                             transform_iter,
+                             linf_dual_residual_.data(),
+                             dual_residual_.size(),
+                             stream_view_);
       rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_);
       cub::DeviceReduce::Max(temp_buf.data(),
-                                temp_storage_bytes,
-                                transform_iter,
-                                linf_dual_residual_.data(),
-                                dual_residual_.size(),
-                                stream_view_);
+                             temp_storage_bytes,
+                             transform_iter,
+                             linf_dual_residual_.data(),
+                             dual_residual_.size(),
+                             stream_view_);
     }
   }
 
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu
index 5effbcdc48..4ec5bff8c1 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cu
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu
@@ -37,12 +37,10 @@ void ping_pong_graph_t<i_t>::start_capture(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-      RAFT_CUDA_TRY(
-        cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
+      RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
       capture_even_active_ = true;
     } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-      RAFT_CUDA_TRY(
-        cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
+      RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
       capture_odd_active_ = true;
     }
   }
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
index 5113f804d6..dafecdd06e 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
@@ -19,7 +19,7 @@ namespace cuopt::linear_programming::detail {
 // No additional checks for safe usage (calling launch() before initializing the graph) use with
 // caution Binary part is because in pdlp we swap pointers instead of copying vectors to accept a
 // valid pdhg step So every odd pdlp step it's one graph, every even step it's another graph
-  template <typename i_t>
+template <typename i_t>
 class ping_pong_graph_t {
  public:
   ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false);
@@ -31,7 +31,6 @@ class ping_pong_graph_t {
   bool is_initialized(i_t total_pdlp_iterations);
 
  private:
-
   cudaGraph_t even_graph;
   cudaGraph_t odd_graph;
   cudaGraphExec_t even_instance;
diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh
index 0f2ed44c42..33625f7680 100644
--- a/cpp/src/pdlp/utils.cuh
+++ b/cpp/src/pdlp/utils.cuh
@@ -608,13 +608,11 @@ void inline my_inf_norm(const rmm::device_uvector<f_t>& input_vector,
   auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t<f_t>{});
   auto n        = input_vector.size();
 
-  void* d_temp          = nullptr;
-  size_t temp_bytes     = 0;
-  cub::DeviceReduce::Max(
-    d_temp, temp_bytes, abs_iter, result, n, stream);
+  void* d_temp      = nullptr;
+  size_t temp_bytes = 0;
+  cub::DeviceReduce::Max(d_temp, temp_bytes, abs_iter, result, n, stream);
   rmm::device_buffer temp_buf(temp_bytes, stream);
-  cub::DeviceReduce::Max(
-    temp_buf.data(), temp_bytes, abs_iter, result, n, stream);
+  cub::DeviceReduce::Max(temp_buf.data(), temp_bytes, abs_iter, result, n, stream);
 }
 
 template <typename f_t>

From b1be5bb2d0b729ddb685b722ad50c41f5a32aa58 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Feb 2026 13:45:01 +0100
Subject: [PATCH 11/13] format

---
 benchmarks/linear_programming/cuopt/run_pdlp.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu
index 64897264c9..e9b4f8296c 100644
--- a/benchmarks/linear_programming/cuopt/run_pdlp.cu
+++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu
@@ -71,9 +71,9 @@ static void parse_arguments(argparse::ArgumentParser& program)
       "modes.");
 
   program.add_argument("--presolver")
-  .help("Presolver to use. Possible values: None, Papilo, PSLP, Default")
-  .default_value("Default")
-  .choices("None", "Papilo", "PSLP", "Default");
+    .help("Presolver to use. Possible values: None, Papilo, PSLP, Default")
+    .default_value("Default")
+    .choices("None", "Papilo", "PSLP", "Default");
 
   program.add_argument("--solution-path").help("Path where solution file will be generated");
 }

From d89af961b3d8af72d22f027114ac1921809c3ccc Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Feb 2026 13:53:59 +0100
Subject: [PATCH 12/13] style

---
 .../restart_strategy/pdlp_restart_strategy.cu | 827 +++++++++---------
 1 file changed, 415 insertions(+), 412 deletions(-)

diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index cf715e8a1d..0b1c109185 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -2008,462 +2008,465 @@ void pdlp_restart_strategy_t<i_t, f_t>::solve_bound_constrained_trust_region(
                    "Incorrect primal reverse iterator");
       index_start_primal = thrust::raw_pointer_cast(&*highest_negInf_primal) - threshold_.data() +
                            1;  // + 1 to go after last negInf
-    if (lowest_inf != end) {
+      if (lowest_inf != end) {
                      std::numeric_limits<f_t>::infinity(),
                    "Incorrect primal iterator");
-      index_end_primal =
-        thrust::raw_pointer_cast(lowest_inf) -
-        threshold_.data();  // no - 1 to go before the first inf because end is not included
-      testing_range_high_.set_value_async(index_end_primal, stream_view_);
-    } else  // No inf found, end is primal_size_h_
-      testing_range_high_.set_value_async(index_end_primal, stream_view_);
-    cuopt_assert(index_start_primal <= index_end_primal,
-                 "Start should be strictly smaller than end");
-
-    cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(),
-                                 threshold_.data() + index_start_primal,
-                                 threshold_.data() + index_end_primal,
-                                 is_nan_or_inf<f_t>()),
-                 "Threshold vector should not contain inf or NaN values");
-
-    // Init parameters for live kernel
-    // Has to do this to pass lvalues (and not rvalue) to void* kernel_args
-    auto restart_view        = this->view();
-    auto op_view             = problem_ptr->view();
-    i_t* testing_range_low   = testing_range_low_.data();
-    i_t* testing_range_high  = testing_range_high_.data();
-    f_t* test_radius_squared = test_radius_squared_.data();
-    f_t* low_radius_squared  = low_radius_squared_.data();
-    f_t* high_radius_squared = high_radius_squared_.data();
-    f_t* distance_traveled   = duality_gap.distance_traveled_.data();
-
-    void* kernel_args[] = {
-      &restart_view,
-      &op_view,
-      &testing_range_low,
-      &testing_range_high,
-      &test_radius_squared,
-      &low_radius_squared,
-      &high_radius_squared,
-      &distance_traveled,
-    };
-    constexpr int numThreads = 128;
-    dim3 dimBlock(numThreads, 1, 1);
-    // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount *
-    // numBlocksPerSm
-    dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1);
-    // Compute the median for the join problem, while loop is inside the live kernel
-    RAFT_CUDA_TRY(cudaLaunchCooperativeKernel(
-      (void*)solve_bound_constrained_trust_region_kernel<i_t, f_t, numThreads>,
-      dimGrid,
-      dimBlock,
-      kernel_args,
-      0,
-      stream_view_));
-
-    // Find max threshold for the join problem
-    const f_t* max_threshold =
-      thrust::max_element(handle_ptr_->get_thrust_policy(),
-                          threshold_.data(),
-                          threshold_.data() + primal_size_h_ + dual_size_h_);
-
-    // we have now determined the test_threshold that should minimize the objective value of the
-    // solution.
-
-    //  if no component got fixed by their upper bound we can pick the maximum threshold to be the
-    //  target_threshold which was computed before the loop in the direction_and_threshold_kernel
-    // Otherwise use the test_threshold determined in the loop
-    // {
-    target_threshold_determination_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
-      this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-    // }
+                     index_end_primal =
+                       thrust::raw_pointer_cast(lowest_inf) -
+                       threshold_
+                         .data();  // no - 1 to go before the first inf because end is not included
+                     testing_range_high_.set_value_async(index_end_primal, stream_view_);
+      } else  // No inf found, end is primal_size_h_
+        testing_range_high_.set_value_async(index_end_primal, stream_view_);
+      cuopt_assert(index_start_primal <= index_end_primal,
+                   "Start should be strictly smaller than end");
+
+      cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(),
+                                   threshold_.data() + index_start_primal,
+                                   threshold_.data() + index_end_primal,
+                                   is_nan_or_inf<f_t>()),
+                   "Threshold vector should not contain inf or NaN values");
+
+      // Init parameters for live kernel
+      // Has to do this to pass lvalues (and not rvalue) to void* kernel_args
+      auto restart_view        = this->view();
+      auto op_view             = problem_ptr->view();
+      i_t* testing_range_low   = testing_range_low_.data();
+      i_t* testing_range_high  = testing_range_high_.data();
+      f_t* test_radius_squared = test_radius_squared_.data();
+      f_t* low_radius_squared  = low_radius_squared_.data();
+      f_t* high_radius_squared = high_radius_squared_.data();
+      f_t* distance_traveled   = duality_gap.distance_traveled_.data();
+
+      void* kernel_args[] = {
+        &restart_view,
+        &op_view,
+        &testing_range_low,
+        &testing_range_high,
+        &test_radius_squared,
+        &low_radius_squared,
+        &high_radius_squared,
+        &distance_traveled,
+      };
+      constexpr int numThreads = 128;
+      dim3 dimBlock(numThreads, 1, 1);
+      // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount *
+      // numBlocksPerSm
+      dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1);
+      // Compute the median for the join problem, while loop is inside the live kernel
+      RAFT_CUDA_TRY(cudaLaunchCooperativeKernel(
+        (void*)solve_bound_constrained_trust_region_kernel<i_t, f_t, numThreads>,
+        dimGrid,
+        dimBlock,
+        kernel_args,
+        0,
+        stream_view_));
+
+      // Find max threshold for the join problem
+      const f_t* max_threshold =
+        thrust::max_element(handle_ptr_->get_thrust_policy(),
+                            threshold_.data(),
+                            threshold_.data() + primal_size_h_ + dual_size_h_);
+
+      // we have now determined the test_threshold that should minimize the objective value of the
+      // solution.
+
+      //  if no component got fixed by their upper bound we can pick the maximum threshold to be the
+      //  target_threshold which was computed before the loop in the direction_and_threshold_kernel
+      // Otherwise use the test_threshold determined in the loop
+      // {
+      target_threshold_determination_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
+        this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+      // }
+
+      // Compute x (the solution which is defined by moving each component test_threshold *
+      // direction[component]) clamp on upper and lower bounds.
+      // Used unsorted_direction_full_ as the other one got sorted
+      // {
+      raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(),
+                             duality_gap.primal_solution_.data(),
+                             unsorted_direction_full_.data(),
+                             primal_size_h_,
+                             a_add_scalar_times_b<f_t>(target_threshold_.data()),
+                             stream_view_);
+      raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(),
+                             duality_gap.dual_solution_.data(),
+                             unsorted_direction_full_.data() + primal_size_h_,
+                             dual_size_h_,
+                             a_add_scalar_times_b<f_t>(target_threshold_.data()),
+                             stream_view_);
+      // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part
+      using f_t2 = typename type_2<f_t>::type;
+      cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(),
+                                                            problem_ptr->variable_bounds.data()),
+                                      duality_gap.primal_solution_tr_.data(),
+                                      primal_size_h_,
+                                      clamp<f_t, f_t2>(),
+                                      stream_view_.value());
+
+      // project by max(min(y[i], upperbound[i]),lowerbound[i])
+      raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(),
+                              duality_gap.dual_solution_tr_.data(),
+                              transformed_constraint_lower_bounds_.data(),
+                              transformed_constraint_upper_bounds_.data(),
+                              dual_size_h_,
+                              constraint_clamp<f_t>(),
+                              stream_view_);
+      // }
+    }
 
-    // Compute x (the solution which is defined by moving each component test_threshold *
-    // direction[component]) clamp on upper and lower bounds.
-    // Used unsorted_direction_full_ as the other one got sorted
+    // Compute the current lower bound for the objective value using the primal solution_tr and
+    // upper bound for the objective value using the dual solution_tr
     // {
-    raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(),
-                           duality_gap.primal_solution_.data(),
-                           unsorted_direction_full_.data(),
-                           primal_size_h_,
-                           a_add_scalar_times_b<f_t>(target_threshold_.data()),
-                           stream_view_);
-    raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(),
-                           duality_gap.dual_solution_.data(),
-                           unsorted_direction_full_.data() + primal_size_h_,
-                           dual_size_h_,
-                           a_add_scalar_times_b<f_t>(target_threshold_.data()),
-                           stream_view_);
-    // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part
-    using f_t2 = typename type_2<f_t>::type;
-    cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(),
-                                                          problem_ptr->variable_bounds.data()),
-                                    duality_gap.primal_solution_tr_.data(),
-                                    primal_size_h_,
-                                    clamp<f_t, f_t2>(),
-                                    stream_view_.value());
-
-    // project by max(min(y[i], upperbound[i]),lowerbound[i])
-    raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(),
-                            duality_gap.dual_solution_tr_.data(),
-                            transformed_constraint_lower_bounds_.data(),
-                            transformed_constraint_upper_bounds_.data(),
-                            dual_size_h_,
-                            constraint_clamp<f_t>(),
-                            stream_view_);
+    // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution,
+    // primal_gradient))
+    compute_bound(duality_gap.primal_solution_tr_,
+                  duality_gap.primal_solution_,
+                  duality_gap.primal_gradient_,
+                  duality_gap.lagrangian_value_,
+                  primal_size_h_,
+                  primal_stride,
+                  tmp_primal,
+                  duality_gap.lower_bound_value_);
+
+    // compute 'upper bound' using dual
+    compute_bound(duality_gap.dual_solution_tr_,
+                  duality_gap.dual_solution_,
+                  duality_gap.dual_gradient_,
+                  duality_gap.lagrangian_value_,
+                  dual_size_h_,
+                  dual_stride,
+                  tmp_dual,
+                  duality_gap.upper_bound_value_);
+
     // }
   }
 
-  // Compute the current lower bound for the objective value using the primal solution_tr and
-  // upper bound for the objective value using the dual solution_tr
-  // {
-  // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution,
-  // primal_gradient))
-  compute_bound(duality_gap.primal_solution_tr_,
-                duality_gap.primal_solution_,
-                duality_gap.primal_gradient_,
-                duality_gap.lagrangian_value_,
-                primal_size_h_,
-                primal_stride,
-                tmp_primal,
-                duality_gap.lower_bound_value_);
-
-  // compute 'upper bound' using dual
-  compute_bound(duality_gap.dual_solution_tr_,
-                duality_gap.dual_solution_,
-                duality_gap.dual_gradient_,
-                duality_gap.lagrangian_value_,
-                dual_size_h_,
-                dual_stride,
-                tmp_dual,
-                duality_gap.upper_bound_value_);
-
-  // }
-}
-
-template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::compute_distance_traveled_from_last_restart(
-  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-  rmm::device_uvector<f_t>& primal_weight,
-  rmm::device_uvector<f_t>& tmp_primal,
-  rmm::device_uvector<f_t>& tmp_dual)
-{
-  raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart");
-  // norm(
-  //     new_primal_solution - last_restart.primal_solution,
-  //   )^2
-
-  // Julia / Paper use a weighted norm using primal weight for primal / dual distance
-  // We simply use L2 norm of diff
-  distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_,
-                                                  last_restart_duality_gap_.primal_solution_,
-                                                  tmp_primal,
-                                                  primal_size_h_,
-                                                  primal_stride,
-                                                  duality_gap.primal_distance_traveled_);
-
-  // compute similarly for dual
-  distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_,
-                                                  last_restart_duality_gap_.dual_solution_,
-                                                  tmp_dual,
-                                                  dual_size_h_,
-                                                  dual_stride,
-                                                  duality_gap.dual_distance_traveled_);
-
-  // distance_traveled = primal_distance * 0.5 * primal_weight
-  // + dual_distance * 0.5 / primal_weight
-  compute_distance_traveled_last_restart_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
-    duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data());
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
+  template <typename i_t, typename f_t>
+  void pdlp_restart_strategy_t<i_t, f_t>::compute_distance_traveled_from_last_restart(
+    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
+    rmm::device_uvector<f_t> & primal_weight,
+    rmm::device_uvector<f_t> & tmp_primal,
+    rmm::device_uvector<f_t> & tmp_dual)
+  {
+    raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart");
+    // norm(
+    //     new_primal_solution - last_restart.primal_solution,
+    //   )^2
+
+    // Julia / Paper use a weighted norm using primal weight for primal / dual distance
+    // We simply use L2 norm of diff
+    distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_,
+                                                    last_restart_duality_gap_.primal_solution_,
+                                                    tmp_primal,
+                                                    primal_size_h_,
+                                                    primal_stride,
+                                                    duality_gap.primal_distance_traveled_);
+
+    // compute similarly for dual
+    distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_,
+                                                    last_restart_duality_gap_.dual_solution_,
+                                                    tmp_dual,
+                                                    dual_size_h_,
+                                                    dual_stride,
+                                                    duality_gap.dual_distance_traveled_);
+
+    // distance_traveled = primal_distance * 0.5 * primal_weight
+    // + dual_distance * 0.5 / primal_weight
+    compute_distance_traveled_last_restart_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
+      duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
 
-template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::compute_primal_gradient(
-  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-  cusparse_view_t<i_t, f_t>& cusparse_view)
-{
-  raft::common::nvtx::range fun_scope("compute_primal_gradient");
+  template <typename i_t, typename f_t>
+  void pdlp_restart_strategy_t<i_t, f_t>::compute_primal_gradient(
+    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
+    cusparse_view_t<i_t, f_t> & cusparse_view)
+  {
+    raft::common::nvtx::range fun_scope("compute_primal_gradient");
 #ifdef PDLP_DEBUG_MODE
-  std::cout << "    Compute primal gradient:" << std::endl;
+    std::cout << "    Compute primal gradient:" << std::endl;
 #endif
 
-  // for QP add problem.objective_matrix * primal_solution as well
-  // c - A^T*y (copy c to primal_gradient for correct writing of result)
-  raft::copy(duality_gap.primal_gradient_.data(),
-             problem_ptr->objective_coefficients.data(),
-             primal_size_h_,
-             stream_view_);
+    // for QP add problem.objective_matrix * primal_solution as well
+    // c - A^T*y (copy c to primal_gradient for correct writing of result)
+    raft::copy(duality_gap.primal_gradient_.data(),
+               problem_ptr->objective_coefficients.data(),
+               primal_size_h_,
+               stream_view_);
 
-  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                                       reusable_device_scalar_value_neg_1_.data(),
-                                                       cusparse_view.A_T,
-                                                       cusparse_view.dual_solution,
-                                                       reusable_device_scalar_value_1_.data(),
-                                                       cusparse_view.primal_gradient,
-                                                       CUSPARSE_SPMV_CSR_ALG2,
-                                                       (f_t*)cusparse_view.buffer_transpose.data(),
-                                                       stream_view_));
-}
+    RAFT_CUSPARSE_TRY(
+      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         reusable_device_scalar_value_neg_1_.data(),
+                                         cusparse_view.A_T,
+                                         cusparse_view.dual_solution,
+                                         reusable_device_scalar_value_1_.data(),
+                                         cusparse_view.primal_gradient,
+                                         CUSPARSE_SPMV_CSR_ALG2,
+                                         (f_t*)cusparse_view.buffer_transpose.data(),
+                                         stream_view_));
+  }
 
-template <typename i_t, typename f_t>
-__global__ void compute_subgradient_kernel(
-  const typename pdlp_restart_strategy_t<i_t, f_t>::view_t restart_strategy_view,
-  const typename problem_t<i_t, f_t>::view_t op_problem_view,
-  const typename localized_duality_gap_container_t<i_t, f_t>::view_t duality_gap_view,
-  f_t* subgradient)
-{
-  i_t id = threadIdx.x + blockIdx.x * blockDim.x;
-  if (id >= duality_gap_view.dual_size) { return; }
-
-  f_t lower          = op_problem_view.constraint_lower_bounds[id];
-  f_t upper          = op_problem_view.constraint_upper_bounds[id];
-  f_t primal_product = duality_gap_view.dual_gradient[id];
-  f_t dual_solution  = duality_gap_view.dual_solution[id];
-
-  f_t subgradient_coefficient;
-
-  if (dual_solution < f_t(0)) {
-    subgradient_coefficient = upper;
-  } else if (dual_solution > f_t(0)) {
-    subgradient_coefficient = lower;
-  } else if (!isfinite(upper) && !isfinite(lower)) {
-    subgradient_coefficient = f_t(0);
-  } else if (!isfinite(upper) && isfinite(lower)) {
-    subgradient_coefficient = lower;
-  } else if (isfinite(upper) && !isfinite(lower)) {
-    subgradient_coefficient = upper;
-  } else {
-    if (primal_product < lower) {
+  template <typename i_t, typename f_t>
+  __global__ void compute_subgradient_kernel(
+    const typename pdlp_restart_strategy_t<i_t, f_t>::view_t restart_strategy_view,
+    const typename problem_t<i_t, f_t>::view_t op_problem_view,
+    const typename localized_duality_gap_container_t<i_t, f_t>::view_t duality_gap_view,
+    f_t* subgradient)
+  {
+    i_t id = threadIdx.x + blockIdx.x * blockDim.x;
+    if (id >= duality_gap_view.dual_size) { return; }
+
+    f_t lower          = op_problem_view.constraint_lower_bounds[id];
+    f_t upper          = op_problem_view.constraint_upper_bounds[id];
+    f_t primal_product = duality_gap_view.dual_gradient[id];
+    f_t dual_solution  = duality_gap_view.dual_solution[id];
+
+    f_t subgradient_coefficient;
+
+    if (dual_solution < f_t(0)) {
+      subgradient_coefficient = upper;
+    } else if (dual_solution > f_t(0)) {
       subgradient_coefficient = lower;
-    } else if (primal_product > upper) {
+    } else if (!isfinite(upper) && !isfinite(lower)) {
+      subgradient_coefficient = f_t(0);
+    } else if (!isfinite(upper) && isfinite(lower)) {
+      subgradient_coefficient = lower;
+    } else if (isfinite(upper) && !isfinite(lower)) {
       subgradient_coefficient = upper;
     } else {
-      subgradient_coefficient = primal_product;
+      if (primal_product < lower) {
+        subgradient_coefficient = lower;
+      } else if (primal_product > upper) {
+        subgradient_coefficient = upper;
+      } else {
+        subgradient_coefficient = primal_product;
+      }
     }
-  }
 
-  subgradient[id] = subgradient_coefficient;
-}
+    subgradient[id] = subgradient_coefficient;
+  }
 
-template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::compute_dual_gradient(
-  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-  cusparse_view_t<i_t, f_t>& cusparse_view,
-  rmm::device_uvector<f_t>& tmp_dual)
-{
-  raft::common::nvtx::range fun_scope("compute_dual_gradient");
+  template <typename i_t, typename f_t>
+  void pdlp_restart_strategy_t<i_t, f_t>::compute_dual_gradient(
+    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
+    cusparse_view_t<i_t, f_t> & cusparse_view,
+    rmm::device_uvector<f_t> & tmp_dual)
+  {
+    raft::common::nvtx::range fun_scope("compute_dual_gradient");
 #ifdef PDLP_DEBUG_MODE
-  std::cout << "    Compute dual gradient:" << std::endl;
+    std::cout << "    Compute dual gradient:" << std::endl;
 #endif
 
-  // b - A*x
-  // is changed with the introduction of constraint upper and lower bounds
-
-  // gradient constains primal_product
-  RAFT_CUSPARSE_TRY(
-    raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       reusable_device_scalar_value_1_.data(),
-                                       cusparse_view.A,
-                                       cusparse_view.primal_solution,
-                                       reusable_device_scalar_value_0_.data(),
-                                       cusparse_view.dual_gradient,
-                                       CUSPARSE_SPMV_CSR_ALG2,
-                                       (f_t*)cusparse_view.buffer_non_transpose.data(),
-                                       stream_view_));
-
-  // tmp_dual will contain the subgradient
-  i_t number_of_blocks = dual_size_h_ / block_size;
-  if (dual_size_h_ % block_size) number_of_blocks++;
-  i_t number_of_threads = std::min(dual_size_h_, block_size);
-  compute_subgradient_kernel<i_t, f_t><<<number_of_blocks, number_of_threads, 0, stream_view_>>>(
-    this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data());
-
-  // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient)
-  raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(),
-                           tmp_dual.data(),
-                           duality_gap.dual_gradient_.data(),
-                           dual_size_h_,
-                           stream_view_);
-}
+    // b - A*x
+    // is changed with the introduction of constraint upper and lower bounds
+
+    // gradient constains primal_product
+    RAFT_CUSPARSE_TRY(
+      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         reusable_device_scalar_value_1_.data(),
+                                         cusparse_view.A,
+                                         cusparse_view.primal_solution,
+                                         reusable_device_scalar_value_0_.data(),
+                                         cusparse_view.dual_gradient,
+                                         CUSPARSE_SPMV_CSR_ALG2,
+                                         (f_t*)cusparse_view.buffer_non_transpose.data(),
+                                         stream_view_));
+
+    // tmp_dual will contain the subgradient
+    i_t number_of_blocks = dual_size_h_ / block_size;
+    if (dual_size_h_ % block_size) number_of_blocks++;
+    i_t number_of_threads = std::min(dual_size_h_, block_size);
+    compute_subgradient_kernel<i_t, f_t><<<number_of_blocks, number_of_threads, 0, stream_view_>>>(
+      this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data());
+
+    // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient)
+    raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(),
+                             tmp_dual.data(),
+                             duality_gap.dual_gradient_.data(),
+                             dual_size_h_,
+                             stream_view_);
+  }
 
-template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::compute_lagrangian_value(
-  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-  cusparse_view_t<i_t, f_t>& cusparse_view,
-  rmm::device_uvector<f_t>& tmp_primal,
-  rmm::device_uvector<f_t>& tmp_dual)
-{
-  raft::common::nvtx::range fun_scope("compute_lagrangian_value");
+  template <typename i_t, typename f_t>
+  void pdlp_restart_strategy_t<i_t, f_t>::compute_lagrangian_value(
+    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
+    cusparse_view_t<i_t, f_t> & cusparse_view,
+    rmm::device_uvector<f_t> & tmp_primal,
+    rmm::device_uvector<f_t> & tmp_dual)
+  {
+    raft::common::nvtx::range fun_scope("compute_lagrangian_value");
 #ifdef PDLP_DEBUG_MODE
-  std::cout << "    Compute lagrangian value:" << std::endl;
+    std::cout << "    Compute lagrangian value:" << std::endl;
 #endif
-  // if QP
-  //  0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) +
-  //  dot(primal_solution, problem.objective_vector) -
-  //  dot(primal_solution, problem.constraint_matrix' * dual_solution) +
-  //  dot(dual_solution, dual_gradient+primal_product) +
-  //  problem.objective_constant
+    // if QP
+    //  0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) +
+    //  dot(primal_solution, problem.objective_vector) -
+    //  dot(primal_solution, problem.constraint_matrix' * dual_solution) +
+    //  dot(dual_solution, dual_gradient+primal_product) +
+    //  problem.objective_constant
 
-  // when lp first term is irrelevant
+    // when lp first term is irrelevant
 
-  // second term
-  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                  primal_size_h_,
-                                                  duality_gap.primal_solution_.data(),
-                                                  primal_stride,
-                                                  problem_ptr->objective_coefficients.data(),
-                                                  primal_stride,
-                                                  reusable_device_scalar_1_.data(),
-                                                  stream_view_));
+    // second term
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                    primal_size_h_,
+                                                    duality_gap.primal_solution_.data(),
+                                                    primal_stride,
+                                                    problem_ptr->objective_coefficients.data(),
+                                                    primal_stride,
+                                                    reusable_device_scalar_1_.data(),
+                                                    stream_view_));
 
-  // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot
-  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                                       reusable_device_scalar_value_1_.data(),
-                                                       cusparse_view.A_T,
-                                                       cusparse_view.dual_solution,
-                                                       reusable_device_scalar_value_0_.data(),
-                                                       cusparse_view.tmp_primal,
-                                                       CUSPARSE_SPMV_CSR_ALG2,
-                                                       (f_t*)cusparse_view.buffer_transpose.data(),
-                                                       stream_view_));
+    // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot
+    RAFT_CUSPARSE_TRY(
+      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         reusable_device_scalar_value_1_.data(),
+                                         cusparse_view.A_T,
+                                         cusparse_view.dual_solution,
+                                         reusable_device_scalar_value_0_.data(),
+                                         cusparse_view.tmp_primal,
+                                         CUSPARSE_SPMV_CSR_ALG2,
+                                         (f_t*)cusparse_view.buffer_transpose.data(),
+                                         stream_view_));
 
-  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                  primal_size_h_,
-                                                  duality_gap.primal_solution_.data(),
-                                                  primal_stride,
-                                                  tmp_primal.data(),
-                                                  primal_stride,
-                                                  reusable_device_scalar_2_.data(),
-                                                  stream_view_));
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                    primal_size_h_,
+                                                    duality_gap.primal_solution_.data(),
+                                                    primal_stride,
+                                                    tmp_primal.data(),
+                                                    primal_stride,
+                                                    reusable_device_scalar_2_.data(),
+                                                    stream_view_));
 
-  // fourth term //tmp_dual still contains subgradient from the dual_gradient computation
-  reusable_device_scalar_3_.set_value_to_zero_async(stream_view_);
-  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                  dual_size_h_,
-                                                  duality_gap.dual_solution_.data(),
-                                                  dual_stride,
-                                                  tmp_dual.data(),
-                                                  dual_stride,
-                                                  reusable_device_scalar_3_.data(),
-                                                  stream_view_));
+    // fourth term //tmp_dual still contains subgradient from the dual_gradient computation
+    reusable_device_scalar_3_.set_value_to_zero_async(stream_view_);
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                    dual_size_h_,
+                                                    duality_gap.dual_solution_.data(),
+                                                    dual_stride,
+                                                    tmp_dual.data(),
+                                                    dual_stride,
+                                                    reusable_device_scalar_3_.data(),
+                                                    stream_view_));
 
-  // subtract third term from second up
-  raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(),
-                           reusable_device_scalar_1_.data(),
-                           reusable_device_scalar_2_.data(),
-                           1,
-                           stream_view_);
-  raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(),
-                           reusable_device_scalar_1_.data(),
-                           reusable_device_scalar_3_.data(),
-                           1,
-                           stream_view_);
-}
+    // subtract third term from second up
+    raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(),
+                             reusable_device_scalar_1_.data(),
+                             reusable_device_scalar_2_.data(),
+                             1,
+                             stream_view_);
+    raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(),
+                             reusable_device_scalar_1_.data(),
+                             reusable_device_scalar_3_.data(),
+                             1,
+                             stream_view_);
+  }
 
-template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::reset_internal()
-{
-  candidate_is_avg_.set_value_to_zero_async(stream_view_);
-  restart_triggered_.set_value_to_zero_async(stream_view_);
-}
+  template <typename i_t, typename f_t>
+  void pdlp_restart_strategy_t<i_t, f_t>::reset_internal()
+  {
+    candidate_is_avg_.set_value_to_zero_async(stream_view_);
+    restart_triggered_.set_value_to_zero_async(stream_view_);
+  }
 
-template <typename i_t, typename f_t>
-typename pdlp_restart_strategy_t<i_t, f_t>::view_t pdlp_restart_strategy_t<i_t, f_t>::view()
-{
-  pdlp_restart_strategy_t<i_t, f_t>::view_t v{};
-  v.primal_size                         = primal_size_h_;
-  v.dual_size                           = dual_size_h_;
-  v.transformed_constraint_lower_bounds = raft::device_span<f_t>{
-    transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()};
-  v.transformed_constraint_upper_bounds = raft::device_span<f_t>{
-    transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()};
-  v.last_restart_length = last_restart_length_;
+  template <typename i_t, typename f_t>
+  typename pdlp_restart_strategy_t<i_t, f_t>::view_t pdlp_restart_strategy_t<i_t, f_t>::view()
+  {
+    pdlp_restart_strategy_t<i_t, f_t>::view_t v{};
+    v.primal_size                         = primal_size_h_;
+    v.dual_size                           = dual_size_h_;
+    v.transformed_constraint_lower_bounds = raft::device_span<f_t>{
+      transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()};
+    v.transformed_constraint_upper_bounds = raft::device_span<f_t>{
+      transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()};
+    v.last_restart_length = last_restart_length_;
 
-  v.weights = raft::device_span<f_t>{weights_.data(), weights_.size()};
+    v.weights = raft::device_span<f_t>{weights_.data(), weights_.size()};
 
-  v.candidate_is_avg  = candidate_is_avg_.data();
-  v.restart_triggered = restart_triggered_.data();
+    v.candidate_is_avg  = candidate_is_avg_.data();
+    v.restart_triggered = restart_triggered_.data();
 
-  v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data();
+    v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data();
 
-  v.center_point     = raft::device_span<f_t>{center_point_.data(), center_point_.size()};
-  v.objective_vector = raft::device_span<f_t>{objective_vector_.data(), objective_vector_.size()};
-  v.direction_full   = raft::device_span<f_t>{direction_full_.data(), direction_full_.size()};
-  v.threshold        = raft::device_span<f_t>{threshold_.data(), threshold_.size()};
-  v.lower_bound      = raft::device_span<f_t>{lower_bound_.data(), lower_bound_.size()};
-  v.upper_bound      = raft::device_span<f_t>{upper_bound_.data(), upper_bound_.size()};
-  v.test_point       = raft::device_span<f_t>{test_point_.data(), test_point_.size()};
+    v.center_point     = raft::device_span<f_t>{center_point_.data(), center_point_.size()};
+    v.objective_vector = raft::device_span<f_t>{objective_vector_.data(), objective_vector_.size()};
+    v.direction_full   = raft::device_span<f_t>{direction_full_.data(), direction_full_.size()};
+    v.threshold        = raft::device_span<f_t>{threshold_.data(), threshold_.size()};
+    v.lower_bound      = raft::device_span<f_t>{lower_bound_.data(), lower_bound_.size()};
+    v.upper_bound      = raft::device_span<f_t>{upper_bound_.data(), upper_bound_.size()};
+    v.test_point       = raft::device_span<f_t>{test_point_.data(), test_point_.size()};
 
-  v.target_threshold    = target_threshold_.data();
-  v.low_radius_squared  = low_radius_squared_.data();
-  v.high_radius_squared = high_radius_squared_.data();
-  v.test_radius_squared = test_radius_squared_.data();
+    v.target_threshold    = target_threshold_.data();
+    v.low_radius_squared  = low_radius_squared_.data();
+    v.high_radius_squared = high_radius_squared_.data();
+    v.test_radius_squared = test_radius_squared_.data();
 
-  v.testing_range_low  = testing_range_low_.data();
-  v.testing_range_high = testing_range_high_.data();
+    v.testing_range_low  = testing_range_low_.data();
+    v.testing_range_high = testing_range_high_.data();
 
-  v.shared_live_kernel_accumulator = raft::device_span<f_t>{shared_live_kernel_accumulator_.data(),
-                                                            shared_live_kernel_accumulator_.size()};
+    v.shared_live_kernel_accumulator = raft::device_span<f_t>{
+      shared_live_kernel_accumulator_.data(), shared_live_kernel_accumulator_.size()};
 
-  v.hyper_params = hyper_params_;
+    v.hyper_params = hyper_params_;
 
-  return v;
-}
+    return v;
+  }
 
-template <typename i_t, typename f_t>
-typename pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart_view_t
-pdlp_restart_strategy_t<i_t, f_t>::make_cupdlpx_restart_view(
-  const rmm::device_uvector<f_t>& primal_distance,
-  const rmm::device_uvector<f_t>& dual_distance,
-  const convergence_information_t<i_t, f_t>& current_convergence_information,
-  const rmm::device_uvector<f_t>& step_size,
-  rmm::device_uvector<f_t>& primal_weight,
-  rmm::device_uvector<f_t>& best_primal_weight,
-  rmm::device_uvector<f_t>& primal_step_size,
-  rmm::device_uvector<f_t>& dual_step_size)
-{
-  cupdlpx_restart_view_t v{};
-  v.primal_distance    = make_span(primal_distance);
-  v.dual_distance      = make_span(dual_distance);
-  v.l2_dual_residual   = make_span(current_convergence_information.get_l2_dual_residual());
-  v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual());
-  v.l2_norm_primal_linear_objective =
-    current_convergence_information.get_relative_dual_tolerance_factor();
-  v.l2_norm_primal_right_hand_side =
-    current_convergence_information.get_relative_primal_tolerance_factor();
-  v.step_size                     = make_span(step_size);
-  v.primal_weight                 = make_span(primal_weight);
-  v.primal_weight_error_sum       = make_span(primal_weight_error_sum_);
-  v.primal_weight_last_error      = make_span(primal_weight_last_error_);
-  v.best_primal_weight            = make_span(best_primal_weight);
-  v.new_primal_step_size          = make_span(primal_step_size);
-  v.new_dual_step_size            = make_span(dual_step_size);
-  v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_);
-  v.hyper_params                  = hyper_params_;
-  return v;
-}
+  template <typename i_t, typename f_t>
+  typename pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart_view_t
+  pdlp_restart_strategy_t<i_t, f_t>::make_cupdlpx_restart_view(
+    const rmm::device_uvector<f_t>& primal_distance,
+    const rmm::device_uvector<f_t>& dual_distance,
+    const convergence_information_t<i_t, f_t>& current_convergence_information,
+    const rmm::device_uvector<f_t>& step_size,
+    rmm::device_uvector<f_t>& primal_weight,
+    rmm::device_uvector<f_t>& best_primal_weight,
+    rmm::device_uvector<f_t>& primal_step_size,
+    rmm::device_uvector<f_t>& dual_step_size)
+  {
+    cupdlpx_restart_view_t v{};
+    v.primal_distance    = make_span(primal_distance);
+    v.dual_distance      = make_span(dual_distance);
+    v.l2_dual_residual   = make_span(current_convergence_information.get_l2_dual_residual());
+    v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual());
+    v.l2_norm_primal_linear_objective =
+      current_convergence_information.get_relative_dual_tolerance_factor();
+    v.l2_norm_primal_right_hand_side =
+      current_convergence_information.get_relative_primal_tolerance_factor();
+    v.step_size                     = make_span(step_size);
+    v.primal_weight                 = make_span(primal_weight);
+    v.primal_weight_error_sum       = make_span(primal_weight_error_sum_);
+    v.primal_weight_last_error      = make_span(primal_weight_last_error_);
+    v.best_primal_weight            = make_span(best_primal_weight);
+    v.new_primal_step_size          = make_span(primal_step_size);
+    v.new_dual_step_size            = make_span(dual_step_size);
+    v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_);
+    v.hyper_params                  = hyper_params_;
+    return v;
+  }
 
-template <typename i_t, typename f_t>
-i_t pdlp_restart_strategy_t<i_t, f_t>::get_iterations_since_last_restart() const
-{
-  return weighted_average_solution_.get_iterations_since_last_restart();
-}
+  template <typename i_t, typename f_t>
+  i_t pdlp_restart_strategy_t<i_t, f_t>::get_iterations_since_last_restart() const
+  {
+    return weighted_average_solution_.get_iterations_since_last_restart();
+  }
 
-template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::set_last_restart_was_average(bool value)
-{
-  last_restart_was_average_ = value;
-}
+  template <typename i_t, typename f_t>
+  void pdlp_restart_strategy_t<i_t, f_t>::set_last_restart_was_average(bool value)
+  {
+    last_restart_was_average_ = value;
+  }
 
-template <typename i_t, typename f_t>
-bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
-{
-  return last_restart_was_average_;
-}
+  template <typename i_t, typename f_t>
+  bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
+  {
+    return last_restart_was_average_;
+  }
 
 #define INSTANTIATE(F_TYPE)                                                                     \
   template class pdlp_restart_strategy_t<int, F_TYPE>;                                          \
@@ -2520,11 +2523,11 @@ bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
     F_TYPE* primal_product);
 
 #if MIP_INSTANTIATE_FLOAT
-INSTANTIATE(float)
+  INSTANTIATE(float)
 #endif
 
 #if MIP_INSTANTIATE_DOUBLE
-INSTANTIATE(double)
+  INSTANTIATE(double)
 #endif
 
 }  // namespace cuopt::linear_programming::detail

From c7e3e222d063b98907efa2f5f81e31d84f231f10 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Feb 2026 15:03:34 +0100
Subject: [PATCH 13/13] put back changes in restart

---
 .../restart_strategy/pdlp_restart_strategy.cu | 831 +++++++++---------
 1 file changed, 416 insertions(+), 415 deletions(-)

diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index 0b1c109185..8eacd4d246 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -2008,465 +2008,466 @@ void pdlp_restart_strategy_t<i_t, f_t>::solve_bound_constrained_trust_region(
                    "Incorrect primal reverse iterator");
       index_start_primal = thrust::raw_pointer_cast(&*highest_negInf_primal) - threshold_.data() +
                            1;  // + 1 to go after last negInf
-      if (lowest_inf != end) {
+      testing_range_low_.set_value_async(index_start_primal, stream_view_);
+    } else  // No negInf found, start is 0
+      testing_range_low_.set_value_async(index_start_primal, stream_view_);
+    if (lowest_inf != end) {
+      cuopt_assert(device_to_host_value(thrust::raw_pointer_cast(&*lowest_inf)) ==
                      std::numeric_limits<f_t>::infinity(),
                    "Incorrect primal iterator");
-                     index_end_primal =
-                       thrust::raw_pointer_cast(lowest_inf) -
-                       threshold_
-                         .data();  // no - 1 to go before the first inf because end is not included
-                     testing_range_high_.set_value_async(index_end_primal, stream_view_);
-      } else  // No inf found, end is primal_size_h_
-        testing_range_high_.set_value_async(index_end_primal, stream_view_);
-      cuopt_assert(index_start_primal <= index_end_primal,
-                   "Start should be strictly smaller than end");
-
-      cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(),
-                                   threshold_.data() + index_start_primal,
-                                   threshold_.data() + index_end_primal,
-                                   is_nan_or_inf<f_t>()),
-                   "Threshold vector should not contain inf or NaN values");
-
-      // Init parameters for live kernel
-      // Has to do this to pass lvalues (and not rvalue) to void* kernel_args
-      auto restart_view        = this->view();
-      auto op_view             = problem_ptr->view();
-      i_t* testing_range_low   = testing_range_low_.data();
-      i_t* testing_range_high  = testing_range_high_.data();
-      f_t* test_radius_squared = test_radius_squared_.data();
-      f_t* low_radius_squared  = low_radius_squared_.data();
-      f_t* high_radius_squared = high_radius_squared_.data();
-      f_t* distance_traveled   = duality_gap.distance_traveled_.data();
-
-      void* kernel_args[] = {
-        &restart_view,
-        &op_view,
-        &testing_range_low,
-        &testing_range_high,
-        &test_radius_squared,
-        &low_radius_squared,
-        &high_radius_squared,
-        &distance_traveled,
-      };
-      constexpr int numThreads = 128;
-      dim3 dimBlock(numThreads, 1, 1);
-      // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount *
-      // numBlocksPerSm
-      dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1);
-      // Compute the median for the join problem, while loop is inside the live kernel
-      RAFT_CUDA_TRY(cudaLaunchCooperativeKernel(
-        (void*)solve_bound_constrained_trust_region_kernel<i_t, f_t, numThreads>,
-        dimGrid,
-        dimBlock,
-        kernel_args,
-        0,
-        stream_view_));
-
-      // Find max threshold for the join problem
-      const f_t* max_threshold =
-        thrust::max_element(handle_ptr_->get_thrust_policy(),
-                            threshold_.data(),
-                            threshold_.data() + primal_size_h_ + dual_size_h_);
-
-      // we have now determined the test_threshold that should minimize the objective value of the
-      // solution.
-
-      //  if no component got fixed by their upper bound we can pick the maximum threshold to be the
-      //  target_threshold which was computed before the loop in the direction_and_threshold_kernel
-      // Otherwise use the test_threshold determined in the loop
-      // {
-      target_threshold_determination_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
-        this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold);
-      RAFT_CUDA_TRY(cudaPeekAtLastError());
-      // }
-
-      // Compute x (the solution which is defined by moving each component test_threshold *
-      // direction[component]) clamp on upper and lower bounds.
-      // Used unsorted_direction_full_ as the other one got sorted
-      // {
-      raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(),
-                             duality_gap.primal_solution_.data(),
-                             unsorted_direction_full_.data(),
-                             primal_size_h_,
-                             a_add_scalar_times_b<f_t>(target_threshold_.data()),
-                             stream_view_);
-      raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(),
-                             duality_gap.dual_solution_.data(),
-                             unsorted_direction_full_.data() + primal_size_h_,
-                             dual_size_h_,
-                             a_add_scalar_times_b<f_t>(target_threshold_.data()),
-                             stream_view_);
-      // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part
-      using f_t2 = typename type_2<f_t>::type;
-      cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(),
-                                                            problem_ptr->variable_bounds.data()),
-                                      duality_gap.primal_solution_tr_.data(),
-                                      primal_size_h_,
-                                      clamp<f_t, f_t2>(),
-                                      stream_view_.value());
-
-      // project by max(min(y[i], upperbound[i]),lowerbound[i])
-      raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(),
-                              duality_gap.dual_solution_tr_.data(),
-                              transformed_constraint_lower_bounds_.data(),
-                              transformed_constraint_upper_bounds_.data(),
-                              dual_size_h_,
-                              constraint_clamp<f_t>(),
-                              stream_view_);
-      // }
-    }
-
-    // Compute the current lower bound for the objective value using the primal solution_tr and
-    // upper bound for the objective value using the dual solution_tr
+      index_end_primal =
+        thrust::raw_pointer_cast(lowest_inf) -
+        threshold_.data();  // no - 1 to go before the first inf because end is not included
+      testing_range_high_.set_value_async(index_end_primal, stream_view_);
+    } else  // No inf found, end is primal_size_h_
+      testing_range_high_.set_value_async(index_end_primal, stream_view_);
+    cuopt_assert(index_start_primal <= index_end_primal,
+                 "Start should be strictly smaller than end");
+
+    cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(),
+                                 threshold_.data() + index_start_primal,
+                                 threshold_.data() + index_end_primal,
+                                 is_nan_or_inf<f_t>()),
+                 "Threshold vector should not contain inf or NaN values");
+
+    // Init parameters for live kernel
+    // Has to do this to pass lvalues (and not rvalue) to void* kernel_args
+    auto restart_view        = this->view();
+    auto op_view             = problem_ptr->view();
+    i_t* testing_range_low   = testing_range_low_.data();
+    i_t* testing_range_high  = testing_range_high_.data();
+    f_t* test_radius_squared = test_radius_squared_.data();
+    f_t* low_radius_squared  = low_radius_squared_.data();
+    f_t* high_radius_squared = high_radius_squared_.data();
+    f_t* distance_traveled   = duality_gap.distance_traveled_.data();
+
+    void* kernel_args[] = {
+      &restart_view,
+      &op_view,
+      &testing_range_low,
+      &testing_range_high,
+      &test_radius_squared,
+      &low_radius_squared,
+      &high_radius_squared,
+      &distance_traveled,
+    };
+    constexpr int numThreads = 128;
+    dim3 dimBlock(numThreads, 1, 1);
+    // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount *
+    // numBlocksPerSm
+    dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1);
+    // Compute the median for the join problem, while loop is inside the live kernel
+    RAFT_CUDA_TRY(cudaLaunchCooperativeKernel(
+      (void*)solve_bound_constrained_trust_region_kernel<i_t, f_t, numThreads>,
+      dimGrid,
+      dimBlock,
+      kernel_args,
+      0,
+      stream_view_));
+
+    // Find max threshold for the join problem
+    const f_t* max_threshold =
+      thrust::max_element(handle_ptr_->get_thrust_policy(),
+                          threshold_.data(),
+                          threshold_.data() + primal_size_h_ + dual_size_h_);
+
+    // we have now determined the test_threshold that should minimize the objective value of the
+    // solution.
+
+    //  if no component got fixed by their upper bound we can pick the maximum threshold to be the
+    //  target_threshold which was computed before the loop in the direction_and_threshold_kernel
+    // Otherwise use the test_threshold determined in the loop
     // {
-    // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution,
-    // primal_gradient))
-    compute_bound(duality_gap.primal_solution_tr_,
-                  duality_gap.primal_solution_,
-                  duality_gap.primal_gradient_,
-                  duality_gap.lagrangian_value_,
-                  primal_size_h_,
-                  primal_stride,
-                  tmp_primal,
-                  duality_gap.lower_bound_value_);
-
-    // compute 'upper bound' using dual
-    compute_bound(duality_gap.dual_solution_tr_,
-                  duality_gap.dual_solution_,
-                  duality_gap.dual_gradient_,
-                  duality_gap.lagrangian_value_,
-                  dual_size_h_,
-                  dual_stride,
-                  tmp_dual,
-                  duality_gap.upper_bound_value_);
+    target_threshold_determination_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
+      this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    // }
 
+    // Compute x (the solution which is defined by moving each component test_threshold *
+    // direction[component]) clamp on upper and lower bounds.
+    // Used unsorted_direction_full_ as the other one got sorted
+    // {
+    raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(),
+                           duality_gap.primal_solution_.data(),
+                           unsorted_direction_full_.data(),
+                           primal_size_h_,
+                           a_add_scalar_times_b<f_t>(target_threshold_.data()),
+                           stream_view_);
+    raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(),
+                           duality_gap.dual_solution_.data(),
+                           unsorted_direction_full_.data() + primal_size_h_,
+                           dual_size_h_,
+                           a_add_scalar_times_b<f_t>(target_threshold_.data()),
+                           stream_view_);
+    // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part
+    using f_t2 = typename type_2<f_t>::type;
+    cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(),
+                                                          problem_ptr->variable_bounds.data()),
+                                    duality_gap.primal_solution_tr_.data(),
+                                    primal_size_h_,
+                                    clamp<f_t, f_t2>(),
+                                    stream_view_.value());
+
+    // project by max(min(y[i], upperbound[i]),lowerbound[i])
+    raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(),
+                            duality_gap.dual_solution_tr_.data(),
+                            transformed_constraint_lower_bounds_.data(),
+                            transformed_constraint_upper_bounds_.data(),
+                            dual_size_h_,
+                            constraint_clamp<f_t>(),
+                            stream_view_);
     // }
   }
 
-  template <typename i_t, typename f_t>
-  void pdlp_restart_strategy_t<i_t, f_t>::compute_distance_traveled_from_last_restart(
-    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
-    rmm::device_uvector<f_t> & primal_weight,
-    rmm::device_uvector<f_t> & tmp_primal,
-    rmm::device_uvector<f_t> & tmp_dual)
-  {
-    raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart");
-    // norm(
-    //     new_primal_solution - last_restart.primal_solution,
-    //   )^2
-
-    // Julia / Paper use a weighted norm using primal weight for primal / dual distance
-    // We simply use L2 norm of diff
-    distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_,
-                                                    last_restart_duality_gap_.primal_solution_,
-                                                    tmp_primal,
-                                                    primal_size_h_,
-                                                    primal_stride,
-                                                    duality_gap.primal_distance_traveled_);
-
-    // compute similarly for dual
-    distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_,
-                                                    last_restart_duality_gap_.dual_solution_,
-                                                    tmp_dual,
-                                                    dual_size_h_,
-                                                    dual_stride,
-                                                    duality_gap.dual_distance_traveled_);
-
-    // distance_traveled = primal_distance * 0.5 * primal_weight
-    // + dual_distance * 0.5 / primal_weight
-    compute_distance_traveled_last_restart_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
-      duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data());
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
+  // Compute the current lower bound for the objective value using the primal solution_tr and
+  // upper bound for the objective value using the dual solution_tr
+  // {
+  // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution,
+  // primal_gradient))
+  compute_bound(duality_gap.primal_solution_tr_,
+                duality_gap.primal_solution_,
+                duality_gap.primal_gradient_,
+                duality_gap.lagrangian_value_,
+                primal_size_h_,
+                primal_stride,
+                tmp_primal,
+                duality_gap.lower_bound_value_);
+
+  // compute 'upper bound' using dual
+  compute_bound(duality_gap.dual_solution_tr_,
+                duality_gap.dual_solution_,
+                duality_gap.dual_gradient_,
+                duality_gap.lagrangian_value_,
+                dual_size_h_,
+                dual_stride,
+                tmp_dual,
+                duality_gap.upper_bound_value_);
+
+  // }
+}
 
-  template <typename i_t, typename f_t>
-  void pdlp_restart_strategy_t<i_t, f_t>::compute_primal_gradient(
-    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
-    cusparse_view_t<i_t, f_t> & cusparse_view)
-  {
-    raft::common::nvtx::range fun_scope("compute_primal_gradient");
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::compute_distance_traveled_from_last_restart(
+  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
+  rmm::device_uvector<f_t>& primal_weight,
+  rmm::device_uvector<f_t>& tmp_primal,
+  rmm::device_uvector<f_t>& tmp_dual)
+{
+  raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart");
+  // norm(
+  //     new_primal_solution - last_restart.primal_solution,
+  //   )^2
+
+  // Julia / Paper use a weighted norm using primal weight for primal / dual distance
+  // We simply use L2 norm of diff
+  distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_,
+                                                  last_restart_duality_gap_.primal_solution_,
+                                                  tmp_primal,
+                                                  primal_size_h_,
+                                                  primal_stride,
+                                                  duality_gap.primal_distance_traveled_);
+
+  // compute similarly for dual
+  distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_,
+                                                  last_restart_duality_gap_.dual_solution_,
+                                                  tmp_dual,
+                                                  dual_size_h_,
+                                                  dual_stride,
+                                                  duality_gap.dual_distance_traveled_);
+
+  // distance_traveled = primal_distance * 0.5 * primal_weight
+  // + dual_distance * 0.5 / primal_weight
+  compute_distance_traveled_last_restart_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
+    duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::compute_primal_gradient(
+  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
+  cusparse_view_t<i_t, f_t>& cusparse_view)
+{
+  raft::common::nvtx::range fun_scope("compute_primal_gradient");
 #ifdef PDLP_DEBUG_MODE
-    std::cout << "    Compute primal gradient:" << std::endl;
+  std::cout << "    Compute primal gradient:" << std::endl;
 #endif
 
-    // for QP add problem.objective_matrix * primal_solution as well
-    // c - A^T*y (copy c to primal_gradient for correct writing of result)
-    raft::copy(duality_gap.primal_gradient_.data(),
-               problem_ptr->objective_coefficients.data(),
-               primal_size_h_,
-               stream_view_);
-
-    RAFT_CUSPARSE_TRY(
-      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                         reusable_device_scalar_value_neg_1_.data(),
-                                         cusparse_view.A_T,
-                                         cusparse_view.dual_solution,
-                                         reusable_device_scalar_value_1_.data(),
-                                         cusparse_view.primal_gradient,
-                                         CUSPARSE_SPMV_CSR_ALG2,
-                                         (f_t*)cusparse_view.buffer_transpose.data(),
-                                         stream_view_));
-  }
-
-  template <typename i_t, typename f_t>
-  __global__ void compute_subgradient_kernel(
-    const typename pdlp_restart_strategy_t<i_t, f_t>::view_t restart_strategy_view,
-    const typename problem_t<i_t, f_t>::view_t op_problem_view,
-    const typename localized_duality_gap_container_t<i_t, f_t>::view_t duality_gap_view,
-    f_t* subgradient)
-  {
-    i_t id = threadIdx.x + blockIdx.x * blockDim.x;
-    if (id >= duality_gap_view.dual_size) { return; }
-
-    f_t lower          = op_problem_view.constraint_lower_bounds[id];
-    f_t upper          = op_problem_view.constraint_upper_bounds[id];
-    f_t primal_product = duality_gap_view.dual_gradient[id];
-    f_t dual_solution  = duality_gap_view.dual_solution[id];
+  // for QP add problem.objective_matrix * primal_solution as well
+  // c - A^T*y (copy c to primal_gradient for correct writing of result)
+  raft::copy(duality_gap.primal_gradient_.data(),
+             problem_ptr->objective_coefficients.data(),
+             primal_size_h_,
+             stream_view_);
 
-    f_t subgradient_coefficient;
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                       reusable_device_scalar_value_neg_1_.data(),
+                                                       cusparse_view.A_T,
+                                                       cusparse_view.dual_solution,
+                                                       reusable_device_scalar_value_1_.data(),
+                                                       cusparse_view.primal_gradient,
+                                                       CUSPARSE_SPMV_CSR_ALG2,
+                                                       (f_t*)cusparse_view.buffer_transpose.data(),
+                                                       stream_view_));
+}
 
-    if (dual_solution < f_t(0)) {
-      subgradient_coefficient = upper;
-    } else if (dual_solution > f_t(0)) {
-      subgradient_coefficient = lower;
-    } else if (!isfinite(upper) && !isfinite(lower)) {
-      subgradient_coefficient = f_t(0);
-    } else if (!isfinite(upper) && isfinite(lower)) {
+template <typename i_t, typename f_t>
+__global__ void compute_subgradient_kernel(
+  const typename pdlp_restart_strategy_t<i_t, f_t>::view_t restart_strategy_view,
+  const typename problem_t<i_t, f_t>::view_t op_problem_view,
+  const typename localized_duality_gap_container_t<i_t, f_t>::view_t duality_gap_view,
+  f_t* subgradient)
+{
+  i_t id = threadIdx.x + blockIdx.x * blockDim.x;
+  if (id >= duality_gap_view.dual_size) { return; }
+
+  f_t lower          = op_problem_view.constraint_lower_bounds[id];
+  f_t upper          = op_problem_view.constraint_upper_bounds[id];
+  f_t primal_product = duality_gap_view.dual_gradient[id];
+  f_t dual_solution  = duality_gap_view.dual_solution[id];
+
+  f_t subgradient_coefficient;
+
+  if (dual_solution < f_t(0)) {
+    subgradient_coefficient = upper;
+  } else if (dual_solution > f_t(0)) {
+    subgradient_coefficient = lower;
+  } else if (!isfinite(upper) && !isfinite(lower)) {
+    subgradient_coefficient = f_t(0);
+  } else if (!isfinite(upper) && isfinite(lower)) {
+    subgradient_coefficient = lower;
+  } else if (isfinite(upper) && !isfinite(lower)) {
+    subgradient_coefficient = upper;
+  } else {
+    if (primal_product < lower) {
       subgradient_coefficient = lower;
-    } else if (isfinite(upper) && !isfinite(lower)) {
+    } else if (primal_product > upper) {
       subgradient_coefficient = upper;
     } else {
-      if (primal_product < lower) {
-        subgradient_coefficient = lower;
-      } else if (primal_product > upper) {
-        subgradient_coefficient = upper;
-      } else {
-        subgradient_coefficient = primal_product;
-      }
+      subgradient_coefficient = primal_product;
     }
-
-    subgradient[id] = subgradient_coefficient;
   }
 
-  template <typename i_t, typename f_t>
-  void pdlp_restart_strategy_t<i_t, f_t>::compute_dual_gradient(
-    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
-    cusparse_view_t<i_t, f_t> & cusparse_view,
-    rmm::device_uvector<f_t> & tmp_dual)
-  {
-    raft::common::nvtx::range fun_scope("compute_dual_gradient");
+  subgradient[id] = subgradient_coefficient;
+}
+
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::compute_dual_gradient(
+  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
+  cusparse_view_t<i_t, f_t>& cusparse_view,
+  rmm::device_uvector<f_t>& tmp_dual)
+{
+  raft::common::nvtx::range fun_scope("compute_dual_gradient");
 #ifdef PDLP_DEBUG_MODE
-    std::cout << "    Compute dual gradient:" << std::endl;
+  std::cout << "    Compute dual gradient:" << std::endl;
 #endif
 
-    // b - A*x
-    // is changed with the introduction of constraint upper and lower bounds
-
-    // gradient constains primal_product
-    RAFT_CUSPARSE_TRY(
-      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                         reusable_device_scalar_value_1_.data(),
-                                         cusparse_view.A,
-                                         cusparse_view.primal_solution,
-                                         reusable_device_scalar_value_0_.data(),
-                                         cusparse_view.dual_gradient,
-                                         CUSPARSE_SPMV_CSR_ALG2,
-                                         (f_t*)cusparse_view.buffer_non_transpose.data(),
-                                         stream_view_));
-
-    // tmp_dual will contain the subgradient
-    i_t number_of_blocks = dual_size_h_ / block_size;
-    if (dual_size_h_ % block_size) number_of_blocks++;
-    i_t number_of_threads = std::min(dual_size_h_, block_size);
-    compute_subgradient_kernel<i_t, f_t><<<number_of_blocks, number_of_threads, 0, stream_view_>>>(
-      this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data());
-
-    // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient)
-    raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(),
-                             tmp_dual.data(),
-                             duality_gap.dual_gradient_.data(),
-                             dual_size_h_,
-                             stream_view_);
-  }
+  // b - A*x
+  // is changed with the introduction of constraint upper and lower bounds
 
-  template <typename i_t, typename f_t>
-  void pdlp_restart_strategy_t<i_t, f_t>::compute_lagrangian_value(
-    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
-    cusparse_view_t<i_t, f_t> & cusparse_view,
-    rmm::device_uvector<f_t> & tmp_primal,
-    rmm::device_uvector<f_t> & tmp_dual)
-  {
-    raft::common::nvtx::range fun_scope("compute_lagrangian_value");
+  // gradient constains primal_product
+  RAFT_CUSPARSE_TRY(
+    raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                       reusable_device_scalar_value_1_.data(),
+                                       cusparse_view.A,
+                                       cusparse_view.primal_solution,
+                                       reusable_device_scalar_value_0_.data(),
+                                       cusparse_view.dual_gradient,
+                                       CUSPARSE_SPMV_CSR_ALG2,
+                                       (f_t*)cusparse_view.buffer_non_transpose.data(),
+                                       stream_view_));
+
+  // tmp_dual will contain the subgradient
+  i_t number_of_blocks = dual_size_h_ / block_size;
+  if (dual_size_h_ % block_size) number_of_blocks++;
+  i_t number_of_threads = std::min(dual_size_h_, block_size);
+  compute_subgradient_kernel<i_t, f_t><<<number_of_blocks, number_of_threads, 0, stream_view_>>>(
+    this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data());
+
+  // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient)
+  raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(),
+                           tmp_dual.data(),
+                           duality_gap.dual_gradient_.data(),
+                           dual_size_h_,
+                           stream_view_);
+}
+
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::compute_lagrangian_value(
+  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
+  cusparse_view_t<i_t, f_t>& cusparse_view,
+  rmm::device_uvector<f_t>& tmp_primal,
+  rmm::device_uvector<f_t>& tmp_dual)
+{
+  raft::common::nvtx::range fun_scope("compute_lagrangian_value");
 #ifdef PDLP_DEBUG_MODE
-    std::cout << "    Compute lagrangian value:" << std::endl;
+  std::cout << "    Compute lagrangian value:" << std::endl;
 #endif
-    // if QP
-    //  0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) +
-    //  dot(primal_solution, problem.objective_vector) -
-    //  dot(primal_solution, problem.constraint_matrix' * dual_solution) +
-    //  dot(dual_solution, dual_gradient+primal_product) +
-    //  problem.objective_constant
+  // if QP
+  //  0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) +
+  //  dot(primal_solution, problem.objective_vector) -
+  //  dot(primal_solution, problem.constraint_matrix' * dual_solution) +
+  //  dot(dual_solution, dual_gradient+primal_product) +
+  //  problem.objective_constant
 
-    // when lp first term is irrelevant
+  // when lp first term is irrelevant
 
-    // second term
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                    primal_size_h_,
-                                                    duality_gap.primal_solution_.data(),
-                                                    primal_stride,
-                                                    problem_ptr->objective_coefficients.data(),
-                                                    primal_stride,
-                                                    reusable_device_scalar_1_.data(),
-                                                    stream_view_));
+  // second term
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                  primal_size_h_,
+                                                  duality_gap.primal_solution_.data(),
+                                                  primal_stride,
+                                                  problem_ptr->objective_coefficients.data(),
+                                                  primal_stride,
+                                                  reusable_device_scalar_1_.data(),
+                                                  stream_view_));
 
-    // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot
-    RAFT_CUSPARSE_TRY(
-      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                         reusable_device_scalar_value_1_.data(),
-                                         cusparse_view.A_T,
-                                         cusparse_view.dual_solution,
-                                         reusable_device_scalar_value_0_.data(),
-                                         cusparse_view.tmp_primal,
-                                         CUSPARSE_SPMV_CSR_ALG2,
-                                         (f_t*)cusparse_view.buffer_transpose.data(),
-                                         stream_view_));
+  // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                       reusable_device_scalar_value_1_.data(),
+                                                       cusparse_view.A_T,
+                                                       cusparse_view.dual_solution,
+                                                       reusable_device_scalar_value_0_.data(),
+                                                       cusparse_view.tmp_primal,
+                                                       CUSPARSE_SPMV_CSR_ALG2,
+                                                       (f_t*)cusparse_view.buffer_transpose.data(),
+                                                       stream_view_));
 
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                    primal_size_h_,
-                                                    duality_gap.primal_solution_.data(),
-                                                    primal_stride,
-                                                    tmp_primal.data(),
-                                                    primal_stride,
-                                                    reusable_device_scalar_2_.data(),
-                                                    stream_view_));
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                  primal_size_h_,
+                                                  duality_gap.primal_solution_.data(),
+                                                  primal_stride,
+                                                  tmp_primal.data(),
+                                                  primal_stride,
+                                                  reusable_device_scalar_2_.data(),
+                                                  stream_view_));
 
-    // fourth term //tmp_dual still contains subgradient from the dual_gradient computation
-    reusable_device_scalar_3_.set_value_to_zero_async(stream_view_);
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                    dual_size_h_,
-                                                    duality_gap.dual_solution_.data(),
-                                                    dual_stride,
-                                                    tmp_dual.data(),
-                                                    dual_stride,
-                                                    reusable_device_scalar_3_.data(),
-                                                    stream_view_));
+  // fourth term //tmp_dual still contains subgradient from the dual_gradient computation
+  reusable_device_scalar_3_.set_value_to_zero_async(stream_view_);
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                  dual_size_h_,
+                                                  duality_gap.dual_solution_.data(),
+                                                  dual_stride,
+                                                  tmp_dual.data(),
+                                                  dual_stride,
+                                                  reusable_device_scalar_3_.data(),
+                                                  stream_view_));
 
-    // subtract third term from second up
-    raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(),
-                             reusable_device_scalar_1_.data(),
-                             reusable_device_scalar_2_.data(),
-                             1,
-                             stream_view_);
-    raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(),
-                             reusable_device_scalar_1_.data(),
-                             reusable_device_scalar_3_.data(),
-                             1,
-                             stream_view_);
-  }
+  // subtract third term from second up
+  raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(),
+                           reusable_device_scalar_1_.data(),
+                           reusable_device_scalar_2_.data(),
+                           1,
+                           stream_view_);
+  raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(),
+                           reusable_device_scalar_1_.data(),
+                           reusable_device_scalar_3_.data(),
+                           1,
+                           stream_view_);
+}
 
-  template <typename i_t, typename f_t>
-  void pdlp_restart_strategy_t<i_t, f_t>::reset_internal()
-  {
-    candidate_is_avg_.set_value_to_zero_async(stream_view_);
-    restart_triggered_.set_value_to_zero_async(stream_view_);
-  }
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::reset_internal()
+{
+  candidate_is_avg_.set_value_to_zero_async(stream_view_);
+  restart_triggered_.set_value_to_zero_async(stream_view_);
+}
 
-  template <typename i_t, typename f_t>
-  typename pdlp_restart_strategy_t<i_t, f_t>::view_t pdlp_restart_strategy_t<i_t, f_t>::view()
-  {
-    pdlp_restart_strategy_t<i_t, f_t>::view_t v{};
-    v.primal_size                         = primal_size_h_;
-    v.dual_size                           = dual_size_h_;
-    v.transformed_constraint_lower_bounds = raft::device_span<f_t>{
-      transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()};
-    v.transformed_constraint_upper_bounds = raft::device_span<f_t>{
-      transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()};
-    v.last_restart_length = last_restart_length_;
+template <typename i_t, typename f_t>
+typename pdlp_restart_strategy_t<i_t, f_t>::view_t pdlp_restart_strategy_t<i_t, f_t>::view()
+{
+  pdlp_restart_strategy_t<i_t, f_t>::view_t v{};
+  v.primal_size                         = primal_size_h_;
+  v.dual_size                           = dual_size_h_;
+  v.transformed_constraint_lower_bounds = raft::device_span<f_t>{
+    transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()};
+  v.transformed_constraint_upper_bounds = raft::device_span<f_t>{
+    transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()};
+  v.last_restart_length = last_restart_length_;
 
-    v.weights = raft::device_span<f_t>{weights_.data(), weights_.size()};
+  v.weights = raft::device_span<f_t>{weights_.data(), weights_.size()};
 
-    v.candidate_is_avg  = candidate_is_avg_.data();
-    v.restart_triggered = restart_triggered_.data();
+  v.candidate_is_avg  = candidate_is_avg_.data();
+  v.restart_triggered = restart_triggered_.data();
 
-    v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data();
+  v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data();
 
-    v.center_point     = raft::device_span<f_t>{center_point_.data(), center_point_.size()};
-    v.objective_vector = raft::device_span<f_t>{objective_vector_.data(), objective_vector_.size()};
-    v.direction_full   = raft::device_span<f_t>{direction_full_.data(), direction_full_.size()};
-    v.threshold        = raft::device_span<f_t>{threshold_.data(), threshold_.size()};
-    v.lower_bound      = raft::device_span<f_t>{lower_bound_.data(), lower_bound_.size()};
-    v.upper_bound      = raft::device_span<f_t>{upper_bound_.data(), upper_bound_.size()};
-    v.test_point       = raft::device_span<f_t>{test_point_.data(), test_point_.size()};
+  v.center_point     = raft::device_span<f_t>{center_point_.data(), center_point_.size()};
+  v.objective_vector = raft::device_span<f_t>{objective_vector_.data(), objective_vector_.size()};
+  v.direction_full   = raft::device_span<f_t>{direction_full_.data(), direction_full_.size()};
+  v.threshold        = raft::device_span<f_t>{threshold_.data(), threshold_.size()};
+  v.lower_bound      = raft::device_span<f_t>{lower_bound_.data(), lower_bound_.size()};
+  v.upper_bound      = raft::device_span<f_t>{upper_bound_.data(), upper_bound_.size()};
+  v.test_point       = raft::device_span<f_t>{test_point_.data(), test_point_.size()};
 
-    v.target_threshold    = target_threshold_.data();
-    v.low_radius_squared  = low_radius_squared_.data();
-    v.high_radius_squared = high_radius_squared_.data();
-    v.test_radius_squared = test_radius_squared_.data();
+  v.target_threshold    = target_threshold_.data();
+  v.low_radius_squared  = low_radius_squared_.data();
+  v.high_radius_squared = high_radius_squared_.data();
+  v.test_radius_squared = test_radius_squared_.data();
 
-    v.testing_range_low  = testing_range_low_.data();
-    v.testing_range_high = testing_range_high_.data();
+  v.testing_range_low  = testing_range_low_.data();
+  v.testing_range_high = testing_range_high_.data();
 
-    v.shared_live_kernel_accumulator = raft::device_span<f_t>{
-      shared_live_kernel_accumulator_.data(), shared_live_kernel_accumulator_.size()};
+  v.shared_live_kernel_accumulator = raft::device_span<f_t>{shared_live_kernel_accumulator_.data(),
+                                                            shared_live_kernel_accumulator_.size()};
 
-    v.hyper_params = hyper_params_;
+  v.hyper_params = hyper_params_;
 
-    return v;
-  }
+  return v;
+}
 
-  template <typename i_t, typename f_t>
-  typename pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart_view_t
-  pdlp_restart_strategy_t<i_t, f_t>::make_cupdlpx_restart_view(
-    const rmm::device_uvector<f_t>& primal_distance,
-    const rmm::device_uvector<f_t>& dual_distance,
-    const convergence_information_t<i_t, f_t>& current_convergence_information,
-    const rmm::device_uvector<f_t>& step_size,
-    rmm::device_uvector<f_t>& primal_weight,
-    rmm::device_uvector<f_t>& best_primal_weight,
-    rmm::device_uvector<f_t>& primal_step_size,
-    rmm::device_uvector<f_t>& dual_step_size)
-  {
-    cupdlpx_restart_view_t v{};
-    v.primal_distance    = make_span(primal_distance);
-    v.dual_distance      = make_span(dual_distance);
-    v.l2_dual_residual   = make_span(current_convergence_information.get_l2_dual_residual());
-    v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual());
-    v.l2_norm_primal_linear_objective =
-      current_convergence_information.get_relative_dual_tolerance_factor();
-    v.l2_norm_primal_right_hand_side =
-      current_convergence_information.get_relative_primal_tolerance_factor();
-    v.step_size                     = make_span(step_size);
-    v.primal_weight                 = make_span(primal_weight);
-    v.primal_weight_error_sum       = make_span(primal_weight_error_sum_);
-    v.primal_weight_last_error      = make_span(primal_weight_last_error_);
-    v.best_primal_weight            = make_span(best_primal_weight);
-    v.new_primal_step_size          = make_span(primal_step_size);
-    v.new_dual_step_size            = make_span(dual_step_size);
-    v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_);
-    v.hyper_params                  = hyper_params_;
-    return v;
-  }
+template <typename i_t, typename f_t>
+typename pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart_view_t
+pdlp_restart_strategy_t<i_t, f_t>::make_cupdlpx_restart_view(
+  const rmm::device_uvector<f_t>& primal_distance,
+  const rmm::device_uvector<f_t>& dual_distance,
+  const convergence_information_t<i_t, f_t>& current_convergence_information,
+  const rmm::device_uvector<f_t>& step_size,
+  rmm::device_uvector<f_t>& primal_weight,
+  rmm::device_uvector<f_t>& best_primal_weight,
+  rmm::device_uvector<f_t>& primal_step_size,
+  rmm::device_uvector<f_t>& dual_step_size)
+{
+  cupdlpx_restart_view_t v{};
+  v.primal_distance    = make_span(primal_distance);
+  v.dual_distance      = make_span(dual_distance);
+  v.l2_dual_residual   = make_span(current_convergence_information.get_l2_dual_residual());
+  v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual());
+  v.l2_norm_primal_linear_objective =
+    current_convergence_information.get_relative_dual_tolerance_factor();
+  v.l2_norm_primal_right_hand_side =
+    current_convergence_information.get_relative_primal_tolerance_factor();
+  v.step_size                     = make_span(step_size);
+  v.primal_weight                 = make_span(primal_weight);
+  v.primal_weight_error_sum       = make_span(primal_weight_error_sum_);
+  v.primal_weight_last_error      = make_span(primal_weight_last_error_);
+  v.best_primal_weight            = make_span(best_primal_weight);
+  v.new_primal_step_size          = make_span(primal_step_size);
+  v.new_dual_step_size            = make_span(dual_step_size);
+  v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_);
+  v.hyper_params                  = hyper_params_;
+  return v;
+}
 
-  template <typename i_t, typename f_t>
-  i_t pdlp_restart_strategy_t<i_t, f_t>::get_iterations_since_last_restart() const
-  {
-    return weighted_average_solution_.get_iterations_since_last_restart();
-  }
+template <typename i_t, typename f_t>
+i_t pdlp_restart_strategy_t<i_t, f_t>::get_iterations_since_last_restart() const
+{
+  return weighted_average_solution_.get_iterations_since_last_restart();
+}
 
-  template <typename i_t, typename f_t>
-  void pdlp_restart_strategy_t<i_t, f_t>::set_last_restart_was_average(bool value)
-  {
-    last_restart_was_average_ = value;
-  }
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::set_last_restart_was_average(bool value)
+{
+  last_restart_was_average_ = value;
+}
 
-  template <typename i_t, typename f_t>
-  bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
-  {
-    return last_restart_was_average_;
-  }
+template <typename i_t, typename f_t>
+bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
+{
+  return last_restart_was_average_;
+}
 
 #define INSTANTIATE(F_TYPE)                                                                     \
   template class pdlp_restart_strategy_t<int, F_TYPE>;                                          \
@@ -2523,11 +2524,11 @@ void pdlp_restart_strategy_t<i_t, f_t>::solve_bound_constrained_trust_region(
     F_TYPE* primal_product);
 
 #if MIP_INSTANTIATE_FLOAT
-  INSTANTIATE(float)
+INSTANTIATE(float)
 #endif
 
 #if MIP_INSTANTIATE_DOUBLE
-  INSTANTIATE(double)
+INSTANTIATE(double)
 #endif
 
 }  // namespace cuopt::linear_programming::detail