diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu
index d04979080a..e76a77dba6 100644
--- a/benchmarks/linear_programming/cuopt/run_pdlp.cu
+++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu
@@ -70,15 +70,23 @@ static void parse_arguments(argparse::ArgumentParser& program)
       "Path to PDLP hyper-params file to configure PDLP solver. Has priority over PDLP solver "
       "modes.");
 
-  program.add_argument("--presolve")
-    .help("enable/disable presolve (default: true for MIP problems, false for LP problems)")
-    .default_value(0)
-    .scan<'i', int>()
-    .choices(0, 1);
+  program.add_argument("--presolver")
+    .help("Presolver to use. Possible values: None, Papilo, PSLP, Default")
+    .default_value("Default")
+    .choices("None", "Papilo", "PSLP", "Default");
 
   program.add_argument("--solution-path").help("Path where solution file will be generated");
 }
 
+static cuopt::linear_programming::presolver_t string_to_presolver(const std::string& presolver)
+{
+  if (presolver == "None") return cuopt::linear_programming::presolver_t::None;
+  if (presolver == "Papilo") return cuopt::linear_programming::presolver_t::Papilo;
+  if (presolver == "PSLP") return cuopt::linear_programming::presolver_t::PSLP;
+  if (presolver == "Default") return cuopt::linear_programming::presolver_t::Default;
+  return cuopt::linear_programming::presolver_t::Default;
+}
+
 static cuopt::linear_programming::pdlp_solver_mode_t string_to_pdlp_solver_mode(
   const std::string& mode)
 {
@@ -107,7 +115,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t<int, double> create_sol
     string_to_pdlp_solver_mode(program.get<std::string>("--pdlp-solver-mode"));
   settings.method = static_cast<cuopt::linear_programming::method_t>(program.get<int>("--method"));
   settings.crossover = program.get<int>("--crossover");
-  settings.presolve  = program.get<int>("--presolve");
+  settings.presolver = string_to_presolver(program.get<std::string>("--presolver"));
 
   return settings;
 }
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index db72bbb323..095f462382 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -335,8 +335,9 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
     }
 
     const auto mps_model = simplex_problem_to_mps_data_model(original_problem);
+    const raft::handle_t batch_pdlp_handle;
     const auto solutions =
-      batch_pdlp_solve(original_problem.handle_ptr, mps_model, fractional, fraction_values);
+      batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values);
     f_t batch_pdlp_strong_branching_time = toc(start_batch);
 
     // Find max iteration on how many are done accross the batch
diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt
index ced9da8edc..2071bdfdef 100644
--- a/cpp/src/pdlp/CMakeLists.txt
+++ b/cpp/src/pdlp/CMakeLists.txt
@@ -24,6 +24,7 @@ set(LP_CORE_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/infeasibility_information.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu
 )
 
 # C and Python adapter files
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
index 0adbc3809f..afa3ee5fb7 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
@@ -545,7 +545,7 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::scale_problem()
 #ifdef CUPDLP_DEBUG_MODE
   print("constraint_lower_bound", op_problem_scaled_.constraint_lower_bounds);
   print("constraint_upper_bound", op_problem_scaled_.constraint_upper_bounds);
-  std::vector<f_t2> variable_bounds = host_copy(op_problem_scaled_.variable_bounds);
+  std::vector<f_t2> variable_bounds = host_copy(op_problem_scaled_.variable_bounds, stream_view_);
   std::vector<f_t> lower_bounds;
   std::vector<f_t> upper_bounds;
   for (const auto& variable_bound : variable_bounds) {
diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index a92ce8e8ef..51e0b29381 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -567,6 +567,12 @@ struct primal_reflected_major_projection_bulk_op {
     const f_t obj_coef   = objective_coefficients[var_idx];
     const f_t aty_val    = current_AtY[idx];
 
+    cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_major_projection");
+    cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_major_projection");
+    cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0");
+    cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_major_projection");
+    cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_major_projection");
+
     const f_t next = primal_val - step_size * (obj_coef - aty_val);
 
     const f_t2 bounds = variable_bounds[var_idx];
@@ -576,6 +582,9 @@ struct primal_reflected_major_projection_bulk_op {
     potential_next_primal[idx] = next_clamped;
     dual_slack[idx]            = (next_clamped - next) / step_size;
     reflected_primal[idx]      = f_t(2.0) * next_clamped - primal_val;
+
+    cuopt_assert(!isnan(reflected_primal[idx]),
+                 "reflected_primal is NaN after primal_reflected_major_projection");
   }
 };
 
@@ -599,6 +608,12 @@ struct dual_reflected_major_projection_bulk_op {
     const f_t current_dual = dual_solution[idx];
     const f_t Ax           = dual_gradient[idx];
 
+    cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_major_projection");
+    cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_major_projection");
+    cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0");
+    cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_major_projection");
+    cuopt_assert(!isnan(Ax), "dual_gradient is NaN in dual_reflected_major_projection");
+
     const f_t tmp = current_dual / step_size - Ax;
     const f_t tmp_proj =
       cuda::std::max<f_t>(-constraint_upper_bounds[constraint_idx],
@@ -607,6 +622,9 @@ struct dual_reflected_major_projection_bulk_op {
 
     potential_next_dual[idx] = next_dual;
     reflected_dual[idx]      = f_t(2.0) * next_dual - current_dual;
+
+    cuopt_assert(!isnan(reflected_dual[idx]),
+                 "reflected_dual is NaN after dual_reflected_major_projection");
   }
 };
 
@@ -631,12 +649,21 @@ struct primal_reflected_projection_bulk_op {
     const f_t obj_coef   = objective_coefficients[var_idx];
     const f_t aty_val    = current_AtY[idx];
 
+    cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_projection");
+    cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_projection");
+    cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_projection");
+    cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_projection");
+    cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0");
+
     f_t reflected = primal_val - step_size * (obj_coef - aty_val);
 
     const f_t2 bounds = variable_bounds[var_idx];
     reflected = cuda::std::max(cuda::std::min(reflected, get_upper(bounds)), get_lower(bounds));
 
     reflected_primal[idx] = f_t(2.0) * reflected - primal_val;
+
+    cuopt_assert(!isnan(reflected_primal[idx]),
+                 "reflected_primal is NaN after primal_reflected_projection");
   }
 };
 
@@ -659,13 +686,23 @@ struct dual_reflected_projection_bulk_op {
 
     const f_t step_size    = dual_step_size[batch_idx];
     const f_t current_dual = dual_solution[idx];
-    const f_t tmp          = current_dual / step_size - dual_gradient[idx];
+
+    cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_projection");
+    cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_projection");
+    cuopt_assert(!isnan(dual_gradient[idx]), "dual_gradient is NaN in dual_reflected_projection");
+    cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_projection");
+    cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0");
+
+    const f_t tmp = current_dual / step_size - dual_gradient[idx];
     const f_t tmp_proj =
       cuda::std::max<f_t>(-constraint_upper_bounds[constraint_idx],
                           cuda::std::min<f_t>(tmp, -constraint_lower_bounds[constraint_idx]));
     const f_t next_dual = (tmp - tmp_proj) * step_size;
 
     reflected_dual[idx] = f_t(2.0) * next_dual - current_dual;
+
+    cuopt_assert(!isnan(reflected_dual[idx]),
+                 "reflected_dual is NaN after dual_reflected_projection");
   }
 };
 
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 012b987eea..d79cc6c3a2 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -35,7 +35,9 @@
 
 #include <thrust/count.h>
 #include <thrust/extrema.h>
+#include <thrust/logical.h>
 
+#include <cmath>
 #include <optional>
 #include <unordered_set>
 
@@ -1186,24 +1188,55 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
                           f_t& avg)
 {
   auto abs_op      = [] __host__ __device__(f_t x) { return abs(x); };
-  auto min_nonzero = [] __host__ __device__(f_t x) {
-    return x == 0 ? std::numeric_limits<f_t>::max() : abs(x);
-  };
-
-  smallest = thrust::transform_reduce(rmm::exec_policy(vec.stream()),
-                                      vec.begin(),
-                                      vec.end(),
-                                      min_nonzero,
-                                      std::numeric_limits<f_t>::max(),
-                                      thrust::minimum<f_t>());
-
-  largest = thrust::transform_reduce(
-    rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::maximum<f_t>());
-
-  f_t sum = thrust::transform_reduce(
-    rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::plus<f_t>());
-
-  avg = sum / vec.size();
+  auto min_nonzero = [] __host__ __device__(f_t x)
+    -> f_t { return x == 0 ? std::numeric_limits<f_t>::max() : abs(x); };
+
+  cuopt_assert(vec.size() > 0, "Vector must not be empty");
+
+  auto stream = vec.stream();
+  size_t n    = vec.size();
+
+  rmm::device_scalar<f_t> d_smallest(stream);
+  rmm::device_scalar<f_t> d_largest(stream);
+  rmm::device_scalar<f_t> d_sum(stream);
+
+  auto min_nz_iter = thrust::make_transform_iterator(vec.cbegin(), min_nonzero);
+  auto abs_iter    = thrust::make_transform_iterator(vec.cbegin(), abs_op);
+
+  void* d_temp   = nullptr;
+  size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 0;
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(d_temp,
+                                          bytes_1,
+                                          min_nz_iter,
+                                          d_smallest.data(),
+                                          n,
+                                          cuda::minimum<>{},
+                                          std::numeric_limits<f_t>::max(),
+                                          stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream));
+
+  size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3});
+  rmm::device_buffer temp_buf(max_bytes, stream);
+
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(temp_buf.data(),
+                                          bytes_1,
+                                          min_nz_iter,
+                                          d_smallest.data(),
+                                          n,
+                                          cuda::minimum<>{},
+                                          std::numeric_limits<f_t>::max(),
+                                          stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream));
+
+  smallest = d_smallest.value(stream);
+  largest  = d_largest.value(stream);
+  avg      = d_sum.value(stream) / vec.size();
 };
 
 template <typename f_t>
@@ -1406,11 +1439,25 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal,
                                  const f_t interaction,
                                  f_t* fixed_point_error)
 {
+  cuopt_assert(!isnan(norm_squared_delta_primal), "norm_squared_delta_primal must not be NaN");
+  cuopt_assert(!isnan(norm_squared_delta_dual), "norm_squared_delta_dual must not be NaN");
+  cuopt_assert(!isnan(primal_weight), "primal_weight must not be NaN");
+  cuopt_assert(!isnan(step_size), "step_size must not be NaN");
+  cuopt_assert(!isnan(interaction), "interaction must not be NaN");
+  cuopt_assert(norm_squared_delta_primal >= f_t(0.0), "norm_squared_delta_primal must be >= 0");
+  cuopt_assert(norm_squared_delta_dual >= f_t(0.0), "norm_squared_delta_dual must be >= 0");
+  cuopt_assert(primal_weight > f_t(0.0), "primal_weight must be > 0");
+  cuopt_assert(step_size > f_t(0.0), "step_size must be > 0");
+
   const f_t movement =
     norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight;
   const f_t computed_interaction = f_t(2.0) * interaction * step_size;
 
-  *fixed_point_error = cuda::std::sqrt(movement + computed_interaction);
+  cuopt_assert(movement + computed_interaction >= f_t(0.0),
+               "Movement + computed interaction must be >= 0");
+
+  // Clamp to 0 to avoid NaN
+  *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction));
 
 #ifdef CUPDLP_DEBUG_MODE
   printf("movement %lf\n", movement);
@@ -1790,6 +1837,7 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
   // Sync to make sure all previous cuSparse operations are finished before setting the
   // potential_next_dual_solution
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+
   // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
   RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
                                            (void*)pdhg_solver_.get_reflected_dual().data()));
@@ -1813,6 +1861,7 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
     RAFT_CUDA_TRY(cudaStreamSynchronize(
       stream_view_));  // To make sure all the data is written from device to host
     RAFT_CUDA_TRY(cudaPeekAtLastError());
+
 #ifdef CUPDLP_DEBUG_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
 #endif
@@ -1847,9 +1896,15 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
 #endif
 
   for (size_t i = 0; i < climber_strategies_.size(); ++i) {
+    cuopt_assert(!std::isnan(restart_strategy_.fixed_point_error_[i]),
+                 "fixed_point_error_ must not be NaN after compute_fixed_error");
+    cuopt_assert(restart_strategy_.fixed_point_error_[i] >= f_t(0.0),
+                 "fixed_point_error_ must be >= 0 after compute_fixed_error");
     if (has_restarted[i]) {
       restart_strategy_.initial_fixed_point_error_[i] = restart_strategy_.fixed_point_error_[i];
-      has_restarted[i]                                = false;
+      cuopt_assert(!std::isnan(restart_strategy_.initial_fixed_point_error_[i]),
+                   "initial_fixed_point_error_ must not be NaN after assignment");
+      has_restarted[i] = false;
     }
   }
 }
@@ -1869,6 +1924,7 @@ void pdlp_solver_t<i_t, f_t>::transpose_primal_dual_to_row(
   rmm::device_uvector<f_t> dual_slack_transposed(
     is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_);
 
+  RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_));
   CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(),
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
@@ -1945,6 +2001,7 @@ void pdlp_solver_t<i_t, f_t>::transpose_primal_dual_back_to_col(
   rmm::device_uvector<f_t> dual_slack_transposed(
     is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_);
 
+  RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_));
   CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(),
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
@@ -2632,7 +2689,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
     rmm::device_uvector<f_t> d_atq(n, stream_view_);
 
     std::mt19937 gen(1);
-    std::normal_distribution<double> dist(0.0, 1.0);
+    std::normal_distribution<f_t> dist(f_t(0.0), f_t(1.0));
 
     for (int i = 0; i < m; ++i)
       z[i] = dist(gen);
@@ -2684,7 +2741,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                                            vecATQ,
                                            CUSPARSE_SPMV_CSR_ALG2,
                                            (f_t*)cusparse_view_.buffer_transpose.data(),
-                                           stream_view_));
+                                           stream_view_.value()));
 
       // z = A @ A_t_q
       RAFT_CUSPARSE_TRY(
@@ -2697,7 +2754,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                                            vecZ,
                                            CUSPARSE_SPMV_CSR_ALG2,
                                            (f_t*)cusparse_view_.buffer_non_transpose.data(),
-                                           stream_view_));
+                                           stream_view_.value()));
       // sigma_max_sq = dot(q, z)
       RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                                       m,
@@ -2706,7 +2763,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                                                       d_z.data(),
                                                       primal_stride,
                                                       sigma_max_sq.data(),
-                                                      stream_view_));
+                                                      stream_view_.value()));
 
       cub::DeviceTransform::Transform(
         cuda::std::make_tuple(d_q.data(), d_z.data()),
diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index 5723cf78c1..8eacd4d246 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -34,11 +34,11 @@
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index 329a1ec465..24ef29b243 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -32,8 +32,6 @@
 
 namespace cuopt::linear_programming::detail {
 
-constexpr int parallel_stream_computation = 2;
-
 template <typename i_t, typename f_t>
 adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
   raft::handle_t const* handle_ptr,
@@ -45,10 +43,6 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
   const std::vector<pdlp_climber_strategy_t>& climber_strategies,
   const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params)
   : batch_mode_(climber_strategies.size() > 1),
-    stream_pool_(parallel_stream_computation),
-    dot_delta_X_(cudaEventDisableTiming),
-    dot_delta_Y_(cudaEventDisableTiming),
-    deltas_are_done_(cudaEventDisableTiming),
     handle_ptr_(handle_ptr),
     stream_view_(handle_ptr_->get_stream()),
     primal_size_(primal_size),
@@ -80,7 +74,7 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
       interaction_.data(),
       climber_strategies_.size(),
       primal_size_,
-      stream_view_));
+      stream_view_.value()));
     dot_product_bytes = std::max(dot_product_bytes, byte_needed);
 
     RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(
@@ -90,7 +84,7 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
       norm_squared_delta_primal_.data(),
       climber_strategies_.size(),
       primal_size_,
-      stream_view_));
+      stream_view_.value()));
     dot_product_bytes = std::max(dot_product_bytes, byte_needed);
 
     RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(
@@ -100,10 +94,10 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
       norm_squared_delta_dual_.data(),
       climber_strategies_.size(),
       dual_size_,
-      stream_view_));
+      stream_view_.value()));
     dot_product_bytes = std::max(dot_product_bytes, byte_needed);
 
-    dot_product_storage.resize(dot_product_bytes, stream_view_);
+    dot_product_storage.resize(dot_product_bytes, stream_view_.value());
   }
 }
 
@@ -143,11 +137,12 @@ void adaptive_step_size_strategy_t<i_t, f_t>::swap_context(
   const auto [grid_size, block_size] =
     kernel_config_from_batch_size(static_cast<i_t>(swap_pairs.size()));
   adaptive_step_size_swap_device_vectors_kernel<i_t, f_t>
-    <<<grid_size, block_size, 0, stream_view_>>>(thrust::raw_pointer_cast(swap_pairs.data()),
-                                                 static_cast<i_t>(swap_pairs.size()),
-                                                 make_span(interaction_),
-                                                 make_span(norm_squared_delta_primal_),
-                                                 make_span(norm_squared_delta_dual_));
+    <<<grid_size, block_size, 0, stream_view_.value()>>>(
+      thrust::raw_pointer_cast(swap_pairs.data()),
+      static_cast<i_t>(swap_pairs.size()),
+      make_span(interaction_),
+      make_span(norm_squared_delta_primal_),
+      make_span(norm_squared_delta_dual_));
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -159,9 +154,9 @@ void adaptive_step_size_strategy_t<i_t, f_t>::resize_context(i_t new_size)
   cuopt_assert(new_size > 0, "New size must be greater than 0");
   cuopt_assert(new_size < batch_size, "New size must be less than batch size");
 
-  interaction_.resize(new_size, stream_view_);
-  norm_squared_delta_primal_.resize(new_size, stream_view_);
-  norm_squared_delta_dual_.resize(new_size, stream_view_);
+  interaction_.resize(new_size, stream_view_.value());
+  norm_squared_delta_primal_.resize(new_size, stream_view_.value());
+  norm_squared_delta_dual_.resize(new_size, stream_view_.value());
 }
 
 template <typename i_t, typename f_t>
@@ -276,19 +271,19 @@ i_t adaptive_step_size_strategy_t<i_t, f_t>::get_valid_step_size() const
 template <typename i_t, typename f_t>
 f_t adaptive_step_size_strategy_t<i_t, f_t>::get_interaction(i_t i) const
 {
-  return interaction_.element(i, stream_view_);
+  return interaction_.element(i, stream_view_.value());
 }
 
 template <typename i_t, typename f_t>
 f_t adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_primal(i_t i) const
 {
-  return norm_squared_delta_primal_.element(i, stream_view_);
+  return norm_squared_delta_primal_.element(i, stream_view_.value());
 }
 
 template <typename i_t, typename f_t>
 f_t adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_dual(i_t i) const
 {
-  return norm_squared_delta_dual_.element(i, stream_view_);
+  return norm_squared_delta_dual_.element(i, stream_view_.value());
 }
 
 template <typename i_t, typename f_t>
@@ -337,15 +332,15 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_step_sizes(
                                      pdhg_solver.get_saddle_point_state());
     // Compute n_lim, n_next and decide if step size is valid
     compute_step_sizes_from_movement_and_interaction<i_t, f_t>
-      <<<1, 1, 0, stream_view_>>>(this->view(),
-                                  primal_step_size.data(),
-                                  dual_step_size.data(),
-                                  pdhg_solver.get_d_total_pdhg_iterations().data());
+      <<<1, 1, 0, stream_view_.value()>>>(this->view(),
+                                          primal_step_size.data(),
+                                          dual_step_size.data(),
+                                          pdhg_solver.get_d_total_pdhg_iterations().data());
     graph.end_capture(total_pdlp_iterations);
   }
   graph.launch(total_pdlp_iterations);
   // Steam sync so that next call can see modification made to host var valid_step_size
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value()));
 }
 
 template <typename i_t, typename f_t>
@@ -371,7 +366,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
 
     Deltas x & y were computed during pdhg step
 
-    We will compute in parallel (parallel cuda graph):
+    We will compute:
     ||(x' - x)||
     ||(y' - y)||
     (y' - y)_t . A @ (x' - x)
@@ -379,11 +374,6 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     And finally merge the results
   */
 
-  // We need to make sure both dot products happens after previous operations (next_primal/dual)
-  // Thus, we add another node in the main stream before starting the SpMVs
-
-  if (!batch_mode_) deltas_are_done_.record(stream_view_);
-
   // primal_dual_interaction computation => we purposly diverge from the paper (delta_y . (A @ x' -
   // A@x)) to save one SpMV
   // Instead we do: delta_x . (A_t @ y' - A_t @ y)
@@ -406,7 +396,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                          cusparse_view.next_AtY,
                                          CUSPARSE_SPMV_CSR_ALG2,
                                          (f_t*)cusparse_view.buffer_transpose.data(),
-                                         stream_view_));
+                                         stream_view_.value()));
   } else {
     // TODO later batch mode: handle if not all restart
     RAFT_CUSPARSE_TRY(
@@ -420,7 +410,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                          cusparse_view.batch_next_AtYs,
                                          CUSPARSE_SPMM_CSR_ALG3,
                                          (f_t*)cusparse_view.buffer_transpose_batch.data(),
-                                         stream_view_));
+                                         stream_view_.value()));
   }
 
   // Compute Ay' - Ay = next_Aty - current_Aty
@@ -443,7 +433,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                       current_saddle_point_state.get_delta_primal().data(),
                                       primal_stride,
                                       interaction_.data(),
-                                      stream_view_));
+                                      stream_view_.value()));
 
     // Compute movement
     //  compute euclidean norm squared which is
@@ -453,7 +443,6 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     //               2 + (0.5 /
     //               solver_state.primal_weight) *
     //               norm(delta_dual) ^ 2;
-    deltas_are_done_.stream_wait(stream_pool_.get_stream(0));
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                       current_saddle_point_state.get_primal_size(),
@@ -462,10 +451,8 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                       current_saddle_point_state.get_delta_primal().data(),
                                       primal_stride,
                                       norm_squared_delta_primal_.data(),
-                                      stream_pool_.get_stream(0)));
-    dot_delta_X_.record(stream_pool_.get_stream(0));
+                                      stream_view_.value()));
 
-    deltas_are_done_.stream_wait(stream_pool_.get_stream(1));
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                       current_saddle_point_state.get_dual_size(),
@@ -474,12 +461,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                       current_saddle_point_state.get_delta_dual().data(),
                                       dual_stride,
                                       norm_squared_delta_dual_.data(),
-                                      stream_pool_.get_stream(1)));
-    dot_delta_Y_.record(stream_pool_.get_stream(1));
-
-    // Wait on main stream for both dot to be done before launching the next kernel
-    dot_delta_X_.stream_wait(stream_view_);
-    dot_delta_Y_.stream_wait(stream_view_);
+                                      stream_view_.value()));
   } else {
     // TODO later batch mode: remove this once you want to do per climber restart
     cub::DeviceSegmentedReduce::Sum(
@@ -492,7 +474,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
       interaction_.data(),
       climber_strategies_.size(),
       primal_size_,
-      stream_view_);
+      stream_view_.value());
 
     cub::DeviceSegmentedReduce::Sum(
       dot_product_storage.data(),
@@ -502,7 +484,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
       norm_squared_delta_primal_.data(),
       climber_strategies_.size(),
       primal_size_,
-      stream_view_);
+      stream_view_.value());
 
     cub::DeviceSegmentedReduce::Sum(
       dot_product_storage.data(),
@@ -512,7 +494,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
       norm_squared_delta_dual_.data(),
       climber_strategies_.size(),
       dual_size_,
-      stream_view_);
+      stream_view_.value());
   }
 }
 
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
index 8e7e048b18..1e969150e7 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
@@ -91,15 +91,6 @@ class adaptive_step_size_strategy_t {
  private:
   const bool batch_mode_;
 
-  // Stream pool to run different step size computation in parallel
-  // Because we already have the main stream, we just need 2 extra streams from this
-  rmm::cuda_stream_pool stream_pool_;
-
-  // Events to record when dot product of both delta_x and y are done and when to start them
-  event_handler_t deltas_are_done_;
-  event_handler_t dot_delta_X_;
-  event_handler_t dot_delta_Y_;
-
   raft::handle_t const* handle_ptr_{nullptr};
   rmm::cuda_stream_view stream_view_;
 
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index 464178e40b..9b01608b47 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -395,28 +395,28 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
                   "Batch mode not supported for per_constraint_residual");
 
     // Compute the linf of (residual_i - rel * b_i)
-    thrust::device_ptr<f_t> result_ptr(linf_primal_residual_.data());
-    const f_t neutral = f_t(0.0);
-
     if (settings.save_best_primal_so_far) {
       const i_t zero_int = 0;
       nb_violated_constraints_.set_value_async(zero_int, handle_ptr_->get_stream());
-      *result_ptr = thrust::transform_reduce(
-        handle_ptr_->get_thrust_policy(),
-        thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()),
-        thrust::make_zip_iterator(primal_residual_.cend(), combined_bounds.cend()),
-        relative_residual_t<i_t, f_t>{settings.tolerances.relative_primal_tolerance},
-        neutral,
-        thrust::maximum<f_t>());
-    } else {
-      *result_ptr = thrust::transform_reduce(
-        handle_ptr_->get_thrust_policy(),
-        thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()),
-        thrust::make_zip_iterator(primal_residual_.cend(), combined_bounds.cend()),
-        relative_residual_t<i_t, f_t>{settings.tolerances.relative_primal_tolerance},
-        neutral,
-        thrust::maximum<f_t>());
     }
+    auto transform_iter = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()),
+      relative_residual_t<i_t, f_t>{settings.tolerances.relative_primal_tolerance});
+    void* d_temp_storage      = nullptr;
+    size_t temp_storage_bytes = 0;
+    RAFT_CUDA_TRY(cub::DeviceReduce::Max(d_temp_storage,
+                                         temp_storage_bytes,
+                                         transform_iter,
+                                         linf_primal_residual_.data(),
+                                         primal_residual_.size(),
+                                         stream_view_));
+    rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_);
+    RAFT_CUDA_TRY(cub::DeviceReduce::Max(temp_buf.data(),
+                                         temp_storage_bytes,
+                                         transform_iter,
+                                         linf_primal_residual_.data(),
+                                         primal_residual_.size(),
+                                         stream_view_));
   }
 
   compute_dual_residual(op_problem_cusparse_view_,
@@ -458,16 +458,26 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
                   "Batch mode not supported for per_constraint_residual");
 
     // Compute the linf of (residual_i - rel * c_i)
-    thrust::device_ptr<f_t> result_ptr(linf_dual_residual_.data());
-    const f_t neutral = f_t(0.0);
-
-    *result_ptr = thrust::transform_reduce(
-      handle_ptr_->get_thrust_policy(),
-      thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()),
-      thrust::make_zip_iterator(dual_residual_.cend(), objective_coefficients.cend()),
-      relative_residual_t<i_t, f_t>{settings.tolerances.relative_dual_tolerance},
-      neutral,
-      thrust::maximum<f_t>());
+    {
+      auto transform_iter = thrust::make_transform_iterator(
+        thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()),
+        relative_residual_t<i_t, f_t>{settings.tolerances.relative_dual_tolerance});
+      void* d_temp_storage      = nullptr;
+      size_t temp_storage_bytes = 0;
+      cub::DeviceReduce::Max(d_temp_storage,
+                             temp_storage_bytes,
+                             transform_iter,
+                             linf_dual_residual_.data(),
+                             dual_residual_.size(),
+                             stream_view_);
+      rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_);
+      cub::DeviceReduce::Max(temp_buf.data(),
+                             temp_storage_bytes,
+                             transform_iter,
+                             linf_dual_residual_.data(),
+                             dual_residual_.size(),
+                             stream_view_);
+    }
   }
 
   const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu
new file mode 100644
index 0000000000..4ec5bff8c1
--- /dev/null
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu
@@ -0,0 +1,100 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#include <pdlp/utilities/ping_pong_graph.cuh>
+
+#include <raft/core/error.hpp>
+
+#include <utilities/logger.hpp>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t>
+ping_pong_graph_t<i_t>::ping_pong_graph_t(rmm::cuda_stream_view stream_view,
+                                          bool is_legacy_batch_mode)
+  : stream_view_(stream_view), is_legacy_batch_mode_(is_legacy_batch_mode)
+{
+}
+
+template <typename i_t>
+ping_pong_graph_t<i_t>::~ping_pong_graph_t()
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); }
+    if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); }
+  }
+#endif
+}
+
+template <typename i_t>
+void ping_pong_graph_t<i_t>::start_capture(i_t total_pdlp_iterations)
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
+      RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
+      capture_even_active_ = true;
+    } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
+      RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
+      capture_odd_active_ = true;
+    }
+  }
+#endif
+}
+
+template <typename i_t>
+void ping_pong_graph_t<i_t>::end_capture(i_t total_pdlp_iterations)
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
+      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
+      capture_even_active_ = false;
+      RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph));
+      even_initialized = true;
+      RAFT_CUDA_TRY(cudaGraphDestroy(even_graph));
+    } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
+      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
+      capture_odd_active_ = false;
+      RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph));
+      odd_initialized = true;
+      RAFT_CUDA_TRY(cudaGraphDestroy(odd_graph));
+    }
+  }
+#endif
+}
+
+template <typename i_t>
+void ping_pong_graph_t<i_t>::launch(i_t total_pdlp_iterations)
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    if (total_pdlp_iterations % 2 == 0 && even_initialized) {
+      RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value()));
+    } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) {
+      RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value()));
+    }
+  }
+#endif
+}
+
+template <typename i_t>
+bool ping_pong_graph_t<i_t>::is_initialized(i_t total_pdlp_iterations)
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    return (total_pdlp_iterations % 2 == 0 && even_initialized) ||
+           (total_pdlp_iterations % 2 == 1 && odd_initialized);
+  }
+#endif
+  return false;
+}
+
+template class ping_pong_graph_t<int>;
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
index 16e2d64957..dafecdd06e 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
@@ -9,6 +9,8 @@
 
 #include <pdlp/pdlp_constants.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <cuda_runtime.h>
 
 namespace cuopt::linear_programming::detail {
@@ -20,78 +22,13 @@ namespace cuopt::linear_programming::detail {
 template <typename i_t>
 class ping_pong_graph_t {
  public:
-  ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false)
-    : stream_view_(stream_view), is_legacy_batch_mode_(is_legacy_batch_mode)
-  {
-  }
-
-  ~ping_pong_graph_t()
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); }
-      if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); }
-    }
-#endif
-  }
-
-  void start_capture(i_t total_pdlp_iterations)
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-        RAFT_CUDA_TRY(
-          cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
-      } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-        RAFT_CUDA_TRY(
-          cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
-      }
-    }
-#endif
-  }
+  ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false);
+  ~ping_pong_graph_t();
 
-  void end_capture(i_t total_pdlp_iterations)
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-        RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
-        RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph));
-        even_initialized = true;
-        RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
-      } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-        RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
-        RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph));
-        odd_initialized = true;
-        RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
-      }
-    }
-#endif
-  }
-
-  void launch(i_t total_pdlp_iterations)
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      if (total_pdlp_iterations % 2 == 0 && even_initialized) {
-        RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value()));
-      } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) {
-        RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value()));
-      }
-    }
-#endif
-  }
-
-  bool is_initialized(i_t total_pdlp_iterations)
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      return (total_pdlp_iterations % 2 == 0 && even_initialized) ||
-             (total_pdlp_iterations % 2 == 1 && odd_initialized);
-    }
-#endif
-    return false;
-  }
+  void start_capture(i_t total_pdlp_iterations);
+  void end_capture(i_t total_pdlp_iterations);
+  void launch(i_t total_pdlp_iterations);
+  bool is_initialized(i_t total_pdlp_iterations);
 
  private:
   cudaGraph_t even_graph;
@@ -101,7 +38,9 @@ class ping_pong_graph_t {
   rmm::cuda_stream_view stream_view_;
   bool even_initialized{false};
   bool odd_initialized{false};
-  // Temporary fix to disable cuda graph in legacy batch mode
+  bool capture_even_active_{false};
+  bool capture_odd_active_{false};
   bool is_legacy_batch_mode_{false};
 };
+
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh
index d48ae21c1a..33625f7680 100644
--- a/cpp/src/pdlp/utils.cuh
+++ b/cpp/src/pdlp/utils.cuh
@@ -604,15 +604,15 @@ void inline my_inf_norm(const rmm::device_uvector<f_t>& input_vector,
                         f_t* result,
                         raft::handle_t const* handle_ptr)
 {
-  const f_t neutral = f_t(0.0);
-  thrust::device_ptr<f_t> result_ptr(result);
-
-  *result_ptr = thrust::transform_reduce(handle_ptr->get_thrust_policy(),
-                                         input_vector.data(),
-                                         input_vector.data() + input_vector.size(),
-                                         abs_t<f_t>{},
-                                         neutral,
-                                         thrust::maximum<f_t>());
+  auto stream   = handle_ptr->get_stream();
+  auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t<f_t>{});
+  auto n        = input_vector.size();
+
+  void* d_temp      = nullptr;
+  size_t temp_bytes = 0;
+  cub::DeviceReduce::Max(d_temp, temp_bytes, abs_iter, result, n, stream);
+  rmm::device_buffer temp_buf(temp_bytes, stream);
+  cub::DeviceReduce::Max(temp_buf.data(), temp_bytes, abs_iter, result, n, stream);
 }
 
 template <typename f_t>
diff --git a/docs/cuopt/source/lp-qp-milp-settings.rst b/docs/cuopt/source/lp-qp-milp-settings.rst
index 51c6142c2b..bd1372f70e 100644
--- a/docs/cuopt/source/lp-qp-milp-settings.rst
+++ b/docs/cuopt/source/lp-qp-milp-settings.rst
@@ -513,3 +513,13 @@ Set this value to 0 to disable reliability branching.
 Set this value to k > 0, to enable reliability branching. A variable will be considered reliable if it has been branched on k times.
 
 .. note:: The default value is ``-1`` (automatic).
+
+Batch PDLP Strong Branching
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING`` controls whether to use batched PDLP over Dual Simplex during strong branching at the root.
+When enabled, the solver evaluates multiple branching candidates simultaneously in a single batched PDLP solve rather than solving them in parallel using Dual Simplex. This can significantly reduce the time spent in strong branching if Dual Simplex is struggling.
+Set this value to 0 to disable batched PDLP strong branching.
+Set this value to 1 to enable batched PDLP strong branching.
+
+.. note:: The default value is ``0`` (disabled). This setting is ignored if the problem is not a MIP problem.
diff --git a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
index dc470f3828..0eca50ba9b 100644
--- a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
+++ b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
@@ -30,6 +30,7 @@
     CUOPT_ELIMINATE_DENSE_COLUMNS,
     CUOPT_FOLDING,
     CUOPT_INFEASIBILITY_DETECTION,
+    CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING,
     CUOPT_MIP_CUT_PASSES,
     CUOPT_METHOD,
     CUOPT_ORDERING,
@@ -997,3 +998,35 @@ def test_cuts():
     assert problem.Status.name == "Optimal"
     assert problem.ObjValue == pytest.approx(-126, abs=1e-3)
     assert problem.SolutionStats.num_nodes == 0
+
+
+def test_batch_pdlp_strong_branching():
+    # Minimize - 86*y1 - 4*y2 - 40*y3
+    # subject to 774*y1 + 76*y2 + 42*y3 <= 875
+    #            67*y1 + 27*y2 + 53*y3 <= 875
+    #            y1, y2, y3 in {0, 1}
+
+    problem = Problem()
+    y1 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y1")
+    y2 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y2")
+    y3 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y3")
+
+    problem.addConstraint(774 * y1 + 76 * y2 + 42 * y3 <= 875)
+    problem.addConstraint(67 * y1 + 27 * y2 + 53 * y3 <= 875)
+
+    problem.setObjective(-86 * y1 - 4 * y2 - 40 * y3)
+
+    settings = SolverSettings()
+    settings.set_parameter(CUOPT_PRESOLVE, 0)
+    settings.set_parameter(CUOPT_TIME_LIMIT, 10)
+    settings.set_parameter(CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, 0)
+
+    problem.solve(settings)
+    assert problem.Status.name == "Optimal"
+    assert problem.ObjValue == pytest.approx(-126, abs=1e-3)
+
+    settings.set_parameter(CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, 1)
+
+    problem.solve(settings)
+    assert problem.Status.name == "Optimal"
+    assert problem.ObjValue == pytest.approx(-126, abs=1e-3)
diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
index 8412c745b5..59ea62089d 100644
--- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
+++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
@@ -450,6 +450,11 @@ class SolverConfig(BaseModel):
         description="Set True to run heuristics only, False to run "
         "heuristics and branch and bound for MILP",
     )
+    mip_batch_pdlp_strong_branching: Optional[int] = Field(
+        default=0,
+        description="Set 1 to enable batch PDLP strong branching "
+        "in the MIP solver, 0 to disable.",
+    )
     num_cpu_threads: Optional[int] = Field(
         default=None,
         description="Set the number of CPU threads to use for branch and bound.",  # noqa