From 4c6704f748493a3290a534e3a72ebc3ec93ca645 Mon Sep 17 00:00:00 2001
From: akifcorduk <akifcorduk@gmail.com>
Date: Thu, 2 Oct 2025 08:48:56 -0700
Subject: [PATCH 01/27] remove left-over kernels and fix logs

---
 .../local_search/rounding/constraint_prop.cu  |   2 +-
 cpp/src/mip/presolve/spmv_kernels.cuh         | 229 ------------------
 cpp/src/mip/presolve/third_party_presolve.cpp |  20 +-
 3 files changed, 11 insertions(+), 240 deletions(-)
 delete mode 100644 cpp/src/mip/presolve/spmv_kernels.cuh
diff --git a/cpp/src/mip/local_search/rounding/constraint_prop.cu b/cpp/src/mip/local_search/rounding/constraint_prop.cu
index 4dfd1b216b..76d3916df6 100644
--- a/cpp/src/mip/local_search/rounding/constraint_prop.cu
+++ b/cpp/src/mip/local_search/rounding/constraint_prop.cu
@@ -907,7 +907,7 @@ bool constraint_prop_t<i_t, f_t>::find_integer(
 
   CUOPT_LOG_DEBUG("Bounds propagation rounding: unset vars %lu", unset_integer_vars.size());
   if (unset_integer_vars.size() == 0) {
-    CUOPT_LOG_ERROR("No integer variables provided in the bounds prop rounding");
+    CUOPT_LOG_DEBUG("No integer variables provided in the bounds prop rounding");
     expand_device_copy(orig_sol.assignment, sol.assignment, sol.handle_ptr->get_stream());
     cuopt_func_call(orig_sol.test_variable_bounds());
     return orig_sol.compute_feasibility();
diff --git a/cpp/src/mip/presolve/spmv_kernels.cuh b/cpp/src/mip/presolve/spmv_kernels.cuh
deleted file mode 100644
index 3f11a8f365..0000000000
--- a/cpp/src/mip/presolve/spmv_kernels.cuh
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/binary_search.h>
-#include <mip/problem/problem.cuh>
-
-namespace cuopt::linear_programming::detail {
-
-template <typename i_t, typename f_t, i_t MAX_EDGE_PER_CNST, typename view_t>
-__device__ f_t spmv(view_t view, raft::device_span<f_t> input, i_t tid, i_t beg, i_t end)
-{
-  f_t out = 0.;
-  for (i_t i = tid + beg; i < end; i += MAX_EDGE_PER_CNST) {
-    auto coeff = view.coeff[i];
-    auto var   = view.elem[i];
-    auto in    = input[var];
-    out += coeff * in;
-  }
-  return out;
-}
-
-template <typename i_t, typename f_t, i_t BDIM, typename view_t>
-__global__ void lb_spmv_heavy_kernel(i_t id_range_beg,
-                                     raft::device_span<const i_t> ids,
-                                     raft::device_span<const i_t> pseudo_block_ids,
-                                     i_t work_per_block,
-                                     view_t view,
-                                     raft::device_span<f_t> input,
-                                     raft::device_span<f_t> tmp_out)
-{
-  auto idx             = ids[blockIdx.x] + id_range_beg;
-  auto pseudo_block_id = pseudo_block_ids[blockIdx.x];
-  i_t item_off_beg     = view.offsets[idx] + work_per_block * pseudo_block_id;
-  i_t item_off_end     = min(item_off_beg + work_per_block, view.offsets[idx + 1]);
-
-  typedef cub::BlockReduce<f_t, BDIM> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  auto out = spmv<i_t, f_t, BDIM>(view, input, threadIdx.x, item_off_beg, item_off_end);
-
-  out = BlockReduce(temp_storage).Sum(out);
-
-  if (threadIdx.x == 0) { tmp_out[blockIdx.x] = out; }
-}
-
-template <typename i_t, typename f_t, typename view_t>
-__global__ void finalize_spmv_kernel(i_t heavy_beg_id,
-                                     raft::device_span<const i_t> item_offsets,
-                                     raft::device_span<f_t> tmp_out,
-                                     view_t view,
-                                     raft::device_span<f_t> output)
-{
-  using warp_reduce = cub::WarpReduce<f_t>;
-  __shared__ typename warp_reduce::TempStorage temp_storage;
-  i_t idx      = heavy_beg_id + blockIdx.x;
-  i_t item_idx = view.reorg_ids[idx];
-
-  i_t item_off_beg = item_offsets[blockIdx.x];
-  i_t item_off_end = item_offsets[blockIdx.x + 1];
-  f_t out          = 0.;
-  for (i_t i = threadIdx.x + item_off_beg; i < item_off_end; i += blockDim.x) {
-    out += tmp_out[i];
-  }
-  out = warp_reduce(temp_storage).Sum(out);
-  if (threadIdx.x == 0) { output[item_idx] = out; }
-}
-
-template <typename i_t, typename f_t, i_t BDIM, typename view_t>
-__global__ void lb_spmv_block_kernel(i_t id_range_beg,
-                                     view_t view,
-                                     raft::device_span<f_t> input,
-                                     raft::device_span<f_t> output)
-
-{
-  i_t idx          = id_range_beg + blockIdx.x;
-  i_t item_idx     = view.reorg_ids[idx];
-  i_t item_off_beg = view.offsets[idx];
-  i_t item_off_end = view.offsets[idx + 1];
-
-  typedef cub::BlockReduce<f_t, BDIM> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  auto out = spmv<i_t, f_t, BDIM>(view, input, threadIdx.x, item_off_beg, item_off_end);
-
-  out = BlockReduce(temp_storage).Sum(out);
-
-  if (threadIdx.x == 0) {
-    // written to old index
-    output[item_idx] = out;
-  }
-}
-
-template <typename i_t, typename f_t, i_t BDIM, i_t MAX_EDGE_PER_CNST, typename view_t>
-__global__ void lb_spmv_sub_warp_kernel(i_t id_range_beg,
-                                        i_t id_range_end,
-                                        view_t view,
-                                        raft::device_span<f_t> input,
-                                        raft::device_span<f_t> output)
-{
-  constexpr i_t ids_per_block = BDIM / MAX_EDGE_PER_CNST;
-  i_t id_beg                  = blockIdx.x * ids_per_block + id_range_beg;
-  i_t idx                     = id_beg + (threadIdx.x / MAX_EDGE_PER_CNST);
-  i_t item_idx;
-  if (idx < id_range_end) { item_idx = view.reorg_ids[idx]; }
-  i_t p_tid = threadIdx.x % MAX_EDGE_PER_CNST;
-
-  i_t head_flag = (p_tid == 0);
-
-  using warp_reduce = cub::WarpReduce<f_t, MAX_EDGE_PER_CNST>;
-  __shared__ typename warp_reduce::TempStorage temp_storage;
-
-  f_t out = 0.;
-
-  if (idx < id_range_end) {
-    i_t item_off_beg = view.offsets[idx];
-    i_t item_off_end = view.offsets[idx + 1];
-    out = spmv<i_t, f_t, MAX_EDGE_PER_CNST>(view, input, p_tid, item_off_beg, item_off_end);
-  }
-
-  out = warp_reduce(temp_storage).Sum(out);
-
-  if (head_flag && (idx < id_range_end)) { output[item_idx] = out; }
-}
-
-#if 1
-
-#define BYTE_TO_BINARY(byte)                                                               \
-  ((byte) & 0x80 ? '1' : '0'), ((byte) & 0x40 ? '1' : '0'), ((byte) & 0x20 ? '1' : '0'),   \
-    ((byte) & 0x10 ? '1' : '0'), ((byte) & 0x08 ? '1' : '0'), ((byte) & 0x04 ? '1' : '0'), \
-    ((byte) & 0x02 ? '1' : '0'), ((byte) & 0x01 ? '1' : '0')
-
-template <typename i_t>
-__device__ __forceinline__ void get_sub_warp_bin(i_t* id_warp_beg,
-                                                 i_t* id_range_end,
-                                                 i_t* t_p_v,
-                                                 raft::device_span<i_t> warp_offsets,
-                                                 raft::device_span<i_t> bin_offsets)
-{
-  i_t warp_id = (blockDim.x * blockIdx.x + threadIdx.x) / 32;
-  i_t lane_id = threadIdx.x & 31;
-  bool pred   = false;
-  if (lane_id < warp_offsets.size()) { pred = (warp_id >= warp_offsets[lane_id]); }
-  unsigned int m  = __ballot_sync(0xffffffff, pred);
-  i_t seg         = 31 - __clz(m);
-  i_t it_per_warp = (1 << (5 - seg));  // item per warp = 32/(2^seg)
-  if (5 - seg < 0) {
-    *t_p_v = 0;
-    return;
-  }
-  i_t beg       = bin_offsets[seg] + (warp_id - warp_offsets[seg]) * it_per_warp;
-  i_t end       = bin_offsets[seg + 1];
-  *id_warp_beg  = beg;
-  *id_range_end = end;
-  *t_p_v        = (1 << seg);
-}
-
-template <typename i_t, typename f_t, i_t BDIM, i_t MAX_EDGE_PER_CNST, typename view_t>
-__device__ void spmv_sub_warp(i_t id_warp_beg,
-                              i_t id_range_end,
-                              view_t view,
-                              raft::device_span<f_t> input,
-                              raft::device_span<f_t> output)
-{
-  i_t lane_id = (threadIdx.x & 31);
-  i_t idx     = id_warp_beg + (lane_id / MAX_EDGE_PER_CNST);
-  i_t item_idx;
-  if (idx < id_range_end) { item_idx = view.reorg_ids[idx]; }
-  i_t p_tid = lane_id & (MAX_EDGE_PER_CNST - 1);
-
-  i_t head_flag = (p_tid == 0);
-
-  using warp_reduce = cub::WarpReduce<f_t, MAX_EDGE_PER_CNST>;
-  __shared__ typename warp_reduce::TempStorage temp_storage;
-
-  f_t out = 0.;
-
-  if (idx < id_range_end) {
-    i_t item_off_beg = view.offsets[idx];
-    i_t item_off_end = view.offsets[idx + 1];
-    out = spmv<i_t, f_t, MAX_EDGE_PER_CNST>(view, input, p_tid, item_off_beg, item_off_end);
-  }
-
-  out = warp_reduce(temp_storage).Sum(out);
-
-  if (head_flag && (idx < id_range_end)) { output[item_idx] = out; }
-}
-
-template <typename i_t, typename f_t, i_t BDIM, typename view_t>
-__global__ void lb_spmv_sub_warp_kernel(view_t view,
-                                        raft::device_span<f_t> input,
-                                        raft::device_span<f_t> output,
-                                        raft::device_span<i_t> warp_item_offsets,
-                                        raft::device_span<i_t> warp_item_id_offsets)
-{
-  i_t id_warp_beg, id_range_end, t_p_v;
-  get_sub_warp_bin<i_t>(
-    &id_warp_beg, &id_range_end, &t_p_v, warp_item_offsets, warp_item_id_offsets);
-
-  if (t_p_v == 1) {
-    spmv_sub_warp<i_t, f_t, BDIM, 1>(id_warp_beg, id_range_end, view, input, output);
-  } else if (t_p_v == 2) {
-    spmv_sub_warp<i_t, f_t, BDIM, 2>(id_warp_beg, id_range_end, view, input, output);
-  } else if (t_p_v == 4) {
-    spmv_sub_warp<i_t, f_t, BDIM, 4>(id_warp_beg, id_range_end, view, input, output);
-  } else if (t_p_v == 8) {
-    spmv_sub_warp<i_t, f_t, BDIM, 8>(id_warp_beg, id_range_end, view, input, output);
-  } else if (t_p_v == 16) {
-    spmv_sub_warp<i_t, f_t, BDIM, 16>(id_warp_beg, id_range_end, view, input, output);
-  }
-}
-#endif
-
-}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip/presolve/third_party_presolve.cpp b/cpp/src/mip/presolve/third_party_presolve.cpp
index dc2d4b00e8..88de4912af 100644
--- a/cpp/src/mip/presolve/third_party_presolve.cpp
+++ b/cpp/src/mip/presolve/third_party_presolve.cpp
@@ -274,19 +274,19 @@ void check_presolve_status(const papilo::PresolveStatus& status)
 {
   switch (status) {
     case papilo::PresolveStatus::kUnchanged:
-      CUOPT_LOG_INFO("Presolve status:: did not result in any changes");
+      CUOPT_LOG_INFO("Presolve status: did not result in any changes");
       break;
     case papilo::PresolveStatus::kReduced:
-      CUOPT_LOG_INFO("Presolve status:: reduced the problem");
+      CUOPT_LOG_INFO("Presolve status: reduced the problem");
       break;
     case papilo::PresolveStatus::kUnbndOrInfeas:
-      CUOPT_LOG_INFO("Presolve status:: found an unbounded or infeasible problem");
+      CUOPT_LOG_INFO("Presolve status: found an unbounded or infeasible problem");
       break;
     case papilo::PresolveStatus::kInfeasible:
-      CUOPT_LOG_INFO("Presolve status:: found an infeasible problem");
+      CUOPT_LOG_INFO("Presolve status: found an infeasible problem");
       break;
     case papilo::PresolveStatus::kUnbounded:
-      CUOPT_LOG_INFO("Presolve status:: found an unbounded problem");
+      CUOPT_LOG_INFO("Presolve status: found an unbounded problem");
       break;
   }
 }
@@ -294,10 +294,10 @@ void check_presolve_status(const papilo::PresolveStatus& status)
 void check_postsolve_status(const papilo::PostsolveStatus& status)
 {
   switch (status) {
-    case papilo::PostsolveStatus::kOk: CUOPT_LOG_INFO("Post-solve status:: succeeded"); break;
+    case papilo::PostsolveStatus::kOk: CUOPT_LOG_INFO("Post-solve status: succeeded"); break;
     case papilo::PostsolveStatus::kFailed:
       CUOPT_LOG_INFO(
-        "Post-solve status:: Post solved solution violates constraints. This is most likely due to "
+        "Post-solve status: Post solved solution violates constraints. This is most likely due to "
         "different tolerances.");
       break;
   }
@@ -362,7 +362,7 @@ std::pair<optimization_problem_t<i_t, f_t>, bool> third_party_presolve_t<i_t, f_
 {
   papilo::Problem<f_t> papilo_problem = build_papilo_problem(op_problem);
 
-  CUOPT_LOG_INFO("Unpresolved problem:: %d constraints, %d variables, %d nonzeros",
+  CUOPT_LOG_INFO("Unpresolved problem: %d constraints, %d variables, %d nonzeros",
                  papilo_problem.getNRows(),
                  papilo_problem.getNCols(),
                  papilo_problem.getConstraintMatrix().getNnz());
@@ -382,11 +382,11 @@ std::pair<optimization_problem_t<i_t, f_t>, bool> third_party_presolve_t<i_t, f_
     return std::make_pair(optimization_problem_t<i_t, f_t>(op_problem.get_handle_ptr()), false);
   }
   post_solve_storage_ = result.postsolve;
-  CUOPT_LOG_INFO("Presolve removed:: %d constraints, %d variables, %d nonzeros",
+  CUOPT_LOG_INFO("Presolve removed: %d constraints, %d variables, %d nonzeros",
                  op_problem.get_n_constraints() - papilo_problem.getNRows(),
                  op_problem.get_n_variables() - papilo_problem.getNCols(),
                  op_problem.get_nnz() - papilo_problem.getConstraintMatrix().getNnz());
-  CUOPT_LOG_INFO("Presolved problem:: %d constraints, %d variables, %d nonzeros",
+  CUOPT_LOG_INFO("Presolved problem: %d constraints, %d variables, %d nonzeros",
                  papilo_problem.getNRows(),
                  papilo_problem.getNCols(),
                  papilo_problem.getConstraintMatrix().getNnz());

From d2baad3b9ee37d67e6ce92c47c352a2d16a23720 Mon Sep 17 00:00:00 2001
From: akifcorduk <akifcorduk@gmail.com>
Date: Thu, 2 Oct 2025 01:23:33 -0700
Subject: [PATCH 02/27] fix gpu count

---
 benchmarks/linear_programming/cuopt/run_mip.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/benchmarks/linear_programming/cuopt/run_mip.cpp b/benchmarks/linear_programming/cuopt/run_mip.cpp
index a4f52cb4ed..4a6121f8e2 100644
--- a/benchmarks/linear_programming/cuopt/run_mip.cpp
+++ b/benchmarks/linear_programming/cuopt/run_mip.cpp
@@ -80,10 +80,11 @@ void merge_result_files(const std::string& out_dir,
 void write_to_output_file(const std::string& out_dir,
                           const std::string& base_filename,
                           int gpu_id,
+                          int n_gpus,
                           int batch_id,
                           const std::string& data)
 {
-  int output_id        = batch_id * 8 + gpu_id;
+  int output_id        = batch_id * n_gpus + gpu_id;
   std::string filename = out_dir + "/result_" + std::to_string(output_id) + ".txt";
   std::ofstream outfile(filename, std::ios_base::app);
   if (outfile.is_open()) {
@@ -149,6 +150,7 @@ std::vector<std::vector<double>> read_solution_from_dir(const std::string file_p
 int run_single_file(std::string file_path,
                     int device,
                     int batch_id,
+                    int n_gpus,
                     std::string out_dir,
                     std::optional<std::string> initial_solution_dir,
                     bool heuristics_only,
@@ -243,7 +245,7 @@ int run_single_file(std::string file_path,
      << obj_val << "," << benchmark_info.objective_of_initial_population << ","
      << benchmark_info.last_improvement_of_best_feasible << ","
      << benchmark_info.last_improvement_after_recombination << "\n";
-  write_to_output_file(out_dir, base_filename, device, batch_id, ss.str());
+  write_to_output_file(out_dir, base_filename, device, n_gpus, batch_id, ss.str());
   CUOPT_LOG_INFO("Results written to the file %s", base_filename.c_str());
   return sol_found;
 }
@@ -251,6 +253,7 @@ int run_single_file(std::string file_path,
 void run_single_file_mp(std::string file_path,
                         int device,
                         int batch_id,
+                        int n_gpus,
                         std::string out_dir,
                         std::optional<std::string> input_file_dir,
                         bool heuristics_only,
@@ -265,6 +268,7 @@ void run_single_file_mp(std::string file_path,
   int sol_found = run_single_file(file_path,
                                   device,
                                   batch_id,
+                                  n_gpus,
                                   out_dir,
                                   input_file_dir,
                                   heuristics_only,
@@ -462,6 +466,7 @@ int main(int argc, char* argv[])
             run_single_file_mp(file_name,
                                gpu_id,
                                batch_num,
+                               n_gpus,
                                out_dir,
                                initial_solution_file,
                                heuristics_only,
@@ -501,6 +506,7 @@ int main(int argc, char* argv[])
     run_single_file(path,
                     0,
                     0,
+                    n_gpus,
                     out_dir,
                     initial_solution_file,
                     heuristics_only,

From 1c27d01d7f3999bf0fca16bcea5ed6f20ad8a8c4 Mon Sep 17 00:00:00 2001
From: nicolas <nguidotti@nvidia.com>
Date: Thu, 9 Oct 2025 16:13:34 +0200
Subject: [PATCH 03/27] fix starting variable bounds for diving. add
 backtracking parameter.

---
 cpp/src/dual_simplex/branch_and_bound.cpp | 34 ++++++++-------
 cpp/src/dual_simplex/branch_and_bound.hpp | 52 ++++++++++++++++++-----
 2 files changed, 60 insertions(+), 26 deletions(-)

diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index cf6fd69798..e306ad4969 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -576,10 +576,7 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node(search_tree_t<i_t, f_t>&
   // two vectors at each node and potentially cause memory issues
   node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed);
 
-  i_t node_iter                    = 0;
-  f_t lp_start_time                = tic();
-  std::vector<f_t> leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
-
+  std::vector<f_t> leaf_edge_norms      = edge_norms_;  // = node.steepest_edge_norms;
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.set_log(false);
   lp_settings.cut_off    = upper_bound + settings_.dual_tol;
@@ -594,6 +591,9 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node(search_tree_t<i_t, f_t>&
   dual::status_t lp_status = dual::status_t::DUAL_UNBOUNDED;
 
   if (feasible) {
+    i_t node_iter     = 0;
+    f_t lp_start_time = tic();
+
     lp_status = dual_phase2(2,
                             0,
                             lp_start_time,
@@ -610,10 +610,10 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node(search_tree_t<i_t, f_t>&
         leaf_problem, lp_start_time, lp_settings, leaf_solution, leaf_vstatus, leaf_edge_norms);
       lp_status = convert_lp_status_to_dual_status(second_status);
     }
-  }
 
-  stats_.total_lp_solve_time += toc(lp_start_time);
-  stats_.total_lp_iters += node_iter;
+    stats_.total_lp_solve_time += toc(lp_start_time);
+    stats_.total_lp_iters += node_iter;
+  }
 
   if (lp_status == dual::status_t::DUAL_UNBOUNDED) {
     // Node was infeasible. Do not branch
@@ -866,7 +866,7 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t id,
         // would be better if we discard the node instead.
         if (get_heap_size() > settings_.num_bfs_threads) {
           mutex_dive_queue_.lock();
-          dive_queue_.push(node->detach_copy());
+          dive_queue_.emplace(node->detach_copy(), leaf_problem.lower, leaf_problem.upper);
           mutex_dive_queue_.unlock();
         }
 
@@ -943,23 +943,24 @@ void branch_and_bound_t<i_t, f_t>::best_first_thread(i_t id,
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::diving_thread(lp_problem_t<i_t, f_t>& leaf_problem,
-                                                 const csc_matrix_t<i_t, f_t>& Arow)
+                                                 const csc_matrix_t<i_t, f_t>& Arow,
+                                                 i_t backtracking)
 {
   logger_t log;
   log.log = false;
 
   while (status_ == mip_exploration_status_t::RUNNING &&
          (active_subtrees_ > 0 || get_heap_size() > 0)) {
-    std::optional<mip_node_t<i_t, f_t>> start_node;
+    std::optional<diving_root_t<i_t, f_t>> start_node;
 
     mutex_dive_queue_.lock();
     if (dive_queue_.size() > 0) { start_node = dive_queue_.pop(); }
     mutex_dive_queue_.unlock();
 
     if (start_node.has_value()) {
-      if (get_upper_bound() < start_node->lower_bound) { continue; }
+      if (get_upper_bound() < start_node->node.lower_bound) { continue; }
 
-      search_tree_t<i_t, f_t> subtree(std::move(start_node.value()));
+      search_tree_t<i_t, f_t> subtree(std::move(start_node->node));
       std::deque<mip_node_t<i_t, f_t>*> stack;
       stack.push_front(&subtree.root);
 
@@ -985,16 +986,19 @@ void branch_and_bound_t<i_t, f_t>::diving_thread(lp_problem_t<i_t, f_t>& leaf_pr
           auto [first, second] = child_selection(node_ptr);
           stack.push_front(second);
           stack.push_front(first);
+        }
 
+        if (stack.size() > 1) {
           // If the diving thread is consuming the nodes faster than the
           // best first search, then we split the current subtree at the
           // lowest possible point and move to the queue, so it can
           // be picked by another thread.
-          if (dive_queue_.size() < min_diving_queue_size_) {
+          if (dive_queue_.size() < min_diving_queue_size_ ||
+              (stack.front()->depth - stack.back()->depth) > backtracking) {
             mutex_dive_queue_.lock();
             mip_node_t<i_t, f_t>* new_node = stack.back();
             stack.pop_back();
-            dive_queue_.push(new_node->detach_copy());
+            dive_queue_.emplace(new_node->detach_copy(), leaf_problem.lower, leaf_problem.upper);
             mutex_dive_queue_.unlock();
           }
         }
@@ -1192,7 +1196,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
 
         for (i_t i = 0; i < settings_.num_diving_threads; i++) {
 #pragma omp task
-          diving_thread(leaf_problem, Arow);
+          diving_thread(leaf_problem, Arow, 10);
         }
       }
     }
diff --git a/cpp/src/dual_simplex/branch_and_bound.hpp b/cpp/src/dual_simplex/branch_and_bound.hpp
index 7b80f88fa9..ae70172e36 100644
--- a/cpp/src/dual_simplex/branch_and_bound.hpp
+++ b/cpp/src/dual_simplex/branch_and_bound.hpp
@@ -55,36 +55,64 @@ enum class mip_exploration_status_t {
 template <typename i_t, typename f_t>
 void upper_bound_callback(f_t upper_bound);
 
+template <typename i_t, typename f_t>
+struct diving_root_t {
+  mip_node_t<i_t, f_t> node;
+  std::vector<f_t> lp_lower;
+  std::vector<f_t> lp_upper;
+
+  diving_root_t(mip_node_t<i_t, f_t>&& node,
+                const std::vector<f_t>& lower,
+                const std::vector<f_t>& upper)
+    : node(std::move(node)), lp_upper(upper), lp_lower(lower)
+  {
+  }
+
+  friend bool operator>(const diving_root_t<i_t, f_t>& a, const diving_root_t<i_t, f_t>& b)
+  {
+    return a.node.lower_bound > b.node.lower_bound;
+  }
+};
+
 // A min-heap for storing the starting nodes for the dives.
-// This has a maximum size of 8192, such that the container
+// This has a maximum size of 256, such that the container
 // will discard the least promising node if the queue is full.
 template <typename i_t, typename f_t>
 class dive_queue_t {
  private:
-  std::vector<mip_node_t<i_t, f_t>> buffer;
-  static constexpr i_t max_size_ = 2048;
+  std::vector<diving_root_t<i_t, f_t>> buffer;
+  static constexpr i_t max_size_ = 256;
 
  public:
   dive_queue_t() { buffer.reserve(max_size_); }
 
-  void push(mip_node_t<i_t, f_t>&& node)
+  void push(diving_root_t<i_t, f_t>&& node)
   {
     buffer.push_back(std::move(node));
-    std::push_heap(buffer.begin(), buffer.end(), node_compare_t<i_t, f_t>());
+    std::push_heap(buffer.begin(), buffer.end(), std::greater<>());
+    if (buffer.size() > max_size()) { buffer.pop_back(); }
+  }
+
+  void emplace(mip_node_t<i_t, f_t>&& node,
+               const std::vector<f_t>& lower,
+               const std::vector<f_t>& upper)
+  {
+    buffer.emplace_back(std::move(node), lower, upper);
+    std::push_heap(buffer.begin(), buffer.end(), std::greater<>());
     if (buffer.size() > max_size()) { buffer.pop_back(); }
   }
 
-  mip_node_t<i_t, f_t> pop()
+  diving_root_t<i_t, f_t> pop()
   {
-    std::pop_heap(buffer.begin(), buffer.end(), node_compare_t<i_t, f_t>());
-    mip_node_t<i_t, f_t> node = std::move(buffer.back());
+    std::pop_heap(buffer.begin(), buffer.end(), std::greater<>());
+    diving_root_t<i_t, f_t> node = std::move(buffer.back());
     buffer.pop_back();
     return node;
   }
 
   i_t size() const { return buffer.size(); }
   constexpr i_t max_size() const { return max_size_; }
-  const mip_node_t<i_t, f_t>& top() const { return buffer.front(); }
+  const diving_root_t<i_t, f_t>& top() const { return buffer.front(); }
   void clear() { buffer.clear(); }
 };
 
@@ -188,7 +216,7 @@ class branch_and_bound_t {
   // Set the final solution.
   mip_status_t set_final_solution(mip_solution_t<i_t, f_t>& solution, f_t lower_bound);
 
-  // Update the incumbent solution with the new feasible solution.
+  // Update the incumbent solution with the new feasible solution
   // found during branch and bound.
   void add_feasible_solution(f_t leaf_objective,
                              const std::vector<f_t>& leaf_solution,
@@ -222,7 +250,9 @@ class branch_and_bound_t {
 
   // Each diving thread pops the first node from the dive queue and then performs
   // a deep dive into the subtree determined by the node.
-  void diving_thread(lp_problem_t<i_t, f_t>& leaf_problem, const csc_matrix_t<i_t, f_t>& Arow);
+  void diving_thread(lp_problem_t<i_t, f_t>& leaf_problem,
+                     const csc_matrix_t<i_t, f_t>& Arow,
+                     i_t backtracking);
 
   // Solve the LP relaxation of a leaf node and update the tree.
   node_status_t solve_node(search_tree_t<i_t, f_t>& search_tree,

From 86a865516ef1a83a8350dab9a48de108de58310b Mon Sep 17 00:00:00 2001
From: nicolas <nguidotti@nvidia.com>
Date: Thu, 9 Oct 2025 17:12:11 +0200
Subject: [PATCH 04/27] fix log during the ramp up phase

---
 cpp/src/dual_simplex/branch_and_bound.cpp | 30 +++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index e306ad4969..c095c92715 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -695,17 +695,14 @@ void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(search_tree_t<i_t, f_t>*
                                                        i_t initial_heap_size)
 {
   if (status_ != mip_exploration_status_t::RUNNING) { return; }
-  if (omp_get_thread_num() == 0) { repair_heuristic_solutions(); }
+  repair_heuristic_solutions();
 
   f_t lower_bound      = node->lower_bound;
   f_t upper_bound      = get_upper_bound();
   f_t rel_gap          = user_relative_gap(original_lp_, upper_bound, lower_bound);
   f_t abs_gap          = upper_bound - lower_bound;
-  i_t nodes_explored   = 0;
-  i_t nodes_unexplored = 0;
-
-  nodes_explored   = (stats_.nodes_explored++);
-  nodes_unexplored = (stats_.nodes_unexplored--);
+  i_t nodes_explored   = (++stats_.nodes_explored);
+  i_t nodes_unexplored = (--stats_.nodes_unexplored);
   stats_.nodes_since_last_log++;
 
   if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
@@ -716,12 +713,17 @@ void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(search_tree_t<i_t, f_t>*
 
   f_t now = toc(stats_.start_time);
 
-  if (omp_get_thread_num() == 0) {
-    f_t time_since_last_log = stats_.last_log == 0 ? 1.0 : toc(stats_.last_log);
+  f_t time_since_last_log = stats_.last_log == 0 ? 1.0 : toc(stats_.last_log);
+
+  if (((stats_.nodes_since_last_log >= 10 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+       (time_since_last_log >= 1)) ||
+      (time_since_last_log > 30) || now > settings_.time_limit) {
+    // Check if no new node was explored until now. If this is the case,
+    // only the last thread should report the progress
+    if (stats_.nodes_explored.load() == nodes_explored) {
+      stats_.nodes_since_last_log = 0;
+      stats_.last_log             = tic();
 
-    if (((stats_.nodes_since_last_log >= 10 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-         (time_since_last_log >= 1)) ||
-        (time_since_last_log > 30) || now > settings_.time_limit) {
       f_t obj              = compute_user_objective(original_lp_, upper_bound);
       f_t user_lower       = compute_user_objective(original_lp_, root_objective_);
       std::string gap_user = user_mip_gap<f_t>(obj, user_lower);
@@ -735,8 +737,6 @@ void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(search_tree_t<i_t, f_t>*
                            nodes_explored > 0 ? stats_.total_lp_iters / nodes_explored : 0,
                            gap_user.c_str(),
                            now);
-
-      stats_.nodes_since_last_log = 0;
     }
   }
 
@@ -802,8 +802,8 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t id,
     // - The lower bound of the parent is lower or equal to its children
     assert(id < local_lower_bounds_.size());
     local_lower_bounds_[id] = lower_bound;
-    i_t nodes_explored      = stats_.nodes_explored++;
-    i_t nodes_unexplored    = stats_.nodes_unexplored--;
+    i_t nodes_explored      = (++stats_.nodes_explored);
+    i_t nodes_unexplored    = (--stats_.nodes_unexplored);
     stats_.nodes_since_last_log++;
 
     if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {

From 4044cc85ba589322532689a407a00a9e4714dabf Mon Sep 17 00:00:00 2001
From: nicolas <nguidotti@nvidia.com>
Date: Thu, 9 Oct 2025 17:24:03 +0200
Subject: [PATCH 05/27] added comment

---
 cpp/src/dual_simplex/branch_and_bound.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index c095c92715..efb4e5b4c7 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -695,6 +695,10 @@ void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(search_tree_t<i_t, f_t>*
                                                        i_t initial_heap_size)
 {
   if (status_ != mip_exploration_status_t::RUNNING) { return; }
+
+  // Note that we do not know which thread will execute the
+  // `exploration_ramp_up` task, so we allow to any thread
+  // to repair the heuristic solution.
   repair_heuristic_solutions();
 
   f_t lower_bound      = node->lower_bound;
@@ -711,8 +715,7 @@ void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(search_tree_t<i_t, f_t>*
     return;
   }
 
-  f_t now = toc(stats_.start_time);
-
+  f_t now                 = toc(stats_.start_time);
   f_t time_since_last_log = stats_.last_log == 0 ? 1.0 : toc(stats_.last_log);
 
   if (((stats_.nodes_since_last_log >= 10 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
@@ -784,7 +787,7 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t id,
   stack.push_front(start_node);
 
   while (stack.size() > 0 && status_ == mip_exploration_status_t::RUNNING) {
-    if (omp_get_thread_num() == 0) { repair_heuristic_solutions(); }
+    if (id == 0) { repair_heuristic_solutions(); }
 
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();

From 1651db35e3f2f2618b93053faaaa0151bb532c94 Mon Sep 17 00:00:00 2001
From: nicolas <nguidotti@nvidia.com>
Date: Thu, 9 Oct 2025 20:53:27 +0200
Subject: [PATCH 06/27] removed backtracking parameter due to a performance
 regression

---
 cpp/src/dual_simplex/branch_and_bound.cpp | 8 +++-----
 cpp/src/dual_simplex/branch_and_bound.hpp | 4 +---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index efb4e5b4c7..0df4c082bc 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -946,8 +946,7 @@ void branch_and_bound_t<i_t, f_t>::best_first_thread(i_t id,
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::diving_thread(lp_problem_t<i_t, f_t>& leaf_problem,
-                                                 const csc_matrix_t<i_t, f_t>& Arow,
-                                                 i_t backtracking)
+                                                 const csc_matrix_t<i_t, f_t>& Arow)
 {
   logger_t log;
   log.log = false;
@@ -996,8 +995,7 @@ void branch_and_bound_t<i_t, f_t>::diving_thread(lp_problem_t<i_t, f_t>& leaf_pr
           // best first search, then we split the current subtree at the
           // lowest possible point and move to the queue, so it can
           // be picked by another thread.
-          if (dive_queue_.size() < min_diving_queue_size_ ||
-              (stack.front()->depth - stack.back()->depth) > backtracking) {
+          if (dive_queue_.size() < min_diving_queue_size_) {
             mutex_dive_queue_.lock();
             mip_node_t<i_t, f_t>* new_node = stack.back();
             stack.pop_back();
@@ -1199,7 +1197,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
 
         for (i_t i = 0; i < settings_.num_diving_threads; i++) {
 #pragma omp task
-          diving_thread(leaf_problem, Arow, 10);
+          diving_thread(leaf_problem, Arow);
         }
       }
     }
diff --git a/cpp/src/dual_simplex/branch_and_bound.hpp b/cpp/src/dual_simplex/branch_and_bound.hpp
index ae70172e36..5453e8b424 100644
--- a/cpp/src/dual_simplex/branch_and_bound.hpp
+++ b/cpp/src/dual_simplex/branch_and_bound.hpp
@@ -250,9 +250,7 @@ class branch_and_bound_t {
 
   // Each diving thread pops the first node from the dive queue and then performs
   // a deep dive into the subtree determined by the node.
-  void diving_thread(lp_problem_t<i_t, f_t>& leaf_problem,
-                     const csc_matrix_t<i_t, f_t>& Arow,
-                     i_t backtracking);
+  void diving_thread(lp_problem_t<i_t, f_t>& leaf_problem, const csc_matrix_t<i_t, f_t>& Arow);
 
   // Solve the LP relaxation of a leaf node and update the tree.
   node_status_t solve_node(search_tree_t<i_t, f_t>& search_tree,

From 831a2c71d7151d9db67d19a19091a5cebe1aa9cf Mon Sep 17 00:00:00 2001
From: Ramakrishnap <42624703+rgsl888prabhu@users.noreply.github.com>
Date: Thu, 9 Oct 2025 15:01:18 -0500
Subject: [PATCH 07/27] Remove raft-dask dependency (#475)

Remove  unused dependency raft-dask

Authors:
  - Ramakrishnap (https://github.com/rgsl888prabhu)

Approvers:
  - Ishika Roy (https://github.com/Iroy30)
  - Trevor McKay (https://github.com/tmckayus)

URL: https://github.com/NVIDIA/cuopt/pull/475
---
 .../all_cuda-129_arch-aarch64.yaml            |  1 -
 .../all_cuda-129_arch-x86_64.yaml             |  1 -
 .../all_cuda-130_arch-aarch64.yaml            |  1 -
 .../all_cuda-130_arch-x86_64.yaml             |  1 -
 dependencies.yaml                             | 28 -------------------
 python/cuopt/cuopt/routing/vehicle_routing.py | 16 ++---------
 .../cuopt/routing/vehicle_routing_wrapper.pyx | 13 +++------
 python/cuopt/pyproject.toml                   |  2 --
 python/cuopt_self_hosted/pyproject.toml       |  1 -
 python/cuopt_server/pyproject.toml            |  1 -
 10 files changed, 7 insertions(+), 58 deletions(-)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index be834f5b5e..b8132714d9 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -59,7 +59,6 @@ dependencies:
 - pytest-cov
 - pytest<8
 - python>=3.10,<3.14
-- raft-dask==25.10.*,>=0.0.0a0
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
 - rapids-dask-dependency==25.10.*,>=0.0.0a0
 - rapids-logger==0.1.*,>=0.0.0a0
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index d1c3968093..7a5b26bb82 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -59,7 +59,6 @@ dependencies:
 - pytest-cov
 - pytest<8
 - python>=3.10,<3.14
-- raft-dask==25.10.*,>=0.0.0a0
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
 - rapids-dask-dependency==25.10.*,>=0.0.0a0
 - rapids-logger==0.1.*,>=0.0.0a0
diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml
index a7a532bd3d..7713566ec9 100644
--- a/conda/environments/all_cuda-130_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-130_arch-aarch64.yaml
@@ -59,7 +59,6 @@ dependencies:
 - pytest-cov
 - pytest<8
 - python>=3.10,<3.14
-- raft-dask==25.10.*,>=0.0.0a0
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
 - rapids-dask-dependency==25.10.*,>=0.0.0a0
 - rapids-logger==0.1.*,>=0.0.0a0
diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml
index 838ac5ac05..bcf7d3a819 100644
--- a/conda/environments/all_cuda-130_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-130_arch-x86_64.yaml
@@ -59,7 +59,6 @@ dependencies:
 - pytest-cov
 - pytest<8
 - python>=3.10,<3.14
-- raft-dask==25.10.*,>=0.0.0a0
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
 - rapids-dask-dependency==25.10.*,>=0.0.0a0
 - rapids-logger==0.1.*,>=0.0.0a0
diff --git a/dependencies.yaml b/dependencies.yaml
index c49f5a19a5..de50ea11df 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -46,7 +46,6 @@ files:
       - depends_on_cudf
       - depends_on_cuvs
       - depends_on_pylibraft
-      - depends_on_raft_dask
       - depends_on_rapids_logger
       - rapids_build_backend
   test_cpp:
@@ -195,7 +194,6 @@ files:
       - depends_on_rmm
       - depends_on_cudf
       - depends_on_cuvs
-      - depends_on_raft_dask
       - depends_on_pylibraft
       - depends_on_rapids_logger
   py_test_cuopt:
@@ -660,32 +658,6 @@ dependencies:
             packages:
               - *cuvs_unsuffixed
 
-  depends_on_raft_dask:
-    common:
-      - output_types: conda
-        packages:
-          - &raft_dask_unsuffixed raft-dask==25.10.*,>=0.0.0a0
-      - output_types: requirements
-        packages:
-          - --extra-index-url=https://pypi.nvidia.com
-          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
-    specific:
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - raft-dask-cu12==25.10.*,>=0.0.0a0
-          - matrix:
-              cuda: "13.*"
-              cuda_suffixed: "true"
-            packages:
-              - raft-dask-cu13==25.10.*,>=0.0.0a0
-          - matrix:
-            packages:
-              - *raft_dask_unsuffixed
-
   depends_on_pylibraft:
     common:
       - output_types: conda
diff --git a/python/cuopt/cuopt/routing/vehicle_routing.py b/python/cuopt/cuopt/routing/vehicle_routing.py
index d4a96748c6..623f15477c 100644
--- a/python/cuopt/cuopt/routing/vehicle_routing.py
+++ b/python/cuopt/cuopt/routing/vehicle_routing.py
@@ -34,7 +34,7 @@
 class DataModel(vehicle_routing_wrapper.DataModel):
     """
 
-    DataModel(n_locations, n_fleet, n_orders: int = -1, session_id=None)
+    DataModel(n_locations, n_fleet, n_orders: int = -1)
 
     Initialize a Data Model.
 
@@ -46,8 +46,6 @@ class DataModel(vehicle_routing_wrapper.DataModel):
         number of vehicles/technician in the fleet.
     n_orders : Integer
         number of orders.
-    session_id : Integer
-        This is used with dask for Multi GPU scenario.
 
     Note:
       - A cost matrix must be set before passing
@@ -67,16 +65,8 @@ class DataModel(vehicle_routing_wrapper.DataModel):
     """
 
     @catch_cuopt_exception
-    def __init__(
-        self,
-        n_locations,
-        n_fleet,
-        n_orders: int = -1,
-        session_id=None,
-    ):
-        super().__init__(
-            n_locations, n_fleet, n_orders=n_orders, session_id=session_id
-        )
+    def __init__(self, n_locations, n_fleet, n_orders: int = -1):
+        super().__init__(n_locations, n_fleet, n_orders=n_orders)
 
     @catch_cuopt_exception
     def add_cost_matrix(self, cost_mat, vehicle_type=0):
diff --git a/python/cuopt/cuopt/routing/vehicle_routing_wrapper.pyx b/python/cuopt/cuopt/routing/vehicle_routing_wrapper.pyx
index d2f3b091b3..afde213044 100644
--- a/python/cuopt/cuopt/routing/vehicle_routing_wrapper.pyx
+++ b/python/cuopt/cuopt/routing/vehicle_routing_wrapper.pyx
@@ -34,8 +34,6 @@ from datetime import date, datetime
 
 from dateutil.relativedelta import relativedelta
 
-from raft_dask.common import Comms, local_handle
-
 from cuopt.routing.assignment import Assignment
 from cuopt.utilities import type_cast
 
@@ -161,14 +159,11 @@ cdef class DataModel:
     cdef unique_ptr[data_model_view_t[int, float]] c_data_model_view
     cdef unique_ptr[handle_t] handle_ptr
 
-    def __init__(self, int num_locations, int fleet_size, int n_orders=-1,
-                 session_id=None):
+    def __init__(self, int num_locations, int fleet_size, int n_orders=-1):
         cdef handle_t* handle_ = <handle_t*><size_t>NULL
-        if session_id is None:
-            self.handle_ptr.reset(new handle_t())
-            handle_ = self.handle_ptr.get()
-        else:
-            handle_ = <handle_t*><size_t>local_handle(session_id).getHandle()
+
+        self.handle_ptr.reset(new handle_t())
+        handle_ = self.handle_ptr.get()
 
         self.c_data_model_view.reset(new data_model_view_t[int, float](
             handle_,
diff --git a/python/cuopt/pyproject.toml b/python/cuopt/pyproject.toml
index e447d94e5d..95cd468e26 100644
--- a/python/cuopt/pyproject.toml
+++ b/python/cuopt/pyproject.toml
@@ -42,7 +42,6 @@ dependencies = [
     "numpy>=1.23.5,<3.0a0",
     "pandas>=2.0",
     "pylibraft==25.10.*,>=0.0.0a0",
-    "raft-dask==25.10.*,>=0.0.0a0",
     "rapids-dask-dependency==25.10.*,>=0.0.0a0",
     "rapids-logger==0.1.*,>=0.0.0a0",
     "rmm==25.10.*,>=0.0.0a0",
@@ -91,7 +90,6 @@ known_rapids = [
     "cudf",
     "pylibraft",
     "rmm",
-    "raft_dask",
 ]
 known_first_party = [
     "cuopt",
diff --git a/python/cuopt_self_hosted/pyproject.toml b/python/cuopt_self_hosted/pyproject.toml
index 1a0b9a9db5..d035b105f6 100644
--- a/python/cuopt_self_hosted/pyproject.toml
+++ b/python/cuopt_self_hosted/pyproject.toml
@@ -86,7 +86,6 @@ known_rapids = [
     "cudf",
     "pylibraft",
     "rmm",
-    "raft_dask",
     "cuopt",
 ]
 known_first_party = [
diff --git a/python/cuopt_server/pyproject.toml b/python/cuopt_server/pyproject.toml
index 40845f4a6a..52e5e9290d 100644
--- a/python/cuopt_server/pyproject.toml
+++ b/python/cuopt_server/pyproject.toml
@@ -96,7 +96,6 @@ known_rapids = [
     "cudf",
     "pylibraft",
     "rmm",
-    "raft_dask",
     "cuopt",
 ]
 known_first_party = [

From 717e9a474430f7c7c0086b361289283803e37415 Mon Sep 17 00:00:00 2001
From: nicolas <nguidotti@nvidia.com>
Date: Thu, 9 Oct 2025 22:07:20 +0200
Subject: [PATCH 08/27] fix missing variable bounds

---
 cpp/src/dual_simplex/branch_and_bound.cpp | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index 0df4c082bc..78acd07426 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -566,10 +566,6 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node(search_tree_t<i_t, f_t>&
   lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
   assert(leaf_vstatus.size() == leaf_problem.num_cols);
 
-  // Set the correct bounds for the leaf problem
-  leaf_problem.lower = original_lp_.lower;
-  leaf_problem.upper = original_lp_.upper;
-
   std::vector<bool> bounds_changed(leaf_problem.num_cols, false);
   // Technically, we can get the already strengthened bounds from the node/parent instead of
   // getting it from the original problem and re-strengthening. But this requires storing
@@ -747,6 +743,11 @@ void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(search_tree_t<i_t, f_t>*
     status_ = mip_exploration_status_t::TIME_LIMIT;
     return;
   }
+
+  // Set the correct bounds for the leaf problem
+  leaf_problem.lower = original_lp_.lower;
+  leaf_problem.upper = original_lp_.upper;
+
   node_status_t node_status =
     solve_node(*search_tree, node, leaf_problem, Arow, upper_bound, settings_.log, 'B');
 
@@ -845,6 +846,10 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t id,
       return;
     }
 
+    // Set the correct bounds for the leaf problem
+    leaf_problem.lower = original_lp_.lower;
+    leaf_problem.upper = original_lp_.upper;
+
     node_status_t node_status =
       solve_node(search_tree, node_ptr, leaf_problem, Arow, upper_bound, settings_.log, 'B');
 
@@ -978,6 +983,10 @@ void branch_and_bound_t<i_t, f_t>::diving_thread(lp_problem_t<i_t, f_t>& leaf_pr
 
         if (toc(stats_.start_time) > settings_.time_limit) { return; }
 
+        // Set the correct bounds for the leaf problem
+        leaf_problem.lower = start_node->lp_lower;
+        leaf_problem.upper = start_node->lp_upper;
+
         node_status_t node_status =
           solve_node(subtree, node_ptr, leaf_problem, Arow, upper_bound, log, 'D');
 

From 182e8da16e1b9909ba0fbc50c6672051ed9ff79a Mon Sep 17 00:00:00 2001
From: Chris Maes <cmaes@nvidia.com>
Date: Thu, 9 Oct 2025 19:35:16 -0700
Subject: [PATCH 09/27] Don't access free variables in the original problem on
 the folded problem (#477)

Fix an issue where the free variable info from the original problem was being used on the folded problem. This was leading to undefined memory access.

Thanks to Burcin for reporting the bug and Hugo for determining the issue.

Authors:
  - Chris Maes (https://github.com/chris-maes)

Approvers:
  - Hugo Linsenmaier (https://github.com/hlinsen)

URL: https://github.com/NVIDIA/cuopt/pull/477
---
 cpp/src/dual_simplex/folding.cpp  | 10 ++++++----
 cpp/src/dual_simplex/presolve.cpp | 16 ++++++++++------
 cpp/src/dual_simplex/presolve.hpp |  2 ++
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/cpp/src/dual_simplex/folding.cpp b/cpp/src/dual_simplex/folding.cpp
index 913d86b0aa..8628f69eb7 100644
--- a/cpp/src/dual_simplex/folding.cpp
+++ b/cpp/src/dual_simplex/folding.cpp
@@ -1536,10 +1536,12 @@ void folding(lp_problem_t<i_t, f_t>& problem,
   problem.lower     = std::vector<f_t>(reduced_cols, 0.0);
   problem.upper     = std::vector<f_t>(reduced_cols, inf);
 
-  presolve_info.folding_info.c_tilde          = c_tilde;
-  presolve_info.folding_info.A_tilde          = A_tilde;
-  presolve_info.folding_info.is_folded        = true;
-  presolve_info.folding_info.num_upper_bounds = nz_ub;
+  presolve_info.folding_info.c_tilde                      = c_tilde;
+  presolve_info.folding_info.A_tilde                      = A_tilde;
+  presolve_info.folding_info.is_folded                    = true;
+  presolve_info.folding_info.num_upper_bounds             = nz_ub;
+  presolve_info.folding_info.previous_free_variable_pairs = presolve_info.free_variable_pairs;
+  presolve_info.free_variable_pairs.clear();
 
   settings.log.printf("Folding: time %.2f seconds\n", toc(start_time));
 
diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp
index e2790858e3..8d80337c74 100644
--- a/cpp/src/dual_simplex/presolve.cpp
+++ b/cpp/src/dual_simplex/presolve.cpp
@@ -1419,9 +1419,10 @@ void uncrush_solution(const presolve_info_t<i_t, f_t>& presolve_info,
                       std::vector<f_t>& uncrushed_y,
                       std::vector<f_t>& uncrushed_z)
 {
-  std::vector<f_t> input_x = crushed_x;
-  std::vector<f_t> input_y = crushed_y;
-  std::vector<f_t> input_z = crushed_z;
+  std::vector<f_t> input_x             = crushed_x;
+  std::vector<f_t> input_y             = crushed_y;
+  std::vector<f_t> input_z             = crushed_z;
+  std::vector<i_t> free_variable_pairs = presolve_info.free_variable_pairs;
   if (presolve_info.folding_info.is_folded) {
     // We solved a foled problem in the form
     // minimize c_prime^T x_prime
@@ -1474,15 +1475,18 @@ void uncrush_solution(const presolve_info_t<i_t, f_t>& presolve_info,
     input_y.resize(previous_rows - presolve_info.folding_info.num_upper_bounds);
     input_z = ztilde;
     input_z.resize(previous_cols - presolve_info.folding_info.num_upper_bounds);
+
+    // If the original problem had free variables we need to reinstate them
+    free_variable_pairs = presolve_info.folding_info.previous_free_variable_pairs;
   }
 
-  const i_t num_free_variables = presolve_info.free_variable_pairs.size() / 2;
+  const i_t num_free_variables = free_variable_pairs.size() / 2;
   if (num_free_variables > 0) {
     settings.log.printf("Post-solve: Handling free variables %d\n", num_free_variables);
     // We added free variables so we need to map the crushed solution back to the original variables
     for (i_t k = 0; k < 2 * num_free_variables; k += 2) {
-      const i_t u = presolve_info.free_variable_pairs[k];
-      const i_t v = presolve_info.free_variable_pairs[k + 1];
+      const i_t u = free_variable_pairs[k];
+      const i_t v = free_variable_pairs[k + 1];
       input_x[u] -= input_x[v];
     }
     input_z.resize(input_z.size() - num_free_variables);
diff --git a/cpp/src/dual_simplex/presolve.hpp b/cpp/src/dual_simplex/presolve.hpp
index fa8a8db58b..bf0aab8997 100644
--- a/cpp/src/dual_simplex/presolve.hpp
+++ b/cpp/src/dual_simplex/presolve.hpp
@@ -60,6 +60,7 @@ struct folding_info_t {
       c_tilde(0),
       A_tilde(0, 0, 0),
       num_upper_bounds(0),
+      previous_free_variable_pairs({}),
       is_folded(false)
   {
   }
@@ -69,6 +70,7 @@ struct folding_info_t {
   std::vector<f_t> c_tilde;
   csc_matrix_t<i_t, f_t> A_tilde;
   i_t num_upper_bounds;
+  std::vector<i_t> previous_free_variable_pairs;
   bool is_folded;
 };
 

From b6163c9b27511f77a141f69b99d74c7abc46307f Mon Sep 17 00:00:00 2001
From: Ramakrishnap <42624703+rgsl888prabhu@users.noreply.github.com>
Date: Thu, 9 Oct 2025 22:33:25 -0500
Subject: [PATCH 10/27] Add  docs and example tests to barrier (#449)

This PR is a follow-up for PR #272 to add documentation and additional tests

Authors:
  - Ramakrishnap (https://github.com/rgsl888prabhu)
  - Hugo Linsenmaier (https://github.com/hlinsen)
  - Chris Maes (https://github.com/chris-maes)

Approvers:
  - Ishika Roy (https://github.com/Iroy30)
  - Trevor McKay (https://github.com/tmckayus)
  - Chris Maes (https://github.com/chris-maes)

URL: https://github.com/NVIDIA/cuopt/pull/449
---
 ci/test_wheel_cuopt.sh                        |   5 +-
 .../source/cuopt-c/lp-milp/lp-milp-c-api.rst  |  27 ++
 .../routing/routing-example.ipynb             | 140 ++++++++---
 docs/cuopt/source/introduction.rst            |  11 +-
 docs/cuopt/source/lp-features.rst             |   9 +-
 docs/cuopt/source/lp-milp-settings.rst        | 152 ++++++++---
 docs/cuopt/source/milp-features.rst           |   1 +
 docs/cuopt/source/system-requirements.rst     |   1 +
 .../thirdparty_modeling_languages/index.rst   |   7 +
 .../data_model/data_model_wrapper.pyx         |   8 +-
 .../linear_programming/test_python_API.py     | 235 +++++++++++++++++-
 .../cuopt_server/tests/test_lp.py             |  74 ++++++
 .../linear_programming/data_definition.py     |   2 +
 .../utils/linear_programming/solver.py        |   8 +-
 python/libcuopt/CMakeLists.txt                |   1 +
 15 files changed, 599 insertions(+), 82 deletions(-)

diff --git a/ci/test_wheel_cuopt.sh b/ci/test_wheel_cuopt.sh
index 1b37ed020f..61dabd67fd 100755
--- a/ci/test_wheel_cuopt.sh
+++ b/ci/test_wheel_cuopt.sh
@@ -66,11 +66,14 @@ cd -
 RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
 export RAPIDS_DATASET_ROOT_DIR
 
-# Please enable this once ISSUE https://github.com/NVIDIA/cuopt/issues/94 is fixed
 # Run CLI tests
 timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh
 
 # Run Python tests
+
+# Due to race condition in certain cases UCX might not be able to cleanup properly, so we set the number of threads to 1
+export OMP_NUM_THREADS=1
+
 RAPIDS_DATASET_ROOT_DIR=./datasets timeout 30m python -m pytest --verbose --capture=no ./python/cuopt/cuopt/tests/
 
 # run jump tests and cvxpy integration tests for only nightly builds
diff --git a/docs/cuopt/source/cuopt-c/lp-milp/lp-milp-c-api.rst b/docs/cuopt/source/cuopt-c/lp-milp/lp-milp-c-api.rst
index 6d942bde63..e321e319db 100644
--- a/docs/cuopt/source/cuopt-c/lp-milp/lp-milp-c-api.rst
+++ b/docs/cuopt/source/cuopt-c/lp-milp/lp-milp-c-api.rst
@@ -16,6 +16,13 @@ You may use the following functions to determine the number of bytes used to rep
 .. doxygenfunction:: cuOptGetIntSize
 .. doxygenfunction:: cuOptGetFloatSize
 
+Version Information
+-------------------
+
+You may use the following function to get the version of the cuOpt library
+
+.. doxygenfunction:: cuOptGetVersion
+
 Status Codes
 ------------
 
@@ -25,6 +32,9 @@ Every function in the C API returns a status code that indicates success or fail
 .. doxygendefine:: CUOPT_INVALID_ARGUMENT
 .. doxygendefine:: CUOPT_MPS_FILE_ERROR
 .. doxygendefine:: CUOPT_MPS_PARSE_ERROR
+.. doxygendefine:: CUOPT_VALIDATION_ERROR
+.. doxygendefine:: CUOPT_OUT_OF_MEMORY
+.. doxygendefine:: CUOPT_RUNTIME_ERROR
 
 Optimization Problem
 --------------------
@@ -156,9 +166,22 @@ These constants are used as parameter names in the :c:func:`cuOptSetParameter`,
 .. doxygendefine:: CUOPT_MIP_ABSOLUTE_TOLERANCE
 .. doxygendefine:: CUOPT_MIP_RELATIVE_TOLERANCE
 .. doxygendefine:: CUOPT_MIP_INTEGRALITY_TOLERANCE
+.. doxygendefine:: CUOPT_MIP_ABSOLUTE_GAP
+.. doxygendefine:: CUOPT_MIP_RELATIVE_GAP
 .. doxygendefine:: CUOPT_MIP_SCALING
 .. doxygendefine:: CUOPT_MIP_HEURISTICS_ONLY
+.. doxygendefine:: CUOPT_MIP_PRESOLVE
 .. doxygendefine:: CUOPT_PRESOLVE
+.. doxygendefine:: CUOPT_LOG_TO_CONSOLE
+.. doxygendefine:: CUOPT_CROSSOVER
+.. doxygendefine:: CUOPT_FOLDING
+.. doxygendefine:: CUOPT_AUGMENTED
+.. doxygendefine:: CUOPT_DUALIZE
+.. doxygendefine:: CUOPT_ORDERING
+.. doxygendefine:: CUOPT_ELIMINATE_DENSE_COLUMNS
+.. doxygendefine:: CUOPT_CUDSS_DETERMINISTIC
+.. doxygendefine:: CUOPT_BARRIER_DUAL_INITIAL_POINT
+.. doxygendefine:: CUOPT_DUAL_POSTSOLVE
 .. doxygendefine:: CUOPT_SOLUTION_FILE
 .. doxygendefine:: CUOPT_NUM_CPU_THREADS
 .. doxygendefine:: CUOPT_USER_PROBLEM_FILE
@@ -186,6 +209,7 @@ These constants are used to configure `CUOPT_METHOD` via :c:func:`cuOptSetIntege
 .. doxygendefine:: CUOPT_METHOD_CONCURRENT
 .. doxygendefine:: CUOPT_METHOD_PDLP
 .. doxygendefine:: CUOPT_METHOD_DUAL_SIMPLEX
+.. doxygendefine:: CUOPT_METHOD_BARRIER
 
 
 Solving an LP or MIP
@@ -206,12 +230,15 @@ The output of a solve is a `cuOptSolution` object.
 The following functions may be used to access information from a `cuOptSolution`
 
 .. doxygenfunction:: cuOptGetTerminationStatus
+.. doxygenfunction:: cuOptGetErrorStatus
+.. doxygenfunction:: cuOptGetErrorString
 .. doxygenfunction:: cuOptGetPrimalSolution
 .. doxygenfunction:: cuOptGetObjectiveValue
 .. doxygenfunction:: cuOptGetSolveTime
 .. doxygenfunction:: cuOptGetMIPGap
 .. doxygenfunction:: cuOptGetSolutionBound
 .. doxygenfunction:: cuOptGetDualSolution
+.. doxygenfunction:: cuOptGetDualObjectiveValue
 .. doxygenfunction:: cuOptGetReducedCosts
 
 When you are finished with a `cuOptSolution` object you should destory it with
diff --git a/docs/cuopt/source/cuopt-python/routing/routing-example.ipynb b/docs/cuopt/source/cuopt-python/routing/routing-example.ipynb
index 2cf903c46d..b376ac8e47 100644
--- a/docs/cuopt/source/cuopt-python/routing/routing-example.ipynb
+++ b/docs/cuopt/source/cuopt-python/routing/routing-example.ipynb
@@ -12,10 +12,62 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 1,
    "id": "2cb694f7",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/luffy/.local/lib/python3.12/site-packages/cudf/utils/_ptxcompiler.py:64: UserWarning: Error getting driver and runtime versions:\n",
+      "\n",
+      "stdout:\n",
+      "\n",
+      "\n",
+      "\n",
+      "stderr:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"<string>\", line 4, in <module>\n",
+      "  File \"/home/luffy/miniforge3/envs/cuopt/lib/python3.12/site-packages/numba_cuda/numba/cuda/cudadrv/driver.py\", line 393, in safe_cuda_api_call\n",
+      "    return self._check_cuda_python_error(fname, libfn(*args))\n",
+      "                                                ^^^^^^^^^^^^\n",
+      "TypeError: cuDriverGetVersion() takes no arguments (1 given)\n",
+      "\n",
+      "\n",
+      "Not patching Numba\n",
+      "  warnings.warn(msg, UserWarning)\n",
+      "/home/luffy/.local/lib/python3.12/site-packages/cupy/_environment.py:596: UserWarning: \n",
+      "--------------------------------------------------------------------------------\n",
+      "\n",
+      "  CuPy may not function correctly because multiple CuPy packages are installed\n",
+      "  in your environment:\n",
+      "\n",
+      "    cupy, cupy-cuda12x\n",
+      "\n",
+      "  Follow these steps to resolve this issue:\n",
+      "\n",
+      "    1. For all packages listed above, run the following command to remove all\n",
+      "       existing CuPy installations:\n",
+      "\n",
+      "         $ pip uninstall <package_name>\n",
+      "\n",
+      "      If you previously installed CuPy via conda, also run the following:\n",
+      "\n",
+      "         $ conda uninstall cupy\n",
+      "\n",
+      "    2. Install the appropriate CuPy package.\n",
+      "       Refer to the Installation Guide for detailed instructions.\n",
+      "\n",
+      "         https://docs.cupy.dev/en/stable/install.html\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\n",
+      "  warnings.warn(f'''\n"
+     ]
+    }
+   ],
    "source": [
     "from cuopt import routing\n",
     "from cuopt import distance_engine\n",
@@ -61,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 2,
    "id": "5d12f05d",
    "metadata": {},
    "outputs": [],
@@ -100,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 3,
    "id": "2c824c99",
    "metadata": {},
    "outputs": [],
@@ -122,7 +174,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 4,
    "id": "4e08f664",
    "metadata": {},
    "outputs": [],
@@ -152,22 +204,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 5,
    "id": "9975bf1a",
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Waypoint graph node to time matrix index mapping \n",
-      "{np.int64(0): 0, np.int64(4): 1, np.int64(5): 2, np.int64(6): 3}\n",
-      "\n",
-      "     0    1    2    3\n",
-      "0  0.0  6.0  4.0  6.0\n",
-      "1  6.0  0.0  4.0  6.0\n",
-      "2  4.0  4.0  0.0  4.0\n",
-      "3  6.0  6.0  4.0  0.0\n"
+     "ename": "RuntimeError",
+     "evalue": "CuPy failed to load libnvrtc.so.12: OSError: libnvrtc.so.12: cannot open shared object file: No such file or directory",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mOSError\u001b[39m                                   Traceback (most recent call last)",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy_backends/cuda/_softlink.pyx:25\u001b[39m, in \u001b[36mcupy_backends.cuda._softlink.SoftLink.__init__\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/miniforge3/envs/cuopt/lib/python3.12/ctypes/__init__.py:379\u001b[39m, in \u001b[36mCDLL.__init__\u001b[39m\u001b[34m(self, name, mode, handle, use_errno, use_last_error, winmode)\u001b[39m\n\u001b[32m    378\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m handle \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m379\u001b[39m     \u001b[38;5;28mself\u001b[39m._handle = \u001b[43m_dlopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    380\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
+      "\u001b[31mOSError\u001b[39m: libnvrtc.so.12: cannot open shared object file: No such file or directory",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[31mRuntimeError\u001b[39m                              Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m      1\u001b[39m waypoint_graph = distance_engine.WaypointMatrix(\n\u001b[32m      2\u001b[39m     offsets,\n\u001b[32m      3\u001b[39m     edges,\n\u001b[32m      4\u001b[39m     weights\n\u001b[32m      5\u001b[39m )\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m cost_matrix = \u001b[43mwaypoint_graph\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcompute_cost_matrix\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtarget_locations\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m      7\u001b[39m transit_time_matrix = cost_matrix.copy(deep=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m      8\u001b[39m target_map = {v:k \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(target_locations)}\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/miniforge3/envs/cuopt/lib/python3.12/site-packages/cuopt/utilities/exception_handler.py:60\u001b[39m, in \u001b[36mcatch_cuopt_exception.<locals>.func\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m     58\u001b[39m             \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(err[\u001b[33m\"\u001b[39m\u001b[33mmsg\u001b[39m\u001b[33m\"\u001b[39m])\n\u001b[32m     59\u001b[39m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m60\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[32m     61\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m     62\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m e\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/miniforge3/envs/cuopt/lib/python3.12/site-packages/cuopt/utilities/exception_handler.py:36\u001b[39m, in \u001b[36mcatch_cuopt_exception.<locals>.func\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m     33\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(f)\n\u001b[32m     34\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mfunc\u001b[39m(*args, **kwargs):\n\u001b[32m     35\u001b[39m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m36\u001b[39m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     37\u001b[39m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m     38\u001b[39m         err_msg = \u001b[38;5;28mstr\u001b[39m(e)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/miniforge3/envs/cuopt/lib/python3.12/site-packages/cuopt/distance_engine/waypoint_matrix.py:133\u001b[39m, in \u001b[36mWaypointMatrix.compute_cost_matrix\u001b[39m\u001b[34m(self, target_locations)\u001b[39m\n\u001b[32m    130\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m target_locations.shape[\u001b[32m0\u001b[39m] <= \u001b[32m0\u001b[39m:\n\u001b[32m    131\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\"\"\u001b[39m\u001b[33mTarget_locations length must be positive\u001b[39m\u001b[33m\"\"\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m133\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcompute_cost_matrix\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtarget_locations\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/miniforge3/envs/cuopt/lib/python3.12/site-packages/cuopt/distance_engine/waypoint_matrix_wrapper.pyx:81\u001b[39m, in \u001b[36mcuopt.distance_engine.waypoint_matrix_wrapper.WaypointMatrix.compute_cost_matrix\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/cudf/utils/performance_tracking.py:51\u001b[39m, in \u001b[36m_performance_tracking.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m     43\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m nvtx.enabled():\n\u001b[32m     44\u001b[39m     stack.enter_context(\n\u001b[32m     45\u001b[39m         nvtx.annotate(\n\u001b[32m     46\u001b[39m             message=func.\u001b[34m__qualname__\u001b[39m,\n\u001b[32m   (...)\u001b[39m\u001b[32m     49\u001b[39m         )\n\u001b[32m     50\u001b[39m     )\n\u001b[32m---> \u001b[39m\u001b[32m51\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/cudf/core/dataframe.py:810\u001b[39m, in \u001b[36mDataFrame.__init__\u001b[39m\u001b[34m(self, data, index, columns, dtype, copy, nan_as_null)\u001b[39m\n\u001b[32m    808\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mdescr\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m arr_interface:\n\u001b[32m    809\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(arr_interface[\u001b[33m\"\u001b[39m\u001b[33mdescr\u001b[39m\u001b[33m\"\u001b[39m]) == \u001b[32m1\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m810\u001b[39m         new_df = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_from_arrays\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    811\u001b[39m \u001b[43m            \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m=\u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcolumns\u001b[49m\n\u001b[32m    812\u001b[39m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    813\u001b[39m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    814\u001b[39m         new_df = \u001b[38;5;28mself\u001b[39m.from_records(\n\u001b[32m    815\u001b[39m             data, index=index, columns=columns\n\u001b[32m    816\u001b[39m         )\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/cudf/utils/performance_tracking.py:51\u001b[39m, in \u001b[36m_performance_tracking.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m     43\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m nvtx.enabled():\n\u001b[32m     44\u001b[39m     stack.enter_context(\n\u001b[32m     45\u001b[39m         nvtx.annotate(\n\u001b[32m     46\u001b[39m             message=func.\u001b[34m__qualname__\u001b[39m,\n\u001b[32m   (...)\u001b[39m\u001b[32m     49\u001b[39m         )\n\u001b[32m     50\u001b[39m     )\n\u001b[32m---> \u001b[39m\u001b[32m51\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/cudf/core/dataframe.py:5947\u001b[39m, in \u001b[36mDataFrame._from_arrays\u001b[39m\u001b[34m(cls, data, index, columns, nan_as_null)\u001b[39m\n\u001b[32m   5945\u001b[39m array_data: np.ndarray | cupy.ndarray\n\u001b[32m   5946\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(data, \u001b[33m\"\u001b[39m\u001b[33m__cuda_array_interface__\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m-> \u001b[39m\u001b[32m5947\u001b[39m     array_data = \u001b[43mcupy\u001b[49m\u001b[43m.\u001b[49m\u001b[43masarray\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mF\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m   5948\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(data, \u001b[33m\"\u001b[39m\u001b[33m__array_interface__\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m   5949\u001b[39m     array_data = np.asarray(data, order=\u001b[33m\"\u001b[39m\u001b[33mF\u001b[39m\u001b[33m\"\u001b[39m)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/.local/lib/python3.12/site-packages/cupy/_creation/from_data.py:88\u001b[39m, in \u001b[36masarray\u001b[39m\u001b[34m(a, dtype, order, blocking)\u001b[39m\n\u001b[32m     56\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34masarray\u001b[39m(a, dtype=\u001b[38;5;28;01mNone\u001b[39;00m, order=\u001b[38;5;28;01mNone\u001b[39;00m, *, blocking=\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[32m     57\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"Converts an object to array.\u001b[39;00m\n\u001b[32m     58\u001b[39m \n\u001b[32m     59\u001b[39m \u001b[33;03m    This is equivalent to ``array(a, dtype, copy=False, order=order)``.\u001b[39;00m\n\u001b[32m   (...)\u001b[39m\u001b[32m     86\u001b[39m \n\u001b[32m     87\u001b[39m \u001b[33;03m    \"\"\"\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m88\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_core\u001b[49m\u001b[43m.\u001b[49m\u001b[43marray\u001b[49m\u001b[43m(\u001b[49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblocking\u001b[49m\u001b[43m=\u001b[49m\u001b[43mblocking\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy/_core/core.pyx:2502\u001b[39m, in \u001b[36mcupy._core.core.array\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy/_core/core.pyx:2512\u001b[39m, in \u001b[36mcupy._core.core.array\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy/_core/core.pyx:2543\u001b[39m, in \u001b[36mcupy._core.core._array_from_cupy_ndarray\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy/_core/core.pyx:618\u001b[39m, in \u001b[36mcupy._core.core._ndarray_base.astype\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy/_core/core.pyx:686\u001b[39m, in \u001b[36mcupy._core.core._ndarray_base.astype\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy/_core/_kernel.pyx:1374\u001b[39m, in \u001b[36mcupy._core._kernel.ufunc.__call__\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy/_core/_kernel.pyx:1401\u001b[39m, in \u001b[36mcupy._core._kernel.ufunc._get_ufunc_kernel\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy/_core/_kernel.pyx:1082\u001b[39m, in \u001b[36mcupy._core._kernel._get_ufunc_kernel\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy/_core/_kernel.pyx:94\u001b[39m, in \u001b[36mcupy._core._kernel._get_simple_elementwise_kernel\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy/_core/_kernel.pyx:82\u001b[39m, in \u001b[36mcupy._core._kernel._get_simple_elementwise_kernel_from_code\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy/_core/core.pyx:2375\u001b[39m, in \u001b[36mcupy._core.core.compile_with_cache\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy/_core/core.pyx:2320\u001b[39m, in \u001b[36mcupy._core.core.assemble_cupy_compiler_options\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy_backends/cuda/libs/nvrtc.pyx:57\u001b[39m, in \u001b[36mcupy_backends.cuda.libs.nvrtc.getVersion\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy_backends/cuda/libs/_cnvrtc.pxi:72\u001b[39m, in \u001b[36mcupy_backends.cuda.libs.nvrtc.initialize\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy_backends/cuda/libs/_cnvrtc.pxi:75\u001b[39m, in \u001b[36mcupy_backends.cuda.libs.nvrtc._initialize\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy_backends/cuda/libs/_cnvrtc.pxi:153\u001b[39m, in \u001b[36mcupy_backends.cuda.libs.nvrtc._get_softlink\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32mcupy_backends/cuda/_softlink.pyx:32\u001b[39m, in \u001b[36mcupy_backends.cuda._softlink.SoftLink.__init__\u001b[39m\u001b[34m()\u001b[39m\n",
+      "\u001b[31mRuntimeError\u001b[39m: CuPy failed to load libnvrtc.so.12: OSError: libnvrtc.so.12: cannot open shared object file: No such file or directory"
      ]
     }
    ],
@@ -230,7 +310,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": null,
    "id": "72b715c7",
    "metadata": {},
    "outputs": [
@@ -409,7 +489,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": null,
    "id": "9e17e899",
    "metadata": {},
    "outputs": [
@@ -496,7 +576,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": null,
    "id": "2e765325",
    "metadata": {},
    "outputs": [],
@@ -525,7 +605,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": null,
    "id": "c936b137",
    "metadata": {},
    "outputs": [
@@ -567,7 +647,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": null,
    "id": "87c2d9f8",
    "metadata": {},
    "outputs": [],
@@ -596,7 +676,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": null,
    "id": "1d325f4b",
    "metadata": {},
    "outputs": [
@@ -642,7 +722,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": null,
    "id": "064978ca",
    "metadata": {},
    "outputs": [],
@@ -666,7 +746,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": null,
    "id": "b3f328e3",
    "metadata": {},
    "outputs": [],
@@ -708,7 +788,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": null,
    "id": "a6babc11",
    "metadata": {
     "scrolled": true
@@ -732,7 +812,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": null,
    "id": "28a05ace",
    "metadata": {},
    "outputs": [
@@ -792,7 +872,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": null,
    "id": "e0d98709",
    "metadata": {},
    "outputs": [
@@ -838,7 +918,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": null,
    "id": "c13cfbf3",
    "metadata": {
     "scrolled": true
@@ -945,7 +1025,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.10"
+   "version": "3.12.11"
   }
  },
  "nbformat": 4,
diff --git a/docs/cuopt/source/introduction.rst b/docs/cuopt/source/introduction.rst
index 1002821283..de24f47469 100644
--- a/docs/cuopt/source/introduction.rst
+++ b/docs/cuopt/source/introduction.rst
@@ -66,9 +66,15 @@ This is a linear program.
 
 How cuOpt Solves the Linear Programming Problem
 ------------------------------------------------
-cuOpt includes an LP solver based on `PDLP <https://arxiv.org/abs/2106.04756>`__, a new First-Order Method (FOM) used to solve large-scale LPs. This solver implements gradient descent, enhanced by heuristics, and performing massively parallel operations efficiently by leveraging the latest NVIDIA GPUs.
+cuOpt includes three LP solving methods:
 
-In addition to PDLP, cuOpt includes a dual simplex solver that runs on the CPU. Both algorithms can be run concurrently on the GPU and CPU.
+* **PDLP**: Based on `PDLP <https://arxiv.org/abs/2106.04756>`__, a First-Order Method (FOM) for solving large-scale LPs. This solver implements primal-dual hybrid gradient enhanced by heuristics. Sparse matrix-vector products are perfomed efficiently on NVIDIA GPUs.
+
+* **Barrier (Interior-Point)**: A primal-dual interior-point method that uses GPU-accelerated sparse Cholesky and LDLT solves via cuDSS, and sparse matrix operations via cuSparse.
+
+* **Dual Simplex**: A CPU-based dual simplex solver for small to medium-sized problems.
+
+All three algorithms can be run concurrently on both GPU and CPU, with the fastest solution returned automatically.
 
 Mixed Integer Linear Programming (MILP)
 =========================================
@@ -121,6 +127,7 @@ cuOpt supports the following APIs:
    - `AMPL <https://www.ampl.com/>`_
    - `GAMS <https://www.gams.com/>`_
    - `PuLP <https://pypi.org/project/PuLP/>`_
+   - `JuMP <https://github.com/jump-dev/cuOpt.jl>`_
 
 
 ==================================
diff --git a/docs/cuopt/source/lp-features.rst b/docs/cuopt/source/lp-features.rst
index c5d5899079..fc450736b4 100644
--- a/docs/cuopt/source/lp-features.rst
+++ b/docs/cuopt/source/lp-features.rst
@@ -13,6 +13,7 @@ The LP solver can be accessed in the following ways:
    -  AMPL
    -  GAMS
    -  PuLP
+   -  JuMP
 
 - **C API**: A native C API that provides direct low-level access to cuOpt's LP capabilities, enabling integration into any application or system that can interface with C.
 
@@ -65,9 +66,11 @@ Users can control how the solver will operate by specifying the PDLP solver mode
 Method
 ------
 
-**Concurrent**: The default method for solving linear programs. When concurrent is selected, cuOpt runs two algorithms at the same time: PDLP on the GPU and dual simplex on the CPU. A solution is returned from the algorithm that finishes first.
+**Concurrent**: The default method for solving linear programs. When concurrent is selected, cuOpt runs three algorithms in parallel: PDLP on the GPU, barrier (interior-point) on the GPU, and dual simplex on the CPU. A solution is returned from the algorithm that finishes first.
 
-**PDLP**: Primal-Dual Hybrid Gradient for Linear Program is an algorithm for solving large-scale linear programming problems on the GPU. PDLP does not attempt to any matrix factorizations during the course of the solve. Select this method if your LP is so large that factorization will not fit into memory. By default PDLP solves to low relative tolerance and the solutions it returns do not lie at a vertex of the feasible region. Enable crossover to obtain a highly accurate basic solution from a PDLP solution.
+**PDLP**: Primal-Dual Hybrid Gradient for Linear Program is an algorithm for solving large-scale linear programming problems on the GPU. PDLP does not attempt any matrix factorizations during the course of the solve. Select this method if your LP is so large that factorization will not fit into memory. By default PDLP solves to low relative tolerance and the solutions it returns do not lie at a vertex of the feasible region. Enable crossover to obtain a highly accurate basic solution from a PDLP solution.
+
+**Barrier**: The barrier method (also known as interior-point method) solves linear programs using a primal-dual predictor-corrector algorithm. This method uses GPU-accelerated sparse Cholesky and sparse LDLT solves via cuDSS, and GPU-accelerated sparse matrix-vector and matrix-matrix operations via cuSparse. Barrier is particularly effective for large-scale problems and can automatically apply techniques like folding, dualization, and dense column elimination to improve performance. This method solves the linear systems at each iteration using the augmented system or the normal equations (ADAT). Enable crossover to obtain a highly accurate basic solution from a barrier solution.
 
 **Dual Simplex**: Dual simplex is the simplex method applied to the dual of the linear program. Dual simplex requires the basis factorization of linear program fit into memory. Select this method if your LP is small to medium sized, or if you require a high-quality basic solution.
 
@@ -75,7 +78,7 @@ Method
 Crossover
 ---------
 
-Crossover allows you to obtain a high-quality basic solution from the results of a PDLP solve. More details can be found :ref:`here <crossover>`.
+Crossover allows you to obtain a high-quality basic solution from the results of a PDLP or barrier solve. When enabled, crossover converts these solutions to a vertex solution (basic solution) with high accuracy. More details can be found :ref:`here <crossover>`.
 
 
 Presolve
diff --git a/docs/cuopt/source/lp-milp-settings.rst b/docs/cuopt/source/lp-milp-settings.rst
index 2586954191..d755d5a979 100644
--- a/docs/cuopt/source/lp-milp-settings.rst
+++ b/docs/cuopt/source/lp-milp-settings.rst
@@ -23,8 +23,8 @@ may run slightly over the limit. If set along with the iteration limit, cuOpt wi
 the first limit (iteration or time) is hit.
 
 
-Note: by default there is no time limit. So cuOpt will run until it finds an optimal solution,
-or proves the problem is infeasible or unbounded.
+.. note:: by default there is no time limit. So cuOpt will run until it finds an optimal solution,
+   or proves the problem is infeasible or unbounded.
 
 
 
@@ -33,25 +33,25 @@ Log to Console
 ``CUOPT_LOG_TO_CONSOLE`` controls whether cuOpt should log information to the console during a solve.
 If true, a logging info is written to the console, if false no logging info is written to the console (logs may still be written to a file.)
 
-Note: the default value is true.
+.. note:: the default value is true.
 
 Log File
 ^^^^^^^^
 ``CUOPT_LOG_FILE`` controls the name of a log file where cuOpt should write information about the solve.
 
-Note: the default value is ``""`` and no log file is written. This setting is ignored by the cuOpt service, use the log callback feature instead.
+.. note:: the default value is ``""`` and no log file is written. This setting is ignored by the cuOpt service, use the log callback feature instead.
 
 Solution File
 ^^^^^^^^^^^^^
 ``CUOPT_SOLUTION_FILE`` controls the name of a file where cuOpt should write the solution.
 
-Note: the default value is ``""`` and no solution file is written. This setting is ignored by the cuOpt service.
+.. note:: the default value is ``""`` and no solution file is written. This setting is ignored by the cuOpt service.
 
 User Problem File
 ^^^^^^^^^^^^^^^^^
 ``CUOPT_USER_PROBLEM_FILE`` controls the name of a file where cuOpt should write the user problem.
 
-Note: the default value is ``""`` and no user problem file is written. This setting is ignored by the cuOpt service.
+.. note:: the default value is ``""`` and no user problem file is written. This setting is ignored by the cuOpt service.
 
 Num CPU Threads
 ^^^^^^^^^^^^^^^
@@ -59,7 +59,7 @@ Num CPU Threads
 the amount of CPU resources cuOpt uses. Set this to a large value to improve solve times for CPU
 parallel parts of the solvers.
 
-Note: by default the number of CPU threads is automatically determined based on the number of CPU cores.
+.. note:: by default the number of CPU threads is automatically determined based on the number of CPU cores.
 
 Presolve
 ^^^^^^^^
@@ -78,20 +78,19 @@ We now describe the parameter settings used to control cuOpt's Linear Programmin
 Method
 ^^^^^^
 
-``CUOPT_METHOD`` controls the method to solve the linear programming problem. Three methods are available:
+``CUOPT_METHOD`` controls the method to solve the linear programming problem. Four methods are available:
 
-* ``Concurrent``: Use both PDLP and dual simplex in parallel.
+* ``Concurrent``: Use PDLP, dual simplex, and barrier in parallel (default).
 * ``PDLP``: Use the PDLP method.
 * ``Dual Simplex``: Use the dual simplex method.
+* ``Barrier``: Use the barrier (interior-point) method.
 
-Note: The default method is ``Concurrent``.
+.. note:: The default method is ``Concurrent``.
 
 C API users should use the constants defined in :ref:`method-constants` for this parameter.
 
 Server Thin client users should use the :class:`cuopt_sh_client.SolverMethod` for this parameter.
 
-
-
 PDLP Solver Mode
 ^^^^^^^^^^^^^^^^
 
@@ -117,8 +116,8 @@ For performance reasons, cuOpt's does not constantly checks for iteration limit,
 the solver might run a few extra iterations over the limit. If set along with the time limit,
 cuOpt will stop at the first limit (iteration or time) reached.
 
-Note: by default there is no iteration limit. So, cuOpt will run until it finds an optimal solution,
-or proves the problem is infeasible or unbounded.
+.. note:: by default there is no iteration limit. So, cuOpt will run until it finds an optimal solution,
+   or proves the problem is infeasible or unbounded.
 
 
 Infeasiblity Detection
@@ -129,8 +128,8 @@ is not always accurate. Some problems detected as infeasible may converge under
 Detecting infeasibility consumes both more runtime and memory. The added runtime is between 3% and 7%,
 the added memory consumpution is between 10% and 20%.
 
-Note: by default PDLP will not detect infeasibility. Dual simplex will always detect infeasibility
-regardless of this setting.
+.. note:: by default PDLP will not detect infeasibility. Dual simplex will always detect infeasibility
+   regardless of this setting.
 
 Strict Infeasibility
 ^^^^^^^^^^^^^^^^^^^^
@@ -139,21 +138,21 @@ Strict Infeasibility
 is detected as infeasible, PDLP will stop. When false both the current and average solution need to be
 detected as infeasible for PDLP to stop.
 
-Note: the default value is false.
+.. note:: the default value is false.
 
 .. _crossover:
 
 Crossover
 ^^^^^^^^^
 
-``CUOPT_CROSSOVER`` controls whether PDLP should crossover to a basic solution after a optimal solution is found.
+``CUOPT_CROSSOVER`` controls whether PDLP or barrier should crossover to a basic solution after an optimal solution is found.
 Changing this value has a significant impact on accuracy and runtime.
-By default the solutions provided by PDLP are low accuracy and may have many variables that lie
+By default the solutions provided by PDLP and barrier do not lie at a vertex and thus may have many variables that lie
 between their bounds. Enabling crossover allows the user to obtain a high-quality basic solution
 that lies at a vertex of the feasible region. If n is the number of variables, and m is the number of
 constraints, n - m variables will be on their bounds in a basic solution.
 
-Note: the default value is false.
+.. note:: the default value is false.
 
 Save Best Primal So Far
 ^^^^^^^^^^^^^^^^^^^^^^^
@@ -164,21 +163,104 @@ With this parameter set to true, PDLP
 * If no primal feasible was found, the one with the lowest primal residual will be kept
 * If two have the same primal residual, the one with the best objective will be kept
 
-Note: the default value is false.
+.. note:: the default value is false.
 
 First Primal Feasible
 ^^^^^^^^^^^^^^^^^^^^^
 
 ``CUOPT_FIRST_PRIMAL_FEASIBLE`` controls whether PDLP should stop when the first primal feasible solution is found.
 
-Note: the default value is false.
+.. note:: the default value is false.
 
 Per Constraint Residual
 ^^^^^^^^^^^^^^^^^^^^^^^
 
 ``CUOPT_PER_CONSTRAINT_RESIDUAL`` controls whether PDLP should compute the primal & dual residual per constraint instead of globally.
 
-Note: the default value is false.
+.. note:: the default value is false.
+
+Barrier Solver Settings
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following settings control the behavior of the barrier (interior-point) method:
+
+Folding
+"""""""
+
+``CUOPT_FOLDING`` controls whether to fold the linear program. Folding can reduce problem size by exploiting symmetry in the problem.
+
+* ``-1``: Automatic (default) - cuOpt decides whether to fold based on problem characteristics
+* ``0``: Disable folding
+* ``1``: Force folding to run
+
+.. note:: the default value is ``-1`` (automatic).
+
+Dualize
+"""""""
+
+``CUOPT_DUALIZE`` controls whether to dualize the linear program in presolve. Dualizing can improve solve time for problems, with inequality constraints, where there are more constraints than variables.
+
+* ``-1``: Automatic (default) - cuOpt decides whether to dualize based on problem characteristics
+* ``0``: Don't attempt to dualize
+* ``1``: Force dualize
+
+.. note:: the default value is ``-1`` (automatic).
+
+Ordering
+""""""""
+
+``CUOPT_ORDERING`` controls the ordering algorithm used by cuDSS for sparse factorizations. The ordering can significantly impact solver run time.
+
+* ``-1``: Automatic (default) - cuOpt selects the best ordering
+* ``0``: cuDSS default ordering
+* ``1``: AMD (Approximate Minimum Degree) ordering
+
+.. note:: the default value is ``-1`` (automatic).
+
+Augmented System
+""""""""""""""""
+
+``CUOPT_AUGMENTED`` controls which linear system to solve in the barrier method.
+
+* ``-1``: Automatic (default) - cuOpt selects the best linear system to solve
+* ``0``: Solve the ADAT system (normal equations)
+* ``1``: Solve the augmented system
+
+.. note:: the default value is ``-1`` (automatic). The augmented system may be more stable for some problems, while ADAT may be faster for others.
+
+Eliminate Dense Columns
+""""""""""""""""""""""""
+
+``CUOPT_ELIMINATE_DENSE_COLUMNS`` controls whether to eliminate dense columns from the constraint matrix before solving. Eliminating dense columns can improve performance by reducing fill-in during factorization.
+However, extra solves must be performed at each iteration.
+
+* ``true``: Eliminate dense columns (default)
+* ``false``: Don't eliminate dense columns
+
+This setting only has an effect when the ADAT (normal equation) system is solved.
+
+.. note:: the default value is ``true``.
+
+cuDSS Deterministic Mode
+"""""""""""""""""""""""""
+
+``CUOPT_CUDSS_DETERMINISTIC`` controls whether cuDSS operates in deterministic mode. Deterministic mode ensures reproducible results across runs but may be slower.
+
+* ``true``: Use deterministic mode
+* ``false``: Use non-deterministic mode (default)
+
+.. note:: the default value is ``false``. Enable deterministic mode if reproducibility is more important than performance.
+
+Dual Initial Point
+""""""""""""""""""
+
+``CUOPT_BARRIER_DUAL_INITIAL_POINT`` controls the method used to compute the dual initial point for the barrier solver. The choice of initial point will affect the number of iterations performed by barrier.
+
+* ``-1``: Automatic (default) - cuOpt selects the best method
+* ``0``: Use an initial point from a heuristic approach based on the paper "On Implementing Mehrotra's Predictor–Corrector Interior-Point Method for Linear Programming" (SIAM J. Optimization, 1992) by Lustig, Martsten, Shanno.
+* ``1``: Use an initial point from solving a least squares problem that minimizes the norms of the dual variables and reduced costs while statisfying the dual equality constraints.
+
+.. note:: the default value is ``-1`` (automatic).
 
 Absolute Primal Tolerance
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -204,7 +286,7 @@ The primal feasibility condition is computed as follows::
 
    primal_feasiblity < absolute_primal_tolerance + relative_primal_tolerance * l2_norm(b)
 
-Note: the default value is ``1e-4``.
+.. note:: the default value is ``1e-4``.
 
 Absolute Dual Tolerance
 ^^^^^^^^^^^^^^^^^^^^^^^
@@ -217,7 +299,7 @@ The dual feasibility condition is computed as follows::
 
    dual_feasiblity < absolute_dual_tolerance + relative_dual_tolerance * l2_norm(c)
 
-Note: the default value is ``1e-4``.
+.. note:: the default value is ``1e-4``.
 
 Relative Dual Tolerance
 ^^^^^^^^^^^^^^^^^^^^^^^
@@ -228,7 +310,7 @@ The dual feasibility condition is computed as follows::
 
    dual_feasiblity < absolute_dual_tolerance + relative_dual_tolerance * l2_norm(c)
 
-Note: the default value is ``1e-4``.
+.. note:: the default value is ``1e-4``.
 
 
 Absolute Gap Tolerance
@@ -241,7 +323,7 @@ The duality gap is computed as follows::
 
    duality_gap < absolute_gap_tolerance + relative_gap_tolerance * (|primal_objective| + |dual_objective|)
 
-Note: the default value is ``1e-4``.
+.. note:: the default value is ``1e-4``.
 
 
 Relative Gap Tolerance
@@ -253,7 +335,7 @@ The duality gap is computed as follows::
 
    duality_gap < absolute_gap_tolerance + relative_gap_tolerance * (|primal_objective| + |dual_objective|)
 
-Note: the default value is ``1e-4``.
+.. note:: the default value is ``1e-4``.
 
 
 Mixed Integer Linear Programming
@@ -269,7 +351,7 @@ Heuristics only
 bound is improved via the GPU. When set to false, both the GPU and CPU are used and
 the dual bound is improved on the CPU.
 
-Note: the default value is false.
+.. note:: the default value is false.
 
 Scaling
 ^^^^^^^
@@ -277,7 +359,7 @@ Scaling
 ``CUOPT_MIP_SCALING`` controls if scaling should be applied to the MIP problem. When true scaling is applied,
 when false, no scaling is applied.
 
-Note: the defaulte value is true.
+.. note:: the defaulte value is true.
 
 
 Absolute Tolerance
@@ -285,14 +367,14 @@ Absolute Tolerance
 
 ``CUOPT_MIP_ABSOLUTE_TOLERANCE`` controls the MIP absolute tolerance.
 
-Note: the default value is ``1e-6``.
+.. note:: the default value is ``1e-6``.
 
 Relative Tolerance
 ^^^^^^^^^^^^^^^^^^
 
 ``CUOPT_MIP_RELATIVE_TOLERANCE`` controls the MIP relative tolerance.
 
-Note: the default value is ``1e-12``.
+.. note:: the default value is ``1e-12``.
 
 
 Integrality Tolerance
@@ -301,7 +383,7 @@ Integrality Tolerance
 ``CUOPT_INTEGRALITY_TOLERANCE`` controls the MIP integrality tolerance. A variable is considered to be integral, if
 it is within the integrality tolerance of an integer.
 
-Note: the default value is ``1e-5``.
+.. note:: the default value is ``1e-5``.
 
 Absolute MIP Gap
 ^^^^^^^^^^^^^^^^
@@ -316,7 +398,7 @@ when minimizing or
 
 when maximizing.
 
-Note: the default value is ``1e-10``.
+.. note:: the default value is ``1e-10``.
 
 Relative MIP Gap
 ^^^^^^^^^^^^^^^^
@@ -328,4 +410,4 @@ Relative MIP Gap
 If the Best Objective and the Dual Bound are both zero the gap is zero. If the best objective value is zero, the
 gap is infinity.
 
-Note: the default value is ``1e-4``.
+.. note:: the default value is ``1e-4``.
diff --git a/docs/cuopt/source/milp-features.rst b/docs/cuopt/source/milp-features.rst
index 40eba5c403..429bc05300 100644
--- a/docs/cuopt/source/milp-features.rst
+++ b/docs/cuopt/source/milp-features.rst
@@ -13,6 +13,7 @@ The MILP solver can be accessed in the following ways:
    - AMPL
    - GAMS
    - PuLP
+   - JuMP
 
 - **C API**: A native C API that provides direct low-level access to cuOpt's MILP solver, enabling integration into any application or system that can interface with C.
 
diff --git a/docs/cuopt/source/system-requirements.rst b/docs/cuopt/source/system-requirements.rst
index e7d963ae57..7ad7021943 100644
--- a/docs/cuopt/source/system-requirements.rst
+++ b/docs/cuopt/source/system-requirements.rst
@@ -47,6 +47,7 @@ Dependencies are installed automatically when using the pip and Conda installati
       - CUDA 12.2 with Driver 535.86.10+
       - CUDA 12.5 with Driver 555.42.06+
       - CUDA 12.9 with Driver 570.42.01+
+      - CUDA 13.0 with Driver 580.65.06+
 
 .. dropdown:: Recommended Requirements for Best Performance
 
diff --git a/docs/cuopt/source/thirdparty_modeling_languages/index.rst b/docs/cuopt/source/thirdparty_modeling_languages/index.rst
index 3fa6c54664..0acda399ab 100644
--- a/docs/cuopt/source/thirdparty_modeling_languages/index.rst
+++ b/docs/cuopt/source/thirdparty_modeling_languages/index.rst
@@ -21,3 +21,10 @@ PuLP Support
 
 PuLP can be used with near zero code changes: simply switch to cuOpt as a solver to solve linear and mixed-integer programming problems.
 Please refer to the `PuLP documentation <https://pypi.org/project/PuLP/>`_ for more information. Also, see the example notebook in the `cuopt-examples <https://github.com/NVIDIA/cuopt-examples>`_ repository.
+
+--------------------------
+JuMP Support
+--------------------------
+
+JuMP can be used with near zero code changes: simply switch to cuOpt as a solver to solve linear and mixed-integer programming problems.
+Please refer to the `JuMP documentation <https://github.com/jump-dev/cuOpt.jl>`_ for more information.
diff --git a/python/cuopt/cuopt/linear_programming/data_model/data_model_wrapper.pyx b/python/cuopt/cuopt/linear_programming/data_model/data_model_wrapper.pyx
index 50641d331f..2c196751f4 100644
--- a/python/cuopt/cuopt/linear_programming/data_model/data_model_wrapper.pyx
+++ b/python/cuopt/cuopt/linear_programming/data_model/data_model_wrapper.pyx
@@ -25,8 +25,6 @@ import warnings
 
 import numpy as np
 
-import cudf
-
 from libc.stdint cimport uintptr_t
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -46,13 +44,11 @@ def type_cast(np_obj, np_type, name):
 
 
 def get_data_ptr(array):
-    if isinstance(array, cudf.Series):
-        return array.__cuda_array_interface__['data'][0]
-    elif isinstance(array, np.ndarray):
+    if isinstance(array, np.ndarray):
         return array.__array_interface__['data'][0]
     else:
         raise Exception(
-            "get_data_ptr must be called with cudf.Series or np.ndarray"
+            "get_data_ptr must be called with np.ndarray"
         )
 
 
diff --git a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
index 1f0ade10e5..c7ef8b99bf 100644
--- a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
+++ b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
@@ -34,10 +34,21 @@
     sense,
 )
 from cuopt.linear_programming.solver.solver_parameters import (
+    CUOPT_AUGMENTED,
+    CUOPT_BARRIER_DUAL_INITIAL_POINT,
+    CUOPT_CUDSS_DETERMINISTIC,
+    CUOPT_DUALIZE,
+    CUOPT_ELIMINATE_DENSE_COLUMNS,
+    CUOPT_FOLDING,
     CUOPT_INFEASIBILITY_DETECTION,
+    CUOPT_METHOD,
+    CUOPT_ORDERING,
     CUOPT_PDLP_SOLVER_MODE,
 )
-from cuopt.linear_programming.solver_settings import PDLPSolverMode
+from cuopt.linear_programming.solver_settings import (
+    PDLPSolverMode,
+    SolverMethod,
+)
 
 RAPIDS_DATASET_ROOT_DIR = os.getenv("RAPIDS_DATASET_ROOT_DIR")
 if RAPIDS_DATASET_ROOT_DIR is None:
@@ -449,3 +460,225 @@ def test_problem_update():
     prob.updateObjective(constant=5, sense=MINIMIZE)
     prob.solve()
     assert prob.ObjValue == pytest.approx(5)
+
+
+@pytest.mark.parametrize(
+    "test_name,settings_config",
+    [
+        (
+            "automatic",
+            {
+                CUOPT_FOLDING: -1,
+                CUOPT_DUALIZE: -1,
+                CUOPT_ORDERING: -1,
+                CUOPT_AUGMENTED: -1,
+            },
+        ),
+        (
+            "forced_on",
+            {
+                CUOPT_FOLDING: 1,
+                CUOPT_DUALIZE: 1,
+                CUOPT_ORDERING: 1,
+                CUOPT_AUGMENTED: 1,
+                CUOPT_ELIMINATE_DENSE_COLUMNS: True,
+                CUOPT_CUDSS_DETERMINISTIC: True,
+            },
+        ),
+        (
+            "disabled",
+            {
+                CUOPT_FOLDING: 0,
+                CUOPT_DUALIZE: 0,
+                CUOPT_ORDERING: 0,
+                CUOPT_AUGMENTED: 0,
+                CUOPT_ELIMINATE_DENSE_COLUMNS: False,
+                CUOPT_CUDSS_DETERMINISTIC: False,
+            },
+        ),
+        (
+            "mixed",
+            {
+                CUOPT_FOLDING: 1,
+                CUOPT_DUALIZE: 0,
+                CUOPT_ORDERING: -1,
+                CUOPT_AUGMENTED: 1,
+            },
+        ),
+        (
+            "folding_on",
+            {
+                CUOPT_FOLDING: 1,
+            },
+        ),
+        (
+            "folding_off",
+            {
+                CUOPT_FOLDING: 0,
+            },
+        ),
+        (
+            "dualize_on",
+            {
+                CUOPT_DUALIZE: 1,
+            },
+        ),
+        (
+            "dualize_off",
+            {
+                CUOPT_DUALIZE: 0,
+            },
+        ),
+        (
+            "amd_ordering",
+            {
+                CUOPT_ORDERING: 1,
+            },
+        ),
+        (
+            "cudss_ordering",
+            {
+                CUOPT_ORDERING: 0,
+            },
+        ),
+        (
+            "augmented_system",
+            {
+                CUOPT_AUGMENTED: 1,
+            },
+        ),
+        (
+            "adat_system",
+            {
+                CUOPT_AUGMENTED: 0,
+            },
+        ),
+        (
+            "no_dense_elim",
+            {
+                CUOPT_ELIMINATE_DENSE_COLUMNS: False,
+            },
+        ),
+        (
+            "cudss_deterministic",
+            {
+                CUOPT_CUDSS_DETERMINISTIC: True,
+            },
+        ),
+        (
+            "combo1",
+            {
+                CUOPT_FOLDING: 1,
+                CUOPT_DUALIZE: 1,
+                CUOPT_ORDERING: 1,
+            },
+        ),
+        (
+            "combo2",
+            {
+                CUOPT_FOLDING: 0,
+                CUOPT_AUGMENTED: 0,
+                CUOPT_ELIMINATE_DENSE_COLUMNS: False,
+            },
+        ),
+        (
+            "dual_initial_point_automatic",
+            {
+                CUOPT_BARRIER_DUAL_INITIAL_POINT: -1,
+            },
+        ),
+        (
+            "dual_initial_point_lustig",
+            {
+                CUOPT_BARRIER_DUAL_INITIAL_POINT: 0,
+            },
+        ),
+        (
+            "dual_initial_point_least_squares",
+            {
+                CUOPT_BARRIER_DUAL_INITIAL_POINT: 1,
+            },
+        ),
+        (
+            "combo3_with_dual_init",
+            {
+                CUOPT_AUGMENTED: 1,
+                CUOPT_BARRIER_DUAL_INITIAL_POINT: 1,
+                CUOPT_ELIMINATE_DENSE_COLUMNS: True,
+            },
+        ),
+    ],
+)
+def test_barrier_solver_settings(test_name, settings_config):
+    """
+    Parameterized test for barrier solver with different configurations.
+
+    Tests the barrier solver across various settings combinations to ensure
+    correctness and robustness. Each configuration tests different aspects
+    of the barrier solver implementation.
+
+    Problem:
+        maximize   5*xs + 20*xl
+        subject to  1*xs +  3*xl <= 200
+                    3*xs +  2*xl <= 160
+                    xs, xl >= 0
+
+    Expected Solution:
+        Optimal objective: 1333.33
+        xs = 0, xl = 66.67 (corner solution where constraint 1 is binding)
+
+    Args
+    ----
+        test_name: Descriptive name for the test configuration
+        settings_config: Dictionary of barrier solver parameters to set
+    """
+    prob = Problem(f"Barrier Test - {test_name}")
+
+    # Add variables
+    xs = prob.addVariable(lb=0, vtype=VType.CONTINUOUS, name="xs")
+    xl = prob.addVariable(lb=0, vtype=VType.CONTINUOUS, name="xl")
+
+    # Add constraints
+    prob.addConstraint(xs + 3 * xl <= 200, name="constraint1")
+    prob.addConstraint(3 * xs + 2 * xl <= 160, name="constraint2")
+
+    # Set objective: maximize 5*xs + 20*xl
+    prob.setObjective(5 * xs + 20 * xl, sense=MAXIMIZE)
+
+    # Configure solver settings
+    settings = SolverSettings()
+    settings.set_parameter(CUOPT_METHOD, SolverMethod.Barrier)
+    settings.set_parameter("time_limit", 10)
+
+    # Apply test-specific settings
+    for param_name, param_value in settings_config.items():
+        settings.set_parameter(param_name, param_value)
+
+    print(f"\nTesting configuration: {test_name}")
+    print(f"Settings: {settings_config}")
+
+    # Solve the problem
+    prob.solve(settings)
+
+    print(f"Status: {prob.Status.name}")
+    print(f"Objective: {prob.ObjValue}")
+    print(f"xs = {xs.Value}, xl = {xl.Value}")
+
+    # Verify solution
+    assert prob.solved, f"Problem not solved for {test_name}"
+    assert prob.Status.name == "Optimal", f"Not optimal for {test_name}"
+    assert prob.ObjValue == pytest.approx(
+        1333.33, rel=0.01
+    ), f"Incorrect objective for {test_name}"
+    assert xs.Value == pytest.approx(
+        0.0, abs=1e-4
+    ), f"Incorrect xs value for {test_name}"
+    assert xl.Value == pytest.approx(
+        66.67, rel=0.01
+    ), f"Incorrect xl value for {test_name}"
+
+    # Verify constraint slacks are non-negative
+    for c in prob.getConstraints():
+        assert (
+            c.Slack >= -1e-6
+        ), f"Negative slack for {c.getConstraintName()} in {test_name}"
diff --git a/python/cuopt_server/cuopt_server/tests/test_lp.py b/python/cuopt_server/cuopt_server/tests/test_lp.py
index 8fc85aa3a0..4a01daaca3 100644
--- a/python/cuopt_server/cuopt_server/tests/test_lp.py
+++ b/python/cuopt_server/cuopt_server/tests/test_lp.py
@@ -146,3 +146,77 @@ def test_sample_milp(
         res.json()["response"]["solver_response"],
         expected_status,
     )
+
+
+# @pytest.mark.skip(reason="Skipping barrier solver options test")
+@pytest.mark.parametrize(
+    "folding, dualize, ordering, augmented, eliminate_dense, cudss_determ, "
+    "dual_initial_point",
+    [
+        # Test automatic settings (default)
+        (-1, -1, -1, -1, True, False, -1),
+        # Test folding off, no dualization, cuDSS default ordering, ADAT system
+        (0, 0, 0, 0, True, False, 0),
+        # Test folding on, force dualization, AMD ordering, augmented system
+        (1, 1, 1, 1, True, True, 1),
+        # Test mixed settings: automatic folding, no dualize, AMD, augmented
+        (-1, 0, 1, 1, False, False, 0),
+        # Test no folding, automatic dualize, cuDSS default, ADAT
+        (0, -1, 0, 0, True, True, -1),
+        # Test dual initial point with Lustig-Marsten-Shanno
+        (-1, -1, -1, -1, True, False, 0),
+        # Test dual initial point with least squares
+        (-1, -1, -1, 1, True, False, 1),
+    ],
+)
+def test_barrier_solver_options(
+    cuoptproc,  # noqa
+    folding,
+    dualize,
+    ordering,
+    augmented,
+    eliminate_dense,
+    cudss_determ,
+    dual_initial_point,
+):
+    """
+    Test the barrier solver (method=3) with various configuration options:
+    - folding: (-1) automatic, (0) off, (1) on
+    - dualize: (-1) automatic, (0) don't dualize, (1) force dualize
+    - ordering: (-1) automatic, (0) cuDSS default, (1) AMD
+    - augmented: (-1) automatic, (0) ADAT, (1) augmented system
+    - eliminate_dense_columns: True to eliminate, False to not
+    - cudss_deterministic: True for deterministic, False for
+      nondeterministic
+    - barrier_dual_initial_point: (-1) automatic, (0) Lustig-Marsten-Shanno,
+      (1) dual least squares
+    """
+    data = get_std_data_for_lp()
+
+    # Use barrier solver (method=3)
+    data["solver_config"]["method"] = 3
+
+    # Configure barrier solver options
+    data["solver_config"]["folding"] = folding
+    data["solver_config"]["dualize"] = dualize
+    data["solver_config"]["ordering"] = ordering
+    data["solver_config"]["augmented"] = augmented
+    data["solver_config"]["eliminate_dense_columns"] = eliminate_dense
+    data["solver_config"]["cudss_deterministic"] = cudss_determ
+    data["solver_config"]["barrier_dual_initial_point"] = dual_initial_point
+
+    res = get_lp(client, data)
+
+    assert res.status_code == 200
+
+    print("\n=== Barrier Solver Test Configuration ===")
+    print(f"folding={folding}, dualize={dualize}, ordering={ordering}")
+    print(f"augmented={augmented}, eliminate_dense={eliminate_dense}")
+    print(f"cudss_deterministic={cudss_determ}")
+    print(f"barrier_dual_initial_point={dual_initial_point}")
+    print(res.json())
+
+    validate_lp_result(
+        res.json()["response"]["solver_response"],
+        LPTerminationStatus.Optimal.name,
+    )
diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
index 8eeca36452..b74d66b09a 100644
--- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
+++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
@@ -520,6 +520,8 @@ class SolverConfig(StrictModel):
         "<br>"
         "- Dual Simplex: 2, Dual Simplex method"
         "<br>"
+        "- Barrier: 3, Barrier method"
+        "<br>"
         "Note: Not supported for MILP. ",
     )
     mip_scaling: Optional[bool] = Field(
diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/solver.py b/python/cuopt_server/cuopt_server/utils/linear_programming/solver.py
index b059742174..1dbfaf51ef 100644
--- a/python/cuopt_server/cuopt_server/utils/linear_programming/solver.py
+++ b/python/cuopt_server/cuopt_server/utils/linear_programming/solver.py
@@ -440,15 +440,15 @@ def is_mip(var_types):
             solver_settings.set_parameter(
                 CUOPT_LOG_FILE, solver_config.log_file
             )
-        if solver_config.augmented != "":
+        if solver_config.augmented is not None:
             solver_settings.set_parameter(
                 CUOPT_AUGMENTED, solver_config.augmented
             )
-        if solver_config.folding != "":
+        if solver_config.folding is not None:
             solver_settings.set_parameter(CUOPT_FOLDING, solver_config.folding)
-        if solver_config.dualize != "":
+        if solver_config.dualize is not None:
             solver_settings.set_parameter(CUOPT_DUALIZE, solver_config.dualize)
-        if solver_config.ordering != "":
+        if solver_config.ordering is not None:
             solver_settings.set_parameter(
                 CUOPT_ORDERING, solver_config.ordering
             )
diff --git a/python/libcuopt/CMakeLists.txt b/python/libcuopt/CMakeLists.txt
index 175e501e4e..b6fbb6b2b3 100644
--- a/python/libcuopt/CMakeLists.txt
+++ b/python/libcuopt/CMakeLists.txt
@@ -86,6 +86,7 @@ set(rpaths
   "$ORIGIN/../../nvidia/curand/lib"
   "$ORIGIN/../../nvidia/cusolver/lib"
   "$ORIGIN/../../nvidia/cusparse/lib"
+  "$ORIGIN/../../nvidia/nvjitlink/lib"
 )
 
 # Add CUDA version-specific paths based on CUDA compiler version

From e9a4f81dbf2888fbca0701e52451bb04a9f3f067 Mon Sep 17 00:00:00 2001
From: Chris Maes <cmaes@nvidia.com>
Date: Thu, 9 Oct 2025 22:21:08 -0700
Subject: [PATCH 11/27] Barrier log fixes (#478)

Minor tweaks to the barrier log.

Authors:
  - Chris Maes (https://github.com/chris-maes)

Approvers:
  - Hugo Linsenmaier (https://github.com/hlinsen)

URL: https://github.com/NVIDIA/cuopt/pull/478
---
 cpp/src/dual_simplex/barrier.cu          | 5 ++---
 cpp/src/dual_simplex/sparse_cholesky.cuh | 8 ++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cpp/src/dual_simplex/barrier.cu b/cpp/src/dual_simplex/barrier.cu
index 7a9c2ee249..2e9d64f071 100644
--- a/cpp/src/dual_simplex/barrier.cu
+++ b/cpp/src/dual_simplex/barrier.cu
@@ -1487,7 +1487,6 @@ int barrier_solver_t<i_t, f_t>::initial_point(iteration_data_t<i_t, f_t>& data)
   // Perform a numerical factorization
   i_t status;
   if (use_augmented) {
-    settings.log.printf("Factorizing augmented\n");
     status = data.chol->factorize(data.augmented);
   } else {
     if (use_gpu) {
@@ -3329,9 +3328,9 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
   i_t iter = 0;
   settings.log.printf("\n");
   settings.log.printf(
-    "        Objective                            Infeasibility              Time\n");
+    "                  Objective                         Infeasibility        Time\n");
   settings.log.printf(
-    "Iter    Primal               Dual            Primal   Dual    Compl.    Elapsed\n");
+    "Iter   Primal              Dual                Primal   Dual    Compl.   Elapsed\n");
   float64_t elapsed_time = toc(start_time);
   settings.log.printf("%3d   %+.12e %+.12e %.2e %.2e %.2e %.1f\n",
                       iter,
diff --git a/cpp/src/dual_simplex/sparse_cholesky.cuh b/cpp/src/dual_simplex/sparse_cholesky.cuh
index 51145a36b4..4ea1609e31 100644
--- a/cpp/src/dual_simplex/sparse_cholesky.cuh
+++ b/cpp/src/dual_simplex/sparse_cholesky.cuh
@@ -534,7 +534,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     }
 
     if (first_factor) {
-      settings_.log.printf("Factorization time          : %.2fs\n", numeric_time);
+      settings_.log.debug("Factorization time          : %.2fs\n", numeric_time);
       first_factor = false;
     }
     if (status != CUDSS_STATUS_SUCCESS) {
@@ -635,7 +635,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
 
     f_t symbolic_time = toc(start_symbolic);
     f_t analysis_time = toc(start_analysis);
-    settings_.log.printf("Symbolic factorization time: %.2fs\n", symbolic_time);
+    settings_.log.printf("Symbolic factorization time : %.2fs\n", symbolic_time);
     if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
       RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
       handle_ptr_->get_stream().synchronize();
@@ -647,7 +647,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
       cudssDataGet(handle, solverData, CUDSS_DATA_LU_NNZ, &lu_nz, sizeof(int64_t), &size_written),
       status,
       "cudssDataGet for LU_NNZ");
-    settings_.log.printf("Symbolic nonzeros in factor: %e\n", static_cast<f_t>(lu_nz) / 2.0);
+    settings_.log.printf("Symbolic nonzeros in factor : %.2e\n", static_cast<f_t>(lu_nz) / 2.0);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     handle_ptr_->get_stream().synchronize();
     // TODO: Is there any way to get nonzeros in the factors?
@@ -703,7 +703,7 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t<i_t, f_t> {
     }
 
     if (first_factor) {
-      settings_.log.printf("Factor time %.2fs\n", numeric_time);
+      settings_.log.debug("Factorization time          : %.2fs\n", numeric_time);
       first_factor = false;
     }
     if (status != CUDSS_STATUS_SUCCESS) {

From ff4cde2b54ae8ed2da4e8b4c58532e5c41076ffb Mon Sep 17 00:00:00 2001
From: akifcorduk <akifcorduk@gmail.com>
Date: Fri, 10 Oct 2025 05:08:44 -0700
Subject: [PATCH 12/27] clique merge bug

---
 cpp/CMakeLists.txt | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cb17f0c4a2..90bb1c57f7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -190,9 +190,9 @@ FetchContent_Declare(
   # does not have some of the presolvers and settings that we need
   # Mainly, probing and clique merging.
   # This is the reason we are using the development branch
-  # commit from Oct 8, 2025. Once these changes are merged into the main branch,
+  # commit from cliquemergebug branch. Once these changes are merged into the main branch,
   #we can switch to the main branch.
-  GIT_TAG "24ccf5752656df0f15dd9aabe5b97feae829b9ec"
+  GIT_TAG "8f710e33d352bf319d30b9c57e70516222f3f5ca"
   GIT_PROGRESS TRUE
   SYSTEM
 )
@@ -201,8 +201,6 @@ find_package(TBB REQUIRED)
 set(BUILD_TESTING OFF CACHE BOOL "Disable test build for papilo")
 set(PAPILO_NO_BINARIES ON)
 option(LUSOL "Disable LUSOL" OFF)
-# Disable TBB because of a bug in CliqueMerging parallel version
-set(TBB OFF CACHE BOOL "Disable TBB for papilo")
 
 FetchContent_MakeAvailable(papilo)
 

From c1ff4ea011705cceb64234efcc0b99cab7cc9f60 Mon Sep 17 00:00:00 2001
From: Ramakrishna Prabhu <ramakrishnap@nvidia.com>
Date: Fri, 10 Oct 2025 10:32:38 -0500
Subject: [PATCH 13/27] push changes

---
 ci/test_python.sh                                              | 3 +++
 ci/test_wheel_cuopt_server.sh                                  | 3 +++
 python/cuopt/cuopt/tests/linear_programming/test_python_API.py | 1 +
 3 files changed, 7 insertions(+)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 0d3d1e5963..7d504f4738 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -60,6 +60,9 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
+# Due to race condition in certain cases UCX might not be able to cleanup properly, so we set the number of threads to 1
+export OMP_NUM_THREADS=1
+
 rapids-logger "Test cuopt_cli"
 timeout 10m bash ./python/libcuopt/libcuopt/tests/test_cli.sh
 
diff --git a/ci/test_wheel_cuopt_server.sh b/ci/test_wheel_cuopt_server.sh
index 5f0b874ba3..de4a52f479 100755
--- a/ci/test_wheel_cuopt_server.sh
+++ b/ci/test_wheel_cuopt_server.sh
@@ -37,4 +37,7 @@ rapids-pip-retry install \
 ./datasets/linear_programming/download_pdlp_test_dataset.sh
 ./datasets/mip/download_miplib_test_dataset.sh
 
+# Due to race condition in certain cases UCX might not be able to cleanup properly, so we set the number of threads to 1
+export OMP_NUM_THREADS=1
+
 RAPIDS_DATASET_ROOT_DIR=./datasets timeout 30m python -m pytest --verbose --capture=no ./python/cuopt_server/cuopt_server/tests/
diff --git a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
index c7ef8b99bf..42059bf3d3 100644
--- a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
+++ b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
@@ -406,6 +406,7 @@ def test_warm_start():
 
     settings = SolverSettings()
     settings.set_parameter(CUOPT_PDLP_SOLVER_MODE, PDLPSolverMode.Stable2)
+    settings.set_parameter(CUOPT_METHOD, SolverMethod.PDLP)
     settings.set_optimality_tolerance(1e-3)
     settings.set_parameter(CUOPT_INFEASIBILITY_DETECTION, False)
 

From ff5cfa129df295565b3dc7797362e7f9e0f854d1 Mon Sep 17 00:00:00 2001
From: Ramakrishna Prabhu <ramakrishnap@nvidia.com>
Date: Fri, 10 Oct 2025 12:58:36 -0500
Subject: [PATCH 14/27] fix test in server as well

---
 python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py b/python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py
index be67894be1..cfc30fa1c8 100644
--- a/python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py
+++ b/python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py
@@ -22,6 +22,7 @@
 from cuopt.linear_programming.solver.solver_parameters import (
     CUOPT_INFEASIBILITY_DETECTION,
     CUOPT_PDLP_SOLVER_MODE,
+    CUOPT_METHOD,
 )
 from cuopt.linear_programming.solver_settings import PDLPSolverMode
 
@@ -45,6 +46,7 @@ def test_warmstart(cuoptproc):  # noqa
     settings.set_optimality_tolerance(1e-4)
     settings.set_parameter(CUOPT_INFEASIBILITY_DETECTION, False)
     settings.set_parameter(CUOPT_PDLP_SOLVER_MODE, PDLPSolverMode.Stable2)
+    settings.set_parameter(CUOPT_METHOD, SolverMethod.PDLP)
     data["solver_config"] = settings.toDict()
 
     headers = {"CLIENT-VERSION": "custom"}

From 07da5e4f645c046c381e0781b6a6aed143deabb0 Mon Sep 17 00:00:00 2001
From: Ramakrishna Prabhu <ramakrishnap@nvidia.com>
Date: Fri, 10 Oct 2025 13:00:34 -0500
Subject: [PATCH 15/27] fix test

---
 .../cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py b/python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py
index cfc30fa1c8..ff8f4a841b 100644
--- a/python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py
+++ b/python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py
@@ -21,10 +21,10 @@
 from cuopt.linear_programming import solver_settings
 from cuopt.linear_programming.solver.solver_parameters import (
     CUOPT_INFEASIBILITY_DETECTION,
-    CUOPT_PDLP_SOLVER_MODE,
     CUOPT_METHOD,
+    CUOPT_PDLP_SOLVER_MODE,
 )
-from cuopt.linear_programming.solver_settings import PDLPSolverMode
+from cuopt.linear_programming.solver_settings import Method, PDLPSolverMode
 
 from cuopt_server.tests.utils.utils import cuoptproc  # noqa
 from cuopt_server.tests.utils.utils import (
@@ -46,7 +46,7 @@ def test_warmstart(cuoptproc):  # noqa
     settings.set_optimality_tolerance(1e-4)
     settings.set_parameter(CUOPT_INFEASIBILITY_DETECTION, False)
     settings.set_parameter(CUOPT_PDLP_SOLVER_MODE, PDLPSolverMode.Stable2)
-    settings.set_parameter(CUOPT_METHOD, SolverMethod.PDLP)
+    settings.set_parameter(CUOPT_METHOD, Method.PDLP)
     data["solver_config"] = settings.toDict()
 
     headers = {"CLIENT-VERSION": "custom"}

From ae473c96a8f28dcdbe96de057f200cc56ad5048d Mon Sep 17 00:00:00 2001
From: Ramakrishna Prabhu <ramakrishnap@nvidia.com>
Date: Fri, 10 Oct 2025 13:12:01 -0500
Subject: [PATCH 16/27] fix

---
 .../cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py b/python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py
index ff8f4a841b..242b34545b 100644
--- a/python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py
+++ b/python/cuopt_server/cuopt_server/tests/test_pdlp_warmstart.py
@@ -24,7 +24,10 @@
     CUOPT_METHOD,
     CUOPT_PDLP_SOLVER_MODE,
 )
-from cuopt.linear_programming.solver_settings import Method, PDLPSolverMode
+from cuopt.linear_programming.solver_settings import (
+    PDLPSolverMode,
+    SolverMethod,
+)
 
 from cuopt_server.tests.utils.utils import cuoptproc  # noqa
 from cuopt_server.tests.utils.utils import (
@@ -46,7 +49,7 @@ def test_warmstart(cuoptproc):  # noqa
     settings.set_optimality_tolerance(1e-4)
     settings.set_parameter(CUOPT_INFEASIBILITY_DETECTION, False)
     settings.set_parameter(CUOPT_PDLP_SOLVER_MODE, PDLPSolverMode.Stable2)
-    settings.set_parameter(CUOPT_METHOD, Method.PDLP)
+    settings.set_parameter(CUOPT_METHOD, SolverMethod.PDLP)
     data["solver_config"] = settings.toDict()
 
     headers = {"CLIENT-VERSION": "custom"}

From ecc2566aa108d5ae49d6bf5e045559e042654676 Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Sat, 11 Oct 2025 00:29:47 +0200
Subject: [PATCH 17/27] [BUG] Fixed starting variable bounds for diving (#474)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR also fixes the starting bounds for the diving subtrees. As these nodes are detached from the main B&B tree, the bounds from the parent were lost. Now, the bounds are also kept when inserting the nodes in the diving queue.

Authors:
  - Nicolas L. Guidotti (https://github.com/nguidotti)

Approvers:
  - Akif ÇÖRDÜK (https://github.com/akifcorduk)
  - Chris Maes (https://github.com/chris-maes)

URL: https://github.com/NVIDIA/cuopt/pull/474
---
 cpp/src/dual_simplex/branch_and_bound.cpp | 22 +++++++++++-----------
 cpp/src/dual_simplex/branch_and_bound.hpp |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index 78acd07426..2ce3ee0b4e 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -562,8 +562,8 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node(search_tree_t<i_t, f_t>&
 {
   f_t abs_fathom_tol = settings_.absolute_mip_gap_tol / 10;
 
-  std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
   lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
+  std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
   assert(leaf_vstatus.size() == leaf_problem.num_cols);
 
   std::vector<bool> bounds_changed(leaf_problem.num_cols, false);
@@ -572,7 +572,6 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node(search_tree_t<i_t, f_t>&
   // two vectors at each node and potentially cause memory issues
   node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed);
 
-  std::vector<f_t> leaf_edge_norms      = edge_norms_;  // = node.steepest_edge_norms;
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.set_log(false);
   lp_settings.cut_off    = upper_bound + settings_.dual_tol;
@@ -587,8 +586,9 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node(search_tree_t<i_t, f_t>&
   dual::status_t lp_status = dual::status_t::DUAL_UNBOUNDED;
 
   if (feasible) {
-    i_t node_iter     = 0;
-    f_t lp_start_time = tic();
+    i_t node_iter                    = 0;
+    f_t lp_start_time                = tic();
+    std::vector<f_t> leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
 
     lp_status = dual_phase2(2,
                             0,
@@ -778,7 +778,7 @@ void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(search_tree_t<i_t, f_t>*
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t id,
+void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
                                                    search_tree_t<i_t, f_t>& search_tree,
                                                    mip_node_t<i_t, f_t>* start_node,
                                                    lp_problem_t<i_t, f_t>& leaf_problem,
@@ -788,7 +788,7 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t id,
   stack.push_front(start_node);
 
   while (stack.size() > 0 && status_ == mip_exploration_status_t::RUNNING) {
-    if (id == 0) { repair_heuristic_solutions(); }
+    if (task_id == 0) { repair_heuristic_solutions(); }
 
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();
@@ -804,10 +804,10 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t id,
     // - The current node and its siblings uses the lower bound of the parent before solving the LP
     // relaxation
     // - The lower bound of the parent is lower or equal to its children
-    assert(id < local_lower_bounds_.size());
-    local_lower_bounds_[id] = lower_bound;
-    i_t nodes_explored      = (++stats_.nodes_explored);
-    i_t nodes_unexplored    = (--stats_.nodes_unexplored);
+    assert(task_id < local_lower_bounds_.size());
+    local_lower_bounds_[task_id] = lower_bound;
+    i_t nodes_explored           = (++stats_.nodes_explored);
+    i_t nodes_unexplored         = (--stats_.nodes_unexplored);
     stats_.nodes_since_last_log++;
 
     if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
@@ -818,7 +818,7 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t id,
 
     f_t now = toc(stats_.start_time);
 
-    if (id == 0) {
+    if (task_id == 0) {
       f_t time_since_last_log = stats_.last_log == 0 ? 1.0 : toc(stats_.last_log);
 
       if (((stats_.nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
diff --git a/cpp/src/dual_simplex/branch_and_bound.hpp b/cpp/src/dual_simplex/branch_and_bound.hpp
index 5453e8b424..23fb9eb7f8 100644
--- a/cpp/src/dual_simplex/branch_and_bound.hpp
+++ b/cpp/src/dual_simplex/branch_and_bound.hpp
@@ -235,7 +235,7 @@ class branch_and_bound_t {
                            i_t initial_heap_size);
 
   // Explore the search tree using the best-first search with plunging strategy.
-  void explore_subtree(i_t id,
+  void explore_subtree(i_t task_id,
                        search_tree_t<i_t, f_t>& search_tree,
                        mip_node_t<i_t, f_t>* start_node,
                        lp_problem_t<i_t, f_t>& leaf_problem,

From 36b96cd3399702e87998a558ac3896417e957f29 Mon Sep 17 00:00:00 2001
From: Hugo Linsenmaier <hlinsenmaier@gmail.com>
Date: Fri, 10 Oct 2025 16:15:29 -0700
Subject: [PATCH 18/27] Gracefully handle OOM in Barrier (#483)

CUDSS does not throw on OOM and even though we check for the return status. There was a sticky cuda error on any successive CUDA API call. A workaround is to throw if `cudaMallocAsync` fails in the device mem handler and catch that error in the solve method.

Authors:
  - Hugo Linsenmaier (https://github.com/hlinsen)

Approvers:
  - Chris Maes (https://github.com/chris-maes)

URL: https://github.com/NVIDIA/cuopt/pull/483
---
 cpp/src/dual_simplex/barrier.cu          | 580 ++++++++++++-----------
 cpp/src/dual_simplex/sparse_cholesky.cuh |   8 +-
 2 files changed, 300 insertions(+), 288 deletions(-)

diff --git a/cpp/src/dual_simplex/barrier.cu b/cpp/src/dual_simplex/barrier.cu
index 2e9d64f071..0b952e2dd7 100644
--- a/cpp/src/dual_simplex/barrier.cu
+++ b/cpp/src/dual_simplex/barrier.cu
@@ -3242,217 +3242,53 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
                                               const barrier_solver_settings_t<i_t, f_t>& options,
                                               lp_solution_t<i_t, f_t>& solution)
 {
-  raft::common::nvtx::range fun_scope("Barrier: solve");
+  try {
+    raft::common::nvtx::range fun_scope("Barrier: solve");
 
-  i_t n = lp.num_cols;
-  i_t m = lp.num_rows;
+    i_t n = lp.num_cols;
+    i_t m = lp.num_rows;
 
-  solution.resize(m, n);
-  settings.log.printf(
-    "Barrier solver: %d constraints, %d variables, %ld nonzeros\n", m, n, lp.A.col_start[n]);
-  settings.log.printf("\n");
-
-  // Compute the number of free variables
-  i_t num_free_variables = presolve_info.free_variable_pairs.size() / 2;
-  if (num_free_variables > 0) {
-    settings.log.printf("Free variables              : %d\n", num_free_variables);
-  }
-
-  // Compute the number of upper bounds
-  i_t num_upper_bounds = 0;
-  for (i_t j = 0; j < n; j++) {
-    if (lp.upper[j] < inf) { num_upper_bounds++; }
-  }
-
-  iteration_data_t<i_t, f_t> data(lp, num_upper_bounds, settings);
-  if (data.symbolic_status != 0) {
-    settings.log.printf("Error in symbolic analysis\n");
-    return lp_status_t::NUMERICAL_ISSUES;
-  }
-  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
-    settings.log.printf("Barrier solver halted\n");
-    return lp_status_t::CONCURRENT_LIMIT;
-  }
-  data.cusparse_dual_residual_ = data.cusparse_view_.create_vector(data.d_dual_residual_);
-  data.cusparse_r1_            = data.cusparse_view_.create_vector(data.d_r1_);
-  data.cusparse_tmp4_          = data.cusparse_view_.create_vector(data.d_tmp4_);
-  data.cusparse_h_             = data.cusparse_view_.create_vector(data.d_h_);
-  data.cusparse_dx_residual_   = data.cusparse_view_.create_vector(data.d_dx_residual_);
-  data.cusparse_u_             = data.cusparse_view_.create_vector(data.d_u_);
-  data.cusparse_y_residual_    = data.cusparse_view_.create_vector(data.d_y_residual_);
-  data.restrict_u_.resize(num_upper_bounds);
-
-  if (toc(start_time) > settings.time_limit) {
-    settings.log.printf("Barrier time limit exceeded\n");
-    return lp_status_t::TIME_LIMIT;
-  }
-
-  i_t initial_status = initial_point(data);
-  if (toc(start_time) > settings.time_limit) {
-    settings.log.printf("Barrier time limit exceeded\n");
-    return lp_status_t::TIME_LIMIT;
-  }
-  if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
-    settings.log.printf("Barrier solver halted\n");
-    return lp_status_t::CONCURRENT_LIMIT;
-  }
-  if (initial_status != 0) {
-    settings.log.printf("Unable to compute initial point\n");
-    return lp_status_t::NUMERICAL_ISSUES;
-  }
-  compute_residuals<PinnedHostAllocator<f_t>>(data.w, data.x, data.y, data.v, data.z, data);
-
-  f_t primal_residual_norm = std::max(vector_norm_inf<i_t, f_t>(data.primal_residual, stream_view_),
-                                      vector_norm_inf<i_t, f_t>(data.bound_residual, stream_view_));
-  f_t dual_residual_norm   = vector_norm_inf<i_t, f_t>(data.dual_residual, stream_view_);
-  f_t complementarity_residual_norm =
-    std::max(vector_norm_inf<i_t, f_t>(data.complementarity_xz_residual, stream_view_),
-             vector_norm_inf<i_t, f_t>(data.complementarity_wv_residual, stream_view_));
-  f_t mu = (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) /
-           (static_cast<f_t>(n) + static_cast<f_t>(num_upper_bounds));
-
-  f_t norm_b = vector_norm_inf<i_t, f_t>(data.b, stream_view_);
-  f_t norm_c = vector_norm_inf<i_t, f_t>(data.c, stream_view_);
-
-  f_t primal_objective = data.c.inner_product(data.x);
-
-  f_t relative_primal_residual = primal_residual_norm / (1.0 + norm_b);
-  f_t relative_dual_residual   = dual_residual_norm / (1.0 + norm_c);
-  f_t relative_complementarity_residual =
-    complementarity_residual_norm / (1.0 + std::abs(primal_objective));
-
-  dense_vector_t<i_t, f_t> upper(lp.upper);
-  data.gather_upper_bounds(upper, data.restrict_u_);
-  f_t dual_objective = data.b.inner_product(data.y) - data.restrict_u_.inner_product(data.v);
-
-  i_t iter = 0;
-  settings.log.printf("\n");
-  settings.log.printf(
-    "                  Objective                         Infeasibility        Time\n");
-  settings.log.printf(
-    "Iter   Primal              Dual                Primal   Dual    Compl.   Elapsed\n");
-  float64_t elapsed_time = toc(start_time);
-  settings.log.printf("%3d   %+.12e %+.12e %.2e %.2e %.2e %.1f\n",
-                      iter,
-                      primal_objective,
-                      dual_objective,
-                      primal_residual_norm,
-                      dual_residual_norm,
-                      complementarity_residual_norm,
-                      elapsed_time);
-
-  bool converged = primal_residual_norm < settings.barrier_relative_feasibility_tol &&
-                   dual_residual_norm < settings.barrier_relative_optimality_tol &&
-                   complementarity_residual_norm < settings.barrier_relative_complementarity_tol;
-
-  data.d_complementarity_xz_residual_.resize(data.complementarity_xz_residual.size(), stream_view_);
-  data.d_complementarity_wv_residual_.resize(data.complementarity_wv_residual.size(), stream_view_);
-  data.d_complementarity_xz_rhs_.resize(data.complementarity_xz_rhs.size(), stream_view_);
-  data.d_complementarity_wv_rhs_.resize(data.complementarity_wv_rhs.size(), stream_view_);
-  raft::copy(data.d_complementarity_xz_residual_.data(),
-             data.complementarity_xz_residual.data(),
-             data.complementarity_xz_residual.size(),
-             stream_view_);
-  raft::copy(data.d_complementarity_wv_residual_.data(),
-             data.complementarity_wv_residual.data(),
-             data.complementarity_wv_residual.size(),
-             stream_view_);
-  raft::copy(data.d_complementarity_xz_rhs_.data(),
-             data.complementarity_xz_rhs.data(),
-             data.complementarity_xz_rhs.size(),
-             stream_view_);
-  raft::copy(data.d_complementarity_wv_rhs_.data(),
-             data.complementarity_wv_rhs.data(),
-             data.complementarity_wv_rhs.size(),
-             stream_view_);
-
-  data.w_save = data.w;
-  data.x_save = data.x;
-  data.y_save = data.y;
-  data.v_save = data.v;
-  data.z_save = data.z;
-
-  const i_t iteration_limit = settings.iteration_limit;
-
-  while (iter < iteration_limit) {
-    raft::common::nvtx::range fun_scope("Barrier: iteration");
+    solution.resize(m, n);
+    settings.log.printf(
+      "Barrier solver: %d constraints, %d variables, %ld nonzeros\n", m, n, lp.A.col_start[n]);
+    settings.log.printf("\n");
 
-    if (toc(start_time) > settings.time_limit) {
-      settings.log.printf("Barrier time limit exceeded\n");
-      return lp_status_t::TIME_LIMIT;
-    }
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
-      settings.log.printf("Barrier solver halted\n");
-      return lp_status_t::CONCURRENT_LIMIT;
+    // Compute the number of free variables
+    i_t num_free_variables = presolve_info.free_variable_pairs.size() / 2;
+    if (num_free_variables > 0) {
+      settings.log.printf("Free variables              : %d\n", num_free_variables);
     }
 
-    // Compute the affine step
-    compute_affine_rhs(data);
-    f_t max_affine_residual = 0.0;
+    // Compute the number of upper bounds
+    i_t num_upper_bounds = 0;
+    for (i_t j = 0; j < n; j++) {
+      if (lp.upper[j] < inf) { num_upper_bounds++; }
+    }
 
-    i_t status = gpu_compute_search_direction(
-      data, data.dw_aff, data.dx_aff, data.dy_aff, data.dv_aff, data.dz_aff, max_affine_residual);
+    iteration_data_t<i_t, f_t> data(lp, num_upper_bounds, settings);
+    if (data.symbolic_status != 0) {
+      settings.log.printf("Error in symbolic analysis\n");
+      return lp_status_t::NUMERICAL_ISSUES;
+    }
     if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
       settings.log.printf("Barrier solver halted\n");
       return lp_status_t::CONCURRENT_LIMIT;
     }
-    // Sync to make sure all the async copies to host done inside are finished
-    if (use_gpu) RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+    data.cusparse_dual_residual_ = data.cusparse_view_.create_vector(data.d_dual_residual_);
+    data.cusparse_r1_            = data.cusparse_view_.create_vector(data.d_r1_);
+    data.cusparse_tmp4_          = data.cusparse_view_.create_vector(data.d_tmp4_);
+    data.cusparse_h_             = data.cusparse_view_.create_vector(data.d_h_);
+    data.cusparse_dx_residual_   = data.cusparse_view_.create_vector(data.d_dx_residual_);
+    data.cusparse_u_             = data.cusparse_view_.create_vector(data.d_u_);
+    data.cusparse_y_residual_    = data.cusparse_view_.create_vector(data.d_y_residual_);
+    data.restrict_u_.resize(num_upper_bounds);
 
-    if (status < 0) {
-      return check_for_suboptimal_solution(options,
-                                           data,
-                                           start_time,
-                                           iter,
-                                           primal_objective,
-                                           primal_residual_norm,
-                                           dual_residual_norm,
-                                           complementarity_residual_norm,
-                                           relative_primal_residual,
-                                           relative_dual_residual,
-                                           relative_complementarity_residual,
-                                           solution);
-    }
     if (toc(start_time) > settings.time_limit) {
       settings.log.printf("Barrier time limit exceeded\n");
       return lp_status_t::TIME_LIMIT;
     }
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
-      settings.log.printf("Barrier solver halted\n");
-      return lp_status_t::CONCURRENT_LIMIT;
-    }
-
-    f_t mu_aff, sigma, new_mu;
-    compute_target_mu(data, mu, mu_aff, sigma, new_mu);
 
-    compute_cc_rhs(data, new_mu);
-
-    f_t max_corrector_residual = 0.0;
-
-    status = gpu_compute_search_direction(
-      data, data.dw, data.dx, data.dy, data.dv, data.dz, max_corrector_residual);
-    if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
-      settings.log.printf("Barrier solver halted\n");
-      return lp_status_t::CONCURRENT_LIMIT;
-    }
-    // Sync to make sure all the async copies to host done inside are finished
-    if (use_gpu) RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
-    if (status < 0) {
-      return check_for_suboptimal_solution(options,
-                                           data,
-                                           start_time,
-                                           iter,
-                                           primal_objective,
-                                           primal_residual_norm,
-                                           dual_residual_norm,
-                                           complementarity_residual_norm,
-                                           relative_primal_residual,
-                                           relative_dual_residual,
-                                           relative_complementarity_residual,
-                                           solution);
-    }
-    data.has_factorization = false;
-    data.has_solve_info    = false;
+    i_t initial_status = initial_point(data);
     if (toc(start_time) > settings.time_limit) {
       settings.log.printf("Barrier time limit exceeded\n");
       return lp_status_t::TIME_LIMIT;
@@ -3461,112 +3297,284 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
       settings.log.printf("Barrier solver halted\n");
       return lp_status_t::CONCURRENT_LIMIT;
     }
+    if (initial_status != 0) {
+      settings.log.printf("Unable to compute initial point\n");
+      return lp_status_t::NUMERICAL_ISSUES;
+    }
+    compute_residuals<PinnedHostAllocator<f_t>>(data.w, data.x, data.y, data.v, data.z, data);
 
-    compute_final_direction(data);
-    f_t step_primal, step_dual;
-    compute_primal_dual_step_length(data, options.step_scale, step_primal, step_dual);
-    compute_next_iterate(data, options.step_scale, step_primal, step_dual);
-
-    compute_residual_norms(
-      data, primal_residual_norm, dual_residual_norm, complementarity_residual_norm);
+    f_t primal_residual_norm =
+      std::max(vector_norm_inf<i_t, f_t>(data.primal_residual, stream_view_),
+               vector_norm_inf<i_t, f_t>(data.bound_residual, stream_view_));
+    f_t dual_residual_norm = vector_norm_inf<i_t, f_t>(data.dual_residual, stream_view_);
+    f_t complementarity_residual_norm =
+      std::max(vector_norm_inf<i_t, f_t>(data.complementarity_xz_residual, stream_view_),
+               vector_norm_inf<i_t, f_t>(data.complementarity_wv_residual, stream_view_));
+    f_t mu = (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) /
+             (static_cast<f_t>(n) + static_cast<f_t>(num_upper_bounds));
 
-    compute_mu(data, mu);
+    f_t norm_b = vector_norm_inf<i_t, f_t>(data.b, stream_view_);
+    f_t norm_c = vector_norm_inf<i_t, f_t>(data.c, stream_view_);
 
-    compute_primal_dual_objective(data, primal_objective, dual_objective);
+    f_t primal_objective = data.c.inner_product(data.x);
 
-    relative_primal_residual = primal_residual_norm / (1.0 + norm_b);
-    relative_dual_residual   = dual_residual_norm / (1.0 + norm_c);
-    relative_complementarity_residual =
+    f_t relative_primal_residual = primal_residual_norm / (1.0 + norm_b);
+    f_t relative_dual_residual   = dual_residual_norm / (1.0 + norm_c);
+    f_t relative_complementarity_residual =
       complementarity_residual_norm / (1.0 + std::abs(primal_objective));
 
-    if (relative_primal_residual < settings.barrier_relaxed_feasibility_tol &&
-        relative_dual_residual < settings.barrier_relaxed_optimality_tol &&
-        relative_complementarity_residual < settings.barrier_relaxed_complementarity_tol) {
-      if (relative_primal_residual < data.relative_primal_residual_save &&
-          relative_dual_residual < data.relative_dual_residual_save &&
-          relative_complementarity_residual < data.relative_complementarity_residual_save) {
-        settings.log.debug(
-          "Saving solution: feasibility %.2e (%.2e), optimality %.2e (%.2e), complementarity "
-          "%.2e (%.2e)\n",
-          relative_primal_residual,
-          primal_residual_norm,
-          relative_dual_residual,
-          dual_residual_norm,
-          relative_complementarity_residual,
-          complementarity_residual_norm);
-        data.w_save                                 = data.w;
-        data.x_save                                 = data.x;
-        data.y_save                                 = data.y;
-        data.v_save                                 = data.v;
-        data.z_save                                 = data.z;
-        data.relative_primal_residual_save          = relative_primal_residual;
-        data.relative_dual_residual_save            = relative_dual_residual;
-        data.relative_complementarity_residual_save = relative_complementarity_residual;
-        data.primal_residual_norm_save              = primal_residual_norm;
-        data.dual_residual_norm_save                = dual_residual_norm;
-        data.complementarity_residual_norm_save     = complementarity_residual_norm;
-      }
-    }
-
-    iter++;
-    elapsed_time = toc(start_time);
-
-    if (primal_objective != primal_objective || dual_objective != dual_objective) {
-      settings.log.printf("Numerical error in objective\n");
-      return lp_status_t::NUMERICAL_ISSUES;
-    }
+    dense_vector_t<i_t, f_t> upper(lp.upper);
+    data.gather_upper_bounds(upper, data.restrict_u_);
+    f_t dual_objective = data.b.inner_product(data.y) - data.restrict_u_.inner_product(data.v);
 
+    i_t iter = 0;
+    settings.log.printf("\n");
+    settings.log.printf(
+      "                  Objective                         Infeasibility        Time\n");
+    settings.log.printf(
+      "Iter   Primal              Dual                Primal   Dual    Compl.   Elapsed\n");
+    float64_t elapsed_time = toc(start_time);
     settings.log.printf("%3d   %+.12e %+.12e %.2e %.2e %.2e %.1f\n",
                         iter,
-                        compute_user_objective(lp, primal_objective),
-                        compute_user_objective(lp, dual_objective),
-                        relative_primal_residual,
-                        relative_dual_residual,
-                        relative_complementarity_residual,
+                        primal_objective,
+                        dual_objective,
+                        primal_residual_norm,
+                        dual_residual_norm,
+                        complementarity_residual_norm,
                         elapsed_time);
 
-    bool primal_feasible = relative_primal_residual < settings.barrier_relative_feasibility_tol;
-    bool dual_feasible   = relative_dual_residual < settings.barrier_relative_optimality_tol;
-    bool small_gap =
-      relative_complementarity_residual < settings.barrier_relative_complementarity_tol;
+    bool converged = primal_residual_norm < settings.barrier_relative_feasibility_tol &&
+                     dual_residual_norm < settings.barrier_relative_optimality_tol &&
+                     complementarity_residual_norm < settings.barrier_relative_complementarity_tol;
+
+    data.d_complementarity_xz_residual_.resize(data.complementarity_xz_residual.size(),
+                                               stream_view_);
+    data.d_complementarity_wv_residual_.resize(data.complementarity_wv_residual.size(),
+                                               stream_view_);
+    data.d_complementarity_xz_rhs_.resize(data.complementarity_xz_rhs.size(), stream_view_);
+    data.d_complementarity_wv_rhs_.resize(data.complementarity_wv_rhs.size(), stream_view_);
+    raft::copy(data.d_complementarity_xz_residual_.data(),
+               data.complementarity_xz_residual.data(),
+               data.complementarity_xz_residual.size(),
+               stream_view_);
+    raft::copy(data.d_complementarity_wv_residual_.data(),
+               data.complementarity_wv_residual.data(),
+               data.complementarity_wv_residual.size(),
+               stream_view_);
+    raft::copy(data.d_complementarity_xz_rhs_.data(),
+               data.complementarity_xz_rhs.data(),
+               data.complementarity_xz_rhs.size(),
+               stream_view_);
+    raft::copy(data.d_complementarity_wv_rhs_.data(),
+               data.complementarity_wv_rhs.data(),
+               data.complementarity_wv_rhs.size(),
+               stream_view_);
+
+    data.w_save = data.w;
+    data.x_save = data.x;
+    data.y_save = data.y;
+    data.v_save = data.v;
+    data.z_save = data.z;
 
-    converged = primal_feasible && dual_feasible && small_gap;
+    const i_t iteration_limit = settings.iteration_limit;
 
-    if (converged) {
-      settings.log.printf("\n");
-      settings.log.printf(
-        "Optimal solution found in %d iterations and %.2fs\n", iter, toc(start_time));
-      settings.log.printf("Objective %+.8e\n", compute_user_objective(lp, primal_objective));
-      settings.log.printf("Primal infeasibility (abs/rel): %8.2e/%8.2e\n",
-                          primal_residual_norm,
-                          relative_primal_residual);
-      settings.log.printf("Dual infeasibility   (abs/rel): %8.2e/%8.2e\n",
-                          dual_residual_norm,
-                          relative_dual_residual);
-      settings.log.printf("Complementarity gap  (abs/rel): %8.2e/%8.2e\n",
-                          complementarity_residual_norm,
-                          relative_complementarity_residual);
-      settings.log.printf("\n");
-      data.to_solution(lp,
-                       iter,
-                       primal_objective,
-                       compute_user_objective(lp, primal_objective),
-                       primal_residual_norm,
-                       dual_residual_norm,
-                       data.cusparse_view_,
-                       solution);
-      return lp_status_t::OPTIMAL;
+    while (iter < iteration_limit) {
+      raft::common::nvtx::range fun_scope("Barrier: iteration");
+
+      if (toc(start_time) > settings.time_limit) {
+        settings.log.printf("Barrier time limit exceeded\n");
+        return lp_status_t::TIME_LIMIT;
+      }
+      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+        settings.log.printf("Barrier solver halted\n");
+        return lp_status_t::CONCURRENT_LIMIT;
+      }
+
+      // Compute the affine step
+      compute_affine_rhs(data);
+      f_t max_affine_residual = 0.0;
+
+      i_t status = gpu_compute_search_direction(
+        data, data.dw_aff, data.dx_aff, data.dy_aff, data.dv_aff, data.dz_aff, max_affine_residual);
+      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+        settings.log.printf("Barrier solver halted\n");
+        return lp_status_t::CONCURRENT_LIMIT;
+      }
+      // Sync to make sure all the async copies to host done inside are finished
+      if (use_gpu) RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+
+      if (status < 0) {
+        return check_for_suboptimal_solution(options,
+                                             data,
+                                             start_time,
+                                             iter,
+                                             primal_objective,
+                                             primal_residual_norm,
+                                             dual_residual_norm,
+                                             complementarity_residual_norm,
+                                             relative_primal_residual,
+                                             relative_dual_residual,
+                                             relative_complementarity_residual,
+                                             solution);
+      }
+      if (toc(start_time) > settings.time_limit) {
+        settings.log.printf("Barrier time limit exceeded\n");
+        return lp_status_t::TIME_LIMIT;
+      }
+      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+        settings.log.printf("Barrier solver halted\n");
+        return lp_status_t::CONCURRENT_LIMIT;
+      }
+
+      f_t mu_aff, sigma, new_mu;
+      compute_target_mu(data, mu, mu_aff, sigma, new_mu);
+
+      compute_cc_rhs(data, new_mu);
+
+      f_t max_corrector_residual = 0.0;
+
+      status = gpu_compute_search_direction(
+        data, data.dw, data.dx, data.dy, data.dv, data.dz, max_corrector_residual);
+      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+        settings.log.printf("Barrier solver halted\n");
+        return lp_status_t::CONCURRENT_LIMIT;
+      }
+      // Sync to make sure all the async copies to host done inside are finished
+      if (use_gpu) RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+      if (status < 0) {
+        return check_for_suboptimal_solution(options,
+                                             data,
+                                             start_time,
+                                             iter,
+                                             primal_objective,
+                                             primal_residual_norm,
+                                             dual_residual_norm,
+                                             complementarity_residual_norm,
+                                             relative_primal_residual,
+                                             relative_dual_residual,
+                                             relative_complementarity_residual,
+                                             solution);
+      }
+      data.has_factorization = false;
+      data.has_solve_info    = false;
+      if (toc(start_time) > settings.time_limit) {
+        settings.log.printf("Barrier time limit exceeded\n");
+        return lp_status_t::TIME_LIMIT;
+      }
+      if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
+        settings.log.printf("Barrier solver halted\n");
+        return lp_status_t::CONCURRENT_LIMIT;
+      }
+
+      compute_final_direction(data);
+      f_t step_primal, step_dual;
+      compute_primal_dual_step_length(data, options.step_scale, step_primal, step_dual);
+      compute_next_iterate(data, options.step_scale, step_primal, step_dual);
+
+      compute_residual_norms(
+        data, primal_residual_norm, dual_residual_norm, complementarity_residual_norm);
+
+      compute_mu(data, mu);
+
+      compute_primal_dual_objective(data, primal_objective, dual_objective);
+
+      relative_primal_residual = primal_residual_norm / (1.0 + norm_b);
+      relative_dual_residual   = dual_residual_norm / (1.0 + norm_c);
+      relative_complementarity_residual =
+        complementarity_residual_norm / (1.0 + std::abs(primal_objective));
+
+      if (relative_primal_residual < settings.barrier_relaxed_feasibility_tol &&
+          relative_dual_residual < settings.barrier_relaxed_optimality_tol &&
+          relative_complementarity_residual < settings.barrier_relaxed_complementarity_tol) {
+        if (relative_primal_residual < data.relative_primal_residual_save &&
+            relative_dual_residual < data.relative_dual_residual_save &&
+            relative_complementarity_residual < data.relative_complementarity_residual_save) {
+          settings.log.debug(
+            "Saving solution: feasibility %.2e (%.2e), optimality %.2e (%.2e), complementarity "
+            "%.2e (%.2e)\n",
+            relative_primal_residual,
+            primal_residual_norm,
+            relative_dual_residual,
+            dual_residual_norm,
+            relative_complementarity_residual,
+            complementarity_residual_norm);
+          data.w_save                                 = data.w;
+          data.x_save                                 = data.x;
+          data.y_save                                 = data.y;
+          data.v_save                                 = data.v;
+          data.z_save                                 = data.z;
+          data.relative_primal_residual_save          = relative_primal_residual;
+          data.relative_dual_residual_save            = relative_dual_residual;
+          data.relative_complementarity_residual_save = relative_complementarity_residual;
+          data.primal_residual_norm_save              = primal_residual_norm;
+          data.dual_residual_norm_save                = dual_residual_norm;
+          data.complementarity_residual_norm_save     = complementarity_residual_norm;
+        }
+      }
+
+      iter++;
+      elapsed_time = toc(start_time);
+
+      if (primal_objective != primal_objective || dual_objective != dual_objective) {
+        settings.log.printf("Numerical error in objective\n");
+        return lp_status_t::NUMERICAL_ISSUES;
+      }
+
+      settings.log.printf("%3d   %+.12e %+.12e %.2e %.2e %.2e %.1f\n",
+                          iter,
+                          compute_user_objective(lp, primal_objective),
+                          compute_user_objective(lp, dual_objective),
+                          relative_primal_residual,
+                          relative_dual_residual,
+                          relative_complementarity_residual,
+                          elapsed_time);
+
+      bool primal_feasible = relative_primal_residual < settings.barrier_relative_feasibility_tol;
+      bool dual_feasible   = relative_dual_residual < settings.barrier_relative_optimality_tol;
+      bool small_gap =
+        relative_complementarity_residual < settings.barrier_relative_complementarity_tol;
+
+      converged = primal_feasible && dual_feasible && small_gap;
+
+      if (converged) {
+        settings.log.printf("\n");
+        settings.log.printf(
+          "Optimal solution found in %d iterations and %.2fs\n", iter, toc(start_time));
+        settings.log.printf("Objective %+.8e\n", compute_user_objective(lp, primal_objective));
+        settings.log.printf("Primal infeasibility (abs/rel): %8.2e/%8.2e\n",
+                            primal_residual_norm,
+                            relative_primal_residual);
+        settings.log.printf("Dual infeasibility   (abs/rel): %8.2e/%8.2e\n",
+                            dual_residual_norm,
+                            relative_dual_residual);
+        settings.log.printf("Complementarity gap  (abs/rel): %8.2e/%8.2e\n",
+                            complementarity_residual_norm,
+                            relative_complementarity_residual);
+        settings.log.printf("\n");
+        data.to_solution(lp,
+                         iter,
+                         primal_objective,
+                         compute_user_objective(lp, primal_objective),
+                         primal_residual_norm,
+                         dual_residual_norm,
+                         data.cusparse_view_,
+                         solution);
+        return lp_status_t::OPTIMAL;
+      }
     }
+    data.to_solution(lp,
+                     iter,
+                     primal_objective,
+                     compute_user_objective(lp, primal_objective),
+                     vector_norm2<i_t, f_t>(data.primal_residual),
+                     vector_norm2<i_t, f_t>(data.dual_residual),
+                     data.cusparse_view_,
+                     solution);
+    return lp_status_t::ITERATION_LIMIT;
+  } catch (const raft::cuda_error& e) {
+    settings.log.debug("Error in barrier_solver_t: %s\n", e.what());
+    return lp_status_t::NUMERICAL_ISSUES;
   }
-  data.to_solution(lp,
-                   iter,
-                   primal_objective,
-                   compute_user_objective(lp, primal_objective),
-                   vector_norm2<i_t, f_t>(data.primal_residual),
-                   vector_norm2<i_t, f_t>(data.dual_residual),
-                   data.cusparse_view_,
-                   solution);
-  return lp_status_t::ITERATION_LIMIT;
 }
 
 #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
diff --git a/cpp/src/dual_simplex/sparse_cholesky.cuh b/cpp/src/dual_simplex/sparse_cholesky.cuh
index 4ea1609e31..7e792998fd 100644
--- a/cpp/src/dual_simplex/sparse_cholesky.cuh
+++ b/cpp/src/dual_simplex/sparse_cholesky.cuh
@@ -100,13 +100,17 @@ class sparse_cholesky_base_t {
 template <typename mem_pool_t>
 int cudss_device_alloc(void* ctx, void** ptr, size_t size, cudaStream_t stream)
 {
-  return cudaMallocAsync(ptr, size, stream);
+  int status = cudaMallocAsync(ptr, size, stream);
+  if (status != cudaSuccess) { throw raft::cuda_error("Cuda error in cudss_device_alloc"); }
+  return status;
 }
 
 template <typename mem_pool_t>
 int cudss_device_dealloc(void* ctx, void* ptr, size_t size, cudaStream_t stream)
 {
-  return cudaFreeAsync(ptr, stream);
+  int status = cudaFreeAsync(ptr, stream);
+  if (status != cudaSuccess) { throw raft::cuda_error("Cuda error in cudss_device_dealloc"); }
+  return status;
 }
 
 template <class T>

From 91a19f82b1974f9e375021b932953e5d0d7c5609 Mon Sep 17 00:00:00 2001
From: Chris Maes <cmaes@nvidia.com>
Date: Sun, 12 Oct 2025 16:35:00 -0700
Subject: [PATCH 19/27] Fix issue with barrier incorrectly returning numerical
 in concurrent (#484)

@tmckayus reported that in concurrent mode, he was seeing lots of barrier numerical status.

This was due to incorrectly setting the status to NUMERIC, when barrier was stopped by PDLP or concurrent.

This PR fixes the issue, by checking for concurrent limit first.

Logs are also changed for consistency.

Authors:
  - Chris Maes (https://github.com/chris-maes)

Approvers:
  - Hugo Linsenmaier (https://github.com/hlinsen)

URL: https://github.com/NVIDIA/cuopt/pull/484
---
 cpp/src/dual_simplex/barrier.cu            | 9 +++++----
 cpp/src/mip/diversity/diversity_manager.cu | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/dual_simplex/barrier.cu b/cpp/src/dual_simplex/barrier.cu
index 0b952e2dd7..47f1218f34 100644
--- a/cpp/src/dual_simplex/barrier.cu
+++ b/cpp/src/dual_simplex/barrier.cu
@@ -3266,14 +3266,15 @@ lp_status_t barrier_solver_t<i_t, f_t>::solve(f_t start_time,
     }
 
     iteration_data_t<i_t, f_t> data(lp, num_upper_bounds, settings);
-    if (data.symbolic_status != 0) {
-      settings.log.printf("Error in symbolic analysis\n");
-      return lp_status_t::NUMERICAL_ISSUES;
-    }
     if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) {
       settings.log.printf("Barrier solver halted\n");
       return lp_status_t::CONCURRENT_LIMIT;
     }
+    if (data.symbolic_status != 0) {
+      settings.log.printf("Error in symbolic analysis\n");
+      return lp_status_t::NUMERICAL_ISSUES;
+    }
+
     data.cusparse_dual_residual_ = data.cusparse_view_.create_vector(data.d_dual_residual_);
     data.cusparse_r1_            = data.cusparse_view_.create_vector(data.d_r1_);
     data.cusparse_tmp4_          = data.cusparse_view_.create_vector(data.d_tmp4_);
diff --git a/cpp/src/mip/diversity/diversity_manager.cu b/cpp/src/mip/diversity/diversity_manager.cu
index ce96a317a5..b11f98e10d 100644
--- a/cpp/src/mip/diversity/diversity_manager.cu
+++ b/cpp/src/mip/diversity/diversity_manager.cu
@@ -245,7 +245,7 @@ bool diversity_manager_t<i_t, f_t>::run_presolve(f_t time_limit)
   lp_dual_optimal_solution.resize(problem_ptr->n_constraints,
                                   problem_ptr->handle_ptr->get_stream());
   problem_ptr->handle_ptr->sync_stream();
-  CUOPT_LOG_INFO("After trivial presolve #constraints %d #variables %d objective offset %f.",
+  CUOPT_LOG_INFO("After trivial presolve: %d constraints, %d variables, objective offset %f.",
                  problem_ptr->n_constraints,
                  problem_ptr->n_variables,
                  problem_ptr->presolve_data.objective_offset);

From 6d11a001b2239969ea21b0491c2f505a04dda5cf Mon Sep 17 00:00:00 2001
From: Cindy Wilkinson <cwilkinson@nvidia.com>
Date: Mon, 13 Oct 2025 12:37:30 -0400
Subject: [PATCH 20/27] Release notes for v25.10 (#432)

## Issue

Authors:
  - Cindy Wilkinson (https://github.com/cwilkinson76)
  - Ramakrishnap (https://github.com/rgsl888prabhu)
  - Chris Maes (https://github.com/chris-maes)

Approvers:
  - Trevor McKay (https://github.com/tmckayus)

URL: https://github.com/NVIDIA/cuopt/pull/432
---
 docs/cuopt/source/release-notes.rst | 87 +++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/docs/cuopt/source/release-notes.rst b/docs/cuopt/source/release-notes.rst
index 6443782d87..e01866f7c2 100644
--- a/docs/cuopt/source/release-notes.rst
+++ b/docs/cuopt/source/release-notes.rst
@@ -2,6 +2,93 @@
 Release Notes
 =====================
 
+====================
+Release Notes 25.10
+====================
+
+New Features (25.10)
+--------------------
+
+- New barrier method for solving LPs. Uses cuDSS for sparse Cholesky / LDT.
+- Concurrent mode for LPs now uses PDLP, dual simplex, and barrier
+- New PDLP solver mode Stable3.
+- MIP presolve using Papilo (enabled by default). LP presolve using Papilo (optional).
+- Parallel branch and bound on the CPU: multiple best-first search and diving threads
+
+Breaking Changes (25.10)
+------------------------
+
+- New PDLP Solver mode Stable3 is the default
+
+
+Improvements (25.10)
+--------------------
+
+- Add setting "CUOPT_BARRIER_DUAL_INITIAL_POINT" to change the dual initial point used by barrier
+- CPUFJ for local search + simple rounding
+- FP as a local search
+- Sub-MIP recombiner and B&B global variable changes
+- Implement GF(2) presolve reduction
+- Implement node presolve
+- CUDA 13/12.9 support
+- Build and test with CUDA 13.0.0
+- Add read/write MPS and relaxation to python API
+- Decompression for ``.mps.gz`` and ``.mps.bz2`` files
+- Enable parallelism for root node presolve
+- Enable singleton stuffing and use Papilo default params
+- Make infeasibility checks consistent between the main solver and presolver
+- Add maximization support for root node presolve
+- Performance improvement in dual simplex's right-looking LU factorization
+- Fix high GPU memory usage
+- Print cuOpt version / machine info before solving
+- ``cuopt-server``: update dependencies (drop httpx, add psutil)
+- Add nightly testing of cuOpt jump interface
+- Compression tests are not run when compression is disabled
+- Add sanitizer build option- Heuristic Improvements: balance between generation and improvement heuristics
+- Loosen presolve tolerance and update timers to report cumulative presolve/solve time
+- Warn in case a dependent library is not found in libcuopt load
+- Combined variable bounds
+- Add Commit Sha to container for reference
+- use GCC 14, consolidate dependency groups, update pre-commit hooks
+- Add support for nightly ``cuopt-examples`` notebook testing
+- Reduce hard-coded version usage in repo
+- Container to work on all different users including root
+- Changes to download LP and MILP datasets, and also disable cvxpy testing for 3.10
+- Faster engine compile time
+- Fix pre-commit for trailing whitespace and end of file
+- Merge update version and fix version format bugs
+- This library now supports the QPS format, which is an extension of the standard MPS format for representing quadratic programming problems.
+
+
+Bug Fixes (25.10)
+-----------------
+
+- Fix variables out of bounds caused by CPUFJ LP scratch thread
+- Fix the maybe-uninitialized compilation error
+- Fix linking errors in the test suite when disabling C adaptor
+- Compute relative gap with respect to user objectives
+- Add http timeout values for general, send, and receive to client
+- Fix bug in ``fixed_problem_computation``
+- Remove ``limiting_resource_adaptor`` leftover
+- Add support for cuda13 container and fix cuda13 lib issues in wheel
+- Return Infeasible if the user problem contains crossing bounds
+- Fix out-of-bound access in ``clean_up_infeasibilities``
+- Empty columns with infinite bounds are not removed
+
+
+Documentation (25.10)
+---------------------
+
+- Add tutorial video links to Decompression
+- Add warmstart, model update, update docs
+- add docs on CI workflow inputs
+- Add name to drop-down for video link
+- Add video link to the docs and to the Readme
+- Add documentation on nightly installation commands
+- Fix version in version tab, change log, and fix typos
+- Doc update for container version update, and add ``nvidia-cuda-runtime`` as a dependency
+
+
 ====================
 Release Notes 25.08
 ====================

From d123cb9d15fde583f22c47a8fe980b3dbeecd3b1 Mon Sep 17 00:00:00 2001
From: Ramakrishna Prabhu <ramakrishnap@nvidia.com>
Date: Mon, 13 Oct 2025 17:23:24 -0500
Subject: [PATCH 21/27] update

---
 docs/cuopt/source/versions1.json | 50 +++++++++++++++++---------------
 python/cuopt/pyproject.toml      |  4 +--
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/docs/cuopt/source/versions1.json b/docs/cuopt/source/versions1.json
index da50d0918f..96ede286d4 100644
--- a/docs/cuopt/source/versions1.json
+++ b/docs/cuopt/source/versions1.json
@@ -1,24 +1,28 @@
 [
-    {
-      "version": "25.10.00",
-      "url": "../25.10.00/",
-      "name": "latest",
-      "preferred": true
-    },
-    {
-      "version": "25.08.00",
-      "url": "../25.08.00/"
-    },
-    {
-      "version": "25.05",
-      "url": "../25.05/"
-    },
-    {
-      "version": "25.02",
-      "url": "../25.02/"
-    },
-    {
-      "version": "24.11",
-      "url": "../24.11/"
-    }
-  ]
+  {
+    "version": "25.12.00",
+    "url": "../25.12.00/",
+    "name": "latest",
+    "preferred": true
+  },
+  {
+    "version": "25.10.00",
+    "url": "../25.10.00/"
+  },
+  {
+    "version": "25.08.00",
+    "url": "../25.08.00/"
+  },
+  {
+    "version": "25.05",
+    "url": "../25.05/"
+  },
+  {
+    "version": "25.02",
+    "url": "../25.02/"
+  },
+  {
+    "version": "24.11",
+    "url": "../24.11/"
+  }
+]
\ No newline at end of file
diff --git a/python/cuopt/pyproject.toml b/python/cuopt/pyproject.toml
index c925b8fa7f..ee0b468ed9 100644
--- a/python/cuopt/pyproject.toml
+++ b/python/cuopt/pyproject.toml
@@ -41,8 +41,8 @@ dependencies = [
     "numba>=0.60.0",
     "numpy>=1.23.5,<3.0a0",
     "pandas>=2.0",
-    "pylibraft==25.10.*,>=0.0.0a0",
-    "rapids-dask-dependency==25.10.*,>=0.0.0a0",
+    "pylibraft==25.12.*,>=0.0.0a0",
+    "rapids-dask-dependency==25.12.*,>=0.0.0a0",
     "rapids-logger==0.1.*,>=0.0.0a0",
     "rmm==25.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From daa1ca27919780b33cfed7f5ea9ee4156df81d95 Mon Sep 17 00:00:00 2001
From: Ramakrishna Prabhu <ramakrishnap@nvidia.com>
Date: Mon, 13 Oct 2025 21:37:36 -0500
Subject: [PATCH 22/27] fix style

---
 docs/cuopt/source/versions1.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/cuopt/source/versions1.json b/docs/cuopt/source/versions1.json
index 96ede286d4..1c5b37349c 100644
--- a/docs/cuopt/source/versions1.json
+++ b/docs/cuopt/source/versions1.json
@@ -25,4 +25,4 @@
     "version": "24.11",
     "url": "../24.11/"
   }
-]
\ No newline at end of file
+]

From 873eaf76bbeb6bc9347ca5d2bcd4eeed44e2bd35 Mon Sep 17 00:00:00 2001
From: Ramakrishna Prabhu <ramakrishnap@nvidia.com>
Date: Tue, 14 Oct 2025 10:51:05 -0500
Subject: [PATCH 23/27] update rapids logger

---
 conda/recipes/libcuopt/recipe.yaml | 4 ++--
 dependencies.yaml                  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conda/recipes/libcuopt/recipe.yaml b/conda/recipes/libcuopt/recipe.yaml
index e18cea2a6a..fdfd752579 100644
--- a/conda/recipes/libcuopt/recipe.yaml
+++ b/conda/recipes/libcuopt/recipe.yaml
@@ -67,7 +67,7 @@ cache:
       - gtest ${{ gtest_version }}
       - libraft-headers =${{ dep_minor_version }}
       - librmm =${{ dep_minor_version }}
-      - rapids-logger =0.1
+      - rapids-logger =0.2
       - cuda-nvtx-dev
       - libcudss-dev >=0.7
       - libcurand-dev
@@ -148,7 +148,7 @@ outputs:
         - ${{ pin_subpackage("libmps-parser", exact=True) }}
         - boost
         - cuda-version =${{ cuda_version }}
-        - rapids-logger =0.1
+        - rapids-logger =0.2
         - librmm =${{ dep_minor_version }}
         - cuda-cudart-dev
         - libcublas
diff --git a/dependencies.yaml b/dependencies.yaml
index 8de34e0ce8..25a2fca6d4 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -573,7 +573,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-logger==0.1.*,>=0.0.0a0
+          - rapids-logger==0.2.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file

From 0b53a14bc489cfb1c773b41beb79d95fb841c205 Mon Sep 17 00:00:00 2001
From: Ramakrishna Prabhu <ramakrishnap@nvidia.com>
Date: Tue, 14 Oct 2025 10:51:47 -0500
Subject: [PATCH 24/27] fix style

---
 conda/environments/all_cuda-129_arch-aarch64.yaml    | 2 +-
 conda/environments/all_cuda-129_arch-x86_64.yaml     | 2 +-
 conda/environments/all_cuda-130_arch-aarch64.yaml    | 2 +-
 conda/environments/all_cuda-130_arch-x86_64.yaml     | 2 +-
 python/cuopt/cuopt/linear_programming/pyproject.toml | 6 +++---
 python/cuopt/pyproject.toml                          | 6 +++---
 python/libcuopt/pyproject.toml                       | 4 ++--
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 3ad7d0de6f..f87e9981d7 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -61,7 +61,7 @@ dependencies:
 - python>=3.10,<3.14
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
 - rapids-dask-dependency==25.12.*,>=0.0.0a0
-- rapids-logger==0.1.*,>=0.0.0a0
+- rapids-logger==0.2.*,>=0.0.0a0
 - requests
 - rmm==25.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 6aa27358ce..8105bf9aba 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -61,7 +61,7 @@ dependencies:
 - python>=3.10,<3.14
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
 - rapids-dask-dependency==25.12.*,>=0.0.0a0
-- rapids-logger==0.1.*,>=0.0.0a0
+- rapids-logger==0.2.*,>=0.0.0a0
 - requests
 - rmm==25.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml
index f853f06adc..bf626dd520 100644
--- a/conda/environments/all_cuda-130_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-130_arch-aarch64.yaml
@@ -61,7 +61,7 @@ dependencies:
 - python>=3.10,<3.14
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
 - rapids-dask-dependency==25.12.*,>=0.0.0a0
-- rapids-logger==0.1.*,>=0.0.0a0
+- rapids-logger==0.2.*,>=0.0.0a0
 - requests
 - rmm==25.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml
index b5f642a1b2..72691938c5 100644
--- a/conda/environments/all_cuda-130_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-130_arch-x86_64.yaml
@@ -61,7 +61,7 @@ dependencies:
 - python>=3.10,<3.14
 - rapids-build-backend>=0.4.0,<0.5.0.dev0
 - rapids-dask-dependency==25.12.*,>=0.0.0a0
-- rapids-logger==0.1.*,>=0.0.0a0
+- rapids-logger==0.2.*,>=0.0.0a0
 - requests
 - rmm==25.12.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
diff --git a/python/cuopt/cuopt/linear_programming/pyproject.toml b/python/cuopt/cuopt/linear_programming/pyproject.toml
index 1e55450fc5..5a3809234a 100644
--- a/python/cuopt/cuopt/linear_programming/pyproject.toml
+++ b/python/cuopt/cuopt/linear_programming/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "numpy>=1.23.5,<3.0a0",
-    "rapids-logger==0.1.*,>=0.0.0a0",
+    "rapids-logger==0.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -39,7 +39,7 @@ Source = "https://github.com/nvidia/cuopt"
 test = [
     "pytest-cov",
     "pytest<8",
-    "rapids-logger==0.1.*,>=0.0.0a0",
+    "rapids-logger==0.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [tool.setuptools]
@@ -83,5 +83,5 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy>=1.23.5,<3.0a0",
-    "rapids-logger==0.1.*,>=0.0.0a0",
+    "rapids-logger==0.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cuopt/pyproject.toml b/python/cuopt/pyproject.toml
index ee0b468ed9..b7d32e4637 100644
--- a/python/cuopt/pyproject.toml
+++ b/python/cuopt/pyproject.toml
@@ -43,7 +43,7 @@ dependencies = [
     "pandas>=2.0",
     "pylibraft==25.12.*,>=0.0.0a0",
     "rapids-dask-dependency==25.12.*,>=0.0.0a0",
-    "rapids-logger==0.1.*,>=0.0.0a0",
+    "rapids-logger==0.2.*,>=0.0.0a0",
     "rmm==25.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -61,7 +61,7 @@ test = [
     "numpy>=1.23.5,<3.0a0",
     "pytest-cov",
     "pytest<8",
-    "rapids-logger==0.1.*,>=0.0.0a0",
+    "rapids-logger==0.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -155,6 +155,6 @@ requires = [
     "libcuopt==25.12.*,>=0.0.0a0",
     "ninja",
     "pylibraft==25.12.*,>=0.0.0a0",
-    "rapids-logger==0.1.*,>=0.0.0a0",
+    "rapids-logger==0.2.*,>=0.0.0a0",
     "rmm==25.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/libcuopt/pyproject.toml b/python/libcuopt/pyproject.toml
index fbbbdd0877..278092e29b 100644
--- a/python/libcuopt/pyproject.toml
+++ b/python/libcuopt/pyproject.toml
@@ -52,7 +52,7 @@ dependencies = [
     "nvidia-cusolver",
     "nvidia-cusparse",
     "nvidia-nvtx",
-    "rapids-logger==0.1.*,>=0.0.0a0",
+    "rapids-logger==0.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -97,5 +97,5 @@ requires = [
     "cuopt-mps-parser==25.12.*,>=0.0.0a0",
     "librmm==25.12.*,>=0.0.0a0",
     "ninja",
-    "rapids-logger==0.1.*,>=0.0.0a0",
+    "rapids-logger==0.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 921267b6fd39e4d2df35529fde28d685f1a4d340 Mon Sep 17 00:00:00 2001
From: Ramakrishna Prabhu <ramakrishnap@nvidia.com>
Date: Tue, 14 Oct 2025 11:29:24 -0500
Subject: [PATCH 25/27] add header

---
 cpp/src/mip/diversity/weights.cuh     | 1 +
 cpp/src/mip/problem/presolve_data.cuh | 1 +
 cpp/src/mip/relaxed_lp/lp_state.cuh   | 1 +
 3 files changed, 3 insertions(+)

diff --git a/cpp/src/mip/diversity/weights.cuh b/cpp/src/mip/diversity/weights.cuh
index 9f53b88473..d8563ea739 100644
--- a/cpp/src/mip/diversity/weights.cuh
+++ b/cpp/src/mip/diversity/weights.cuh
@@ -17,6 +17,7 @@
 
 #pragma once
 
+#include <thrust/fill.h>
 #include <raft/core/handle.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/src/mip/problem/presolve_data.cuh b/cpp/src/mip/problem/presolve_data.cuh
index 20ae5fe931..e9ba5f3b39 100644
--- a/cpp/src/mip/problem/presolve_data.cuh
+++ b/cpp/src/mip/problem/presolve_data.cuh
@@ -20,6 +20,7 @@
 #include <cuopt/linear_programming/optimization_problem.hpp>
 
 #include <thrust/sequence.h>
+#include <thrust/uninitialized_fill.h>
 #include <rmm/device_uvector.hpp>
 
 namespace cuopt {
diff --git a/cpp/src/mip/relaxed_lp/lp_state.cuh b/cpp/src/mip/relaxed_lp/lp_state.cuh
index 3bfa00955c..8662df7547 100644
--- a/cpp/src/mip/relaxed_lp/lp_state.cuh
+++ b/cpp/src/mip/relaxed_lp/lp_state.cuh
@@ -17,6 +17,7 @@
 
 #pragma once
 
+#include <thrust/fill.h>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 

From c8b8a7cbd2ffb5592a08cab9d157a66c36e1f601 Mon Sep 17 00:00:00 2001
From: Ramakrishna Prabhu <ramakrishnap@nvidia.com>
Date: Tue, 14 Oct 2025 13:22:55 -0500
Subject: [PATCH 26/27] udpdate deps

---
 dependencies.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 25a2fca6d4..5db5a9ee1c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -326,7 +326,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - libcuopt-tests==25.10.*,>=0.0.0a0
+          - libcuopt-tests==25.12.*,>=0.0.0a0
   build_wheels:
     common:
       - output_types: [requirements, pyproject]

From 1bbe0b768bc66c41883aa1660c621c8cf1111674 Mon Sep 17 00:00:00 2001
From: Ramakrishna Prabhu <ramakrishnap@nvidia.com>
Date: Wed, 15 Oct 2025 11:13:51 -0500
Subject: [PATCH 27/27] Empty commit