From bee22227030dfb61c714df6cedbbd14b03da4ee2 Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Mon, 13 Apr 2026 13:36:18 +0200
Subject: [PATCH 01/14] B&B and heuristics now shares a single omp parallel
 region

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/branch_and_bound/branch_and_bound.cpp |  53 ++++----
 cpp/src/branch_and_bound/pseudo_costs.cpp     | 124 +++++++++---------
 cpp/src/mip_heuristics/solver.cu              |  74 ++++++-----
 3 files changed, 135 insertions(+), 116 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 1526baa367..a0a4907374 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -1871,19 +1871,30 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
 
   // Root node path
   lp_status_t root_status;
-  std::future<lp_status_t> root_status_future;
-  root_status_future = std::async(std::launch::async,
-                                  &solve_linear_program_with_advanced_basis<i_t, f_t>,
-                                  std::ref(original_lp_),
-                                  exploration_stats_.start_time,
-                                  std::ref(lp_settings),
-                                  std::ref(root_relax_soln),
-                                  std::ref(basis_update),
-                                  std::ref(basic_list),
-                                  std::ref(nonbasic_list),
-                                  std::ref(root_vstatus),
-                                  std::ref(edge_norms),
-                                  nullptr);
+
+  // Note that we need to explicitly declared `root_status` as a shared variable here since
+  // it is local to the thread that are executing the enclosing task.
+#pragma omp task shared(root_status,     \
+                          original_lp_,  \
+                          lp_settings,   \
+                          basis_update,  \
+                          basic_list,    \
+                          nonbasic_list, \
+                          root_vstatus_, \
+                          edge_norms_) default(none)
+  {
+    root_status = solve_linear_program_with_advanced_basis(original_lp_,
+                                                           exploration_stats_.start_time,
+                                                           lp_settings,
+                                                           root_relax_soln_,
+                                                           basis_update,
+                                                           basic_list,
+                                                           nonbasic_list,
+                                                           root_vstatus_,
+                                                           edge_norms_,
+                                                           nullptr);
+  }
+
   // Wait for the root relaxation solution to be sent by the diversity manager or dual simplex
   // to finish
   while (!root_crossover_solution_set_.load(std::memory_order_acquire) &&
@@ -1925,9 +1936,9 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
 
     // Check if crossover was stopped by dual simplex
     if (crossover_status == crossover_status_t::OPTIMAL) {
-      set_root_concurrent_halt(1);             // Stop dual simplex
-      root_status = root_status_future.get();  // Wait for dual simplex to finish
-      set_root_concurrent_halt(0);             // Clear the concurrent halt flag
+      set_root_concurrent_halt(1);  // Stop dual simplex
+#pragma omp taskwait                // Wait for dual simplex to finish
+      set_root_concurrent_halt(0);  // Clear the concurrent halt flag
       // Override the root relaxation solution with the crossover solution
       root_relax_soln = root_crossover_soln_;
       root_vstatus    = crossover_vstatus_;
@@ -1977,14 +1988,14 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
       solver_name    = method_to_string(root_relax_solved_by);
 
     } else {
-      root_status          = root_status_future.get();
+#pragma omp taskwait
       user_objective       = root_relax_soln_.user_objective;
       iter                 = root_relax_soln_.iterations;
       root_relax_solved_by = DualSimplex;
       solver_name          = "Dual Simplex";
     }
   } else {
-    root_status          = root_status_future.get();
+#pragma omp taskwait
     user_objective       = root_relax_soln_.user_objective;
     iter                 = root_relax_soln_.iterations;
     root_relax_solved_by = DualSimplex;
@@ -2613,11 +2624,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   if (settings_.deterministic) {
     run_deterministic_coordinator(Arow_);
   } else if (settings_.num_threads > 1) {
-#pragma omp parallel num_threads(settings_.num_threads)
-    {
-#pragma omp master
-      run_scheduler();
-    }
+    run_scheduler();
   } else {
     single_threaded_solve();
   }
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index c38e98e27d..65e527078a 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -756,14 +756,15 @@ static void batch_pdlp_strong_branching_task(
     ws_settings.inside_mip                           = true;
     if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; }
 
-    auto start_time = std::chrono::high_resolution_clock::now();
+    auto pdlp_start_time = std::chrono::high_resolution_clock::now();
 
     auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings);
 
     if (verbose) {
-      auto end_time = std::chrono::high_resolution_clock::now();
+      auto pdlp_end_time = std::chrono::high_resolution_clock::now();
       auto duration =
-        std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
+        std::chrono::duration_cast<std::chrono::milliseconds>(pdlp_end_time - pdlp_start_time)
+          .count();
       settings.log.printf(
         "Original problem solved in %d milliseconds"
         " and iterations: %d\n",
@@ -1029,7 +1030,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   shared_strong_branching_context_t<i_t, f_t> shared_ctx(2 * fractional.size());
   shared_strong_branching_context_view_t<i_t, f_t> sb_view(shared_ctx.solved);
 
-  std::atomic<int> concurrent_halt{0};
+  std::atomic concurrent_halt{0};
 
   std::vector<f_t> pdlp_obj_down(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
   std::vector<f_t> pdlp_obj_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
@@ -1052,70 +1053,67 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                                           basis_factors,
                                           pc);
   } else {
-#pragma omp parallel num_threads(settings.num_threads)
-    {
-#pragma omp single nowait
-      {
-        if (effective_batch_pdlp != 0) {
-#pragma omp task
-          batch_pdlp_strong_branching_task(settings,
-                                           effective_batch_pdlp,
-                                           start_time,
-                                           concurrent_halt,
-                                           original_lp,
-                                           new_slacks,
-                                           root_solution.x,
-                                           fractional,
-                                           root_obj,
-                                           pc,
-                                           sb_view,
-                                           pdlp_obj_down,
-                                           pdlp_obj_up);
-        }
+    if (effective_batch_pdlp != 0) {
+#pragma omp task depend(out : pdlp_obj_down) depend(out : pdlp_obj_up) default(shared)
+      batch_pdlp_strong_branching_task(settings,
+                                       effective_batch_pdlp,
+                                       start_time,
+                                       concurrent_halt,
+                                       original_lp,
+                                       new_slacks,
+                                       root_solution.x,
+                                       fractional,
+                                       root_obj,
+                                       pc,
+                                       sb_view,
+                                       pdlp_obj_down,
+                                       pdlp_obj_up);
+    }
 
-        if (effective_batch_pdlp != 2) {
-          i_t n = std::min<i_t>(4 * settings.num_threads, fractional.size());
+    if (effective_batch_pdlp != 2) {
+      i_t n = std::min<i_t>(4 * settings.num_threads, fractional.size());
 // Here we are creating more tasks than the number of threads
 // such that they can be scheduled dynamically to the threads.
-#pragma omp taskloop num_tasks(n)
-          for (i_t k = 0; k < n; k++) {
-            i_t start = std::floor(k * fractional.size() / n);
-            i_t end   = std::floor((k + 1) * fractional.size() / n);
-
-            constexpr bool verbose = false;
-            if (verbose) {
-              settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n",
-                                  omp_get_thread_num(),
-                                  k,
-                                  start,
-                                  end,
-                                  end - start);
-            }
-
-            strong_branch_helper(start,
-                                 end,
-                                 start_time,
-                                 original_lp,
-                                 settings,
-                                 var_types,
-                                 fractional,
-                                 root_solution.x,
-                                 root_vstatus,
-                                 edge_norms,
-                                 root_obj,
-                                 upper_bound,
-                                 simplex_iteration_limit,
-                                 pc,
-                                 dual_simplex_obj_down,
-                                 dual_simplex_obj_up,
-                                 dual_simplex_status_down,
-                                 dual_simplex_status_up,
-                                 sb_view);
-          }
-          // DS done: signal PDLP to stop (time-limit or all work done) and wait
-          if (effective_batch_pdlp == 1) { concurrent_halt.store(1); }
+#pragma omp taskloop num_tasks(n) default(shared)
+      for (i_t k = 0; k < n; k++) {
+        i_t start = std::floor(k * fractional.size() / n);
+        i_t end   = std::floor((k + 1) * fractional.size() / n);
+
+        if (verbose) {
+          settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n",
+                              omp_get_thread_num(),
+                              k,
+                              start,
+                              end,
+                              end - start);
         }
+
+        strong_branch_helper(start,
+                             end,
+                             start_time,
+                             original_lp,
+                             settings,
+                             var_types,
+                             fractional,
+                             root_solution.x,
+                             root_vstatus,
+                             edge_norms,
+                             root_obj,
+                             upper_bound,
+                             simplex_iteration_limit,
+                             pc,
+                             dual_simplex_obj_down,
+                             dual_simplex_obj_up,
+                             dual_simplex_status_down,
+                             dual_simplex_status_up,
+                             sb_view);
       }
+      // DS done: signal PDLP to stop (time-limit or all work done) and wait
+      if (effective_batch_pdlp == 1) { concurrent_halt.store(1); }
+    }
+
+    if (effective_batch_pdlp != 0) {
+#pragma omp taskwait depend(in : pdlp_obj_down) depend(in : pdlp_obj_up)
     }
   }
 
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index 0bbf48d95e..737d81201f 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -181,6 +181,8 @@ void extract_probing_implied_bounds(
 template <typename i_t, typename f_t>
 solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 {
+  solution_t<i_t, f_t> sol(*context.problem_ptr);
+
   //  we need to keep original problem const
   cuopt_assert(context.problem_ptr != nullptr, "invalid problem pointer");
   context.problem_ptr->tolerances = context.settings.get_tolerances();
@@ -234,14 +236,12 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 
   if (!presolve_success) {
     CUOPT_LOG_INFO("Problem proven infeasible in presolve");
-    solution_t<i_t, f_t> sol(*context.problem_ptr);
     sol.set_problem_fully_reduced();
     context.problem_ptr->post_process_solution(sol);
     return sol;
   }
   if (run_presolve && context.problem_ptr->empty) {
     CUOPT_LOG_INFO("Problem full reduced in presolve");
-    solution_t<i_t, f_t> sol(*context.problem_ptr);
     sol.set_problem_fully_reduced();
     for (auto callback : context.settings.get_mip_callbacks()) {
       if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) {
@@ -293,8 +293,8 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
   }
   context.work_unit_scheduler_.register_context(context.gpu_heur_loop);
 
-  namespace dual_simplex = cuopt::linear_programming::dual_simplex;
-  std::future<dual_simplex::mip_status_t> branch_and_bound_status_future;
+  namespace dual_simplex                             = cuopt::linear_programming::dual_simplex;
+  dual_simplex::mip_status_t branch_and_bound_status = dual_simplex::mip_status_t::UNSET;
   dual_simplex::user_problem_t<i_t, f_t> branch_and_bound_problem(context.problem_ptr->handle_ptr);
   context.problem_ptr->recompute_objective_integrality();
   if (context.problem_ptr->is_objective_integral()) {
@@ -309,8 +309,14 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 
   dual_simplex::probing_implied_bound_t<i_t, f_t> probing_implied_bound;
 
-  bool run_bb = !context.settings.heuristics_only;
-  if (run_bb) {
+  i_t num_threads = 0;
+  if (context.settings.num_cpu_threads < 0) {
+    num_threads = omp_get_max_threads();
+  } else {
+    num_threads = std::max(1, context.settings.num_cpu_threads);
+  }
+
+  if (!context.settings.heuristics_only) {
     // Convert the presolved problem to dual_simplex::user_problem_t
     op_problem_.get_host_user_problem(branch_and_bound_problem);
     // Resize the solution now that we know the number of columns/variables
@@ -324,6 +330,7 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     // Fill in the settings for branch and bound
     branch_and_bound_settings.time_limit           = timer_.get_time_limit();
     branch_and_bound_settings.node_limit           = context.settings.node_limit;
+    branch_and_bound_settings.num_threads          = num_threads - 1;
     branch_and_bound_settings.print_presolve_stats = false;
     branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
     branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap;
@@ -363,21 +370,18 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
         ? 2
         : context.settings.reduced_cost_strengthening;
 
-    if (context.settings.num_cpu_threads < 0) {
-      branch_and_bound_settings.num_threads = std::max(1, omp_get_max_threads() - 1);
-    } else {
-      branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads);
-    }
-
     // Set the branch and bound -> primal heuristics callback
     branch_and_bound_settings.solution_callback =
       std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::solution_callback,
                 &solution_helper,
                 std::placeholders::_1,
                 std::placeholders::_2);
-    // heuristic_preemption_callback is needed in both modes to properly stop the heuristic thread
+
+    // heuristic_preemption_callback is needed in both modes to properly stop the heuristic
+    // thread
     branch_and_bound_settings.heuristic_preemption_callback = std::bind(
       &branch_and_bound_solution_helper_t<i_t, f_t>::preempt_heuristic_solver, &solution_helper);
+
     if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) {
       branch_and_bound_settings.set_simplex_solution_callback =
         std::bind(&branch_and_bound_solution_helper_t<i_t, f_t>::set_simplex_solution,
@@ -403,8 +407,8 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     context.branch_and_bound_ptr = branch_and_bound.get();
 
     // Convert initial_cutoff from user-space to B&B's internal objective space.
-    // context.problem_ptr is the post-trivial-presolve problem, whose get_solver_obj_from_user_obj
-    // produces values in the same space as B&B node lower bounds.
+    // context.problem_ptr is the post-trivial-presolve problem, whose
+    // get_solver_obj_from_user_obj produces values in the same space as B&B node lower bounds.
     if (std::isfinite(context.initial_cutoff)) {
       f_t bb_cutoff = context.problem_ptr->get_solver_obj_from_user_obj(context.initial_cutoff);
       branch_and_bound->set_initial_cutoff(bb_cutoff);
@@ -456,28 +460,38 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
       context.problem_ptr->post_process_solution(sol);
       return sol;
     }
+  }
+
+#pragma omp parallel num_threads(num_threads) default(none) \
+  shared(sol, branch_and_bound, branch_and_bound_status, branch_and_bound_solution, dm, context)
+  {
+#pragma omp master
+    {
+      if (!context.settings.heuristics_only) {
+#pragma omp task
+        {
+          branch_and_bound_status = branch_and_bound->solve(branch_and_bound_solution);
+        }
+      }
 
-    // Fork a thread for branch and bound
-    // std::async and std::future allow us to get the return value of bb::solve()
-    // without having to manually manage the thread
-    // std::future.get() performs a join() operation to wait until the return status is available
-    branch_and_bound_status_future = std::async(std::launch::async,
-                                                &dual_simplex::branch_and_bound_t<i_t, f_t>::solve,
-                                                branch_and_bound.get(),
-                                                std::ref(branch_and_bound_solution));
+#pragma omp task
+      {
+        // Start the primal heuristics
+        context.diversity_manager_ptr = &dm;
+        // Start the primal heuristics
+        sol = dm.run_solver();
+      }
+    }
   }
 
-  // Start the primal heuristics
-  context.diversity_manager_ptr = &dm;
-  auto sol                      = dm.run_solver();
-  if (run_bb) {
-    // Wait for the branch and bound to finish
-    auto bb_status = branch_and_bound_status_future.get();
+  if (!context.settings.heuristics_only) {
     if (branch_and_bound_solution.lower_bound > -std::numeric_limits<f_t>::infinity()) {
       context.stats.set_solution_bound(
         context.problem_ptr->get_user_obj_from_solver_obj(branch_and_bound_solution.lower_bound));
     }
-    if (bb_status == dual_simplex::mip_status_t::INFEASIBLE) { sol.set_problem_fully_reduced(); }
+    if (branch_and_bound_status == dual_simplex::mip_status_t::INFEASIBLE) {
+      sol.set_problem_fully_reduced();
+    }
     context.stats.num_nodes              = branch_and_bound_solution.nodes_explored;
     context.stats.num_simplex_iterations = branch_and_bound_solution.simplex_iterations;
   }

From a50ff744e2f61aea91f61825f7c62764ba334f27 Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Tue, 14 Apr 2026 18:09:57 +0200
Subject: [PATCH 02/14] migrated heuristics and presolve to use OpenMP. now the
 entire solver shares the same thread pool.

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/branch_and_bound/branch_and_bound.cpp |  19 +-
 cpp/src/branch_and_bound/pseudo_costs.cpp     |  78 ++++++--
 cpp/src/mip_heuristics/diversity/lns/rins.cu  |  89 ++++-----
 cpp/src/mip_heuristics/diversity/lns/rins.cuh |  37 +---
 .../feasibility_jump/early_cpufj.cu           |  32 ++--
 .../feasibility_jump/early_cpufj.cuh          |   6 +-
 .../feasibility_jump/feasibility_jump.cuh     |   2 -
 .../mip_heuristics/feasibility_jump/fj_cpu.cu | 172 +++++++-----------
 .../feasibility_jump/fj_cpu.cuh               |  23 +--
 .../local_search/local_search.cu              | 172 +++++++++---------
 .../local_search/local_search.cuh             |  19 +-
 .../presolve/bounds_presolve.cuh              |   2 +-
 .../conditional_bound_strengthening.cu        |   3 +-
 .../mip_heuristics/presolve/probing_cache.cu  |  93 ++++++----
 cpp/src/mip_heuristics/solve.cu               |  60 +++++-
 cpp/src/mip_heuristics/solver.cu              |  36 ++--
 .../utilities/cpu_worker_thread.cuh           | 147 ---------------
 17 files changed, 415 insertions(+), 575 deletions(-)
 delete mode 100644 cpp/src/mip_heuristics/utilities/cpu_worker_thread.cuh

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index a0a4907374..de13e1ba27 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -1750,7 +1750,7 @@ void branch_and_bound_t<i_t, f_t>::run_scheduler()
         active_workers_per_strategy_[strategy]++;
         launched_any_task = true;
 
-#pragma omp task affinity(worker)
+#pragma omp task affinity(worker) default(none) firstprivate(worker)
         plunge_with(worker);
 
       } else {
@@ -1771,7 +1771,7 @@ void branch_and_bound_t<i_t, f_t>::run_scheduler()
         active_workers_per_strategy_[strategy]++;
         launched_any_task = true;
 
-#pragma omp task affinity(worker)
+#pragma omp task affinity(worker) default(none) firstprivate(worker)
         dive_with(worker);
       }
     }
@@ -2621,12 +2621,15 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
       "|   Gap    |  Time  |\n");
   }
 
-  if (settings_.deterministic) {
-    run_deterministic_coordinator(Arow_);
-  } else if (settings_.num_threads > 1) {
-    run_scheduler();
-  } else {
-    single_threaded_solve();
+#pragma omp taskgroup
+  {
+    if (settings_.deterministic) {
+      run_deterministic_coordinator(Arow_);
+    } else if (settings_.num_threads > 1) {
+      run_scheduler();
+    } else {
+      single_threaded_solve();
+    }
   }
 
   is_running_ = false;
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 65e527078a..bda91cf04f 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -1000,8 +1000,6 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       basis_update_mpf_t<i_t, f_t>& basis_factors,
                       pseudo_costs_t<i_t, f_t>& pc)
 {
-  constexpr bool verbose = false;
-
   pc.resize(original_lp.num_cols);
   pc.strong_branch_down.assign(fractional.size(), 0);
   pc.strong_branch_up.assign(fractional.size(), 0);
@@ -1054,7 +1052,17 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                                           pc);
   } else {
     if (effective_batch_pdlp != 0) {
-#pragma omp task depend(out : pdlp_obj_down) depend(out : pdlp_obj_up) default(shared)
+#pragma omp task shared(settings,          \
+                          concurrent_halt, \
+                          original_lp,     \
+                          new_slacks,      \
+                          root_solution,   \
+                          fractional,      \
+                          pc,              \
+                          sb_view,         \
+                          pdlp_obj_down,   \
+                          pdlp_obj_up)     \
+  firstprivate(effective_batch_pdlp, start_time, root_obj) default(none)
       batch_pdlp_strong_branching_task(settings,
                                        effective_batch_pdlp,
                                        start_time,
@@ -1074,11 +1082,25 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
       i_t n = std::min<i_t>(4 * settings.num_threads, fractional.size());
 // Here we are creating more tasks than the number of threads
 // such that they can be scheduled dynamically to the threads.
-#pragma omp taskloop num_tasks(n) default(shared)
+#pragma omp taskloop num_tasks(n) default(none) shared(original_lp,                \
+                                                         settings,                 \
+                                                         var_types,                \
+                                                         fractional,               \
+                                                         root_solution,            \
+                                                         root_vstatus,             \
+                                                         edge_norms,               \
+                                                         pc,                       \
+                                                         dual_simplex_obj_down,    \
+                                                         dual_simplex_obj_up,      \
+                                                         dual_simplex_status_down, \
+                                                         dual_simplex_status_up,   \
+                                                         sb_view)                  \
+  firstprivate(start_time, root_obj, upper_bound, simplex_iteration_limit, n)
       for (i_t k = 0; k < n; k++) {
         i_t start = std::floor(k * fractional.size() / n);
         i_t end   = std::floor((k + 1) * fractional.size() / n);
 
+        constexpr bool verbose = false;
         if (verbose) {
           settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n",
                               omp_get_thread_num(),
@@ -1113,12 +1135,13 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     }
 
     if (effective_batch_pdlp != 0) {
-#pragma omp taskwait depend(in : pdlp_obj_down) depend(in : pdlp_obj_up)
+#pragma omp taskwait
     }
   }
 
   settings.log.printf("Strong branching completed in %.2fs\n", toc(strong_branching_start_time));
 
+  constexpr bool verbose = false;
   if (verbose) {
     // Collect Dual Simplex statistics
     i_t dual_simplex_optimal = 0, dual_simplex_infeasible = 0, dual_simplex_iter_limit = 0;
@@ -1539,7 +1562,18 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   std::atomic<int> concurrent_halt{0};
 
   if (use_pdlp) {
-#pragma omp task default(shared)
+#pragma omp task default(none) shared(log,               \
+                                        concurrent_halt, \
+                                        original_lp,     \
+                                        new_slacks,      \
+                                        leaf_solution,   \
+                                        worker,          \
+                                        candidate_vars,  \
+                                        settings,        \
+                                        sb_view,         \
+                                        pdlp_obj_down,   \
+                                        pdlp_obj_up)     \
+  firstprivate(rb_mode, num_candidates, start_time)
     batch_pdlp_reliability_branching_task(log,
                                           rb_mode,
                                           num_candidates,
@@ -1574,14 +1608,30 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   f_t dual_simplex_start_time = tic();
 
   if (rb_mode != 2) {
-#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \
-  shared(score_mutex,                                                                \
-           sb_view,                                                                  \
-           dual_simplex_obj_down,                                                    \
-           dual_simplex_obj_up,                                                      \
-           dual_simplex_status_down,                                                 \
-           dual_simplex_status_up,                                                   \
-           unreliable_list)
+#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) default(none) \
+  shared(log,                                                                                      \
+           unreliable_list,                                                                        \
+           settings,                                                                               \
+           sb_view,                                                                                \
+           worker,                                                                                 \
+           var_types,                                                                              \
+           node_ptr,                                                                               \
+           leaf_solution,                                                                          \
+           dual_simplex_obj_down,                                                                  \
+           dual_simplex_obj_up,                                                                    \
+           dual_simplex_status_down,                                                               \
+           dual_simplex_status_up,                                                                 \
+           score_mutex,                                                                            \
+           max_score,                                                                              \
+           branch_var) firstprivate(num_candidates,                                                \
+                                      start_time,                                                  \
+                                      rb_mode,                                                     \
+                                      reliable_threshold,                                          \
+                                      upper_bound,                                                 \
+                                      iter_limit_per_trial,                                        \
+                                      eps,                                                         \
+                                      pseudo_cost_up_avg,                                          \
+                                      pseudo_cost_down_avg)
     for (i_t i = 0; i < num_candidates; ++i) {
       auto [score, j] = unreliable_list[i];
 
diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu
index c4331343de..c6a1bba7e7 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cu
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu
@@ -36,19 +36,6 @@ rins_t<i_t, f_t>::rins_t(mip_solver_context_t<i_t, f_t>& context_,
   time_limit = context.settings.heuristic_params.rins_time_limit;
 }
 
-template <typename i_t, typename f_t>
-rins_thread_t<i_t, f_t>::~rins_thread_t()
-{
-  this->request_termination();
-}
-
-template <typename i_t, typename f_t>
-void rins_thread_t<i_t, f_t>::run_worker()
-{
-  raft::common::nvtx::range fun_scope("Running RINS");
-  rins_ptr->run_rins();
-}
-
 template <typename i_t, typename f_t>
 void rins_t<i_t, f_t>::new_best_incumbent_callback(const std::vector<f_t>& solution)
 {
@@ -59,23 +46,25 @@ template <typename i_t, typename f_t>
 void rins_t<i_t, f_t>::node_callback(const std::vector<f_t>& solution, f_t objective)
 {
   if (!enabled) return;
-
   node_count++;
 
   if (node_count - node_count_at_last_improvement < settings.nodes_after_later_improvement) return;
-
   if (node_count - node_count_at_last_rins > settings.node_freq) {
     // opportunistic early test w/ atomic to avoid having to take the lock
-    if (!rins_thread->cpu_thread_done) return;
-    std::lock_guard<std::mutex> lock(rins_mutex);
+    if (!launch_new_task.exchange(false)) return;
+
     bool population_ready = false;
-    if (rins_thread->cpu_thread_done) {
+    {
       std::lock_guard<std::recursive_mutex> pop_lock(dm.population.write_mutex);
       population_ready = dm.population.current_size() > 0 && dm.population.is_feasible();
     }
+
     if (population_ready) {
       lp_optimal_solution = solution;
-      rins_thread->start_cpu_solver();
+#pragma omp task default(none)
+      run_rins();
+    } else {
+      launch_new_task = true;
     }
   }
 }
@@ -83,25 +72,17 @@ void rins_t<i_t, f_t>::node_callback(const std::vector<f_t>& solution, f_t objec
 template <typename i_t, typename f_t>
 void rins_t<i_t, f_t>::enable()
 {
-  rins_thread           = std::make_unique<rins_thread_t<i_t, f_t>>();
-  rins_thread->rins_ptr = this;
-  seed                  = cuopt::seed_generator::get_seed();
+  seed = cuopt::seed_generator::get_seed();
   problem_ptr->handle_ptr->sync_stream();
   problem_copy = std::make_unique<problem_t<i_t, f_t>>(*problem_ptr, &rins_handle);
   enabled      = true;
 }
 
-template <typename i_t, typename f_t>
-void rins_t<i_t, f_t>::stop_rins()
-{
-  enabled = false;
-  if (rins_thread) rins_thread->request_termination();
-  rins_thread.reset();
-}
-
 template <typename i_t, typename f_t>
 void rins_t<i_t, f_t>::run_rins()
 {
+  raft::common::nvtx::range fun_scope("Running RINS");
+
   if (total_calls == 0) RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device()));
 
   cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch");
@@ -134,7 +115,10 @@ void rins_t<i_t, f_t>::run_rins()
   cuopt_assert(best_sol.handle_ptr == &rins_handle, "Handle mismatch");
 
   cuopt_assert(best_sol.get_feasible(), "Best solution is not feasible");
-  if (!best_sol.get_feasible()) { return; }
+  if (!best_sol.get_feasible()) {
+    launch_new_task = true;
+    return;
+  }
 
   i_t sol_size_before_rins = best_sol.assignment.size();
   auto lp_opt_device = cuopt::device_copy(this->lp_optimal_solution, rins_handle.get_stream());
@@ -158,6 +142,7 @@ void rins_t<i_t, f_t>::run_rins()
   // abort if the fractional ratio is too low
   if (fractional_ratio < settings.min_fractional_ratio) {
     CUOPT_LOG_TRACE("RINS fractional ratio too low, aborting");
+    launch_new_task = true;
     return;
   }
 
@@ -182,6 +167,7 @@ void rins_t<i_t, f_t>::run_rins()
 
   if (n_to_fix == 0) {
     CUOPT_LOG_DEBUG("RINS no variables to fix");
+    launch_new_task = true;
     return;
   }
 
@@ -229,18 +215,19 @@ void rins_t<i_t, f_t>::run_rins()
   solution_t<i_t, f_t> fj_solution(fixed_problem);
   fj_solution.copy_new_assignment(cuopt::host_copy(fixed_assignment, rins_handle.get_stream()));
   std::vector<f_t> default_weights(fixed_problem.n_constraints, 1.);
-  cpu_fj_thread_t<i_t, f_t> cpu_fj_thread;
-  cpu_fj_thread.fj_cpu             = fj.create_cpu_climber(fj_solution,
-                                               default_weights,
-                                               default_weights,
-                                               0.,
-                                               context.preempt_heuristic_solver_,
-                                               fj_settings_t{},
-                                               true);
-  cpu_fj_thread.fj_ptr             = &fj;
-  cpu_fj_thread.fj_cpu->log_prefix = "[RINS] ";
-  cpu_fj_thread.time_limit         = time_limit;
-  cpu_fj_thread.start_cpu_solver();
+
+  std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> fj_cpu =
+    fj.create_cpu_climber(fj_solution,
+                          default_weights,
+                          default_weights,
+                          0.,
+                          context.preempt_heuristic_solver_,
+                          fj_settings_t{},
+                          true);
+  fj_cpu->log_prefix = "[RINS] ";
+
+#pragma omp task shared(fj_cpu) firstprivate(time_limit) default(none)
+  cpufj_solve(fj_cpu.get(), time_limit);
 
   f_t lower_bound = context.branch_and_bound_ptr ? context.branch_and_bound_ptr->get_lower_bound()
                                                  : -std::numeric_limits<f_t>::infinity();
@@ -311,13 +298,12 @@ void rins_t<i_t, f_t>::run_rins()
                           static_cast<f_t>(context.settings.heuristic_params.rins_max_time_limit));
   }
 
-  cpu_fj_thread.stop_cpu_solver();
-  bool fj_solution_found = cpu_fj_thread.wait_for_cpu_solver();
-  CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", cpu_fj_thread.fj_cpu->iterations);
-  if (fj_solution_found) {
-    CUOPT_LOG_DEBUG("RINS FJ solution found. Objective %.16e",
-                    cpu_fj_thread.fj_cpu->h_best_objective);
-    rins_solution_queue.push_back(cpu_fj_thread.fj_cpu->h_best_assignment);
+#pragma omp taskwait
+
+  CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", fj_cpu->iterations);
+  if (fj_cpu->feasible_found) {
+    CUOPT_LOG_DEBUG("RINS FJ solution found. Objective %.16e", fj_cpu->h_best_objective);
+    rins_solution_queue.push_back(fj_cpu->h_best_assignment);
   }
   // Thread will be automatically terminated and joined by destructor
 
@@ -354,15 +340,14 @@ void rins_t<i_t, f_t>::run_rins()
 
   if (improvement_found) total_success++;
   CUOPT_LOG_DEBUG("RINS calls/successes %d/%d", total_calls, total_success);
+  launch_new_task = true;
 }
 
 #if MIP_INSTANTIATE_FLOAT
-template class rins_thread_t<int, float>;
 template class rins_t<int, float>;
 #endif
 
 #if MIP_INSTANTIATE_DOUBLE
-template class rins_thread_t<int, double>;
 template class rins_t<int, double>;
 #endif
 
diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cuh b/cpp/src/mip_heuristics/diversity/lns/rins.cuh
index 0a9133f848..b1b62bd1ae 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cuh
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cuh
@@ -17,19 +17,11 @@
 
 #pragma once
 
-#include <mip_heuristics/diversity/population.cuh>
 #include <mip_heuristics/solution/solution.cuh>
 #include <mip_heuristics/solver.cuh>
-#include <mip_heuristics/utilities/cpu_worker_thread.cuh>
 
-#include <utilities/timer.hpp>
+#include <utilities/omp_helpers.hpp>
 
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-#include <random>
-#include <string>
-#include <thread>
 #include <vector>
 
 namespace cuopt::linear_programming::detail {
@@ -52,18 +44,6 @@ struct rins_settings_t {
 template <typename i_t, typename f_t>
 class rins_t;
 
-template <typename i_t, typename f_t>
-struct rins_thread_t : public cpu_worker_thread_base_t<rins_thread_t<i_t, f_t>> {
-  ~rins_thread_t();
-
-  void run_worker();
-  void on_terminate() {}
-  void on_start() {}
-  bool get_result() { return true; }
-
-  rins_t<i_t, f_t>* rins_ptr{nullptr};
-};
-
 template <typename i_t, typename f_t>
 class rins_t {
  public:
@@ -74,7 +54,6 @@ class rins_t {
   void node_callback(const std::vector<f_t>& solution, f_t objective);
   void new_best_incumbent_callback(const std::vector<f_t>& solution);
   void enable();
-  void stop_rins();
 
   void run_rins();
 
@@ -96,15 +75,13 @@ class rins_t {
   f_t time_limit{10.};
   i_t seed;
 
-  std::atomic<bool> enabled{false};
-  std::atomic<f_t> lower_bound{0.};
-
-  std::atomic<i_t> node_count{0};
-  std::atomic<i_t> node_count_at_last_rins{0};
-  std::atomic<i_t> node_count_at_last_improvement{0};
-  std::mutex rins_mutex;
+  omp_atomic_t<bool> enabled{false};
+  omp_atomic_t<f_t> lower_bound{0.};
 
-  std::unique_ptr<rins_thread_t<i_t, f_t>> rins_thread;
+  omp_atomic_t<i_t> node_count{0};
+  omp_atomic_t<i_t> node_count_at_last_rins{0};
+  omp_atomic_t<i_t> node_count_at_last_improvement{0};
+  omp_atomic_t<bool> launch_new_task{true};
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
index 8109653e6f..15ff3f4eb7 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
@@ -7,9 +7,7 @@
 
 #include "early_cpufj.cuh"
 
-#include <mip_heuristics/feasibility_jump/fj_cpu.cuh>
 #include <mip_heuristics/mip_constants.hpp>
-#include <utilities/logger.hpp>
 
 namespace cuopt::linear_programming::detail {
 
@@ -32,40 +30,38 @@ early_cpufj_t<i_t, f_t>::~early_cpufj_t()
 template <typename i_t, typename f_t>
 void early_cpufj_t<i_t, f_t>::start()
 {
-  if (cpu_fj_thread_) { return; }
+  if (fj_cpu_) { return; }
 
   this->preemption_flag_.store(false);
   this->start_time_ = std::chrono::steady_clock::now();
 
-  cpu_fj_thread_ = std::make_unique<cpu_fj_thread_t<i_t, f_t>>();
-  cpu_fj_thread_->fj_cpu =
-    init_fj_cpu_standalone(*this->problem_ptr_, *this->solution_ptr_, preemption_flag_);
-  cpu_fj_thread_->time_limit = std::numeric_limits<f_t>::infinity();
+  fj_cpu_ = init_fj_cpu_standalone(*this->problem_ptr_, *this->solution_ptr_, preemption_flag_);
 
-  cpu_fj_thread_->fj_cpu->log_prefix = "[Early CPUFJ] ";
+  fj_cpu_->log_prefix = "[Early CPUFJ] ";
 
-  cpu_fj_thread_->fj_cpu->improvement_callback =
-    [this](f_t solver_obj, const std::vector<f_t>& assignment, double) {
-      this->try_update_best(solver_obj, assignment);
-    };
+  fj_cpu_->improvement_callback = [this](f_t solver_obj,
+                                         const std::vector<f_t>& assignment,
+                                         double) { this->try_update_best(solver_obj, assignment); };
 
-  cpu_fj_thread_->start_cpu_solver();
+#pragma omp task shared(fj_cpu_) depend(out : *fj_cpu_) default(none)
+  cpufj_solve(fj_cpu_.get());
 }
 
 template <typename i_t, typename f_t>
 void early_cpufj_t<i_t, f_t>::stop()
 {
-  if (!cpu_fj_thread_) { return; }
+  if (!fj_cpu_) { return; }
 
   preemption_flag_.store(true);
-  cpu_fj_thread_->stop_cpu_solver();
-  cpu_fj_thread_->wait_for_cpu_solver();
+
+  fj_cpu_->halted = true;
+#pragma omp taskwait depend(in : *fj_cpu_)
 
   CUOPT_LOG_DEBUG("[Early CPUFJ] Stopped after %d iterations, solution_found=%d",
-                  cpu_fj_thread_->fj_cpu ? cpu_fj_thread_->fj_cpu->iterations : 0,
+                  fj_cpu_ ? fj_cpu_->iterations : 0,
                   this->solution_found_);
 
-  cpu_fj_thread_.reset();
+  fj_cpu_.reset();
 }
 
 #if MIP_INSTANTIATE_FLOAT
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh
index 911e846551..fd85e4b9f3 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh
@@ -8,15 +8,13 @@
 #pragma once
 
 #include <mip_heuristics/early_heuristic.cuh>
+#include <mip_heuristics/feasibility_jump/fj_cpu.cuh>
 
 #include <atomic>
 #include <memory>
 
 namespace cuopt::linear_programming::detail {
 
-template <typename i_t, typename f_t>
-struct cpu_fj_thread_t;
-
 template <typename i_t, typename f_t>
 class early_cpufj_t : public early_heuristic_t<i_t, f_t, early_cpufj_t<i_t, f_t>> {
  public:
@@ -32,7 +30,7 @@ class early_cpufj_t : public early_heuristic_t<i_t, f_t, early_cpufj_t<i_t, f_t>
   void stop();
 
  private:
-  std::unique_ptr<cpu_fj_thread_t<i_t, f_t>> cpu_fj_thread_;
+  std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> fj_cpu_;
   std::atomic<bool> preemption_flag_{false};
 };
 
diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh
index 50b451a86e..33d1ac527f 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh
@@ -216,8 +216,6 @@ class fj_t {
     std::atomic<bool>& preemption_flag,
     fj_settings_t settings = fj_settings_t{},
     bool randomize_params  = false);
-  bool cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu,
-                 f_t time_limit = +std::numeric_limits<f_t>::infinity());
   i_t alloc_max_climbers(i_t desired_climbers);
   void resize_vectors(const raft::handle_t* handle_ptr);
   void device_init(const rmm::cuda_stream_view& stream);
diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
index 4eaa5b6a21..aea6295528 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
@@ -1417,7 +1417,7 @@ std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> fj_t<i_t, f_t>::create_cpu_climber(
 }
 
 template <typename i_t, typename f_t>
-static bool cpufj_solve_loop(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_limit)
+void cpufj_solve(fj_cpu_climber_t<i_t, f_t>* fj_cpu, f_t in_time_limit)
 {
   i_t local_mins       = 0;
   auto loop_start      = std::chrono::high_resolution_clock::now();
@@ -1425,37 +1425,37 @@ static bool cpufj_solve_loop(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_lim
   auto loop_time_start = std::chrono::high_resolution_clock::now();
 
   // Initialize feature tracking
-  fj_cpu.last_feature_log_time = loop_start;
-  fj_cpu.prev_best_objective   = fj_cpu.h_best_objective;
-  fj_cpu.iterations_since_best = 0;
+  fj_cpu->last_feature_log_time = loop_start;
+  fj_cpu->prev_best_objective   = fj_cpu->h_best_objective;
+  fj_cpu->iterations_since_best = 0;
 
-  while (!fj_cpu.halted && !fj_cpu.preemption_flag.load()) {
+  while (!fj_cpu->halted && !fj_cpu->preemption_flag.load()) {
     // Check if 5 seconds have passed
     auto now = std::chrono::high_resolution_clock::now();
     if (in_time_limit < std::numeric_limits<f_t>::infinity() &&
         now - loop_time_start > time_limit) {
       CUOPT_LOG_TRACE("%sTime limit of %.4f seconds reached, breaking loop at iteration %d",
-                      fj_cpu.log_prefix.c_str(),
+                      fj_cpu->log_prefix.c_str(),
                       time_limit.count() / 1000.f,
-                      fj_cpu.iterations);
+                      fj_cpu->iterations);
       break;
     }
-    if (fj_cpu.iterations >= fj_cpu.settings.iteration_limit) {
+    if (fj_cpu->iterations >= fj_cpu->settings.iteration_limit) {
       CUOPT_LOG_TRACE("%sIteration limit of %d reached, breaking loop at iteration %d",
-                      fj_cpu.log_prefix.c_str(),
-                      fj_cpu.settings.iteration_limit,
-                      fj_cpu.iterations);
+                      fj_cpu->log_prefix.c_str(),
+                      fj_cpu->settings.iteration_limit,
+                      fj_cpu->iterations);
       break;
     }
 
     // periodically recompute the LHS and violation scores
     // to correct any accumulated numerical errors
-    cuopt_assert(fj_cpu.settings.parameters.lhs_refresh_period > 0,
+    cuopt_assert(fj_cpu->settings.parameters.lhs_refresh_period > 0,
                  "lhs_refresh_period should be positive");
-    if (fj_cpu.iterations % fj_cpu.settings.parameters.lhs_refresh_period == 0 ||
-        fj_cpu.trigger_early_lhs_recomputation) {
-      recompute_lhs(fj_cpu);
-      fj_cpu.trigger_early_lhs_recomputation = false;
+    if (fj_cpu->iterations % fj_cpu->settings.parameters.lhs_refresh_period == 0 ||
+        fj_cpu->trigger_early_lhs_recomputation) {
+      recompute_lhs(*fj_cpu);
+      fj_cpu->trigger_early_lhs_recomputation = false;
     }
 
     fj_move_t move          = fj_move_t{-1, 0};
@@ -1465,153 +1465,113 @@ static bool cpufj_solve_loop(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_lim
     bool is_mtm_sat         = false;
 
     // Perform lift moves
-    if (fj_cpu.violated_constraints.empty()) {
-      thrust::tie(move, score) = find_lift_move(fj_cpu);
+    if (fj_cpu->violated_constraints.empty()) {
+      thrust::tie(move, score) = find_lift_move(*fj_cpu);
       if (score > fj_staged_score_t::zero()) is_lift = true;
     }
     // Regular MTM
     if (!(score > fj_staged_score_t::zero())) {
-      thrust::tie(move, score) = find_mtm_move_viol(fj_cpu, fj_cpu.mtm_viol_samples);
+      thrust::tie(move, score) = find_mtm_move_viol(*fj_cpu, fj_cpu->mtm_viol_samples);
       if (score > fj_staged_score_t::zero()) is_mtm_viol = true;
     }
     // try with MTM in satisfied constraints
-    if (fj_cpu.feasible_found && !(score > fj_staged_score_t::zero())) {
-      thrust::tie(move, score) = find_mtm_move_sat(fj_cpu, fj_cpu.mtm_sat_samples);
+    if (fj_cpu->feasible_found && !(score > fj_staged_score_t::zero())) {
+      thrust::tie(move, score) = find_mtm_move_sat(*fj_cpu, fj_cpu->mtm_sat_samples);
       if (score > fj_staged_score_t::zero()) is_mtm_sat = true;
     }
     // if we're in the feasible region but haven't found improvements in the last n iterations,
     // perturb
     bool should_perturb = false;
-    if (fj_cpu.violated_constraints.empty() &&
-        fj_cpu.iterations - fj_cpu.last_feasible_entrance_iter > fj_cpu.perturb_interval) {
-      should_perturb                     = true;
-      fj_cpu.last_feasible_entrance_iter = fj_cpu.iterations;
+    if (fj_cpu->violated_constraints.empty() &&
+        fj_cpu->iterations - fj_cpu->last_feasible_entrance_iter > fj_cpu->perturb_interval) {
+      should_perturb                      = true;
+      fj_cpu->last_feasible_entrance_iter = fj_cpu->iterations;
     }
 
     if (score > fj_staged_score_t::zero() && !should_perturb) {
-      apply_move(fj_cpu, move.var_idx, move.value, false);
+      apply_move(*fj_cpu, move.var_idx, move.value, false);
       // Track move types
-      if (is_lift) fj_cpu.n_lift_moves_window++;
-      if (is_mtm_viol) fj_cpu.n_mtm_viol_moves_window++;
-      if (is_mtm_sat) fj_cpu.n_mtm_sat_moves_window++;
+      if (is_lift) fj_cpu->n_lift_moves_window++;
+      if (is_mtm_viol) fj_cpu->n_mtm_viol_moves_window++;
+      if (is_mtm_sat) fj_cpu->n_mtm_sat_moves_window++;
     } else {
       // Local Min
-      update_weights(fj_cpu);
+      update_weights(*fj_cpu);
       if (should_perturb) {
-        perturb(fj_cpu);
-        for (size_t i = 0; i < fj_cpu.cached_mtm_moves.size(); i++)
-          fj_cpu.cached_mtm_moves[i].first = 0;
+        perturb(*fj_cpu);
+        for (size_t i = 0; i < fj_cpu->cached_mtm_moves.size(); i++)
+          fj_cpu->cached_mtm_moves[i].first = 0;
       }
       thrust::tie(move, score) =
-        find_mtm_move_viol(fj_cpu, 1, true);  // pick a single random violated constraint
+        find_mtm_move_viol(*fj_cpu, 1, true);  // pick a single random violated constraint
       i_t var_idx = move.var_idx >= 0 ? move.var_idx : 0;
       f_t delta   = move.var_idx >= 0 ? move.value : 0;
-      apply_move(fj_cpu, var_idx, delta, true);
+      apply_move(*fj_cpu, var_idx, delta, true);
       ++local_mins;
-      ++fj_cpu.n_local_minima_window;
+      ++fj_cpu->n_local_minima_window;
     }
 
     // number of violated constraints is usually small (<100). recomputing from all LHSs is cheap
     // and more numerically precise than just adding to the accumulator in apply_move
-    fj_cpu.total_violations = 0;
-    for (auto cstr_idx : fj_cpu.violated_constraints) {
-      fj_cpu.total_violations += fj_cpu.view.excess_score(cstr_idx, fj_cpu.h_lhs[cstr_idx]);
+    fj_cpu->total_violations = 0;
+    for (auto cstr_idx : fj_cpu->violated_constraints) {
+      fj_cpu->total_violations += fj_cpu->view.excess_score(cstr_idx, fj_cpu->h_lhs[cstr_idx]);
     }
-    if (fj_cpu.iterations % fj_cpu.log_interval == 0) {
+    if (fj_cpu->iterations % fj_cpu->log_interval == 0) {
       CUOPT_LOG_TRACE(
         "%sCPUFJ iteration: %d/%d, local mins: %d, best_objective: %g, viol: %zu, obj weight %g, "
         "maxw %g",
-        fj_cpu.log_prefix.c_str(),
-        fj_cpu.iterations,
-        fj_cpu.settings.iteration_limit != std::numeric_limits<i_t>::max()
-          ? fj_cpu.settings.iteration_limit
+        fj_cpu->log_prefix.c_str(),
+        fj_cpu->iterations,
+        fj_cpu->settings.iteration_limit != std::numeric_limits<i_t>::max()
+          ? fj_cpu->settings.iteration_limit
           : -1,
         local_mins,
-        fj_cpu.pb_ptr->get_user_obj_from_solver_obj(fj_cpu.h_best_objective),
-        fj_cpu.violated_constraints.size(),
-        fj_cpu.h_objective_weight,
-        fj_cpu.max_weight);
+        fj_cpu->pb_ptr->get_user_obj_from_solver_obj(fj_cpu->h_best_objective),
+        fj_cpu->violated_constraints.size(),
+        fj_cpu->h_objective_weight,
+        fj_cpu->max_weight);
     }
     // send current solution to callback every 3000 steps for diversity
-    if (fj_cpu.iterations % fj_cpu.diversity_callback_interval == 0) {
-      if (fj_cpu.diversity_callback) {
-        fj_cpu.diversity_callback(fj_cpu.h_incumbent_objective, fj_cpu.h_assignment);
+    if (fj_cpu->iterations % fj_cpu->diversity_callback_interval == 0) {
+      if (fj_cpu->diversity_callback) {
+        fj_cpu->diversity_callback(fj_cpu->h_incumbent_objective, fj_cpu->h_assignment);
       }
     }
 
     // Print timing statistics every N iterations
 #if CPUFJ_TIMING_TRACE
-    if (fj_cpu.iterations % fj_cpu.timing_stats_interval == 0 && fj_cpu.iterations > 0) {
-      print_timing_stats(fj_cpu);
+    if (fj_cpu->iterations % fj_cpu->timing_stats_interval == 0 && fj_cpu->iterations > 0) {
+      print_timing_stats(*fj_cpu);
     }
 #endif
 
-    if (fj_cpu.iterations % 100 == 0 && fj_cpu.iterations > 0) {
+    if (fj_cpu->iterations % 100 == 0 && fj_cpu->iterations > 0) {
       // Collect memory statistics
-      auto [loads, stores] = fj_cpu.memory_aggregator.collect();
-      double biased_work   = (loads + stores) * fj_cpu.work_unit_bias / 1e10;
-      fj_cpu.work_units_elapsed += biased_work;
+      auto [loads, stores] = fj_cpu->memory_aggregator.collect();
+      double biased_work   = (loads + stores) * fj_cpu->work_unit_bias / 1e10;
+      fj_cpu->work_units_elapsed += biased_work;
 
-      if (fj_cpu.producer_sync != nullptr) { fj_cpu.producer_sync->notify_progress(); }
+      if (fj_cpu->producer_sync != nullptr) { fj_cpu->producer_sync->notify_progress(); }
     }
 
-    cuopt_func_call(sanity_checks(fj_cpu));
-    fj_cpu.iterations++;
-    fj_cpu.iterations_since_best++;
+    cuopt_func_call(sanity_checks(*fj_cpu));
+    fj_cpu->iterations++;
+    fj_cpu->iterations_since_best++;
   }
   auto loop_end = std::chrono::high_resolution_clock::now();
   double total_time =
     std::chrono::duration_cast<std::chrono::duration<double>>(loop_end - loop_start).count();
-  double avg_time_per_iter = total_time / fj_cpu.iterations;
+  double avg_time_per_iter = fj_cpu->iterations > 0 ? total_time / fj_cpu->iterations : 0;
   CUOPT_LOG_TRACE("%sCPUFJ Average time per iteration: %.8fms",
-                  fj_cpu.log_prefix.c_str(),
+                  fj_cpu->log_prefix.c_str(),
                   avg_time_per_iter * 1000.0);
 
 #if CPUFJ_TIMING_TRACE
   // Print final timing statistics
   CUOPT_LOG_TRACE("=== Final Timing Statistics ===");
-  print_timing_stats(fj_cpu);
+  print_timing_stats(*fj_cpu);
 #endif
-
-  return fj_cpu.feasible_found;
-}
-
-template <typename i_t, typename f_t>
-bool fj_t<i_t, f_t>::cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_limit)
-{
-  raft::common::nvtx::range scope("fj_cpu");
-  return cpufj_solve_loop(fj_cpu, in_time_limit);
-}
-
-template <typename i_t, typename f_t>
-cpu_fj_thread_t<i_t, f_t>::~cpu_fj_thread_t()
-{
-  this->request_termination();
-}
-
-template <typename i_t, typename f_t>
-void cpu_fj_thread_t<i_t, f_t>::run_worker()
-{
-  cpu_fj_solution_found = cpufj_solve_loop(*fj_cpu, time_limit);
-}
-
-template <typename i_t, typename f_t>
-void cpu_fj_thread_t<i_t, f_t>::on_terminate()
-{
-  if (fj_cpu) fj_cpu->halted = true;
-}
-
-template <typename i_t, typename f_t>
-void cpu_fj_thread_t<i_t, f_t>::on_start()
-{
-  cuopt_assert(fj_cpu != nullptr, "fj_cpu must not be null");
-  fj_cpu->halted = false;
-}
-
-template <typename i_t, typename f_t>
-void cpu_fj_thread_t<i_t, f_t>::stop_cpu_solver()
-{
-  fj_cpu->halted = true;
 }
 
 template <typename i_t, typename f_t>
@@ -1635,7 +1595,7 @@ std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> init_fj_cpu_standalone(
 
 #if MIP_INSTANTIATE_FLOAT
 template class fj_t<int, float>;
-template class cpu_fj_thread_t<int, float>;
+template void cpufj_solve(fj_cpu_climber_t<int, float>* fj_cpu, float in_time_limit);
 template std::unique_ptr<fj_cpu_climber_t<int, float>> init_fj_cpu_standalone(
   problem_t<int, float>& problem,
   solution_t<int, float>& solution,
@@ -1645,7 +1605,7 @@ template std::unique_ptr<fj_cpu_climber_t<int, float>> init_fj_cpu_standalone(
 
 #if MIP_INSTANTIATE_DOUBLE
 template class fj_t<int, double>;
-template class cpu_fj_thread_t<int, double>;
+template void cpufj_solve(fj_cpu_climber_t<int, double>* fj_cpu, double in_time_limit);
 template std::unique_ptr<fj_cpu_climber_t<int, double>> init_fj_cpu_standalone(
   problem_t<int, double>& problem,
   solution_t<int, double>& solution,
diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh
index 3263609a2b..76bf158f9e 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh
@@ -8,16 +8,12 @@
 #pragma once
 
 #include <atomic>
-#include <condition_variable>
 #include <functional>
 #include <limits>
-#include <mutex>
-#include <thread>
 #include <unordered_set>
 #include <vector>
 
 #include <mip_heuristics/feasibility_jump/feasibility_jump.cuh>
-#include <mip_heuristics/utilities/cpu_worker_thread.cuh>
 #include <utilities/memory_instrumentation.hpp>
 #include <utilities/producer_sync.hpp>
 
@@ -126,7 +122,7 @@ struct fj_cpu_climber_t {
 
   // vector<bool> is actually likely beneficial here since we're memory bound
   std::vector<bool> flip_move_computed;
-  ;
+
   // CSR nnz offset -> (delta, score)
   std::vector<std::pair<f_t, fj_staged_score_t>> cached_mtm_moves;
 
@@ -194,21 +190,8 @@ struct fj_cpu_climber_t {
 };
 
 template <typename i_t, typename f_t>
-struct cpu_fj_thread_t : public cpu_worker_thread_base_t<cpu_fj_thread_t<i_t, f_t>> {
-  ~cpu_fj_thread_t();
-
-  void run_worker();
-  void on_terminate();
-  void on_start();
-  bool get_result() { return cpu_fj_solution_found; }
-
-  void stop_cpu_solver();
-
-  std::atomic<bool> cpu_fj_solution_found{false};
-  f_t time_limit{+std::numeric_limits<f_t>::infinity()};
-  std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> fj_cpu;
-  fj_t<i_t, f_t>* fj_ptr{nullptr};
-};
+void cpufj_solve(fj_cpu_climber_t<i_t, f_t>* fj_cpu,
+                 f_t in_time_limit = std::numeric_limits<f_t>::infinity());
 
 // Standalone CPUFJ init for running without full fj_t infrastructure (avoids GPU allocations).
 // Used for early CPUFJ during presolve.
diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu
index da29511d70..9ea7743e5e 100644
--- a/cpp/src/mip_heuristics/local_search/local_search.cu
+++ b/cpp/src/mip_heuristics/local_search/local_search.cu
@@ -20,10 +20,6 @@
 
 #include <mip_heuristics/feasibility_jump/fj_cpu.cuh>
 
-#include <cuda_profiler_api.h>
-
-#include <future>
-
 namespace cuopt::linear_programming::detail {
 
 template <typename i_t, typename f_t>
@@ -47,20 +43,11 @@ local_search_t<i_t, f_t>::local_search_t(mip_solver_context_t<i_t, f_t>& context
     problem_with_objective_cut(*context.problem_ptr, context.problem_ptr->handle_ptr)
 {
   const int n_cpufj = context.settings.heuristic_params.num_cpufj_threads;
-  for (int i = 0; i < n_cpufj; ++i) {
-    ls_cpu_fj.push_back(std::make_unique<cpu_fj_thread_t<i_t, f_t>>());
-    ls_cpu_fj.back()->fj_ptr = &fj;
-  }
-  scratch_cpu_fj.push_back(std::make_unique<cpu_fj_thread_t<i_t, f_t>>());
-  scratch_cpu_fj.back()->fj_ptr   = &fj;
-  scratch_cpu_fj_on_lp_opt.fj_ptr = &fj;
-
+  ls_cpu_fj.resize(n_cpufj);
+  scratch_cpu_fj.resize(1);
   fj.settings.n_of_minimums_for_exit = context.settings.heuristic_params.n_of_minimums_for_exit;
 }
 
-static double local_search_best_obj       = std::numeric_limits<double>::max();
-static population_t<int, double>* pop_ptr = nullptr;
-
 template <typename i_t, typename f_t>
 void local_search_t<i_t, f_t>::start_cpufj_scratch_threads(population_t<i_t, f_t>& population)
 {
@@ -75,37 +62,38 @@ void local_search_t<i_t, f_t>::start_cpufj_scratch_threads(population_t<i_t, f_t
                0.0);
   solution.clamp_within_bounds();
   i_t counter = 0;
-  for (auto& cpu_fj_ptr : scratch_cpu_fj) {
-    auto& cpu_fj = *cpu_fj_ptr;
+  for (auto& cpu_fj : scratch_cpu_fj) {
     if (counter > 0) solution.assign_random_within_bounds(0.4);
-    cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution,
-                                                      default_weights,
-                                                      default_weights,
-                                                      0.,
-                                                      context.preempt_heuristic_solver_,
-                                                      fj_settings_t{},
-                                                      /*randomize=*/counter > 0);
-
-    cpu_fj.fj_cpu->log_prefix = "******* scratch " + std::to_string(counter) + ": ";
-    cpu_fj.fj_cpu->improvement_callback =
-      [&population, problem_ptr = context.problem_ptr](
+    cpu_fj = fj.create_cpu_climber(solution,
+                                   default_weights,
+                                   default_weights,
+                                   0.,
+                                   context.preempt_heuristic_solver_,
+                                   fj_settings_t{},
+                                   /*randomize=*/counter > 0);
+
+    cpu_fj->log_prefix = "******* scratch " + std::to_string(counter) + ": ";
+    cpu_fj->improvement_callback =
+      [this, &population, problem_ptr = context.problem_ptr](
         f_t obj, const std::vector<f_t>& h_vec, double /*work_units*/) {
         population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ);
         (void)problem_ptr;
-        if (obj < local_search_best_obj) {
+        if (obj < this->local_search_best_obj) {
           CUOPT_LOG_TRACE("******* New local search best obj %g, best overall %g",
                           problem_ptr->get_user_obj_from_solver_obj(obj),
                           problem_ptr->get_user_obj_from_solver_obj(
                             population.is_feasible() ? population.best_feasible().get_objective()
                                                      : std::numeric_limits<f_t>::max()));
-          local_search_best_obj = obj;
+          this->local_search_best_obj = obj;
         }
       };
     counter++;
   };
 
-  for (auto& cpu_fj_ptr : scratch_cpu_fj) {
-    cpu_fj_ptr->start_cpu_solver();
+  for (size_t i = 0; i < scratch_cpu_fj.size(); ++i) {
+    auto ptr = scratch_cpu_fj[i].get();
+#pragma omp task firstprivate(ptr) depend(out : *ptr) default(none)
+    cpufj_solve(ptr);
   }
 }
 
@@ -121,34 +109,42 @@ void local_search_t<i_t, f_t>::start_cpufj_lptopt_scratch_threads(
   solution_lp.copy_new_assignment(
     host_copy(lp_optimal_solution, context.problem_ptr->handle_ptr->get_stream()));
   solution_lp.round_random_nearest(500);
-  scratch_cpu_fj_on_lp_opt.fj_cpu = fj.create_cpu_climber(
+  scratch_cpu_fj_on_lp_opt = fj.create_cpu_climber(
     solution_lp, default_weights, default_weights, 0., context.preempt_heuristic_solver_);
-  scratch_cpu_fj_on_lp_opt.fj_cpu->log_prefix = "******* scratch on LP optimal: ";
-  scratch_cpu_fj_on_lp_opt.fj_cpu->improvement_callback =
+  scratch_cpu_fj_on_lp_opt->log_prefix = "******* scratch on LP optimal: ";
+  scratch_cpu_fj_on_lp_opt->improvement_callback =
     [this, &population](f_t obj, const std::vector<f_t>& h_vec, double /*work_units*/) {
       population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ);
-      if (obj < local_search_best_obj) {
+      if (obj < this->local_search_best_obj) {
         CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g",
                         context.problem_ptr->get_user_obj_from_solver_obj(obj),
                         context.problem_ptr->get_user_obj_from_solver_obj(
                           population.is_feasible() ? population.best_feasible().get_objective()
                                                    : std::numeric_limits<f_t>::max()));
-        local_search_best_obj = obj;
+        this->local_search_best_obj = obj;
       }
     };
 
   // default weights
   cudaDeviceSynchronize();
-  scratch_cpu_fj_on_lp_opt.start_cpu_solver();
+
+#pragma omp task shared(scratch_cpu_fj_on_lp_opt) default(none) \
+  depend(out : *scratch_cpu_fj_on_lp_opt)
+  cpufj_solve(scratch_cpu_fj_on_lp_opt.get());
 }
 
 template <typename i_t, typename f_t>
 void local_search_t<i_t, f_t>::stop_cpufj_scratch_threads()
 {
-  for (auto& cpu_fj_ptr : scratch_cpu_fj) {
-    cpu_fj_ptr->request_termination();
+  for (size_t i = 0; i < scratch_cpu_fj.size(); ++i) {
+    scratch_cpu_fj[i]->halted = true;
+#pragma omp taskwait depend(in : *scratch_cpu_fj[i])
+  }
+
+  if (scratch_cpu_fj_on_lp_opt) {
+    scratch_cpu_fj_on_lp_opt->halted = true;
+#pragma omp taskwait depend(in : *scratch_cpu_fj_on_lp_opt)
   }
-  scratch_cpu_fj_on_lp_opt.request_termination();
 }
 
 template <typename i_t, typename f_t>
@@ -164,29 +160,29 @@ void local_search_t<i_t, f_t>::start_cpufj_deterministic(
                0.0);
   solution.clamp_within_bounds();
 
-  deterministic_cpu_fj.fj_ptr = &fj;
-  deterministic_cpu_fj.fj_cpu = fj.create_cpu_climber(solution,
-                                                      default_weights,
-                                                      default_weights,
-                                                      0.,
-                                                      context.preempt_heuristic_solver_,
-                                                      fj_settings_t{},
-                                                      /*randomize=*/true);
+  deterministic_cpu_fj = fj.create_cpu_climber(solution,
+                                               default_weights,
+                                               default_weights,
+                                               0.,
+                                               context.preempt_heuristic_solver_,
+                                               fj_settings_t{},
+                                               /*randomize=*/true);
 
-  deterministic_cpu_fj.fj_cpu->log_prefix = "******* deterministic CPUFJ: ";
+  deterministic_cpu_fj->log_prefix = "******* deterministic CPUFJ: ";
 
   // Register with producer_sync for B&B synchronization
-  producer_sync_t& producer_sync             = bb.get_producer_sync();
-  deterministic_cpu_fj.fj_cpu->producer_sync = &producer_sync;
-  producer_sync.register_producer(&deterministic_cpu_fj.fj_cpu->work_units_elapsed);
+  producer_sync_t& producer_sync      = bb.get_producer_sync();
+  deterministic_cpu_fj->producer_sync = &producer_sync;
+  producer_sync.register_producer(&deterministic_cpu_fj->work_units_elapsed);
 
   // Set up callback to send solutions to B&B with work unit timestamps
-  deterministic_cpu_fj.fj_cpu->improvement_callback =
+  deterministic_cpu_fj->improvement_callback =
     [&bb](f_t obj, const std::vector<f_t>& h_vec, double work_units) {
       bb.queue_external_solution_deterministic(h_vec, work_units);
     };
 
-  deterministic_cpu_fj.start_cpu_solver();
+#pragma omp task shared(deterministic_cpu_fj) default(none) depend(inout : *deterministic_cpu_fj)
+  cpufj_solve(deterministic_cpu_fj.get());
 
   // Signal that registration is complete - B&B can now wait on producers
   producer_sync.registration_complete();
@@ -195,12 +191,14 @@ void local_search_t<i_t, f_t>::start_cpufj_deterministic(
 template <typename i_t, typename f_t>
 void local_search_t<i_t, f_t>::stop_cpufj_deterministic()
 {
-  if (deterministic_cpu_fj.fj_cpu) {
-    if (deterministic_cpu_fj.fj_cpu->producer_sync) {
-      deterministic_cpu_fj.fj_cpu->producer_sync->deregister_producer(
-        &deterministic_cpu_fj.fj_cpu->work_units_elapsed);
+  if (deterministic_cpu_fj) {
+    if (deterministic_cpu_fj->producer_sync) {
+      deterministic_cpu_fj->producer_sync->deregister_producer(
+        &deterministic_cpu_fj->work_units_elapsed);
     }
-    deterministic_cpu_fj.request_termination();
+
+    deterministic_cpu_fj->halted = true;
+#pragma omp taskwait depend(in : *deterministic_cpu_fj)
   }
 }
 
@@ -233,48 +231,44 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
   }
   auto h_weights          = cuopt::host_copy(in_fj.cstr_weights, solution.handle_ptr->get_stream());
   auto h_objective_weight = in_fj.objective_weight.value(solution.handle_ptr->get_stream());
-  for (auto& cpu_fj_ptr : ls_cpu_fj) {
-    auto& cpu_fj  = *cpu_fj_ptr;
-    cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution,
-                                                      h_weights,
-                                                      h_weights,
-                                                      h_objective_weight,
-                                                      context.preempt_heuristic_solver_,
-                                                      fj_settings_t{},
-                                                      true);
+  for (auto& cpu_fj : ls_cpu_fj) {
+    cpu_fj = fj.create_cpu_climber(solution,
+                                   h_weights,
+                                   h_weights,
+                                   h_objective_weight,
+                                   context.preempt_heuristic_solver_,
+                                   fj_settings_t{},
+                                   true);
   }
 
   auto solution_copy = solution;
 
   // Start CPU solver in background thread
-  for (auto& cpu_fj_ptr : ls_cpu_fj) {
-    cpu_fj_ptr->start_cpu_solver();
-  }
+#pragma omp taskgroup
+  {
+#pragma omp taskloop shared(ls_cpu_fj) default(none) num_tasks(ls_cpu_fj.size()) nogroup
+    for (size_t i = 0; i < ls_cpu_fj.size(); ++i) {
+      cpufj_solve(ls_cpu_fj[i].get());
+    }
 
-  // Run GPU solver and measure execution time
-  auto gpu_fj_start         = std::chrono::high_resolution_clock::now();
-  in_fj.settings.time_limit = timer.remaining_time();
-  in_fj.solve(solution);
+    // Run GPU solver
+    in_fj.settings.time_limit = timer.remaining_time();
+    in_fj.solve(solution);
 
-  // Stop CPU solver
-  for (auto& cpu_fj_ptr : ls_cpu_fj) {
-    cpu_fj_ptr->stop_cpu_solver();
+    for (size_t i = 0; i < ls_cpu_fj.size(); ++i) {
+      ls_cpu_fj[i]->halted = true;
+    }
   }
 
-  auto gpu_fj_end        = std::chrono::high_resolution_clock::now();
-  double gpu_fj_duration = std::chrono::duration<double>(gpu_fj_end - gpu_fj_start).count();
-
   solution_t<i_t, f_t> solution_cpu(*solution.problem_ptr);
-
   f_t best_cpu_obj = std::numeric_limits<f_t>::max();
-  // // Wait for CPU solver to finish
-  for (auto& cpu_fj_ptr : ls_cpu_fj) {
-    bool cpu_sol_found = cpu_fj_ptr->wait_for_cpu_solver();
-    if (cpu_sol_found) {
-      f_t cpu_obj = cpu_fj_ptr->fj_cpu->h_best_objective;
+
+  for (size_t i = 0; i < ls_cpu_fj.size(); ++i) {
+    if (ls_cpu_fj[i]->feasible_found) {
+      f_t cpu_obj = ls_cpu_fj[i]->h_best_objective;
       if (cpu_obj < best_cpu_obj) {
         best_cpu_obj = cpu_obj;
-        solution_cpu.copy_new_assignment(cpu_fj_ptr->fj_cpu->h_best_assignment);
+        solution_cpu.copy_new_assignment(ls_cpu_fj[i]->h_best_assignment);
         solution_cpu.compute_feasibility();
       }
     }
diff --git a/cpp/src/mip_heuristics/local_search/local_search.cuh b/cpp/src/mip_heuristics/local_search/local_search.cuh
index 94493ebcb3..9befd34ab5 100644
--- a/cpp/src/mip_heuristics/local_search/local_search.cuh
+++ b/cpp/src/mip_heuristics/local_search/local_search.cuh
@@ -11,16 +11,10 @@
 #include <mip_heuristics/feasibility_jump/fj_cpu.cuh>
 #include <mip_heuristics/local_search/feasibility_pump/feasibility_pump.cuh>
 #include <mip_heuristics/local_search/line_segment_search/line_segment_search.cuh>
-#include <mip_heuristics/solution/solution.cuh>
 #include <mip_heuristics/solver.cuh>
+#include <utilities/omp_helpers.hpp>
 #include <utilities/timer.hpp>
 
-#include <atomic>
-#include <chrono>
-#include <condition_variable>
-#include <mutex>
-#include <thread>
-
 namespace cuopt::linear_programming::dual_simplex {
 template <typename i_t, typename f_t>
 class branch_and_bound_t;
@@ -126,12 +120,15 @@ class local_search_t {
   feasibility_pump_t<i_t, f_t> fp;
   std::mt19937 rng;
 
-  std::vector<std::unique_ptr<cpu_fj_thread_t<i_t, f_t>>> ls_cpu_fj;
-  std::vector<std::unique_ptr<cpu_fj_thread_t<i_t, f_t>>> scratch_cpu_fj;
-  cpu_fj_thread_t<i_t, f_t> scratch_cpu_fj_on_lp_opt;
-  cpu_fj_thread_t<i_t, f_t> deterministic_cpu_fj;
+  std::vector<std::unique_ptr<fj_cpu_climber_t<i_t, f_t>>> ls_cpu_fj;
+  std::vector<std::unique_ptr<fj_cpu_climber_t<i_t, f_t>>> scratch_cpu_fj;
+  std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> scratch_cpu_fj_on_lp_opt;
+  std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> deterministic_cpu_fj;
   problem_t<i_t, f_t> problem_with_objective_cut;
   bool cutting_plane_added_for_active_run{false};
+
+  omp_atomic_t<f_t> local_search_best_obj{std::numeric_limits<double>::max()};
+  population_t<i_t, f_t>* pop_ptr{nullptr};
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh b/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh
index 8b57cc7019..ed0b91466d 100644
--- a/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh
+++ b/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh
@@ -34,7 +34,7 @@ class bound_presolve_t {
   struct settings_t {
     f_t time_limit{60.0};
     i_t iteration_limit{std::numeric_limits<i_t>::max()};
-    i_t num_threads = -1;
+    i_t num_tasks = -1;
     bool parallel_bounds_update{true};
   };
 
diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
index 24cac7129f..74e04c89bb 100644
--- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
+++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
@@ -246,7 +246,8 @@ void conditional_bound_strengthening_t<i_t, f_t>::select_constraint_pairs_host(
   std::vector<int2> constraint_pairs_h(max_pair_per_row * problem.n_constraints, {-1, -1});
   std::unordered_set<int> cnstr_pair;
 
-#pragma omp parallel for private(cnstr_pair)
+#pragma omp taskloop private(cnstr_pair) default(none) \
+  shared(offsets, variables, reverse_offsets, reverse_constraints, constraint_pairs_h)
   for (int cnstr = 0; cnstr < problem.n_constraints; ++cnstr) {
     for (int jj = offsets[cnstr]; jj < offsets[cnstr + 1]; ++jj) {
       int var = variables[jj];
diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu
index e45f2394ed..0dc2ad6e8e 100644
--- a/cpp/src/mip_heuristics/presolve/probing_cache.cu
+++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu
@@ -22,6 +22,7 @@
 #include <utilities/timer.hpp>
 
 #include <unordered_set>
+#include <utilities/omp_helpers.hpp>
 
 namespace cuopt::linear_programming::detail {
 
@@ -860,18 +861,17 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
   bound_presolve.settings.iteration_limit = 50;
   bound_presolve.settings.time_limit      = timer.remaining_time();
 
-  size_t num_threads = bound_presolve.settings.num_threads < 0
-                         ? 0.2 * omp_get_max_threads()
-                         : bound_presolve.settings.num_threads;
-  num_threads        = std::clamp<size_t>(num_threads, 1, 8);
+  size_t num_tasks = bound_presolve.settings.num_tasks < 0 ? 0.2 * omp_get_max_threads()
+                                                           : bound_presolve.settings.num_tasks;
+  num_tasks        = 1;  // std::clamp<size_t>(num_tasks, 1, 8);
 
   // Create a vector of multi_probe_t objects
   std::vector<multi_probe_t<i_t, f_t>> multi_probe_presolve_pool;
-  std::vector<std::vector<std::tuple<f_t, i_t, f_t, f_t>>> modification_vector_pool(num_threads);
-  std::vector<std::vector<substitution_t<i_t, f_t>>> substitution_vector_pool(num_threads);
+  std::vector<std::vector<std::tuple<f_t, i_t, f_t, f_t>>> modification_vector_pool(num_tasks);
+  std::vector<std::vector<substitution_t<i_t, f_t>>> substitution_vector_pool(num_tasks);
 
   // Initialize multi_probe_presolve_pool
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < num_tasks; i++) {
     multi_probe_presolve_pool.emplace_back(bound_presolve.context);
     multi_probe_presolve_pool[i].resize(problem);
     multi_probe_presolve_pool[i].compute_stats = true;
@@ -890,23 +890,39 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
   // are visible before any per-thread kernel can reference that memory.
   problem.handle_ptr->sync_stream();
 
-// Main parallel loop
-#pragma omp parallel num_threads(num_threads)
-  {
-    for (size_t step_start = 0; step_start < priority_indices.size(); step_start += step_size) {
-      if (timer.check_time_limit() || early_exit || problem_is_infeasible.load()) { break; }
-      size_t step_end = std::min(step_start + step_size, priority_indices.size());
-
-#pragma omp for
-      for (size_t i = step_start; i < step_end; ++i) {
+  // Main parallel loop
+  for (size_t step_start = 0; step_start < priority_indices.size(); step_start += step_size) {
+    if (timer.check_time_limit() || early_exit || problem_is_infeasible.load()) { break; }
+    size_t step_end = std::min(step_start + step_size, priority_indices.size());
+
+#pragma omp taskloop num_tasks(num_tasks) default(none) firstprivate(step_start, step_end) \
+  shared(num_tasks,                                                                        \
+           priority_indices,                                                               \
+           timer,                                                                          \
+           multi_probe_presolve_pool,                                                      \
+           bound_presolve,                                                                 \
+           problem,                                                                        \
+           h_var_bounds,                                                                   \
+           h_integer_indices,                                                              \
+           n_of_implied_singletons,                                                        \
+           n_of_cached_probings,                                                           \
+           problem_is_infeasible,                                                          \
+           modification_vector_pool,                                                       \
+           substitution_vector_pool)
+    for (size_t task_id = 0; task_id < num_tasks; ++task_id) {
+      size_t n                   = step_end - step_start;
+      size_t begin               = std::floor(static_cast<f_t>(n) * task_id / num_tasks);
+      size_t end                 = std::floor(static_cast<f_t>(n) * (task_id + 1) / num_tasks);
+      auto& multi_probe_presolve = multi_probe_presolve_pool[task_id];
+      auto& modification_vector  = modification_vector_pool[task_id];
+      auto& substitution_vector  = substitution_vector_pool[task_id];
+      if (timer.check_time_limit()) { continue; }
+
+      for (size_t i = begin; i < end; ++i) {
         auto var_idx = priority_indices[i];
         if (timer.check_time_limit()) { continue; }
 
-        int thread_idx = omp_get_thread_num();
-        CUOPT_LOG_TRACE("Computing probing cache for var %d on thread %d", var_idx, thread_idx);
-
-        auto& multi_probe_presolve = multi_probe_presolve_pool[thread_idx];
-
+        CUOPT_LOG_TRACE("Computing probing cache for var %d on thread %d", var_idx, id);
         compute_cache_for_var<i_t, f_t>(var_idx,
                                         bound_presolve,
                                         problem,
@@ -916,30 +932,29 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
                                         n_of_implied_singletons,
                                         n_of_cached_probings,
                                         problem_is_infeasible,
-                                        modification_vector_pool[thread_idx],
-                                        substitution_vector_pool[thread_idx],
+                                        modification_vector,
+                                        substitution_vector,
                                         timer,
                                         problem.handle_ptr->get_device());
       }
     }
-#pragma omp single
-    {
-      // TODO when we have determinism, check current threads work/time counter and filter queue
-      // items that are smaller or equal to that
-      apply_modification_queue_to_problem(modification_vector_pool, problem);
-      // copy host bounds again, because we changed some problem bounds
-      raft::copy(h_var_bounds.data(),
-                 problem.variable_bounds.data(),
-                 h_var_bounds.size(),
-                 problem.handle_ptr->get_stream());
-      problem.handle_ptr->sync_stream();
-      if (n_of_implied_singletons - last_it_implied_singletons <
-          (size_t)std::max(2, (min(100, problem.n_variables / 50)))) {
-        early_exit = true;
-      }
-      last_it_implied_singletons = n_of_implied_singletons;
+
+    // TODO when we have determinism, check current threads work/time counter and filter queue
+    // items that are smaller or equal to that
+    apply_modification_queue_to_problem(modification_vector_pool, problem);
+    // copy host bounds again, because we changed some problem bounds
+    raft::copy(h_var_bounds.data(),
+               problem.variable_bounds.data(),
+               h_var_bounds.size(),
+               problem.handle_ptr->get_stream());
+    problem.handle_ptr->sync_stream();
+    if (n_of_implied_singletons - last_it_implied_singletons <
+        (size_t)std::max(2, (min(100, problem.n_variables / 50)))) {
+      early_exit = true;
     }
+    last_it_implied_singletons = n_of_implied_singletons;
   }  // end of step
+
   apply_substitution_queue_to_problem(substitution_vector_pool, problem);
   CUOPT_LOG_DEBUG("Total number of cached probings %lu number of implied singletons %lu",
                   n_of_cached_probings.load(),
diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu
index 4e9cd6a2a5..0932f0e5a9 100644
--- a/cpp/src/mip_heuristics/solve.cu
+++ b/cpp/src/mip_heuristics/solve.cu
@@ -47,6 +47,7 @@
 #include <rmm/cuda_stream.hpp>
 
 #include <cuda_profiler_api.h>
+#include <omp.h>
 
 namespace cuopt::linear_programming {
 
@@ -80,10 +81,10 @@ static void invoke_solution_callbacks(
 }
 
 template <typename i_t, typename f_t>
-mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
-                                 mip_solver_settings_t<i_t, f_t> const& settings,
-                                 timer_t& timer,
-                                 f_t initial_cutoff = std::numeric_limits<f_t>::infinity())
+mip_solution_t<i_t, f_t> run_mip_solver(detail::problem_t<i_t, f_t>& problem,
+                                        mip_solver_settings_t<i_t, f_t> const& settings,
+                                        timer_t& timer,
+                                        f_t initial_cutoff = std::numeric_limits<f_t>::infinity())
 {
   try {
     raft::common::nvtx::range fun_scope("run_mip");
@@ -228,8 +229,8 @@ mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
 }
 
 template <typename i_t, typename f_t>
-mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
-                                   mip_solver_settings_t<i_t, f_t> const& settings_const)
+mip_solution_t<i_t, f_t> solve_mip_helper(optimization_problem_t<i_t, f_t>& op_problem,
+                                          mip_solver_settings_t<i_t, f_t> const& settings_const)
 {
   try {
     mip_solver_settings_t<i_t, f_t> settings(settings_const);
@@ -435,10 +436,9 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
       CUOPT_LOG_INFO("Writing presolved problem to file: %s", settings.presolve_file.c_str());
       presolve_result_opt->reduced_problem.write_to_mps(settings.presolve_file);
     }
-
     // early_best_user_obj is in user-space.
     // run_mip stores it in context.initial_cutoff and converts to target spaces as needed.
-    auto sol = run_mip(problem, settings, timer, early_best_user_obj);
+    auto sol = run_mip_solver(problem, settings, timer, early_best_user_obj);
 
     if (run_presolve) {
       auto status_to_skip = sol.get_termination_status() == mip_termination_status_t::TimeLimit ||
@@ -502,6 +502,50 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
     throw;
   }
 }
+template <typename i_t, typename f_t>
+mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
+                                   mip_solver_settings_t<i_t, f_t> const& settings_const)
+{
+  std::exception_ptr exception;
+  i_t num_threads = 0;
+  if (settings_const.num_cpu_threads < 0) {
+    num_threads = omp_get_max_threads();
+  } else {
+    if (settings_const.num_cpu_threads < 4) {
+      CUOPT_LOG_ERROR(
+        "The MIP solver requires at least 4 CPU threads! Setting the number of threads to 4.");
+    }
+
+    num_threads = std::max(4, settings_const.num_cpu_threads);
+  }
+
+  // TODO: Remove this after converting deterministic B&B to use tasks. This allows
+  // creating a nested parallel region.
+  omp_set_max_active_levels(2);
+
+  //
+  mip_solution_t<i_t, f_t> sol(mip_termination_status_t::NoTermination,
+                               solver_stats_t<i_t, f_t>{},
+                               op_problem.get_handle_ptr()->get_stream());
+
+#pragma omp parallel num_threads(num_threads) default(none) \
+  shared(sol, op_problem, settings_const, exception)
+  {
+#pragma omp master
+    {
+      try {
+        sol = solve_mip_helper<i_t, f_t>(op_problem, settings_const);
+      } catch (...) {
+        // We cannot throw inside an OpenMP parallel region. So we need to catch and then
+        // re-throw later.
+        exception = std::current_exception();
+      }
+    }
+  }
+
+  if (exception) { std::rethrow_exception(exception); }
+  return sol;
+}
 
 template <typename i_t, typename f_t>
 mip_solution_t<i_t, f_t> solve_mip(
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index 737d81201f..e8336a0818 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -309,12 +309,7 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 
   dual_simplex::probing_implied_bound_t<i_t, f_t> probing_implied_bound;
 
-  i_t num_threads = 0;
-  if (context.settings.num_cpu_threads < 0) {
-    num_threads = omp_get_max_threads();
-  } else {
-    num_threads = std::max(1, context.settings.num_cpu_threads);
-  }
+  i_t num_threads = omp_get_num_threads();
 
   if (!context.settings.heuristics_only) {
     // Convert the presolved problem to dual_simplex::user_problem_t
@@ -330,7 +325,7 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     // Fill in the settings for branch and bound
     branch_and_bound_settings.time_limit           = timer_.get_time_limit();
     branch_and_bound_settings.node_limit           = context.settings.node_limit;
-    branch_and_bound_settings.num_threads          = num_threads - 1;
+    branch_and_bound_settings.num_threads          = std::max(num_threads - 1, 1);
     branch_and_bound_settings.print_presolve_stats = false;
     branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
     branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap;
@@ -455,33 +450,25 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 
     if (timer_.check_time_limit()) {
       CUOPT_LOG_INFO("Time limit reached during B&B setup");
-      solution_t<i_t, f_t> sol(*context.problem_ptr);
       context.stats.total_solve_time = timer_.elapsed_time();
       context.problem_ptr->post_process_solution(sol);
       return sol;
     }
   }
 
-#pragma omp parallel num_threads(num_threads) default(none) \
-  shared(sol, branch_and_bound, branch_and_bound_status, branch_and_bound_solution, dm, context)
+#pragma omp taskgroup
   {
-#pragma omp master
-    {
-      if (!context.settings.heuristics_only) {
-#pragma omp task
-        {
-          branch_and_bound_status = branch_and_bound->solve(branch_and_bound_solution);
-        }
-      }
-
-#pragma omp task
+    if (!context.settings.heuristics_only) {
+#pragma omp task default(none) \
+  shared(branch_and_bound, branch_and_bound_solution, branch_and_bound_status)
       {
-        // Start the primal heuristics
-        context.diversity_manager_ptr = &dm;
-        // Start the primal heuristics
-        sol = dm.run_solver();
+        branch_and_bound_status = branch_and_bound->solve(branch_and_bound_solution);
       }
     }
+
+    // Start the primal heuristics
+    context.diversity_manager_ptr = &dm;
+    sol                           = dm.run_solver();
   }
 
   if (!context.settings.heuristics_only) {
@@ -508,7 +495,6 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
   }
   context.stats.total_solve_time = timer_.elapsed_time();
   context.problem_ptr->post_process_solution(sol);
-  dm.rins.stop_rins();
   return sol;
 }
 
diff --git a/cpp/src/mip_heuristics/utilities/cpu_worker_thread.cuh b/cpp/src/mip_heuristics/utilities/cpu_worker_thread.cuh
deleted file mode 100644
index 2b982e1f47..0000000000
--- a/cpp/src/mip_heuristics/utilities/cpu_worker_thread.cuh
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
- * reserved. SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <atomic>
-#include <chrono>
-#include <condition_variable>
-#include <mutex>
-#include <thread>
-#include <utilities/logger.hpp>
-
-namespace cuopt::linear_programming::detail {
-
-template <typename Derived>
-class cpu_worker_thread_base_t {
- public:
-  cpu_worker_thread_base_t();
-  ~cpu_worker_thread_base_t();
-
-  void start_cpu_solver();
-  bool wait_for_cpu_solver();
-
-  // Derived classes MUST call this in their destructor before the base destructor runs.
-  // This ensures on_terminate() is called while the derived object is still fully alive.
-  void request_termination();
-
-  // Internal method for thread management - safe to call during destruction
-  void join_worker();
-  void cpu_worker_thread();
-
-  std::thread cpu_worker;
-  std::mutex cpu_mutex;
-  std::condition_variable cpu_cv;
-  std::atomic<bool> should_stop{false};
-  std::atomic<bool> cpu_thread_should_start{false};
-  std::atomic<bool> cpu_thread_done{true};
-  std::atomic<bool> cpu_thread_terminate{false};
-};
-
-template <typename Derived>
-cpu_worker_thread_base_t<Derived>::cpu_worker_thread_base_t()
-{
-  cpu_worker = std::thread(&cpu_worker_thread_base_t<Derived>::cpu_worker_thread, this);
-}
-
-template <typename Derived>
-cpu_worker_thread_base_t<Derived>::~cpu_worker_thread_base_t()
-{
-  // Note: We don't call on_terminate() here since the derived object is already destroyed.
-  join_worker();
-}
-
-template <typename Derived>
-void cpu_worker_thread_base_t<Derived>::cpu_worker_thread()
-{
-  while (!cpu_thread_terminate) {
-    {
-      std::unique_lock<std::mutex> lock(cpu_mutex);
-      cpu_cv.wait(lock, [this] { return cpu_thread_should_start || cpu_thread_terminate; });
-
-      if (cpu_thread_terminate) break;
-
-      cpu_thread_done         = false;
-      cpu_thread_should_start = false;
-    }
-
-    static_cast<Derived*>(this)->run_worker();
-
-    {
-      std::lock_guard<std::mutex> lock(cpu_mutex);
-      cpu_thread_done = true;
-    }
-    cpu_cv.notify_all();
-  }
-}
-
-template <typename Derived>
-void cpu_worker_thread_base_t<Derived>::request_termination()
-{
-  bool should_terminate = false;
-  {
-    std::lock_guard<std::mutex> lock(cpu_mutex);
-    if (cpu_thread_terminate) return;
-    cpu_thread_terminate = true;
-    should_terminate     = true;
-    static_cast<Derived*>(this)->on_terminate();
-  }
-
-  if (should_terminate) {
-    cpu_cv.notify_one();
-    join_worker();
-  }
-}
-
-template <typename Derived>
-void cpu_worker_thread_base_t<Derived>::join_worker()
-{
-  {
-    std::lock_guard<std::mutex> lock(cpu_mutex);
-    if (!cpu_thread_terminate) { cpu_thread_terminate = true; }
-  }
-  cpu_cv.notify_one();
-
-  if (cpu_worker.joinable()) { cpu_worker.join(); }
-}
-
-template <typename Derived>
-void cpu_worker_thread_base_t<Derived>::start_cpu_solver()
-{
-  {
-    std::lock_guard<std::mutex> lock(cpu_mutex);
-    cpu_thread_done         = false;
-    cpu_thread_should_start = true;
-    static_cast<Derived*>(this)->on_start();
-  }
-  cpu_cv.notify_one();
-}
-
-template <typename Derived>
-bool cpu_worker_thread_base_t<Derived>::wait_for_cpu_solver()
-{
-  auto wait_start = std::chrono::high_resolution_clock::now();
-  std::unique_lock<std::mutex> lock(cpu_mutex);
-  cpu_cv.wait(lock, [this] { return cpu_thread_done || cpu_thread_terminate; });
-  auto wait_end    = std::chrono::high_resolution_clock::now();
-  double wait_time = std::chrono::duration<double>(wait_end - wait_start).count();
-  if (wait_time > 1.0) { CUOPT_LOG_DEBUG("CPU thread wait time: %.2f seconds", wait_time); }
-
-  return static_cast<Derived*>(this)->get_result();
-}
-
-}  // namespace cuopt::linear_programming::detail

From 1bb5d10215c036c2c83b3129966a7adc93eb4f6f Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Tue, 14 Apr 2026 18:50:05 +0200
Subject: [PATCH 03/14] fixed compilation. removed debug code from
 probing_cache.cu

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/mip_heuristics/diversity/diversity_manager.cu | 3 ---
 cpp/src/mip_heuristics/presolve/probing_cache.cu      | 4 ++--
 cpp/src/mip_heuristics/solve.cu                       | 3 ++-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
index b8dc3d33bf..f53648b0f8 100644
--- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu
+++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu
@@ -603,18 +603,15 @@ solution_t<i_t, f_t> diversity_manager_t<i_t, f_t>::run_solver()
 
   generate_solution(timer.remaining_time(), false);
   if (timer.check_time_limit()) {
-    rins.stop_rins();
     population.add_external_solutions_to_population();
     return population.best_feasible();
   }
   if (check_b_b_preemption()) {
-    rins.stop_rins();
     population.add_external_solutions_to_population();
     return population.best_feasible();
   }
 
   run_fp_alone();
-  rins.stop_rins();
   population.add_external_solutions_to_population();
   return population.best_feasible();
 };
diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu
index 0dc2ad6e8e..cfa177eaf5 100644
--- a/cpp/src/mip_heuristics/presolve/probing_cache.cu
+++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu
@@ -861,9 +861,9 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
   bound_presolve.settings.iteration_limit = 50;
   bound_presolve.settings.time_limit      = timer.remaining_time();
 
-  size_t num_tasks = bound_presolve.settings.num_tasks < 0 ? 0.2 * omp_get_max_threads()
+  size_t num_tasks = bound_presolve.settings.num_tasks < 0 ? 0.2 * omp_get_num_threads()
                                                            : bound_presolve.settings.num_tasks;
-  num_tasks        = 1;  // std::clamp<size_t>(num_tasks, 1, 8);
+  num_tasks        = std::clamp<size_t>(num_tasks, 1, 8);
 
   // Create a vector of multi_probe_t objects
   std::vector<multi_probe_t<i_t, f_t>> multi_probe_presolve_pool;
diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu
index 7207e8acd9..ab6f324175 100644
--- a/cpp/src/mip_heuristics/solve.cu
+++ b/cpp/src/mip_heuristics/solve.cu
@@ -472,7 +472,8 @@ mip_solution_t<i_t, f_t> solve_mip_helper(optimization_problem_t<i_t, f_t>& op_p
     }
     // early_best_user_obj is in user-space.
     // run_mip stores it in context.initial_upper_bound and converts to target spaces as needed.
-    auto sol = run_mip(problem, settings, timer, early_best_user_obj, early_best_user_assignment);
+    auto sol =
+      run_mip_solver(problem, settings, timer, early_best_user_obj, early_best_user_assignment);
     const f_t cuopt_presolve_time = sol.get_stats().presolve_time;
 
     if (run_presolve) {

From a7f2eb6830ea19a29934d57f1a8cff050d63ef9d Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Wed, 15 Apr 2026 11:27:46 +0200
Subject: [PATCH 04/14] addressed coderabbit reviews. added some comments and
 logs

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/branch_and_bound/branch_and_bound.cpp  |  4 +++-
 cpp/src/branch_and_bound/pseudo_costs.cpp      |  6 ++++++
 cpp/src/mip_heuristics/diversity/lns/rins.cu   |  7 ++++---
 .../feasibility_jump/early_cpufj.cu            |  2 ++
 .../mip_heuristics/feasibility_jump/fj_cpu.cu  |  2 +-
 .../local_search/local_search.cu               | 18 ++++++++++++++++--
 .../conditional_bound_strengthening.cu         | 13 ++++++++-----
 .../mip_heuristics/presolve/probing_cache.cu   |  2 ++
 cpp/src/mip_heuristics/solve.cu                | 17 ++++++++++-------
 cpp/src/mip_heuristics/solver.cu               |  2 +-
 10 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 6fb5955edd..367dfbb416 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -1800,6 +1800,8 @@ void branch_and_bound_t<i_t, f_t>::run_scheduler()
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::single_threaded_solve()
 {
+  raft::common::nvtx::range scope("BB::single_threaded_solve");
+
   branch_and_bound_worker_t<i_t, f_t> worker(0, original_lp_, Arow_, var_types_, settings_);
 
   f_t lower_bound = get_lower_bound();
@@ -2634,7 +2636,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     } else {
       single_threaded_solve();
     }
-  }
+  }  // Implicit barrier for all tasks created within the group
 
   is_running_ = false;
 
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index bda91cf04f..47c460cc7b 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -1000,6 +1000,8 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       basis_update_mpf_t<i_t, f_t>& basis_factors,
                       pseudo_costs_t<i_t, f_t>& pc)
 {
+  raft::common::nvtx::range scope("BB::strong_branching");
+
   pc.resize(original_lp.num_cols);
   pc.strong_branch_down.assign(fractional.size(), 0);
   pc.strong_branch_up.assign(fractional.size(), 0);
@@ -1304,6 +1306,8 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
                                                  const std::vector<f_t>& solution,
                                                  logger_t& log)
 {
+  raft::common::nvtx::range scope("BB::pseudocost_branching");
+
   i_t branch_var = fractional[0];
   f_t max_score  = -1;
   i_t num_initialized_down;
@@ -1350,6 +1354,8 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   const std::vector<i_t>& new_slacks,
   const lp_problem_t<i_t, f_t>& original_lp)
 {
+  raft::common::nvtx::range scope("BB::reliability_branching");
+
   constexpr f_t eps                      = 1e-6;
   f_t start_time                         = bnb_stats.start_time;
   i_t branch_var                         = fractional[0];
diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu
index c6a1bba7e7..6ddc645e4e 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cu
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu
@@ -61,6 +61,8 @@ void rins_t<i_t, f_t>::node_callback(const std::vector<f_t>& solution, f_t objec
 
     if (population_ready) {
       lp_optimal_solution = solution;
+
+      CUOPT_LOG_INFO("Launching RINS task");
 #pragma omp task default(none)
       run_rins();
     } else {
@@ -82,9 +84,7 @@ template <typename i_t, typename f_t>
 void rins_t<i_t, f_t>::run_rins()
 {
   raft::common::nvtx::range fun_scope("Running RINS");
-
-  if (total_calls == 0) RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device()));
-
+  RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device()));
   cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch");
   cuopt_assert(problem_copy->handle_ptr == &rins_handle, "Handle mismatch");
   // Do not make assertions based on problem_ptr. The original problem may have been modified within
@@ -226,6 +226,7 @@ void rins_t<i_t, f_t>::run_rins()
                           true);
   fj_cpu->log_prefix = "[RINS] ";
 
+  CUOPT_LOG_INFO("Launching CPUFJ (RINS) task");
 #pragma omp task shared(fj_cpu) firstprivate(time_limit) default(none)
   cpufj_solve(fj_cpu.get(), time_limit);
 
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
index 15ff3f4eb7..86d91c0539 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
@@ -43,6 +43,7 @@ void early_cpufj_t<i_t, f_t>::start()
                                          const std::vector<f_t>& assignment,
                                          double) { this->try_update_best(solver_obj, assignment); };
 
+  CUOPT_LOG_INFO("Launching early CPUFJ task");
 #pragma omp task shared(fj_cpu_) depend(out : *fj_cpu_) default(none)
   cpufj_solve(fj_cpu_.get());
 }
@@ -56,6 +57,7 @@ void early_cpufj_t<i_t, f_t>::stop()
 
   fj_cpu_->halted = true;
 #pragma omp taskwait depend(in : *fj_cpu_)
+  CUOPT_LOG_INFO("Early CPUFJ task was stopped");
 
   CUOPT_LOG_DEBUG("[Early CPUFJ] Stopped after %d iterations, solution_found=%d",
                   fj_cpu_ ? fj_cpu_->iterations : 0,
diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
index aea6295528..ee8ff9a53c 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
@@ -1421,7 +1421,7 @@ void cpufj_solve(fj_cpu_climber_t<i_t, f_t>* fj_cpu, f_t in_time_limit)
 {
   i_t local_mins       = 0;
   auto loop_start      = std::chrono::high_resolution_clock::now();
-  auto time_limit      = std::chrono::milliseconds((int)(in_time_limit * 1000));
+  auto time_limit      = std::chrono::milliseconds(std::floor(in_time_limit * 1000.0));
   auto loop_time_start = std::chrono::high_resolution_clock::now();
 
   // Initialize feature tracking
diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu
index 9ea7743e5e..d5160862e5 100644
--- a/cpp/src/mip_heuristics/local_search/local_search.cu
+++ b/cpp/src/mip_heuristics/local_search/local_search.cu
@@ -90,6 +90,8 @@ void local_search_t<i_t, f_t>::start_cpufj_scratch_threads(population_t<i_t, f_t
     counter++;
   };
 
+  CUOPT_LOG_INFO("Launching %d scratch CPUFJ tasks", scratch_cpu_fj.size());
+
   for (size_t i = 0; i < scratch_cpu_fj.size(); ++i) {
     auto ptr = scratch_cpu_fj[i].get();
 #pragma omp task firstprivate(ptr) depend(out : *ptr) default(none)
@@ -128,6 +130,8 @@ void local_search_t<i_t, f_t>::start_cpufj_lptopt_scratch_threads(
   // default weights
   cudaDeviceSynchronize();
 
+  CUOPT_LOG_INFO("Launching scratch CPUFJ (on LP optimal) task");
+
 #pragma omp task shared(scratch_cpu_fj_on_lp_opt) default(none) \
   depend(out : *scratch_cpu_fj_on_lp_opt)
   cpufj_solve(scratch_cpu_fj_on_lp_opt.get());
@@ -144,6 +148,8 @@ void local_search_t<i_t, f_t>::stop_cpufj_scratch_threads()
   if (scratch_cpu_fj_on_lp_opt) {
     scratch_cpu_fj_on_lp_opt->halted = true;
 #pragma omp taskwait depend(in : *scratch_cpu_fj_on_lp_opt)
+
+    CUOPT_LOG_INFO("All scratch CPUFJ tasks were stopped");
   }
 }
 
@@ -181,6 +187,7 @@ void local_search_t<i_t, f_t>::start_cpufj_deterministic(
       bb.queue_external_solution_deterministic(h_vec, work_units);
     };
 
+  CUOPT_LOG_INFO("Launching deterministic CPUFJ task");
 #pragma omp task shared(deterministic_cpu_fj) default(none) depend(inout : *deterministic_cpu_fj)
   cpufj_solve(deterministic_cpu_fj.get());
 
@@ -199,6 +206,7 @@ void local_search_t<i_t, f_t>::stop_cpufj_deterministic()
 
     deterministic_cpu_fj->halted = true;
 #pragma omp taskwait depend(in : *deterministic_cpu_fj)
+    CUOPT_LOG_INFO("Deterministic CPUFJ task was stopped");
   }
 }
 
@@ -246,9 +254,13 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
   // Start CPU solver in background thread
 #pragma omp taskgroup
   {
+    if (ls_cpu_fj.size() > 0) {
+      CUOPT_LOG_INFO("Launching %d CPUFJ tasks", ls_cpu_fj.size());
+
 #pragma omp taskloop shared(ls_cpu_fj) default(none) num_tasks(ls_cpu_fj.size()) nogroup
-    for (size_t i = 0; i < ls_cpu_fj.size(); ++i) {
-      cpufj_solve(ls_cpu_fj[i].get());
+      for (size_t i = 0; i < ls_cpu_fj.size(); ++i) {
+        cpufj_solve(ls_cpu_fj[i].get());
+      }
     }
 
     // Run GPU solver
@@ -260,6 +272,8 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
     }
   }
 
+  CUOPT_LOG_INFO("All CPUFJ tasks were stopped");
+
   solution_t<i_t, f_t> solution_cpu(*solution.problem_ptr);
   f_t best_cpu_obj = std::numeric_limits<f_t>::max();
 
diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
index 74e04c89bb..846e22358a 100644
--- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
+++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
@@ -246,12 +246,15 @@ void conditional_bound_strengthening_t<i_t, f_t>::select_constraint_pairs_host(
   std::vector<int2> constraint_pairs_h(max_pair_per_row * problem.n_constraints, {-1, -1});
   std::unordered_set<int> cnstr_pair;
 
-#pragma omp taskloop private(cnstr_pair) default(none) \
-  shared(offsets, variables, reverse_offsets, reverse_constraints, constraint_pairs_h)
-  for (int cnstr = 0; cnstr < problem.n_constraints; ++cnstr) {
-    for (int jj = offsets[cnstr]; jj < offsets[cnstr + 1]; ++jj) {
+  i_t num_tasks = omp_get_num_threads() - 4;
+
+  CUOPT_LOG_INFO("Selecting constraint pairs with %d tasks", num_tasks);
+#pragma omp taskloop num_tasks(num_tasks) private(cnstr_pair) default(none) \
+  shared(problem, offsets, variables, reverse_offsets, reverse_constraints, constraint_pairs_h)
+  for (i_t cnstr = 0; cnstr < problem.n_constraints; ++cnstr) {
+    for (i_t jj = offsets[cnstr]; jj < offsets[cnstr + 1]; ++jj) {
       int var = variables[jj];
-      for (int kk = reverse_offsets[var]; kk < reverse_offsets[var + 1]; ++kk) {
+      for (i_t kk = reverse_offsets[var]; kk < reverse_offsets[var + 1]; ++kk) {
         if (reverse_constraints[kk] != cnstr) { cnstr_pair.insert(reverse_constraints[kk]); }
         if (cnstr_pair.size() == max_pair_per_row) { break; }
       }
diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu
index cfa177eaf5..501e940a71 100644
--- a/cpp/src/mip_heuristics/presolve/probing_cache.cu
+++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu
@@ -890,6 +890,8 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
   // are visible before any per-thread kernel can reference that memory.
   problem.handle_ptr->sync_stream();
 
+  CUOPT_LOG_INFO("Running probing cache with %d tasks", num_tasks);
+
   // Main parallel loop
   for (size_t step_start = 0; step_start < priority_indices.size(); step_start += step_size) {
     if (timer.check_time_limit() || early_exit || problem_is_infeasible.load()) { break; }
diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu
index ab6f324175..6bf6ae9ce7 100644
--- a/cpp/src/mip_heuristics/solve.cu
+++ b/cpp/src/mip_heuristics/solve.cu
@@ -577,23 +577,26 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
   if (settings_const.num_cpu_threads < 0) {
     num_threads = omp_get_max_threads();
   } else {
-    if (settings_const.num_cpu_threads < 4) {
-      CUOPT_LOG_ERROR(
-        "The MIP solver requires at least 4 CPU threads! Setting the number of threads to 4.");
-    }
+    num_threads = settings_const.num_cpu_threads;
+  }
 
-    num_threads = std::max(4, settings_const.num_cpu_threads);
+  if (num_threads < 4) {
+    CUOPT_LOG_ERROR("The MIP solver requires at least 4 CPU threads!");
+    return mip_solution_t<i_t, f_t>{
+      cuopt::logic_error("The number of CPU threads is below than expected.",
+                         cuopt::error_type_t::RuntimeError),
+      op_problem.get_handle_ptr()->get_stream()};
   }
 
   // TODO: Remove this after converting deterministic B&B to use tasks. This allows
   // creating a nested parallel region.
   omp_set_max_active_levels(2);
 
-  //
   mip_solution_t<i_t, f_t> sol(mip_termination_status_t::NoTermination,
                                solver_stats_t<i_t, f_t>{},
                                op_problem.get_handle_ptr()->get_stream());
 
+  // Creates the OpenMP thread pool. It will be shared across the entire MIP solver.
 #pragma omp parallel num_threads(num_threads) default(none) \
   shared(sol, op_problem, settings_const, exception)
   {
@@ -607,7 +610,7 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
         exception = std::current_exception();
       }
     }
-  }
+  }  // Implicit barrier
 
   if (exception) { std::rethrow_exception(exception); }
   return sol;
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index 694fb7fa8d..1415c68922 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -462,7 +462,7 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     // Start the primal heuristics
     context.diversity_manager_ptr = &dm;
     sol                           = dm.run_solver();
-  }
+  }  // implicit barrier for all tasks created within the taskgroup
 
   if (!context.settings.heuristics_only) {
     if (branch_and_bound_solution.lower_bound > -std::numeric_limits<f_t>::infinity()) {

From 50fa3d6129c27d00dc6b1f217162eb12d973e451 Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Wed, 15 Apr 2026 12:07:01 +0200
Subject: [PATCH 05/14] fixed incorrect starting bounds for probing cache.
 added depend clause for root relaxation.

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/branch_and_bound/branch_and_bound.cpp    | 12 ++++++------
 cpp/src/mip_heuristics/presolve/probing_cache.cu |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 367dfbb416..6c6ddad83f 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -1887,7 +1887,7 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
                           basic_list,    \
                           nonbasic_list, \
                           root_vstatus_, \
-                          edge_norms_) default(none)
+                          edge_norms_) default(none) depend(out : root_status)
   {
     root_status = solve_linear_program_with_advanced_basis(original_lp_,
                                                            exploration_stats_.start_time,
@@ -1942,9 +1942,9 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
 
     // Check if crossover was stopped by dual simplex
     if (crossover_status == crossover_status_t::OPTIMAL) {
-      set_root_concurrent_halt(1);  // Stop dual simplex
-#pragma omp taskwait                // Wait for dual simplex to finish
-      set_root_concurrent_halt(0);  // Clear the concurrent halt flag
+      set_root_concurrent_halt(1);             // Stop dual simplex
+#pragma omp taskwait depend(in : root_status)  // Wait for dual simplex to finish
+      set_root_concurrent_halt(0);             // Clear the concurrent halt flag
       // Override the root relaxation solution with the crossover solution
       root_relax_soln = root_crossover_soln_;
       root_vstatus    = crossover_vstatus_;
@@ -1994,14 +1994,14 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
       solver_name    = method_to_string(root_relax_solved_by);
 
     } else {
-#pragma omp taskwait
+#pragma omp taskwait depend(in : root_status)
       user_objective       = root_relax_soln_.user_objective;
       iter                 = root_relax_soln_.iterations;
       root_relax_solved_by = DualSimplex;
       solver_name          = "Dual Simplex";
     }
   } else {
-#pragma omp taskwait
+#pragma omp taskwait depend(in : root_status)
     user_objective       = root_relax_soln_.user_objective;
     iter                 = root_relax_soln_.iterations;
     root_relax_solved_by = DualSimplex;
diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu
index 501e940a71..25562e20a9 100644
--- a/cpp/src/mip_heuristics/presolve/probing_cache.cu
+++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu
@@ -912,9 +912,9 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
            modification_vector_pool,                                                       \
            substitution_vector_pool)
     for (size_t task_id = 0; task_id < num_tasks; ++task_id) {
-      size_t n                   = step_end - step_start;
-      size_t begin               = std::floor(static_cast<f_t>(n) * task_id / num_tasks);
-      size_t end                 = std::floor(static_cast<f_t>(n) * (task_id + 1) / num_tasks);
+      size_t n     = step_end - step_start;
+      size_t begin = step_start + std::floor(static_cast<f_t>(n) * task_id / num_tasks);
+      size_t end   = std::floor(static_cast<f_t>(n) * (task_id + 1) / num_tasks);
       auto& multi_probe_presolve = multi_probe_presolve_pool[task_id];
       auto& modification_vector  = modification_vector_pool[task_id];
       auto& substitution_vector  = substitution_vector_pool[task_id];
@@ -924,7 +924,7 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
         auto var_idx = priority_indices[i];
         if (timer.check_time_limit()) { continue; }
 
-        CUOPT_LOG_TRACE("Computing probing cache for var %d on thread %d", var_idx, id);
+        CUOPT_LOG_TRACE("Computing probing cache for var %d on thread %d", var_idx, task_id);
         compute_cache_for_var<i_t, f_t>(var_idx,
                                         bound_presolve,
                                         problem,

From 180fbd908f82e6f4e15a9b833d32d39b02b3021f Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Wed, 15 Apr 2026 12:15:07 +0200
Subject: [PATCH 06/14] reduced the verbosity of the shared clause

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/branch_and_bound/branch_and_bound.cpp |  9 +--
 cpp/src/branch_and_bound/pseudo_costs.cpp     | 65 ++-----------------
 .../conditional_bound_strengthening.cu        |  3 +-
 .../mip_heuristics/presolve/probing_cache.cu  | 15 +----
 cpp/src/mip_heuristics/solver.cu              |  3 +-
 5 files changed, 8 insertions(+), 87 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 6c6ddad83f..b2a170b9d1 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -1880,14 +1880,7 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
 
   // Note that we need to explicitly declared `root_status` as a shared variable here since
   // it is local to the thread that are executing the enclosing task.
-#pragma omp task shared(root_status,     \
-                          original_lp_,  \
-                          lp_settings,   \
-                          basis_update,  \
-                          basic_list,    \
-                          nonbasic_list, \
-                          root_vstatus_, \
-                          edge_norms_) default(none) depend(out : root_status)
+#pragma omp task default(shared) depend(out : root_status)
   {
     root_status = solve_linear_program_with_advanced_basis(original_lp_,
                                                            exploration_stats_.start_time,
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 47c460cc7b..8d56e3a068 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -1054,17 +1054,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                                           pc);
   } else {
     if (effective_batch_pdlp != 0) {
-#pragma omp task shared(settings,          \
-                          concurrent_halt, \
-                          original_lp,     \
-                          new_slacks,      \
-                          root_solution,   \
-                          fractional,      \
-                          pc,              \
-                          sb_view,         \
-                          pdlp_obj_down,   \
-                          pdlp_obj_up)     \
-  firstprivate(effective_batch_pdlp, start_time, root_obj) default(none)
+#pragma omp task default(shared)
       batch_pdlp_strong_branching_task(settings,
                                        effective_batch_pdlp,
                                        start_time,
@@ -1084,20 +1074,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
       i_t n = std::min<i_t>(4 * settings.num_threads, fractional.size());
 // Here we are creating more tasks than the number of threads
 // such that they can be scheduled dynamically to the threads.
-#pragma omp taskloop num_tasks(n) default(none) shared(original_lp,                \
-                                                         settings,                 \
-                                                         var_types,                \
-                                                         fractional,               \
-                                                         root_solution,            \
-                                                         root_vstatus,             \
-                                                         edge_norms,               \
-                                                         pc,                       \
-                                                         dual_simplex_obj_down,    \
-                                                         dual_simplex_obj_up,      \
-                                                         dual_simplex_status_down, \
-                                                         dual_simplex_status_up,   \
-                                                         sb_view)                  \
-  firstprivate(start_time, root_obj, upper_bound, simplex_iteration_limit, n)
+#pragma omp taskloop num_tasks(n) default(shared)
       for (i_t k = 0; k < n; k++) {
         i_t start = std::floor(k * fractional.size() / n);
         i_t end   = std::floor((k + 1) * fractional.size() / n);
@@ -1568,18 +1545,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   std::atomic<int> concurrent_halt{0};
 
   if (use_pdlp) {
-#pragma omp task default(none) shared(log,               \
-                                        concurrent_halt, \
-                                        original_lp,     \
-                                        new_slacks,      \
-                                        leaf_solution,   \
-                                        worker,          \
-                                        candidate_vars,  \
-                                        settings,        \
-                                        sb_view,         \
-                                        pdlp_obj_down,   \
-                                        pdlp_obj_up)     \
-  firstprivate(rb_mode, num_candidates, start_time)
+#pragma omp task default(shared)
     batch_pdlp_reliability_branching_task(log,
                                           rb_mode,
                                           num_candidates,
@@ -1614,30 +1580,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   f_t dual_simplex_start_time = tic();
 
   if (rb_mode != 2) {
-#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) default(none) \
-  shared(log,                                                                                      \
-           unreliable_list,                                                                        \
-           settings,                                                                               \
-           sb_view,                                                                                \
-           worker,                                                                                 \
-           var_types,                                                                              \
-           node_ptr,                                                                               \
-           leaf_solution,                                                                          \
-           dual_simplex_obj_down,                                                                  \
-           dual_simplex_obj_up,                                                                    \
-           dual_simplex_status_down,                                                               \
-           dual_simplex_status_up,                                                                 \
-           score_mutex,                                                                            \
-           max_score,                                                                              \
-           branch_var) firstprivate(num_candidates,                                                \
-                                      start_time,                                                  \
-                                      rb_mode,                                                     \
-                                      reliable_threshold,                                          \
-                                      upper_bound,                                                 \
-                                      iter_limit_per_trial,                                        \
-                                      eps,                                                         \
-                                      pseudo_cost_up_avg,                                          \
-                                      pseudo_cost_down_avg)
+#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) default(shared)
     for (i_t i = 0; i < num_candidates; ++i) {
       auto [score, j] = unreliable_list[i];
 
diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
index 846e22358a..e6c403aacc 100644
--- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
+++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
@@ -249,8 +249,7 @@ void conditional_bound_strengthening_t<i_t, f_t>::select_constraint_pairs_host(
   i_t num_tasks = omp_get_num_threads() - 4;
 
   CUOPT_LOG_INFO("Selecting constraint pairs with %d tasks", num_tasks);
-#pragma omp taskloop num_tasks(num_tasks) private(cnstr_pair) default(none) \
-  shared(problem, offsets, variables, reverse_offsets, reverse_constraints, constraint_pairs_h)
+#pragma omp taskloop num_tasks(num_tasks) private(cnstr_pair) default(shared)
   for (i_t cnstr = 0; cnstr < problem.n_constraints; ++cnstr) {
     for (i_t jj = offsets[cnstr]; jj < offsets[cnstr + 1]; ++jj) {
       int var = variables[jj];
diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu
index 25562e20a9..43097c6724 100644
--- a/cpp/src/mip_heuristics/presolve/probing_cache.cu
+++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu
@@ -897,20 +897,7 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
     if (timer.check_time_limit() || early_exit || problem_is_infeasible.load()) { break; }
     size_t step_end = std::min(step_start + step_size, priority_indices.size());
 
-#pragma omp taskloop num_tasks(num_tasks) default(none) firstprivate(step_start, step_end) \
-  shared(num_tasks,                                                                        \
-           priority_indices,                                                               \
-           timer,                                                                          \
-           multi_probe_presolve_pool,                                                      \
-           bound_presolve,                                                                 \
-           problem,                                                                        \
-           h_var_bounds,                                                                   \
-           h_integer_indices,                                                              \
-           n_of_implied_singletons,                                                        \
-           n_of_cached_probings,                                                           \
-           problem_is_infeasible,                                                          \
-           modification_vector_pool,                                                       \
-           substitution_vector_pool)
+#pragma omp taskloop num_tasks(num_tasks) default(shared)
     for (size_t task_id = 0; task_id < num_tasks; ++task_id) {
       size_t n     = step_end - step_start;
       size_t begin = step_start + std::floor(static_cast<f_t>(n) * task_id / num_tasks);
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index 1415c68922..660e8a6f61 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -452,8 +452,7 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 #pragma omp taskgroup
   {
     if (!context.settings.heuristics_only) {
-#pragma omp task default(none) \
-  shared(branch_and_bound, branch_and_bound_solution, branch_and_bound_status)
+#pragma omp task default(shared)
       {
         branch_and_bound_status = branch_and_bound->solve(branch_and_bound_solution);
       }

From b4efd6715d72cbab3a4a897c118582163133ec1d Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Wed, 15 Apr 2026 12:20:36 +0200
Subject: [PATCH 07/14] added missing offset for the end in probing cache

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/mip_heuristics/presolve/probing_cache.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu
index 43097c6724..1171e03a53 100644
--- a/cpp/src/mip_heuristics/presolve/probing_cache.cu
+++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu
@@ -901,7 +901,7 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
     for (size_t task_id = 0; task_id < num_tasks; ++task_id) {
       size_t n     = step_end - step_start;
       size_t begin = step_start + std::floor(static_cast<f_t>(n) * task_id / num_tasks);
-      size_t end   = std::floor(static_cast<f_t>(n) * (task_id + 1) / num_tasks);
+      size_t end   = step_start + std::floor(static_cast<f_t>(n) * (task_id + 1) / num_tasks);
       auto& multi_probe_presolve = multi_probe_presolve_pool[task_id];
       auto& modification_vector  = modification_vector_pool[task_id];
       auto& substitution_vector  = substitution_vector_pool[task_id];

From 80b14ab2b2227dae3c79cdc09ecd9990552da3ac Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Wed, 15 Apr 2026 13:46:09 +0200
Subject: [PATCH 08/14] fixed missing type conversion

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
index ee8ff9a53c..ae0057ba5f 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
@@ -1419,9 +1419,9 @@ std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> fj_t<i_t, f_t>::create_cpu_climber(
 template <typename i_t, typename f_t>
 void cpufj_solve(fj_cpu_climber_t<i_t, f_t>* fj_cpu, f_t in_time_limit)
 {
-  i_t local_mins       = 0;
-  auto loop_start      = std::chrono::high_resolution_clock::now();
-  auto time_limit      = std::chrono::milliseconds(std::floor(in_time_limit * 1000.0));
+  i_t local_mins  = 0;
+  auto loop_start = std::chrono::high_resolution_clock::now();
+  auto time_limit = std::chrono::milliseconds(static_cast<i_t>(std::floor(in_time_limit * 1000.0)));
   auto loop_time_start = std::chrono::high_resolution_clock::now();
 
   // Initialize feature tracking

From 588b401d4b7b041ba7618435fa91e83800e50e6f Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Wed, 15 Apr 2026 14:43:34 +0200
Subject: [PATCH 09/14] converted GPUFJ to omp task

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 .../feasibility_jump/early_cpufj.cu           |  7 +++---
 .../feasibility_jump/early_gpufj.cu           | 23 ++++++++-----------
 .../feasibility_jump/early_gpufj.cuh          |  4 ----
 3 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
index 86d91c0539..667edf8455 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
@@ -57,11 +57,10 @@ void early_cpufj_t<i_t, f_t>::stop()
 
   fj_cpu_->halted = true;
 #pragma omp taskwait depend(in : *fj_cpu_)
-  CUOPT_LOG_INFO("Early CPUFJ task was stopped");
 
-  CUOPT_LOG_DEBUG("[Early CPUFJ] Stopped after %d iterations, solution_found=%d",
-                  fj_cpu_ ? fj_cpu_->iterations : 0,
-                  this->solution_found_);
+  CUOPT_LOG_INFO("[Early CPUFJ] Stopped after %d iterations, solution_found=%d",
+                 fj_cpu_ ? fj_cpu_->iterations : 0,
+                 this->solution_found_);
 
   fj_cpu_.reset();
 }
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
index 3f77427d87..a69ff48aa7 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
@@ -38,7 +38,7 @@ early_gpufj_t<i_t, f_t>::~early_gpufj_t()
 template <typename i_t, typename f_t>
 void early_gpufj_t<i_t, f_t>::start()
 {
-  if (worker_thread_) { return; }
+  if (fj_ptr_) { return; }
 
   this->start_time_ = std::chrono::steady_clock::now();
 
@@ -57,29 +57,26 @@ void early_gpufj_t<i_t, f_t>::start()
     this->try_update_best(solver_obj, h_assignment);
   };
 
-  worker_thread_ = std::make_unique<std::thread>(&early_gpufj_t::run_worker, this);
-}
+  CUOPT_LOG_INFO("Launching early GPUFJ task");
 
-template <typename i_t, typename f_t>
-void early_gpufj_t<i_t, f_t>::run_worker()
-{
-  RAFT_CUDA_TRY(cudaSetDevice(this->device_id_));
-  fj_ptr_->solve(*this->solution_ptr_);
+#pragma omp task default(none) shared(fj_ptr_) depend(out : *fj_ptr_)
+  {
+    RAFT_CUDA_TRY(cudaSetDevice(this->device_id_));
+    fj_ptr_->solve(*this->solution_ptr_);
+  }
 }
 
 template <typename i_t, typename f_t>
 void early_gpufj_t<i_t, f_t>::stop()
 {
-  if (!worker_thread_) { return; }
+  if (!fj_ptr_) { return; }
 
   context_ptr_->preempt_heuristic_solver_.store(true);
+#pragma omp taskwait depend(in : *fj_ptr_)
 
-  if (worker_thread_->joinable()) { worker_thread_->join(); }
-
-  CUOPT_LOG_DEBUG("[Early GPU FJ] Stopped, solution_found=%d", this->solution_found_);
+  CUOPT_LOG_INFO("[Early GPU FJ] Stopped, solution_found=%d", this->solution_found_);
 
   fj_ptr_.reset();
-  worker_thread_.reset();
 }
 
 #if MIP_INSTANTIATE_FLOAT
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh
index 4a7769143e..e5ceaaeb61 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh
@@ -10,7 +10,6 @@
 #include <mip_heuristics/early_heuristic.cuh>
 
 #include <memory>
-#include <thread>
 
 namespace cuopt::linear_programming::detail {
 
@@ -35,11 +34,8 @@ class early_gpufj_t : public early_heuristic_t<i_t, f_t, early_gpufj_t<i_t, f_t>
   void stop();
 
  private:
-  void run_worker();
-
   std::unique_ptr<mip_solver_context_t<i_t, f_t>> context_ptr_;
   std::unique_ptr<fj_t<i_t, f_t>> fj_ptr_;
-  std::unique_ptr<std::thread> worker_thread_;
 };
 
 }  // namespace cuopt::linear_programming::detail

From 3c569e61ec190b43a31901210d1ee6522f4e4336 Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Wed, 15 Apr 2026 16:03:30 +0200
Subject: [PATCH 10/14] added more comments about the implicit barriers.
 removed cudaDeviceSync in CPU FJ. use scope_guard in RINS.

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/branch_and_bound/branch_and_bound.cpp       |  6 +++---
 cpp/src/branch_and_bound/pseudo_costs.cpp           |  6 +++---
 cpp/src/mip_heuristics/diversity/lns/rins.cu        | 13 +++++--------
 .../mip_heuristics/feasibility_jump/early_cpufj.cu  |  2 +-
 .../mip_heuristics/feasibility_jump/early_gpufj.cu  |  2 +-
 cpp/src/mip_heuristics/local_search/local_search.cu | 13 ++++++-------
 .../presolve/conditional_bound_strengthening.cu     |  2 +-
 cpp/src/mip_heuristics/presolve/probing_cache.cu    |  2 +-
 cpp/src/mip_heuristics/solver.cu                    |  2 +-
 9 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index b2a170b9d1..db8b9910dd 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -1987,14 +1987,14 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
       solver_name    = method_to_string(root_relax_solved_by);
 
     } else {
-#pragma omp taskwait depend(in : root_status)
+#pragma omp taskwait depend(in : root_status)  // Wait for the dual simplex to finish
       user_objective       = root_relax_soln_.user_objective;
       iter                 = root_relax_soln_.iterations;
       root_relax_solved_by = DualSimplex;
       solver_name          = "Dual Simplex";
     }
   } else {
-#pragma omp taskwait depend(in : root_status)
+#pragma omp taskwait depend(in : root_status)  // Wait for the dual simplex to finish
     user_objective       = root_relax_soln_.user_objective;
     iter                 = root_relax_soln_.iterations;
     root_relax_solved_by = DualSimplex;
@@ -2629,7 +2629,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     } else {
       single_threaded_solve();
     }
-  }  // Implicit barrier for all tasks created within the group
+  }  // Implicit barrier for all tasks created within the group (RINS, B&B workers)
 
   is_running_ = false;
 
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 8d56e3a068..91824d5cac 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -1114,7 +1114,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     }
 
     if (effective_batch_pdlp != 0) {
-#pragma omp taskwait
+#pragma omp taskwait  // Wait for the batch PDLP task to finish
     }
   }
 
@@ -1567,7 +1567,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
     log.printf("Time limit reached\n");
     if (use_pdlp) {
       concurrent_halt.store(1);
-#pragma omp taskwait
+#pragma omp taskwait  // Wait for the batch PDLP task to finish
     }
     return branch_var;
   }
@@ -1706,7 +1706,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   //}
 
   if (use_pdlp) {
-#pragma omp taskwait
+#pragma omp taskwait  // Wait for the batch PDLP task to finish
 
     i_t pdlp_applied = 0;
     i_t pdlp_optimal = 0;
diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu
index 6ddc645e4e..10430043a4 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cu
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu
@@ -24,6 +24,7 @@
 
 #include <branch_and_bound/branch_and_bound.hpp>
 #include <dual_simplex/tic_toc.hpp>
+#include <utilities/scope_guard.hpp>
 
 namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
@@ -84,6 +85,8 @@ template <typename i_t, typename f_t>
 void rins_t<i_t, f_t>::run_rins()
 {
   raft::common::nvtx::range fun_scope("Running RINS");
+  scope_guard scope_guard([this]() { this->launch_new_task = true; });
+
   RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device()));
   cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch");
   cuopt_assert(problem_copy->handle_ptr == &rins_handle, "Handle mismatch");
@@ -115,10 +118,7 @@ void rins_t<i_t, f_t>::run_rins()
   cuopt_assert(best_sol.handle_ptr == &rins_handle, "Handle mismatch");
 
   cuopt_assert(best_sol.get_feasible(), "Best solution is not feasible");
-  if (!best_sol.get_feasible()) {
-    launch_new_task = true;
-    return;
-  }
+  if (!best_sol.get_feasible()) { return; }
 
   i_t sol_size_before_rins = best_sol.assignment.size();
   auto lp_opt_device = cuopt::device_copy(this->lp_optimal_solution, rins_handle.get_stream());
@@ -142,7 +142,6 @@ void rins_t<i_t, f_t>::run_rins()
   // abort if the fractional ratio is too low
   if (fractional_ratio < settings.min_fractional_ratio) {
     CUOPT_LOG_TRACE("RINS fractional ratio too low, aborting");
-    launch_new_task = true;
     return;
   }
 
@@ -167,7 +166,6 @@ void rins_t<i_t, f_t>::run_rins()
 
   if (n_to_fix == 0) {
     CUOPT_LOG_DEBUG("RINS no variables to fix");
-    launch_new_task = true;
     return;
   }
 
@@ -299,7 +297,7 @@ void rins_t<i_t, f_t>::run_rins()
                           static_cast<f_t>(context.settings.heuristic_params.rins_max_time_limit));
   }
 
-#pragma omp taskwait
+#pragma omp taskwait  // Wait for the CPU FJ (RINS) to finish
 
   CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", fj_cpu->iterations);
   if (fj_cpu->feasible_found) {
@@ -341,7 +339,6 @@ void rins_t<i_t, f_t>::run_rins()
 
   if (improvement_found) total_success++;
   CUOPT_LOG_DEBUG("RINS calls/successes %d/%d", total_calls, total_success);
-  launch_new_task = true;
 }
 
 #if MIP_INSTANTIATE_FLOAT
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
index 667edf8455..e198ac20f1 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
@@ -56,7 +56,7 @@ void early_cpufj_t<i_t, f_t>::stop()
   preemption_flag_.store(true);
 
   fj_cpu_->halted = true;
-#pragma omp taskwait depend(in : *fj_cpu_)
+#pragma omp taskwait depend(in : *fj_cpu_)  // Wait for the early CPUFJ task to finish
 
   CUOPT_LOG_INFO("[Early CPUFJ] Stopped after %d iterations, solution_found=%d",
                  fj_cpu_ ? fj_cpu_->iterations : 0,
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
index a69ff48aa7..96c88daf16 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
@@ -72,7 +72,7 @@ void early_gpufj_t<i_t, f_t>::stop()
   if (!fj_ptr_) { return; }
 
   context_ptr_->preempt_heuristic_solver_.store(true);
-#pragma omp taskwait depend(in : *fj_ptr_)
+#pragma omp taskwait depend(in : *fj_ptr_)  // Wait for the early GPU FJ task to finish
 
   CUOPT_LOG_INFO("[Early GPU FJ] Stopped, solution_found=%d", this->solution_found_);
 
diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu
index d5160862e5..7844617f3f 100644
--- a/cpp/src/mip_heuristics/local_search/local_search.cu
+++ b/cpp/src/mip_heuristics/local_search/local_search.cu
@@ -127,9 +127,6 @@ void local_search_t<i_t, f_t>::start_cpufj_lptopt_scratch_threads(
       }
     };
 
-  // default weights
-  cudaDeviceSynchronize();
-
   CUOPT_LOG_INFO("Launching scratch CPUFJ (on LP optimal) task");
 
 #pragma omp task shared(scratch_cpu_fj_on_lp_opt) default(none) \
@@ -142,12 +139,13 @@ void local_search_t<i_t, f_t>::stop_cpufj_scratch_threads()
 {
   for (size_t i = 0; i < scratch_cpu_fj.size(); ++i) {
     scratch_cpu_fj[i]->halted = true;
-#pragma omp taskwait depend(in : *scratch_cpu_fj[i])
+#pragma omp taskwait depend(in : *scratch_cpu_fj[i])  // Wait for each scratch CPU FJ task to finish
   }
 
   if (scratch_cpu_fj_on_lp_opt) {
     scratch_cpu_fj_on_lp_opt->halted = true;
-#pragma omp taskwait depend(in : *scratch_cpu_fj_on_lp_opt)
+#pragma omp taskwait depend( \
+    in : *scratch_cpu_fj_on_lp_opt)  // Wait for the scratch CPU FJ (LP optimal) task to finish
 
     CUOPT_LOG_INFO("All scratch CPUFJ tasks were stopped");
   }
@@ -205,7 +203,8 @@ void local_search_t<i_t, f_t>::stop_cpufj_deterministic()
     }
 
     deterministic_cpu_fj->halted = true;
-#pragma omp taskwait depend(in : *deterministic_cpu_fj)
+#pragma omp taskwait depend( \
+    in : *deterministic_cpu_fj)  // Wait for deterministic CPU FJ task to finish
     CUOPT_LOG_INFO("Deterministic CPUFJ task was stopped");
   }
 }
@@ -270,7 +269,7 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
     for (size_t i = 0; i < ls_cpu_fj.size(); ++i) {
       ls_cpu_fj[i]->halted = true;
     }
-  }
+  }  // implicit barrier that waits all CPU FJ tasks to finish
 
   CUOPT_LOG_INFO("All CPUFJ tasks were stopped");
 
diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
index e6c403aacc..52333b1c14 100644
--- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
+++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
@@ -266,7 +266,7 @@ void conditional_bound_strengthening_t<i_t, f_t>::select_constraint_pairs_host(
       constraint_pairs_h[cnstr * max_pair_per_row + counter++] = {cnstr, temp};
     }
     cnstr_pair.clear();
-  }
+  }  // implicit barrier that waits for all iterations to finish before proceeding
 
   constraint_pairs = cuopt::device_copy(constraint_pairs_h, problem.handle_ptr->get_stream());
 
diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu
index 1171e03a53..9d3fb65a36 100644
--- a/cpp/src/mip_heuristics/presolve/probing_cache.cu
+++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu
@@ -926,7 +926,7 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
                                         timer,
                                         problem.handle_ptr->get_device());
       }
-    }
+    }  // implicit barrier that waits for all iterations to finish before proceeding
 
     // TODO when we have determinism, check current threads work/time counter and filter queue
     // items that are smaller or equal to that
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index 660e8a6f61..0229a8f27f 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -461,7 +461,7 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     // Start the primal heuristics
     context.diversity_manager_ptr = &dm;
     sol                           = dm.run_solver();
-  }  // implicit barrier for all tasks created within the taskgroup
+  }  // implicit barrier for all tasks created in B&B and heuristics
 
   if (!context.settings.heuristics_only) {
     if (branch_and_bound_solution.lower_bound > -std::numeric_limits<f_t>::infinity()) {

From d7d3dad7878b277922b6a05da2ade20d43fca580 Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Wed, 15 Apr 2026 17:50:50 +0200
Subject: [PATCH 11/14] decreased verbosity for logging the task launches

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/branch_and_bound/pseudo_costs.cpp          |  2 +-
 cpp/src/mip_heuristics/diversity/lns/rins.cu       |  2 +-
 .../mip_heuristics/feasibility_jump/early_cpufj.cu |  8 ++++----
 .../mip_heuristics/feasibility_jump/early_gpufj.cu |  4 ++--
 .../mip_heuristics/local_search/local_search.cu    | 14 +++++++-------
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 91824d5cac..a9297de571 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -1030,7 +1030,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   shared_strong_branching_context_t<i_t, f_t> shared_ctx(2 * fractional.size());
   shared_strong_branching_context_view_t<i_t, f_t> sb_view(shared_ctx.solved);
 
-  std::atomic concurrent_halt{0};
+  std::atomic<int> concurrent_halt{0};
 
   std::vector<f_t> pdlp_obj_down(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
   std::vector<f_t> pdlp_obj_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu
index 10430043a4..fd631d573d 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cu
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu
@@ -63,7 +63,7 @@ void rins_t<i_t, f_t>::node_callback(const std::vector<f_t>& solution, f_t objec
     if (population_ready) {
       lp_optimal_solution = solution;
 
-      CUOPT_LOG_INFO("Launching RINS task");
+      CUOPT_LOG_DEBUG("Launching RINS task");
 #pragma omp task default(none)
       run_rins();
     } else {
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
index e198ac20f1..de905f7592 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
@@ -43,7 +43,7 @@ void early_cpufj_t<i_t, f_t>::start()
                                          const std::vector<f_t>& assignment,
                                          double) { this->try_update_best(solver_obj, assignment); };
 
-  CUOPT_LOG_INFO("Launching early CPUFJ task");
+  CUOPT_LOG_DEBUG("Launching early CPUFJ task");
 #pragma omp task shared(fj_cpu_) depend(out : *fj_cpu_) default(none)
   cpufj_solve(fj_cpu_.get());
 }
@@ -58,9 +58,9 @@ void early_cpufj_t<i_t, f_t>::stop()
   fj_cpu_->halted = true;
 #pragma omp taskwait depend(in : *fj_cpu_)  // Wait for the early CPUFJ task to finish
 
-  CUOPT_LOG_INFO("[Early CPUFJ] Stopped after %d iterations, solution_found=%d",
-                 fj_cpu_ ? fj_cpu_->iterations : 0,
-                 this->solution_found_);
+  CUOPT_LOG_DEBUG("[Early CPUFJ] Stopped after %d iterations, solution_found=%d",
+                  fj_cpu_ ? fj_cpu_->iterations : 0,
+                  this->solution_found_);
 
   fj_cpu_.reset();
 }
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
index 96c88daf16..4615fa8dc3 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
@@ -57,7 +57,7 @@ void early_gpufj_t<i_t, f_t>::start()
     this->try_update_best(solver_obj, h_assignment);
   };
 
-  CUOPT_LOG_INFO("Launching early GPUFJ task");
+  CUOPT_LOG_DEBUG("Launching early GPUFJ task");
 
 #pragma omp task default(none) shared(fj_ptr_) depend(out : *fj_ptr_)
   {
@@ -74,7 +74,7 @@ void early_gpufj_t<i_t, f_t>::stop()
   context_ptr_->preempt_heuristic_solver_.store(true);
 #pragma omp taskwait depend(in : *fj_ptr_)  // Wait for the early GPU FJ task to finish
 
-  CUOPT_LOG_INFO("[Early GPU FJ] Stopped, solution_found=%d", this->solution_found_);
+  CUOPT_LOG_DEBUG("[Early GPU FJ] Stopped, solution_found=%d", this->solution_found_);
 
   fj_ptr_.reset();
 }
diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu
index 7844617f3f..58918148ba 100644
--- a/cpp/src/mip_heuristics/local_search/local_search.cu
+++ b/cpp/src/mip_heuristics/local_search/local_search.cu
@@ -90,7 +90,7 @@ void local_search_t<i_t, f_t>::start_cpufj_scratch_threads(population_t<i_t, f_t
     counter++;
   };
 
-  CUOPT_LOG_INFO("Launching %d scratch CPUFJ tasks", scratch_cpu_fj.size());
+  CUOPT_LOG_DEBUG("Launching %d scratch CPUFJ tasks", scratch_cpu_fj.size());
 
   for (size_t i = 0; i < scratch_cpu_fj.size(); ++i) {
     auto ptr = scratch_cpu_fj[i].get();
@@ -127,7 +127,7 @@ void local_search_t<i_t, f_t>::start_cpufj_lptopt_scratch_threads(
       }
     };
 
-  CUOPT_LOG_INFO("Launching scratch CPUFJ (on LP optimal) task");
+  CUOPT_LOG_DEBUG("Launching scratch CPUFJ (on LP optimal) task");
 
 #pragma omp task shared(scratch_cpu_fj_on_lp_opt) default(none) \
   depend(out : *scratch_cpu_fj_on_lp_opt)
@@ -147,7 +147,7 @@ void local_search_t<i_t, f_t>::stop_cpufj_scratch_threads()
 #pragma omp taskwait depend( \
     in : *scratch_cpu_fj_on_lp_opt)  // Wait for the scratch CPU FJ (LP optimal) task to finish
 
-    CUOPT_LOG_INFO("All scratch CPUFJ tasks were stopped");
+    CUOPT_LOG_DEBUG("All scratch CPUFJ tasks were stopped");
   }
 }
 
@@ -185,7 +185,7 @@ void local_search_t<i_t, f_t>::start_cpufj_deterministic(
       bb.queue_external_solution_deterministic(h_vec, work_units);
     };
 
-  CUOPT_LOG_INFO("Launching deterministic CPUFJ task");
+  CUOPT_LOG_DEBUG("Launching deterministic CPUFJ task");
 #pragma omp task shared(deterministic_cpu_fj) default(none) depend(inout : *deterministic_cpu_fj)
   cpufj_solve(deterministic_cpu_fj.get());
 
@@ -205,7 +205,7 @@ void local_search_t<i_t, f_t>::stop_cpufj_deterministic()
     deterministic_cpu_fj->halted = true;
 #pragma omp taskwait depend( \
     in : *deterministic_cpu_fj)  // Wait for deterministic CPU FJ task to finish
-    CUOPT_LOG_INFO("Deterministic CPUFJ task was stopped");
+    CUOPT_LOG_DEBUG("Deterministic CPUFJ task was stopped");
   }
 }
 
@@ -254,7 +254,7 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
 #pragma omp taskgroup
   {
     if (ls_cpu_fj.size() > 0) {
-      CUOPT_LOG_INFO("Launching %d CPUFJ tasks", ls_cpu_fj.size());
+      CUOPT_LOG_DEBUG("Launching %d CPUFJ tasks", ls_cpu_fj.size());
 
 #pragma omp taskloop shared(ls_cpu_fj) default(none) num_tasks(ls_cpu_fj.size()) nogroup
       for (size_t i = 0; i < ls_cpu_fj.size(); ++i) {
@@ -271,7 +271,7 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
     }
   }  // implicit barrier that waits all CPU FJ tasks to finish
 
-  CUOPT_LOG_INFO("All CPUFJ tasks were stopped");
+  CUOPT_LOG_DEBUG("All CPUFJ tasks were stopped");
 
   solution_t<i_t, f_t> solution_cpu(*solution.problem_ptr);
   f_t best_cpu_obj = std::numeric_limits<f_t>::max();

From edcec044ab69b9872554489d7545483fb2a1ebff Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Thu, 16 Apr 2026 12:09:10 +0200
Subject: [PATCH 12/14] revert scope guard as it was causing the compilation to
 fail with CUDA 12.9

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/mip_heuristics/diversity/lns/rins.cu | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu
index fd631d573d..2e3c616335 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cu
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu
@@ -24,7 +24,6 @@
 
 #include <branch_and_bound/branch_and_bound.hpp>
 #include <dual_simplex/tic_toc.hpp>
-#include <utilities/scope_guard.hpp>
 
 namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
@@ -85,8 +84,6 @@ template <typename i_t, typename f_t>
 void rins_t<i_t, f_t>::run_rins()
 {
   raft::common::nvtx::range fun_scope("Running RINS");
-  scope_guard scope_guard([this]() { this->launch_new_task = true; });
-
   RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device()));
   cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch");
   cuopt_assert(problem_copy->handle_ptr == &rins_handle, "Handle mismatch");
@@ -118,7 +115,10 @@ void rins_t<i_t, f_t>::run_rins()
   cuopt_assert(best_sol.handle_ptr == &rins_handle, "Handle mismatch");
 
   cuopt_assert(best_sol.get_feasible(), "Best solution is not feasible");
-  if (!best_sol.get_feasible()) { return; }
+  if (!best_sol.get_feasible()) {
+    launch_new_task = true;
+    return;
+  }
 
   i_t sol_size_before_rins = best_sol.assignment.size();
   auto lp_opt_device = cuopt::device_copy(this->lp_optimal_solution, rins_handle.get_stream());
@@ -142,6 +142,7 @@ void rins_t<i_t, f_t>::run_rins()
   // abort if the fractional ratio is too low
   if (fractional_ratio < settings.min_fractional_ratio) {
     CUOPT_LOG_TRACE("RINS fractional ratio too low, aborting");
+    launch_new_task = true;
     return;
   }
 
@@ -166,6 +167,7 @@ void rins_t<i_t, f_t>::run_rins()
 
   if (n_to_fix == 0) {
     CUOPT_LOG_DEBUG("RINS no variables to fix");
+    launch_new_task = true;
     return;
   }
 
@@ -339,6 +341,7 @@ void rins_t<i_t, f_t>::run_rins()
 
   if (improvement_found) total_success++;
   CUOPT_LOG_DEBUG("RINS calls/successes %d/%d", total_calls, total_success);
+  launch_new_task = true;
 }
 
 #if MIP_INSTANTIATE_FLOAT

From ff75e043c71c27d37e93fd86982be234214fbb7e Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Thu, 16 Apr 2026 12:15:16 +0200
Subject: [PATCH 13/14] trying again to re-enable scope_guard

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/mip_heuristics/diversity/lns/rins.cu | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu
index 2e3c616335..d9f130720d 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cu
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu
@@ -24,6 +24,7 @@
 
 #include <branch_and_bound/branch_and_bound.hpp>
 #include <dual_simplex/tic_toc.hpp>
+#include <utilities/scope_guard.hpp>
 
 namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
@@ -84,6 +85,8 @@ template <typename i_t, typename f_t>
 void rins_t<i_t, f_t>::run_rins()
 {
   raft::common::nvtx::range fun_scope("Running RINS");
+  scope_guard guard([this]() { this->launch_new_task = true; });
+
   RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device()));
   cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch");
   cuopt_assert(problem_copy->handle_ptr == &rins_handle, "Handle mismatch");
@@ -115,10 +118,7 @@ void rins_t<i_t, f_t>::run_rins()
   cuopt_assert(best_sol.handle_ptr == &rins_handle, "Handle mismatch");
 
   cuopt_assert(best_sol.get_feasible(), "Best solution is not feasible");
-  if (!best_sol.get_feasible()) {
-    launch_new_task = true;
-    return;
-  }
+  if (!best_sol.get_feasible()) { return; }
 
   i_t sol_size_before_rins = best_sol.assignment.size();
   auto lp_opt_device = cuopt::device_copy(this->lp_optimal_solution, rins_handle.get_stream());
@@ -142,7 +142,6 @@ void rins_t<i_t, f_t>::run_rins()
   // abort if the fractional ratio is too low
   if (fractional_ratio < settings.min_fractional_ratio) {
     CUOPT_LOG_TRACE("RINS fractional ratio too low, aborting");
-    launch_new_task = true;
     return;
   }
 
@@ -167,7 +166,6 @@ void rins_t<i_t, f_t>::run_rins()
 
   if (n_to_fix == 0) {
     CUOPT_LOG_DEBUG("RINS no variables to fix");
-    launch_new_task = true;
     return;
   }
 
@@ -341,7 +339,6 @@ void rins_t<i_t, f_t>::run_rins()
 
   if (improvement_found) total_success++;
   CUOPT_LOG_DEBUG("RINS calls/successes %d/%d", total_calls, total_success);
-  launch_new_task = true;
 }
 
 #if MIP_INSTANTIATE_FLOAT

From 11b92ed495fbbbe5656ca0153af23318a2eb25bd Mon Sep 17 00:00:00 2001
From: "Nicolas L. Guidotti" <nguidotti@nvidia.com>
Date: Thu, 16 Apr 2026 12:18:25 +0200
Subject: [PATCH 14/14] missed one debug message

Signed-off-by: Nicolas L. Guidotti <nguidotti@nvidia.com>
---
 cpp/src/mip_heuristics/diversity/lns/rins.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu
index d9f130720d..9396d7158a 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cu
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu
@@ -224,7 +224,7 @@ void rins_t<i_t, f_t>::run_rins()
                           true);
   fj_cpu->log_prefix = "[RINS] ";
 
-  CUOPT_LOG_INFO("Launching CPUFJ (RINS) task");
+  CUOPT_LOG_DEBUG("Launching CPUFJ (RINS) task");
 #pragma omp task shared(fj_cpu) firstprivate(time_limit) default(none)
   cpufj_solve(fj_cpu.get(), time_limit);
 
@@ -298,6 +298,7 @@ void rins_t<i_t, f_t>::run_rins()
   }
 
 #pragma omp taskwait  // Wait for the CPU FJ (RINS) to finish
+  CUOPT_LOG_DEBUG("CPUFJ (RINS) task was stopped");
 
   CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", fj_cpu->iterations);
   if (fj_cpu->feasible_found) {