From bee22227030dfb61c714df6cedbbd14b03da4ee2 Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Mon, 13 Apr 2026 13:36:18 +0200 Subject: [PATCH 01/14] B&B and heuristics now shares a single omp parallel region Signed-off-by: Nicolas L. Guidotti --- cpp/src/branch_and_bound/branch_and_bound.cpp | 53 ++++---- cpp/src/branch_and_bound/pseudo_costs.cpp | 124 +++++++++--------- cpp/src/mip_heuristics/solver.cu | 74 ++++++----- 3 files changed, 135 insertions(+), 116 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 1526baa367..a0a4907374 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -1871,19 +1871,30 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( // Root node path lp_status_t root_status; - std::future root_status_future; - root_status_future = std::async(std::launch::async, - &solve_linear_program_with_advanced_basis, - std::ref(original_lp_), - exploration_stats_.start_time, - std::ref(lp_settings), - std::ref(root_relax_soln), - std::ref(basis_update), - std::ref(basic_list), - std::ref(nonbasic_list), - std::ref(root_vstatus), - std::ref(edge_norms), - nullptr); + + // Note that we need to explicitly declared `root_status` as a shared variable here since + // it is local to the thread that are executing the enclosing task. +#pragma omp task shared(root_status, \ + original_lp_, \ + lp_settings, \ + basis_update, \ + basic_list, \ + nonbasic_list, \ + root_vstatus_, \ + edge_norms_) default(none) + { + root_status = solve_linear_program_with_advanced_basis(original_lp_, + exploration_stats_.start_time, + lp_settings, + root_relax_soln_, + basis_update, + basic_list, + nonbasic_list, + root_vstatus_, + edge_norms_, + nullptr); + } + // Wait for the root relaxation solution to be sent by the diversity manager or dual simplex // to finish while (!root_crossover_solution_set_.load(std::memory_order_acquire) && @@ -1925,9 +1936,9 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( // Check if crossover was stopped by dual simplex if (crossover_status == crossover_status_t::OPTIMAL) { - set_root_concurrent_halt(1); // Stop dual simplex - root_status = root_status_future.get(); // Wait for dual simplex to finish - set_root_concurrent_halt(0); // Clear the concurrent halt flag + set_root_concurrent_halt(1); // Stop dual simplex +#pragma omp taskwait // Wait for dual simplex to finish + set_root_concurrent_halt(0); // Clear the concurrent halt flag // Override the root relaxation solution with the crossover solution root_relax_soln = root_crossover_soln_; root_vstatus = crossover_vstatus_; @@ -1977,14 +1988,14 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( solver_name = method_to_string(root_relax_solved_by); } else { - root_status = root_status_future.get(); +#pragma omp taskwait user_objective = root_relax_soln_.user_objective; iter = root_relax_soln_.iterations; root_relax_solved_by = DualSimplex; solver_name = "Dual Simplex"; } } else { - root_status = root_status_future.get(); +#pragma omp taskwait user_objective = root_relax_soln_.user_objective; iter = root_relax_soln_.iterations; root_relax_solved_by = DualSimplex; @@ -2613,11 +2624,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut if (settings_.deterministic) { run_deterministic_coordinator(Arow_); } else if (settings_.num_threads > 1) { -#pragma omp parallel num_threads(settings_.num_threads) - { -#pragma omp master - run_scheduler(); - } + run_scheduler(); } else { single_threaded_solve(); } diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index c38e98e27d..65e527078a 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -756,14 +756,15 @@ static void batch_pdlp_strong_branching_task( ws_settings.inside_mip = true; if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; } - auto start_time = std::chrono::high_resolution_clock::now(); + auto pdlp_start_time = std::chrono::high_resolution_clock::now(); auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings); if (verbose) { - auto end_time = std::chrono::high_resolution_clock::now(); + auto pdlp_end_time = std::chrono::high_resolution_clock::now(); auto duration = - std::chrono::duration_cast(end_time - start_time).count(); + std::chrono::duration_cast(pdlp_end_time - pdlp_start_time) + .count(); settings.log.printf( "Original problem solved in %d milliseconds" " and iterations: %d\n", @@ -1029,7 +1030,7 @@ void strong_branching(const lp_problem_t& original_lp, shared_strong_branching_context_t shared_ctx(2 * fractional.size()); shared_strong_branching_context_view_t sb_view(shared_ctx.solved); - std::atomic concurrent_halt{0}; + std::atomic concurrent_halt{0}; std::vector pdlp_obj_down(fractional.size(), std::numeric_limits::quiet_NaN()); std::vector pdlp_obj_up(fractional.size(), std::numeric_limits::quiet_NaN()); @@ -1052,70 +1053,67 @@ void strong_branching(const lp_problem_t& original_lp, basis_factors, pc); } else { -#pragma omp parallel num_threads(settings.num_threads) - { -#pragma omp single nowait - { - if (effective_batch_pdlp != 0) { -#pragma omp task - batch_pdlp_strong_branching_task(settings, - effective_batch_pdlp, - start_time, - concurrent_halt, - original_lp, - new_slacks, - root_solution.x, - fractional, - root_obj, - pc, - sb_view, - pdlp_obj_down, - pdlp_obj_up); - } + if (effective_batch_pdlp != 0) { +#pragma omp task depend(out : pdlp_obj_down) depend(out : pdlp_obj_up) default(shared) + batch_pdlp_strong_branching_task(settings, + effective_batch_pdlp, + start_time, + concurrent_halt, + original_lp, + new_slacks, + root_solution.x, + fractional, + root_obj, + pc, + sb_view, + pdlp_obj_down, + pdlp_obj_up); + } - if (effective_batch_pdlp != 2) { - i_t n = std::min(4 * settings.num_threads, fractional.size()); + if (effective_batch_pdlp != 2) { + i_t n = std::min(4 * settings.num_threads, fractional.size()); // Here we are creating more tasks than the number of threads // such that they can be scheduled dynamically to the threads. -#pragma omp taskloop num_tasks(n) - for (i_t k = 0; k < n; k++) { - i_t start = std::floor(k * fractional.size() / n); - i_t end = std::floor((k + 1) * fractional.size() / n); - - constexpr bool verbose = false; - if (verbose) { - settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n", - omp_get_thread_num(), - k, - start, - end, - end - start); - } - - strong_branch_helper(start, - end, - start_time, - original_lp, - settings, - var_types, - fractional, - root_solution.x, - root_vstatus, - edge_norms, - root_obj, - upper_bound, - simplex_iteration_limit, - pc, - dual_simplex_obj_down, - dual_simplex_obj_up, - dual_simplex_status_down, - dual_simplex_status_up, - sb_view); - } - // DS done: signal PDLP to stop (time-limit or all work done) and wait - if (effective_batch_pdlp == 1) { concurrent_halt.store(1); } +#pragma omp taskloop num_tasks(n) default(shared) + for (i_t k = 0; k < n; k++) { + i_t start = std::floor(k * fractional.size() / n); + i_t end = std::floor((k + 1) * fractional.size() / n); + + if (verbose) { + settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n", + omp_get_thread_num(), + k, + start, + end, + end - start); } + + strong_branch_helper(start, + end, + start_time, + original_lp, + settings, + var_types, + fractional, + root_solution.x, + root_vstatus, + edge_norms, + root_obj, + upper_bound, + simplex_iteration_limit, + pc, + dual_simplex_obj_down, + dual_simplex_obj_up, + dual_simplex_status_down, + dual_simplex_status_up, + sb_view); } + // DS done: signal PDLP to stop (time-limit or all work done) and wait + if (effective_batch_pdlp == 1) { concurrent_halt.store(1); } + } + + if (effective_batch_pdlp != 0) { +#pragma omp taskwait depend(in : pdlp_obj_down) depend(in : pdlp_obj_up) } } diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index 0bbf48d95e..737d81201f 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -181,6 +181,8 @@ void extract_probing_implied_bounds( template solution_t mip_solver_t::run_solver() { + solution_t sol(*context.problem_ptr); + // we need to keep original problem const cuopt_assert(context.problem_ptr != nullptr, "invalid problem pointer"); context.problem_ptr->tolerances = context.settings.get_tolerances(); @@ -234,14 +236,12 @@ solution_t mip_solver_t::run_solver() if (!presolve_success) { CUOPT_LOG_INFO("Problem proven infeasible in presolve"); - solution_t sol(*context.problem_ptr); sol.set_problem_fully_reduced(); context.problem_ptr->post_process_solution(sol); return sol; } if (run_presolve && context.problem_ptr->empty) { CUOPT_LOG_INFO("Problem full reduced in presolve"); - solution_t sol(*context.problem_ptr); sol.set_problem_fully_reduced(); for (auto callback : context.settings.get_mip_callbacks()) { if (callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) { @@ -293,8 +293,8 @@ solution_t mip_solver_t::run_solver() } context.work_unit_scheduler_.register_context(context.gpu_heur_loop); - namespace dual_simplex = cuopt::linear_programming::dual_simplex; - std::future branch_and_bound_status_future; + namespace dual_simplex = cuopt::linear_programming::dual_simplex; + dual_simplex::mip_status_t branch_and_bound_status = dual_simplex::mip_status_t::UNSET; dual_simplex::user_problem_t branch_and_bound_problem(context.problem_ptr->handle_ptr); context.problem_ptr->recompute_objective_integrality(); if (context.problem_ptr->is_objective_integral()) { @@ -309,8 +309,14 @@ solution_t mip_solver_t::run_solver() dual_simplex::probing_implied_bound_t probing_implied_bound; - bool run_bb = !context.settings.heuristics_only; - if (run_bb) { + i_t num_threads = 0; + if (context.settings.num_cpu_threads < 0) { + num_threads = omp_get_max_threads(); + } else { + num_threads = std::max(1, context.settings.num_cpu_threads); + } + + if (!context.settings.heuristics_only) { // Convert the presolved problem to dual_simplex::user_problem_t op_problem_.get_host_user_problem(branch_and_bound_problem); // Resize the solution now that we know the number of columns/variables @@ -324,6 +330,7 @@ solution_t mip_solver_t::run_solver() // Fill in the settings for branch and bound branch_and_bound_settings.time_limit = timer_.get_time_limit(); branch_and_bound_settings.node_limit = context.settings.node_limit; + branch_and_bound_settings.num_threads = num_threads - 1; branch_and_bound_settings.print_presolve_stats = false; branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap; branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap; @@ -363,21 +370,18 @@ solution_t mip_solver_t::run_solver() ? 2 : context.settings.reduced_cost_strengthening; - if (context.settings.num_cpu_threads < 0) { - branch_and_bound_settings.num_threads = std::max(1, omp_get_max_threads() - 1); - } else { - branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads); - } - // Set the branch and bound -> primal heuristics callback branch_and_bound_settings.solution_callback = std::bind(&branch_and_bound_solution_helper_t::solution_callback, &solution_helper, std::placeholders::_1, std::placeholders::_2); - // heuristic_preemption_callback is needed in both modes to properly stop the heuristic thread + + // heuristic_preemption_callback is needed in both modes to properly stop the heuristic + // thread branch_and_bound_settings.heuristic_preemption_callback = std::bind( &branch_and_bound_solution_helper_t::preempt_heuristic_solver, &solution_helper); + if (context.settings.determinism_mode == CUOPT_MODE_OPPORTUNISTIC) { branch_and_bound_settings.set_simplex_solution_callback = std::bind(&branch_and_bound_solution_helper_t::set_simplex_solution, @@ -403,8 +407,8 @@ solution_t mip_solver_t::run_solver() context.branch_and_bound_ptr = branch_and_bound.get(); // Convert initial_cutoff from user-space to B&B's internal objective space. - // context.problem_ptr is the post-trivial-presolve problem, whose get_solver_obj_from_user_obj - // produces values in the same space as B&B node lower bounds. + // context.problem_ptr is the post-trivial-presolve problem, whose + // get_solver_obj_from_user_obj produces values in the same space as B&B node lower bounds. if (std::isfinite(context.initial_cutoff)) { f_t bb_cutoff = context.problem_ptr->get_solver_obj_from_user_obj(context.initial_cutoff); branch_and_bound->set_initial_cutoff(bb_cutoff); @@ -456,28 +460,38 @@ solution_t mip_solver_t::run_solver() context.problem_ptr->post_process_solution(sol); return sol; } + } + +#pragma omp parallel num_threads(num_threads) default(none) \ + shared(sol, branch_and_bound, branch_and_bound_status, branch_and_bound_solution, dm, context) + { +#pragma omp master + { + if (!context.settings.heuristics_only) { +#pragma omp task + { + branch_and_bound_status = branch_and_bound->solve(branch_and_bound_solution); + } + } - // Fork a thread for branch and bound - // std::async and std::future allow us to get the return value of bb::solve() - // without having to manually manage the thread - // std::future.get() performs a join() operation to wait until the return status is available - branch_and_bound_status_future = std::async(std::launch::async, - &dual_simplex::branch_and_bound_t::solve, - branch_and_bound.get(), - std::ref(branch_and_bound_solution)); +#pragma omp task + { + // Start the primal heuristics + context.diversity_manager_ptr = &dm; + // Start the primal heuristics + sol = dm.run_solver(); + } + } } - // Start the primal heuristics - context.diversity_manager_ptr = &dm; - auto sol = dm.run_solver(); - if (run_bb) { - // Wait for the branch and bound to finish - auto bb_status = branch_and_bound_status_future.get(); + if (!context.settings.heuristics_only) { if (branch_and_bound_solution.lower_bound > -std::numeric_limits::infinity()) { context.stats.set_solution_bound( context.problem_ptr->get_user_obj_from_solver_obj(branch_and_bound_solution.lower_bound)); } - if (bb_status == dual_simplex::mip_status_t::INFEASIBLE) { sol.set_problem_fully_reduced(); } + if (branch_and_bound_status == dual_simplex::mip_status_t::INFEASIBLE) { + sol.set_problem_fully_reduced(); + } context.stats.num_nodes = branch_and_bound_solution.nodes_explored; context.stats.num_simplex_iterations = branch_and_bound_solution.simplex_iterations; } From a50ff744e2f61aea91f61825f7c62764ba334f27 Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Tue, 14 Apr 2026 18:09:57 +0200 Subject: [PATCH 02/14] migrated heuristics and presolve to use OpenMP. now the entire solver shares the same thread pool. Signed-off-by: Nicolas L. Guidotti --- cpp/src/branch_and_bound/branch_and_bound.cpp | 19 +- cpp/src/branch_and_bound/pseudo_costs.cpp | 78 ++++++-- cpp/src/mip_heuristics/diversity/lns/rins.cu | 89 ++++----- cpp/src/mip_heuristics/diversity/lns/rins.cuh | 37 +--- .../feasibility_jump/early_cpufj.cu | 32 ++-- .../feasibility_jump/early_cpufj.cuh | 6 +- .../feasibility_jump/feasibility_jump.cuh | 2 - .../mip_heuristics/feasibility_jump/fj_cpu.cu | 172 +++++++----------- .../feasibility_jump/fj_cpu.cuh | 23 +-- .../local_search/local_search.cu | 172 +++++++++--------- .../local_search/local_search.cuh | 19 +- .../presolve/bounds_presolve.cuh | 2 +- .../conditional_bound_strengthening.cu | 3 +- .../mip_heuristics/presolve/probing_cache.cu | 93 ++++++---- cpp/src/mip_heuristics/solve.cu | 60 +++++- cpp/src/mip_heuristics/solver.cu | 36 ++-- .../utilities/cpu_worker_thread.cuh | 147 --------------- 17 files changed, 415 insertions(+), 575 deletions(-) delete mode 100644 cpp/src/mip_heuristics/utilities/cpu_worker_thread.cuh diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index a0a4907374..de13e1ba27 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -1750,7 +1750,7 @@ void branch_and_bound_t::run_scheduler() active_workers_per_strategy_[strategy]++; launched_any_task = true; -#pragma omp task affinity(worker) +#pragma omp task affinity(worker) default(none) firstprivate(worker) plunge_with(worker); } else { @@ -1771,7 +1771,7 @@ void branch_and_bound_t::run_scheduler() active_workers_per_strategy_[strategy]++; launched_any_task = true; -#pragma omp task affinity(worker) +#pragma omp task affinity(worker) default(none) firstprivate(worker) dive_with(worker); } } @@ -2621,12 +2621,15 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut "| Gap | Time |\n"); } - if (settings_.deterministic) { - run_deterministic_coordinator(Arow_); - } else if (settings_.num_threads > 1) { - run_scheduler(); - } else { - single_threaded_solve(); +#pragma omp taskgroup + { + if (settings_.deterministic) { + run_deterministic_coordinator(Arow_); + } else if (settings_.num_threads > 1) { + run_scheduler(); + } else { + single_threaded_solve(); + } } is_running_ = false; diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 65e527078a..bda91cf04f 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -1000,8 +1000,6 @@ void strong_branching(const lp_problem_t& original_lp, basis_update_mpf_t& basis_factors, pseudo_costs_t& pc) { - constexpr bool verbose = false; - pc.resize(original_lp.num_cols); pc.strong_branch_down.assign(fractional.size(), 0); pc.strong_branch_up.assign(fractional.size(), 0); @@ -1054,7 +1052,17 @@ void strong_branching(const lp_problem_t& original_lp, pc); } else { if (effective_batch_pdlp != 0) { -#pragma omp task depend(out : pdlp_obj_down) depend(out : pdlp_obj_up) default(shared) +#pragma omp task shared(settings, \ + concurrent_halt, \ + original_lp, \ + new_slacks, \ + root_solution, \ + fractional, \ + pc, \ + sb_view, \ + pdlp_obj_down, \ + pdlp_obj_up) \ + firstprivate(effective_batch_pdlp, start_time, root_obj) default(none) batch_pdlp_strong_branching_task(settings, effective_batch_pdlp, start_time, @@ -1074,11 +1082,25 @@ void strong_branching(const lp_problem_t& original_lp, i_t n = std::min(4 * settings.num_threads, fractional.size()); // Here we are creating more tasks than the number of threads // such that they can be scheduled dynamically to the threads. -#pragma omp taskloop num_tasks(n) default(shared) +#pragma omp taskloop num_tasks(n) default(none) shared(original_lp, \ + settings, \ + var_types, \ + fractional, \ + root_solution, \ + root_vstatus, \ + edge_norms, \ + pc, \ + dual_simplex_obj_down, \ + dual_simplex_obj_up, \ + dual_simplex_status_down, \ + dual_simplex_status_up, \ + sb_view) \ + firstprivate(start_time, root_obj, upper_bound, simplex_iteration_limit, n) for (i_t k = 0; k < n; k++) { i_t start = std::floor(k * fractional.size() / n); i_t end = std::floor((k + 1) * fractional.size() / n); + constexpr bool verbose = false; if (verbose) { settings.log.printf("Thread id %d task id %d start %d end %d. size %d\n", omp_get_thread_num(), @@ -1113,12 +1135,13 @@ void strong_branching(const lp_problem_t& original_lp, } if (effective_batch_pdlp != 0) { -#pragma omp taskwait depend(in : pdlp_obj_down) depend(in : pdlp_obj_up) +#pragma omp taskwait } } settings.log.printf("Strong branching completed in %.2fs\n", toc(strong_branching_start_time)); + constexpr bool verbose = false; if (verbose) { // Collect Dual Simplex statistics i_t dual_simplex_optimal = 0, dual_simplex_infeasible = 0, dual_simplex_iter_limit = 0; @@ -1539,7 +1562,18 @@ i_t pseudo_costs_t::reliable_variable_selection( std::atomic concurrent_halt{0}; if (use_pdlp) { -#pragma omp task default(shared) +#pragma omp task default(none) shared(log, \ + concurrent_halt, \ + original_lp, \ + new_slacks, \ + leaf_solution, \ + worker, \ + candidate_vars, \ + settings, \ + sb_view, \ + pdlp_obj_down, \ + pdlp_obj_up) \ + firstprivate(rb_mode, num_candidates, start_time) batch_pdlp_reliability_branching_task(log, rb_mode, num_candidates, @@ -1574,14 +1608,30 @@ i_t pseudo_costs_t::reliable_variable_selection( f_t dual_simplex_start_time = tic(); if (rb_mode != 2) { -#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \ - shared(score_mutex, \ - sb_view, \ - dual_simplex_obj_down, \ - dual_simplex_obj_up, \ - dual_simplex_status_down, \ - dual_simplex_status_up, \ - unreliable_list) +#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) default(none) \ + shared(log, \ + unreliable_list, \ + settings, \ + sb_view, \ + worker, \ + var_types, \ + node_ptr, \ + leaf_solution, \ + dual_simplex_obj_down, \ + dual_simplex_obj_up, \ + dual_simplex_status_down, \ + dual_simplex_status_up, \ + score_mutex, \ + max_score, \ + branch_var) firstprivate(num_candidates, \ + start_time, \ + rb_mode, \ + reliable_threshold, \ + upper_bound, \ + iter_limit_per_trial, \ + eps, \ + pseudo_cost_up_avg, \ + pseudo_cost_down_avg) for (i_t i = 0; i < num_candidates; ++i) { auto [score, j] = unreliable_list[i]; diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu index c4331343de..c6a1bba7e7 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cu +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu @@ -36,19 +36,6 @@ rins_t::rins_t(mip_solver_context_t& context_, time_limit = context.settings.heuristic_params.rins_time_limit; } -template -rins_thread_t::~rins_thread_t() -{ - this->request_termination(); -} - -template -void rins_thread_t::run_worker() -{ - raft::common::nvtx::range fun_scope("Running RINS"); - rins_ptr->run_rins(); -} - template void rins_t::new_best_incumbent_callback(const std::vector& solution) { @@ -59,23 +46,25 @@ template void rins_t::node_callback(const std::vector& solution, f_t objective) { if (!enabled) return; - node_count++; if (node_count - node_count_at_last_improvement < settings.nodes_after_later_improvement) return; - if (node_count - node_count_at_last_rins > settings.node_freq) { // opportunistic early test w/ atomic to avoid having to take the lock - if (!rins_thread->cpu_thread_done) return; - std::lock_guard lock(rins_mutex); + if (!launch_new_task.exchange(false)) return; + bool population_ready = false; - if (rins_thread->cpu_thread_done) { + { std::lock_guard pop_lock(dm.population.write_mutex); population_ready = dm.population.current_size() > 0 && dm.population.is_feasible(); } + if (population_ready) { lp_optimal_solution = solution; - rins_thread->start_cpu_solver(); +#pragma omp task default(none) + run_rins(); + } else { + launch_new_task = true; } } } @@ -83,25 +72,17 @@ void rins_t::node_callback(const std::vector& solution, f_t objec template void rins_t::enable() { - rins_thread = std::make_unique>(); - rins_thread->rins_ptr = this; - seed = cuopt::seed_generator::get_seed(); + seed = cuopt::seed_generator::get_seed(); problem_ptr->handle_ptr->sync_stream(); problem_copy = std::make_unique>(*problem_ptr, &rins_handle); enabled = true; } -template -void rins_t::stop_rins() -{ - enabled = false; - if (rins_thread) rins_thread->request_termination(); - rins_thread.reset(); -} - template void rins_t::run_rins() { + raft::common::nvtx::range fun_scope("Running RINS"); + if (total_calls == 0) RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device())); cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch"); @@ -134,7 +115,10 @@ void rins_t::run_rins() cuopt_assert(best_sol.handle_ptr == &rins_handle, "Handle mismatch"); cuopt_assert(best_sol.get_feasible(), "Best solution is not feasible"); - if (!best_sol.get_feasible()) { return; } + if (!best_sol.get_feasible()) { + launch_new_task = true; + return; + } i_t sol_size_before_rins = best_sol.assignment.size(); auto lp_opt_device = cuopt::device_copy(this->lp_optimal_solution, rins_handle.get_stream()); @@ -158,6 +142,7 @@ void rins_t::run_rins() // abort if the fractional ratio is too low if (fractional_ratio < settings.min_fractional_ratio) { CUOPT_LOG_TRACE("RINS fractional ratio too low, aborting"); + launch_new_task = true; return; } @@ -182,6 +167,7 @@ void rins_t::run_rins() if (n_to_fix == 0) { CUOPT_LOG_DEBUG("RINS no variables to fix"); + launch_new_task = true; return; } @@ -229,18 +215,19 @@ void rins_t::run_rins() solution_t fj_solution(fixed_problem); fj_solution.copy_new_assignment(cuopt::host_copy(fixed_assignment, rins_handle.get_stream())); std::vector default_weights(fixed_problem.n_constraints, 1.); - cpu_fj_thread_t cpu_fj_thread; - cpu_fj_thread.fj_cpu = fj.create_cpu_climber(fj_solution, - default_weights, - default_weights, - 0., - context.preempt_heuristic_solver_, - fj_settings_t{}, - true); - cpu_fj_thread.fj_ptr = &fj; - cpu_fj_thread.fj_cpu->log_prefix = "[RINS] "; - cpu_fj_thread.time_limit = time_limit; - cpu_fj_thread.start_cpu_solver(); + + std::unique_ptr> fj_cpu = + fj.create_cpu_climber(fj_solution, + default_weights, + default_weights, + 0., + context.preempt_heuristic_solver_, + fj_settings_t{}, + true); + fj_cpu->log_prefix = "[RINS] "; + +#pragma omp task shared(fj_cpu) firstprivate(time_limit) default(none) + cpufj_solve(fj_cpu.get(), time_limit); f_t lower_bound = context.branch_and_bound_ptr ? context.branch_and_bound_ptr->get_lower_bound() : -std::numeric_limits::infinity(); @@ -311,13 +298,12 @@ void rins_t::run_rins() static_cast(context.settings.heuristic_params.rins_max_time_limit)); } - cpu_fj_thread.stop_cpu_solver(); - bool fj_solution_found = cpu_fj_thread.wait_for_cpu_solver(); - CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", cpu_fj_thread.fj_cpu->iterations); - if (fj_solution_found) { - CUOPT_LOG_DEBUG("RINS FJ solution found. Objective %.16e", - cpu_fj_thread.fj_cpu->h_best_objective); - rins_solution_queue.push_back(cpu_fj_thread.fj_cpu->h_best_assignment); +#pragma omp taskwait + + CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", fj_cpu->iterations); + if (fj_cpu->feasible_found) { + CUOPT_LOG_DEBUG("RINS FJ solution found. Objective %.16e", fj_cpu->h_best_objective); + rins_solution_queue.push_back(fj_cpu->h_best_assignment); } // Thread will be automatically terminated and joined by destructor @@ -354,15 +340,14 @@ void rins_t::run_rins() if (improvement_found) total_success++; CUOPT_LOG_DEBUG("RINS calls/successes %d/%d", total_calls, total_success); + launch_new_task = true; } #if MIP_INSTANTIATE_FLOAT -template class rins_thread_t; template class rins_t; #endif #if MIP_INSTANTIATE_DOUBLE -template class rins_thread_t; template class rins_t; #endif diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cuh b/cpp/src/mip_heuristics/diversity/lns/rins.cuh index 0a9133f848..b1b62bd1ae 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cuh +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cuh @@ -17,19 +17,11 @@ #pragma once -#include #include #include -#include -#include +#include -#include -#include -#include -#include -#include -#include #include namespace cuopt::linear_programming::detail { @@ -52,18 +44,6 @@ struct rins_settings_t { template class rins_t; -template -struct rins_thread_t : public cpu_worker_thread_base_t> { - ~rins_thread_t(); - - void run_worker(); - void on_terminate() {} - void on_start() {} - bool get_result() { return true; } - - rins_t* rins_ptr{nullptr}; -}; - template class rins_t { public: @@ -74,7 +54,6 @@ class rins_t { void node_callback(const std::vector& solution, f_t objective); void new_best_incumbent_callback(const std::vector& solution); void enable(); - void stop_rins(); void run_rins(); @@ -96,15 +75,13 @@ class rins_t { f_t time_limit{10.}; i_t seed; - std::atomic enabled{false}; - std::atomic lower_bound{0.}; - - std::atomic node_count{0}; - std::atomic node_count_at_last_rins{0}; - std::atomic node_count_at_last_improvement{0}; - std::mutex rins_mutex; + omp_atomic_t enabled{false}; + omp_atomic_t lower_bound{0.}; - std::unique_ptr> rins_thread; + omp_atomic_t node_count{0}; + omp_atomic_t node_count_at_last_rins{0}; + omp_atomic_t node_count_at_last_improvement{0}; + omp_atomic_t launch_new_task{true}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu index 8109653e6f..15ff3f4eb7 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu @@ -7,9 +7,7 @@ #include "early_cpufj.cuh" -#include #include -#include namespace cuopt::linear_programming::detail { @@ -32,40 +30,38 @@ early_cpufj_t::~early_cpufj_t() template void early_cpufj_t::start() { - if (cpu_fj_thread_) { return; } + if (fj_cpu_) { return; } this->preemption_flag_.store(false); this->start_time_ = std::chrono::steady_clock::now(); - cpu_fj_thread_ = std::make_unique>(); - cpu_fj_thread_->fj_cpu = - init_fj_cpu_standalone(*this->problem_ptr_, *this->solution_ptr_, preemption_flag_); - cpu_fj_thread_->time_limit = std::numeric_limits::infinity(); + fj_cpu_ = init_fj_cpu_standalone(*this->problem_ptr_, *this->solution_ptr_, preemption_flag_); - cpu_fj_thread_->fj_cpu->log_prefix = "[Early CPUFJ] "; + fj_cpu_->log_prefix = "[Early CPUFJ] "; - cpu_fj_thread_->fj_cpu->improvement_callback = - [this](f_t solver_obj, const std::vector& assignment, double) { - this->try_update_best(solver_obj, assignment); - }; + fj_cpu_->improvement_callback = [this](f_t solver_obj, + const std::vector& assignment, + double) { this->try_update_best(solver_obj, assignment); }; - cpu_fj_thread_->start_cpu_solver(); +#pragma omp task shared(fj_cpu_) depend(out : *fj_cpu_) default(none) + cpufj_solve(fj_cpu_.get()); } template void early_cpufj_t::stop() { - if (!cpu_fj_thread_) { return; } + if (!fj_cpu_) { return; } preemption_flag_.store(true); - cpu_fj_thread_->stop_cpu_solver(); - cpu_fj_thread_->wait_for_cpu_solver(); + + fj_cpu_->halted = true; +#pragma omp taskwait depend(in : *fj_cpu_) CUOPT_LOG_DEBUG("[Early CPUFJ] Stopped after %d iterations, solution_found=%d", - cpu_fj_thread_->fj_cpu ? cpu_fj_thread_->fj_cpu->iterations : 0, + fj_cpu_ ? fj_cpu_->iterations : 0, this->solution_found_); - cpu_fj_thread_.reset(); + fj_cpu_.reset(); } #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh index 911e846551..fd85e4b9f3 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh @@ -8,15 +8,13 @@ #pragma once #include +#include #include #include namespace cuopt::linear_programming::detail { -template -struct cpu_fj_thread_t; - template class early_cpufj_t : public early_heuristic_t> { public: @@ -32,7 +30,7 @@ class early_cpufj_t : public early_heuristic_t void stop(); private: - std::unique_ptr> cpu_fj_thread_; + std::unique_ptr> fj_cpu_; std::atomic preemption_flag_{false}; }; diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh index 50b451a86e..33d1ac527f 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh @@ -216,8 +216,6 @@ class fj_t { std::atomic& preemption_flag, fj_settings_t settings = fj_settings_t{}, bool randomize_params = false); - bool cpu_solve(fj_cpu_climber_t& fj_cpu, - f_t time_limit = +std::numeric_limits::infinity()); i_t alloc_max_climbers(i_t desired_climbers); void resize_vectors(const raft::handle_t* handle_ptr); void device_init(const rmm::cuda_stream_view& stream); diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu index 4eaa5b6a21..aea6295528 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu @@ -1417,7 +1417,7 @@ std::unique_ptr> fj_t::create_cpu_climber( } template -static bool cpufj_solve_loop(fj_cpu_climber_t& fj_cpu, f_t in_time_limit) +void cpufj_solve(fj_cpu_climber_t* fj_cpu, f_t in_time_limit) { i_t local_mins = 0; auto loop_start = std::chrono::high_resolution_clock::now(); @@ -1425,37 +1425,37 @@ static bool cpufj_solve_loop(fj_cpu_climber_t& fj_cpu, f_t in_time_lim auto loop_time_start = std::chrono::high_resolution_clock::now(); // Initialize feature tracking - fj_cpu.last_feature_log_time = loop_start; - fj_cpu.prev_best_objective = fj_cpu.h_best_objective; - fj_cpu.iterations_since_best = 0; + fj_cpu->last_feature_log_time = loop_start; + fj_cpu->prev_best_objective = fj_cpu->h_best_objective; + fj_cpu->iterations_since_best = 0; - while (!fj_cpu.halted && !fj_cpu.preemption_flag.load()) { + while (!fj_cpu->halted && !fj_cpu->preemption_flag.load()) { // Check if 5 seconds have passed auto now = std::chrono::high_resolution_clock::now(); if (in_time_limit < std::numeric_limits::infinity() && now - loop_time_start > time_limit) { CUOPT_LOG_TRACE("%sTime limit of %.4f seconds reached, breaking loop at iteration %d", - fj_cpu.log_prefix.c_str(), + fj_cpu->log_prefix.c_str(), time_limit.count() / 1000.f, - fj_cpu.iterations); + fj_cpu->iterations); break; } - if (fj_cpu.iterations >= fj_cpu.settings.iteration_limit) { + if (fj_cpu->iterations >= fj_cpu->settings.iteration_limit) { CUOPT_LOG_TRACE("%sIteration limit of %d reached, breaking loop at iteration %d", - fj_cpu.log_prefix.c_str(), - fj_cpu.settings.iteration_limit, - fj_cpu.iterations); + fj_cpu->log_prefix.c_str(), + fj_cpu->settings.iteration_limit, + fj_cpu->iterations); break; } // periodically recompute the LHS and violation scores // to correct any accumulated numerical errors - cuopt_assert(fj_cpu.settings.parameters.lhs_refresh_period > 0, + cuopt_assert(fj_cpu->settings.parameters.lhs_refresh_period > 0, "lhs_refresh_period should be positive"); - if (fj_cpu.iterations % fj_cpu.settings.parameters.lhs_refresh_period == 0 || - fj_cpu.trigger_early_lhs_recomputation) { - recompute_lhs(fj_cpu); - fj_cpu.trigger_early_lhs_recomputation = false; + if (fj_cpu->iterations % fj_cpu->settings.parameters.lhs_refresh_period == 0 || + fj_cpu->trigger_early_lhs_recomputation) { + recompute_lhs(*fj_cpu); + fj_cpu->trigger_early_lhs_recomputation = false; } fj_move_t move = fj_move_t{-1, 0}; @@ -1465,153 +1465,113 @@ static bool cpufj_solve_loop(fj_cpu_climber_t& fj_cpu, f_t in_time_lim bool is_mtm_sat = false; // Perform lift moves - if (fj_cpu.violated_constraints.empty()) { - thrust::tie(move, score) = find_lift_move(fj_cpu); + if (fj_cpu->violated_constraints.empty()) { + thrust::tie(move, score) = find_lift_move(*fj_cpu); if (score > fj_staged_score_t::zero()) is_lift = true; } // Regular MTM if (!(score > fj_staged_score_t::zero())) { - thrust::tie(move, score) = find_mtm_move_viol(fj_cpu, fj_cpu.mtm_viol_samples); + thrust::tie(move, score) = find_mtm_move_viol(*fj_cpu, fj_cpu->mtm_viol_samples); if (score > fj_staged_score_t::zero()) is_mtm_viol = true; } // try with MTM in satisfied constraints - if (fj_cpu.feasible_found && !(score > fj_staged_score_t::zero())) { - thrust::tie(move, score) = find_mtm_move_sat(fj_cpu, fj_cpu.mtm_sat_samples); + if (fj_cpu->feasible_found && !(score > fj_staged_score_t::zero())) { + thrust::tie(move, score) = find_mtm_move_sat(*fj_cpu, fj_cpu->mtm_sat_samples); if (score > fj_staged_score_t::zero()) is_mtm_sat = true; } // if we're in the feasible region but haven't found improvements in the last n iterations, // perturb bool should_perturb = false; - if (fj_cpu.violated_constraints.empty() && - fj_cpu.iterations - fj_cpu.last_feasible_entrance_iter > fj_cpu.perturb_interval) { - should_perturb = true; - fj_cpu.last_feasible_entrance_iter = fj_cpu.iterations; + if (fj_cpu->violated_constraints.empty() && + fj_cpu->iterations - fj_cpu->last_feasible_entrance_iter > fj_cpu->perturb_interval) { + should_perturb = true; + fj_cpu->last_feasible_entrance_iter = fj_cpu->iterations; } if (score > fj_staged_score_t::zero() && !should_perturb) { - apply_move(fj_cpu, move.var_idx, move.value, false); + apply_move(*fj_cpu, move.var_idx, move.value, false); // Track move types - if (is_lift) fj_cpu.n_lift_moves_window++; - if (is_mtm_viol) fj_cpu.n_mtm_viol_moves_window++; - if (is_mtm_sat) fj_cpu.n_mtm_sat_moves_window++; + if (is_lift) fj_cpu->n_lift_moves_window++; + if (is_mtm_viol) fj_cpu->n_mtm_viol_moves_window++; + if (is_mtm_sat) fj_cpu->n_mtm_sat_moves_window++; } else { // Local Min - update_weights(fj_cpu); + update_weights(*fj_cpu); if (should_perturb) { - perturb(fj_cpu); - for (size_t i = 0; i < fj_cpu.cached_mtm_moves.size(); i++) - fj_cpu.cached_mtm_moves[i].first = 0; + perturb(*fj_cpu); + for (size_t i = 0; i < fj_cpu->cached_mtm_moves.size(); i++) + fj_cpu->cached_mtm_moves[i].first = 0; } thrust::tie(move, score) = - find_mtm_move_viol(fj_cpu, 1, true); // pick a single random violated constraint + find_mtm_move_viol(*fj_cpu, 1, true); // pick a single random violated constraint i_t var_idx = move.var_idx >= 0 ? move.var_idx : 0; f_t delta = move.var_idx >= 0 ? move.value : 0; - apply_move(fj_cpu, var_idx, delta, true); + apply_move(*fj_cpu, var_idx, delta, true); ++local_mins; - ++fj_cpu.n_local_minima_window; + ++fj_cpu->n_local_minima_window; } // number of violated constraints is usually small (<100). recomputing from all LHSs is cheap // and more numerically precise than just adding to the accumulator in apply_move - fj_cpu.total_violations = 0; - for (auto cstr_idx : fj_cpu.violated_constraints) { - fj_cpu.total_violations += fj_cpu.view.excess_score(cstr_idx, fj_cpu.h_lhs[cstr_idx]); + fj_cpu->total_violations = 0; + for (auto cstr_idx : fj_cpu->violated_constraints) { + fj_cpu->total_violations += fj_cpu->view.excess_score(cstr_idx, fj_cpu->h_lhs[cstr_idx]); } - if (fj_cpu.iterations % fj_cpu.log_interval == 0) { + if (fj_cpu->iterations % fj_cpu->log_interval == 0) { CUOPT_LOG_TRACE( "%sCPUFJ iteration: %d/%d, local mins: %d, best_objective: %g, viol: %zu, obj weight %g, " "maxw %g", - fj_cpu.log_prefix.c_str(), - fj_cpu.iterations, - fj_cpu.settings.iteration_limit != std::numeric_limits::max() - ? fj_cpu.settings.iteration_limit + fj_cpu->log_prefix.c_str(), + fj_cpu->iterations, + fj_cpu->settings.iteration_limit != std::numeric_limits::max() + ? fj_cpu->settings.iteration_limit : -1, local_mins, - fj_cpu.pb_ptr->get_user_obj_from_solver_obj(fj_cpu.h_best_objective), - fj_cpu.violated_constraints.size(), - fj_cpu.h_objective_weight, - fj_cpu.max_weight); + fj_cpu->pb_ptr->get_user_obj_from_solver_obj(fj_cpu->h_best_objective), + fj_cpu->violated_constraints.size(), + fj_cpu->h_objective_weight, + fj_cpu->max_weight); } // send current solution to callback every 3000 steps for diversity - if (fj_cpu.iterations % fj_cpu.diversity_callback_interval == 0) { - if (fj_cpu.diversity_callback) { - fj_cpu.diversity_callback(fj_cpu.h_incumbent_objective, fj_cpu.h_assignment); + if (fj_cpu->iterations % fj_cpu->diversity_callback_interval == 0) { + if (fj_cpu->diversity_callback) { + fj_cpu->diversity_callback(fj_cpu->h_incumbent_objective, fj_cpu->h_assignment); } } // Print timing statistics every N iterations #if CPUFJ_TIMING_TRACE - if (fj_cpu.iterations % fj_cpu.timing_stats_interval == 0 && fj_cpu.iterations > 0) { - print_timing_stats(fj_cpu); + if (fj_cpu->iterations % fj_cpu->timing_stats_interval == 0 && fj_cpu->iterations > 0) { + print_timing_stats(*fj_cpu); } #endif - if (fj_cpu.iterations % 100 == 0 && fj_cpu.iterations > 0) { + if (fj_cpu->iterations % 100 == 0 && fj_cpu->iterations > 0) { // Collect memory statistics - auto [loads, stores] = fj_cpu.memory_aggregator.collect(); - double biased_work = (loads + stores) * fj_cpu.work_unit_bias / 1e10; - fj_cpu.work_units_elapsed += biased_work; + auto [loads, stores] = fj_cpu->memory_aggregator.collect(); + double biased_work = (loads + stores) * fj_cpu->work_unit_bias / 1e10; + fj_cpu->work_units_elapsed += biased_work; - if (fj_cpu.producer_sync != nullptr) { fj_cpu.producer_sync->notify_progress(); } + if (fj_cpu->producer_sync != nullptr) { fj_cpu->producer_sync->notify_progress(); } } - cuopt_func_call(sanity_checks(fj_cpu)); - fj_cpu.iterations++; - fj_cpu.iterations_since_best++; + cuopt_func_call(sanity_checks(*fj_cpu)); + fj_cpu->iterations++; + fj_cpu->iterations_since_best++; } auto loop_end = std::chrono::high_resolution_clock::now(); double total_time = std::chrono::duration_cast>(loop_end - loop_start).count(); - double avg_time_per_iter = total_time / fj_cpu.iterations; + double avg_time_per_iter = fj_cpu->iterations > 0 ? total_time / fj_cpu->iterations : 0; CUOPT_LOG_TRACE("%sCPUFJ Average time per iteration: %.8fms", - fj_cpu.log_prefix.c_str(), + fj_cpu->log_prefix.c_str(), avg_time_per_iter * 1000.0); #if CPUFJ_TIMING_TRACE // Print final timing statistics CUOPT_LOG_TRACE("=== Final Timing Statistics ==="); - print_timing_stats(fj_cpu); + print_timing_stats(*fj_cpu); #endif - - return fj_cpu.feasible_found; -} - -template -bool fj_t::cpu_solve(fj_cpu_climber_t& fj_cpu, f_t in_time_limit) -{ - raft::common::nvtx::range scope("fj_cpu"); - return cpufj_solve_loop(fj_cpu, in_time_limit); -} - -template -cpu_fj_thread_t::~cpu_fj_thread_t() -{ - this->request_termination(); -} - -template -void cpu_fj_thread_t::run_worker() -{ - cpu_fj_solution_found = cpufj_solve_loop(*fj_cpu, time_limit); -} - -template -void cpu_fj_thread_t::on_terminate() -{ - if (fj_cpu) fj_cpu->halted = true; -} - -template -void cpu_fj_thread_t::on_start() -{ - cuopt_assert(fj_cpu != nullptr, "fj_cpu must not be null"); - fj_cpu->halted = false; -} - -template -void cpu_fj_thread_t::stop_cpu_solver() -{ - fj_cpu->halted = true; } template @@ -1635,7 +1595,7 @@ std::unique_ptr> init_fj_cpu_standalone( #if MIP_INSTANTIATE_FLOAT template class fj_t; -template class cpu_fj_thread_t; +template void cpufj_solve(fj_cpu_climber_t* fj_cpu, float in_time_limit); template std::unique_ptr> init_fj_cpu_standalone( problem_t& problem, solution_t& solution, @@ -1645,7 +1605,7 @@ template std::unique_ptr> init_fj_cpu_standalone( #if MIP_INSTANTIATE_DOUBLE template class fj_t; -template class cpu_fj_thread_t; +template void cpufj_solve(fj_cpu_climber_t* fj_cpu, double in_time_limit); template std::unique_ptr> init_fj_cpu_standalone( problem_t& problem, solution_t& solution, diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh index 3263609a2b..76bf158f9e 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh @@ -8,16 +8,12 @@ #pragma once #include -#include #include #include -#include -#include #include #include #include -#include #include #include @@ -126,7 +122,7 @@ struct fj_cpu_climber_t { // vector is actually likely beneficial here since we're memory bound std::vector flip_move_computed; - ; + // CSR nnz offset -> (delta, score) std::vector> cached_mtm_moves; @@ -194,21 +190,8 @@ struct fj_cpu_climber_t { }; template -struct cpu_fj_thread_t : public cpu_worker_thread_base_t> { - ~cpu_fj_thread_t(); - - void run_worker(); - void on_terminate(); - void on_start(); - bool get_result() { return cpu_fj_solution_found; } - - void stop_cpu_solver(); - - std::atomic cpu_fj_solution_found{false}; - f_t time_limit{+std::numeric_limits::infinity()}; - std::unique_ptr> fj_cpu; - fj_t* fj_ptr{nullptr}; -}; +void cpufj_solve(fj_cpu_climber_t* fj_cpu, + f_t in_time_limit = std::numeric_limits::infinity()); // Standalone CPUFJ init for running without full fj_t infrastructure (avoids GPU allocations). // Used for early CPUFJ during presolve. diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu index da29511d70..9ea7743e5e 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cu +++ b/cpp/src/mip_heuristics/local_search/local_search.cu @@ -20,10 +20,6 @@ #include -#include - -#include - namespace cuopt::linear_programming::detail { template @@ -47,20 +43,11 @@ local_search_t::local_search_t(mip_solver_context_t& context problem_with_objective_cut(*context.problem_ptr, context.problem_ptr->handle_ptr) { const int n_cpufj = context.settings.heuristic_params.num_cpufj_threads; - for (int i = 0; i < n_cpufj; ++i) { - ls_cpu_fj.push_back(std::make_unique>()); - ls_cpu_fj.back()->fj_ptr = &fj; - } - scratch_cpu_fj.push_back(std::make_unique>()); - scratch_cpu_fj.back()->fj_ptr = &fj; - scratch_cpu_fj_on_lp_opt.fj_ptr = &fj; - + ls_cpu_fj.resize(n_cpufj); + scratch_cpu_fj.resize(1); fj.settings.n_of_minimums_for_exit = context.settings.heuristic_params.n_of_minimums_for_exit; } -static double local_search_best_obj = std::numeric_limits::max(); -static population_t* pop_ptr = nullptr; - template void local_search_t::start_cpufj_scratch_threads(population_t& population) { @@ -75,37 +62,38 @@ void local_search_t::start_cpufj_scratch_threads(population_t 0) solution.assign_random_within_bounds(0.4); - cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution, - default_weights, - default_weights, - 0., - context.preempt_heuristic_solver_, - fj_settings_t{}, - /*randomize=*/counter > 0); - - cpu_fj.fj_cpu->log_prefix = "******* scratch " + std::to_string(counter) + ": "; - cpu_fj.fj_cpu->improvement_callback = - [&population, problem_ptr = context.problem_ptr]( + cpu_fj = fj.create_cpu_climber(solution, + default_weights, + default_weights, + 0., + context.preempt_heuristic_solver_, + fj_settings_t{}, + /*randomize=*/counter > 0); + + cpu_fj->log_prefix = "******* scratch " + std::to_string(counter) + ": "; + cpu_fj->improvement_callback = + [this, &population, problem_ptr = context.problem_ptr]( f_t obj, const std::vector& h_vec, double /*work_units*/) { population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); (void)problem_ptr; - if (obj < local_search_best_obj) { + if (obj < this->local_search_best_obj) { CUOPT_LOG_TRACE("******* New local search best obj %g, best overall %g", problem_ptr->get_user_obj_from_solver_obj(obj), problem_ptr->get_user_obj_from_solver_obj( population.is_feasible() ? population.best_feasible().get_objective() : std::numeric_limits::max())); - local_search_best_obj = obj; + this->local_search_best_obj = obj; } }; counter++; }; - for (auto& cpu_fj_ptr : scratch_cpu_fj) { - cpu_fj_ptr->start_cpu_solver(); + for (size_t i = 0; i < scratch_cpu_fj.size(); ++i) { + auto ptr = scratch_cpu_fj[i].get(); +#pragma omp task firstprivate(ptr) depend(out : *ptr) default(none) + cpufj_solve(ptr); } } @@ -121,34 +109,42 @@ void local_search_t::start_cpufj_lptopt_scratch_threads( solution_lp.copy_new_assignment( host_copy(lp_optimal_solution, context.problem_ptr->handle_ptr->get_stream())); solution_lp.round_random_nearest(500); - scratch_cpu_fj_on_lp_opt.fj_cpu = fj.create_cpu_climber( + scratch_cpu_fj_on_lp_opt = fj.create_cpu_climber( solution_lp, default_weights, default_weights, 0., context.preempt_heuristic_solver_); - scratch_cpu_fj_on_lp_opt.fj_cpu->log_prefix = "******* scratch on LP optimal: "; - scratch_cpu_fj_on_lp_opt.fj_cpu->improvement_callback = + scratch_cpu_fj_on_lp_opt->log_prefix = "******* scratch on LP optimal: "; + scratch_cpu_fj_on_lp_opt->improvement_callback = [this, &population](f_t obj, const std::vector& h_vec, double /*work_units*/) { population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ); - if (obj < local_search_best_obj) { + if (obj < this->local_search_best_obj) { CUOPT_LOG_DEBUG("******* New local search best obj %g, best overall %g", context.problem_ptr->get_user_obj_from_solver_obj(obj), context.problem_ptr->get_user_obj_from_solver_obj( population.is_feasible() ? population.best_feasible().get_objective() : std::numeric_limits::max())); - local_search_best_obj = obj; + this->local_search_best_obj = obj; } }; // default weights cudaDeviceSynchronize(); - scratch_cpu_fj_on_lp_opt.start_cpu_solver(); + +#pragma omp task shared(scratch_cpu_fj_on_lp_opt) default(none) \ + depend(out : *scratch_cpu_fj_on_lp_opt) + cpufj_solve(scratch_cpu_fj_on_lp_opt.get()); } template void local_search_t::stop_cpufj_scratch_threads() { - for (auto& cpu_fj_ptr : scratch_cpu_fj) { - cpu_fj_ptr->request_termination(); + for (size_t i = 0; i < scratch_cpu_fj.size(); ++i) { + scratch_cpu_fj[i]->halted = true; +#pragma omp taskwait depend(in : *scratch_cpu_fj[i]) + } + + if (scratch_cpu_fj_on_lp_opt) { + scratch_cpu_fj_on_lp_opt->halted = true; +#pragma omp taskwait depend(in : *scratch_cpu_fj_on_lp_opt) } - scratch_cpu_fj_on_lp_opt.request_termination(); } template @@ -164,29 +160,29 @@ void local_search_t::start_cpufj_deterministic( 0.0); solution.clamp_within_bounds(); - deterministic_cpu_fj.fj_ptr = &fj; - deterministic_cpu_fj.fj_cpu = fj.create_cpu_climber(solution, - default_weights, - default_weights, - 0., - context.preempt_heuristic_solver_, - fj_settings_t{}, - /*randomize=*/true); + deterministic_cpu_fj = fj.create_cpu_climber(solution, + default_weights, + default_weights, + 0., + context.preempt_heuristic_solver_, + fj_settings_t{}, + /*randomize=*/true); - deterministic_cpu_fj.fj_cpu->log_prefix = "******* deterministic CPUFJ: "; + deterministic_cpu_fj->log_prefix = "******* deterministic CPUFJ: "; // Register with producer_sync for B&B synchronization - producer_sync_t& producer_sync = bb.get_producer_sync(); - deterministic_cpu_fj.fj_cpu->producer_sync = &producer_sync; - producer_sync.register_producer(&deterministic_cpu_fj.fj_cpu->work_units_elapsed); + producer_sync_t& producer_sync = bb.get_producer_sync(); + deterministic_cpu_fj->producer_sync = &producer_sync; + producer_sync.register_producer(&deterministic_cpu_fj->work_units_elapsed); // Set up callback to send solutions to B&B with work unit timestamps - deterministic_cpu_fj.fj_cpu->improvement_callback = + deterministic_cpu_fj->improvement_callback = [&bb](f_t obj, const std::vector& h_vec, double work_units) { bb.queue_external_solution_deterministic(h_vec, work_units); }; - deterministic_cpu_fj.start_cpu_solver(); +#pragma omp task shared(deterministic_cpu_fj) default(none) depend(inout : *deterministic_cpu_fj) + cpufj_solve(deterministic_cpu_fj.get()); // Signal that registration is complete - B&B can now wait on producers producer_sync.registration_complete(); @@ -195,12 +191,14 @@ void local_search_t::start_cpufj_deterministic( template void local_search_t::stop_cpufj_deterministic() { - if (deterministic_cpu_fj.fj_cpu) { - if (deterministic_cpu_fj.fj_cpu->producer_sync) { - deterministic_cpu_fj.fj_cpu->producer_sync->deregister_producer( - &deterministic_cpu_fj.fj_cpu->work_units_elapsed); + if (deterministic_cpu_fj) { + if (deterministic_cpu_fj->producer_sync) { + deterministic_cpu_fj->producer_sync->deregister_producer( + &deterministic_cpu_fj->work_units_elapsed); } - deterministic_cpu_fj.request_termination(); + + deterministic_cpu_fj->halted = true; +#pragma omp taskwait depend(in : *deterministic_cpu_fj) } } @@ -233,48 +231,44 @@ bool local_search_t::do_fj_solve(solution_t& solution, } auto h_weights = cuopt::host_copy(in_fj.cstr_weights, solution.handle_ptr->get_stream()); auto h_objective_weight = in_fj.objective_weight.value(solution.handle_ptr->get_stream()); - for (auto& cpu_fj_ptr : ls_cpu_fj) { - auto& cpu_fj = *cpu_fj_ptr; - cpu_fj.fj_cpu = cpu_fj.fj_ptr->create_cpu_climber(solution, - h_weights, - h_weights, - h_objective_weight, - context.preempt_heuristic_solver_, - fj_settings_t{}, - true); + for (auto& cpu_fj : ls_cpu_fj) { + cpu_fj = fj.create_cpu_climber(solution, + h_weights, + h_weights, + h_objective_weight, + context.preempt_heuristic_solver_, + fj_settings_t{}, + true); } auto solution_copy = solution; // Start CPU solver in background thread - for (auto& cpu_fj_ptr : ls_cpu_fj) { - cpu_fj_ptr->start_cpu_solver(); - } +#pragma omp taskgroup + { +#pragma omp taskloop shared(ls_cpu_fj) default(none) num_tasks(ls_cpu_fj.size()) nogroup + for (size_t i = 0; i < ls_cpu_fj.size(); ++i) { + cpufj_solve(ls_cpu_fj[i].get()); + } - // Run GPU solver and measure execution time - auto gpu_fj_start = std::chrono::high_resolution_clock::now(); - in_fj.settings.time_limit = timer.remaining_time(); - in_fj.solve(solution); + // Run GPU solver + in_fj.settings.time_limit = timer.remaining_time(); + in_fj.solve(solution); - // Stop CPU solver - for (auto& cpu_fj_ptr : ls_cpu_fj) { - cpu_fj_ptr->stop_cpu_solver(); + for (size_t i = 0; i < ls_cpu_fj.size(); ++i) { + ls_cpu_fj[i]->halted = true; + } } - auto gpu_fj_end = std::chrono::high_resolution_clock::now(); - double gpu_fj_duration = std::chrono::duration(gpu_fj_end - gpu_fj_start).count(); - solution_t solution_cpu(*solution.problem_ptr); - f_t best_cpu_obj = std::numeric_limits::max(); - // // Wait for CPU solver to finish - for (auto& cpu_fj_ptr : ls_cpu_fj) { - bool cpu_sol_found = cpu_fj_ptr->wait_for_cpu_solver(); - if (cpu_sol_found) { - f_t cpu_obj = cpu_fj_ptr->fj_cpu->h_best_objective; + + for (size_t i = 0; i < ls_cpu_fj.size(); ++i) { + if (ls_cpu_fj[i]->feasible_found) { + f_t cpu_obj = ls_cpu_fj[i]->h_best_objective; if (cpu_obj < best_cpu_obj) { best_cpu_obj = cpu_obj; - solution_cpu.copy_new_assignment(cpu_fj_ptr->fj_cpu->h_best_assignment); + solution_cpu.copy_new_assignment(ls_cpu_fj[i]->h_best_assignment); solution_cpu.compute_feasibility(); } } diff --git a/cpp/src/mip_heuristics/local_search/local_search.cuh b/cpp/src/mip_heuristics/local_search/local_search.cuh index 94493ebcb3..9befd34ab5 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cuh +++ b/cpp/src/mip_heuristics/local_search/local_search.cuh @@ -11,16 +11,10 @@ #include #include #include -#include #include +#include #include -#include -#include -#include -#include -#include - namespace cuopt::linear_programming::dual_simplex { template class branch_and_bound_t; @@ -126,12 +120,15 @@ class local_search_t { feasibility_pump_t fp; std::mt19937 rng; - std::vector>> ls_cpu_fj; - std::vector>> scratch_cpu_fj; - cpu_fj_thread_t scratch_cpu_fj_on_lp_opt; - cpu_fj_thread_t deterministic_cpu_fj; + std::vector>> ls_cpu_fj; + std::vector>> scratch_cpu_fj; + std::unique_ptr> scratch_cpu_fj_on_lp_opt; + std::unique_ptr> deterministic_cpu_fj; problem_t problem_with_objective_cut; bool cutting_plane_added_for_active_run{false}; + + omp_atomic_t local_search_best_obj{std::numeric_limits::max()}; + population_t* pop_ptr{nullptr}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh b/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh index 8b57cc7019..ed0b91466d 100644 --- a/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh +++ b/cpp/src/mip_heuristics/presolve/bounds_presolve.cuh @@ -34,7 +34,7 @@ class bound_presolve_t { struct settings_t { f_t time_limit{60.0}; i_t iteration_limit{std::numeric_limits::max()}; - i_t num_threads = -1; + i_t num_tasks = -1; bool parallel_bounds_update{true}; }; diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu index 24cac7129f..74e04c89bb 100644 --- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu +++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu @@ -246,7 +246,8 @@ void conditional_bound_strengthening_t::select_constraint_pairs_host( std::vector constraint_pairs_h(max_pair_per_row * problem.n_constraints, {-1, -1}); std::unordered_set cnstr_pair; -#pragma omp parallel for private(cnstr_pair) +#pragma omp taskloop private(cnstr_pair) default(none) \ + shared(offsets, variables, reverse_offsets, reverse_constraints, constraint_pairs_h) for (int cnstr = 0; cnstr < problem.n_constraints; ++cnstr) { for (int jj = offsets[cnstr]; jj < offsets[cnstr + 1]; ++jj) { int var = variables[jj]; diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu index e45f2394ed..0dc2ad6e8e 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu @@ -22,6 +22,7 @@ #include #include +#include namespace cuopt::linear_programming::detail { @@ -860,18 +861,17 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, bound_presolve.settings.iteration_limit = 50; bound_presolve.settings.time_limit = timer.remaining_time(); - size_t num_threads = bound_presolve.settings.num_threads < 0 - ? 0.2 * omp_get_max_threads() - : bound_presolve.settings.num_threads; - num_threads = std::clamp(num_threads, 1, 8); + size_t num_tasks = bound_presolve.settings.num_tasks < 0 ? 0.2 * omp_get_max_threads() + : bound_presolve.settings.num_tasks; + num_tasks = 1; // std::clamp(num_tasks, 1, 8); // Create a vector of multi_probe_t objects std::vector> multi_probe_presolve_pool; - std::vector>> modification_vector_pool(num_threads); - std::vector>> substitution_vector_pool(num_threads); + std::vector>> modification_vector_pool(num_tasks); + std::vector>> substitution_vector_pool(num_tasks); // Initialize multi_probe_presolve_pool - for (size_t i = 0; i < num_threads; i++) { + for (size_t i = 0; i < num_tasks; i++) { multi_probe_presolve_pool.emplace_back(bound_presolve.context); multi_probe_presolve_pool[i].resize(problem); multi_probe_presolve_pool[i].compute_stats = true; @@ -890,23 +890,39 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, // are visible before any per-thread kernel can reference that memory. problem.handle_ptr->sync_stream(); -// Main parallel loop -#pragma omp parallel num_threads(num_threads) - { - for (size_t step_start = 0; step_start < priority_indices.size(); step_start += step_size) { - if (timer.check_time_limit() || early_exit || problem_is_infeasible.load()) { break; } - size_t step_end = std::min(step_start + step_size, priority_indices.size()); - -#pragma omp for - for (size_t i = step_start; i < step_end; ++i) { + // Main parallel loop + for (size_t step_start = 0; step_start < priority_indices.size(); step_start += step_size) { + if (timer.check_time_limit() || early_exit || problem_is_infeasible.load()) { break; } + size_t step_end = std::min(step_start + step_size, priority_indices.size()); + +#pragma omp taskloop num_tasks(num_tasks) default(none) firstprivate(step_start, step_end) \ + shared(num_tasks, \ + priority_indices, \ + timer, \ + multi_probe_presolve_pool, \ + bound_presolve, \ + problem, \ + h_var_bounds, \ + h_integer_indices, \ + n_of_implied_singletons, \ + n_of_cached_probings, \ + problem_is_infeasible, \ + modification_vector_pool, \ + substitution_vector_pool) + for (size_t task_id = 0; task_id < num_tasks; ++task_id) { + size_t n = step_end - step_start; + size_t begin = std::floor(static_cast(n) * task_id / num_tasks); + size_t end = std::floor(static_cast(n) * (task_id + 1) / num_tasks); + auto& multi_probe_presolve = multi_probe_presolve_pool[task_id]; + auto& modification_vector = modification_vector_pool[task_id]; + auto& substitution_vector = substitution_vector_pool[task_id]; + if (timer.check_time_limit()) { continue; } + + for (size_t i = begin; i < end; ++i) { auto var_idx = priority_indices[i]; if (timer.check_time_limit()) { continue; } - int thread_idx = omp_get_thread_num(); - CUOPT_LOG_TRACE("Computing probing cache for var %d on thread %d", var_idx, thread_idx); - - auto& multi_probe_presolve = multi_probe_presolve_pool[thread_idx]; - + CUOPT_LOG_TRACE("Computing probing cache for var %d on thread %d", var_idx, id); compute_cache_for_var(var_idx, bound_presolve, problem, @@ -916,30 +932,29 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, n_of_implied_singletons, n_of_cached_probings, problem_is_infeasible, - modification_vector_pool[thread_idx], - substitution_vector_pool[thread_idx], + modification_vector, + substitution_vector, timer, problem.handle_ptr->get_device()); } } -#pragma omp single - { - // TODO when we have determinism, check current threads work/time counter and filter queue - // items that are smaller or equal to that - apply_modification_queue_to_problem(modification_vector_pool, problem); - // copy host bounds again, because we changed some problem bounds - raft::copy(h_var_bounds.data(), - problem.variable_bounds.data(), - h_var_bounds.size(), - problem.handle_ptr->get_stream()); - problem.handle_ptr->sync_stream(); - if (n_of_implied_singletons - last_it_implied_singletons < - (size_t)std::max(2, (min(100, problem.n_variables / 50)))) { - early_exit = true; - } - last_it_implied_singletons = n_of_implied_singletons; + + // TODO when we have determinism, check current threads work/time counter and filter queue + // items that are smaller or equal to that + apply_modification_queue_to_problem(modification_vector_pool, problem); + // copy host bounds again, because we changed some problem bounds + raft::copy(h_var_bounds.data(), + problem.variable_bounds.data(), + h_var_bounds.size(), + problem.handle_ptr->get_stream()); + problem.handle_ptr->sync_stream(); + if (n_of_implied_singletons - last_it_implied_singletons < + (size_t)std::max(2, (min(100, problem.n_variables / 50)))) { + early_exit = true; } + last_it_implied_singletons = n_of_implied_singletons; } // end of step + apply_substitution_queue_to_problem(substitution_vector_pool, problem); CUOPT_LOG_DEBUG("Total number of cached probings %lu number of implied singletons %lu", n_of_cached_probings.load(), diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu index 4e9cd6a2a5..0932f0e5a9 100644 --- a/cpp/src/mip_heuristics/solve.cu +++ b/cpp/src/mip_heuristics/solve.cu @@ -47,6 +47,7 @@ #include #include +#include namespace cuopt::linear_programming { @@ -80,10 +81,10 @@ static void invoke_solution_callbacks( } template -mip_solution_t run_mip(detail::problem_t& problem, - mip_solver_settings_t const& settings, - timer_t& timer, - f_t initial_cutoff = std::numeric_limits::infinity()) +mip_solution_t run_mip_solver(detail::problem_t& problem, + mip_solver_settings_t const& settings, + timer_t& timer, + f_t initial_cutoff = std::numeric_limits::infinity()) { try { raft::common::nvtx::range fun_scope("run_mip"); @@ -228,8 +229,8 @@ mip_solution_t run_mip(detail::problem_t& problem, } template -mip_solution_t solve_mip(optimization_problem_t& op_problem, - mip_solver_settings_t const& settings_const) +mip_solution_t solve_mip_helper(optimization_problem_t& op_problem, + mip_solver_settings_t const& settings_const) { try { mip_solver_settings_t settings(settings_const); @@ -435,10 +436,9 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, CUOPT_LOG_INFO("Writing presolved problem to file: %s", settings.presolve_file.c_str()); presolve_result_opt->reduced_problem.write_to_mps(settings.presolve_file); } - // early_best_user_obj is in user-space. // run_mip stores it in context.initial_cutoff and converts to target spaces as needed. - auto sol = run_mip(problem, settings, timer, early_best_user_obj); + auto sol = run_mip_solver(problem, settings, timer, early_best_user_obj); if (run_presolve) { auto status_to_skip = sol.get_termination_status() == mip_termination_status_t::TimeLimit || @@ -502,6 +502,50 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, throw; } } +template +mip_solution_t solve_mip(optimization_problem_t& op_problem, + mip_solver_settings_t const& settings_const) +{ + std::exception_ptr exception; + i_t num_threads = 0; + if (settings_const.num_cpu_threads < 0) { + num_threads = omp_get_max_threads(); + } else { + if (settings_const.num_cpu_threads < 4) { + CUOPT_LOG_ERROR( + "The MIP solver requires at least 4 CPU threads! Setting the number of threads to 4."); + } + + num_threads = std::max(4, settings_const.num_cpu_threads); + } + + // TODO: Remove this after converting deterministic B&B to use tasks. This allows + // creating a nested parallel region. + omp_set_max_active_levels(2); + + // + mip_solution_t sol(mip_termination_status_t::NoTermination, + solver_stats_t{}, + op_problem.get_handle_ptr()->get_stream()); + +#pragma omp parallel num_threads(num_threads) default(none) \ + shared(sol, op_problem, settings_const, exception) + { +#pragma omp master + { + try { + sol = solve_mip_helper(op_problem, settings_const); + } catch (...) { + // We cannot throw inside an OpenMP parallel region. So we need to catch and then + // re-throw later. + exception = std::current_exception(); + } + } + } + + if (exception) { std::rethrow_exception(exception); } + return sol; +} template mip_solution_t solve_mip( diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index 737d81201f..e8336a0818 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -309,12 +309,7 @@ solution_t mip_solver_t::run_solver() dual_simplex::probing_implied_bound_t probing_implied_bound; - i_t num_threads = 0; - if (context.settings.num_cpu_threads < 0) { - num_threads = omp_get_max_threads(); - } else { - num_threads = std::max(1, context.settings.num_cpu_threads); - } + i_t num_threads = omp_get_num_threads(); if (!context.settings.heuristics_only) { // Convert the presolved problem to dual_simplex::user_problem_t @@ -330,7 +325,7 @@ solution_t mip_solver_t::run_solver() // Fill in the settings for branch and bound branch_and_bound_settings.time_limit = timer_.get_time_limit(); branch_and_bound_settings.node_limit = context.settings.node_limit; - branch_and_bound_settings.num_threads = num_threads - 1; + branch_and_bound_settings.num_threads = std::max(num_threads - 1, 1); branch_and_bound_settings.print_presolve_stats = false; branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap; branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap; @@ -455,33 +450,25 @@ solution_t mip_solver_t::run_solver() if (timer_.check_time_limit()) { CUOPT_LOG_INFO("Time limit reached during B&B setup"); - solution_t sol(*context.problem_ptr); context.stats.total_solve_time = timer_.elapsed_time(); context.problem_ptr->post_process_solution(sol); return sol; } } -#pragma omp parallel num_threads(num_threads) default(none) \ - shared(sol, branch_and_bound, branch_and_bound_status, branch_and_bound_solution, dm, context) +#pragma omp taskgroup { -#pragma omp master - { - if (!context.settings.heuristics_only) { -#pragma omp task - { - branch_and_bound_status = branch_and_bound->solve(branch_and_bound_solution); - } - } - -#pragma omp task + if (!context.settings.heuristics_only) { +#pragma omp task default(none) \ + shared(branch_and_bound, branch_and_bound_solution, branch_and_bound_status) { - // Start the primal heuristics - context.diversity_manager_ptr = &dm; - // Start the primal heuristics - sol = dm.run_solver(); + branch_and_bound_status = branch_and_bound->solve(branch_and_bound_solution); } } + + // Start the primal heuristics + context.diversity_manager_ptr = &dm; + sol = dm.run_solver(); } if (!context.settings.heuristics_only) { @@ -508,7 +495,6 @@ solution_t mip_solver_t::run_solver() } context.stats.total_solve_time = timer_.elapsed_time(); context.problem_ptr->post_process_solution(sol); - dm.rins.stop_rins(); return sol; } diff --git a/cpp/src/mip_heuristics/utilities/cpu_worker_thread.cuh b/cpp/src/mip_heuristics/utilities/cpu_worker_thread.cuh deleted file mode 100644 index 2b982e1f47..0000000000 --- a/cpp/src/mip_heuristics/utilities/cpu_worker_thread.cuh +++ /dev/null @@ -1,147 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights - * reserved. SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace cuopt::linear_programming::detail { - -template -class cpu_worker_thread_base_t { - public: - cpu_worker_thread_base_t(); - ~cpu_worker_thread_base_t(); - - void start_cpu_solver(); - bool wait_for_cpu_solver(); - - // Derived classes MUST call this in their destructor before the base destructor runs. - // This ensures on_terminate() is called while the derived object is still fully alive. - void request_termination(); - - // Internal method for thread management - safe to call during destruction - void join_worker(); - void cpu_worker_thread(); - - std::thread cpu_worker; - std::mutex cpu_mutex; - std::condition_variable cpu_cv; - std::atomic should_stop{false}; - std::atomic cpu_thread_should_start{false}; - std::atomic cpu_thread_done{true}; - std::atomic cpu_thread_terminate{false}; -}; - -template -cpu_worker_thread_base_t::cpu_worker_thread_base_t() -{ - cpu_worker = std::thread(&cpu_worker_thread_base_t::cpu_worker_thread, this); -} - -template -cpu_worker_thread_base_t::~cpu_worker_thread_base_t() -{ - // Note: We don't call on_terminate() here since the derived object is already destroyed. - join_worker(); -} - -template -void cpu_worker_thread_base_t::cpu_worker_thread() -{ - while (!cpu_thread_terminate) { - { - std::unique_lock lock(cpu_mutex); - cpu_cv.wait(lock, [this] { return cpu_thread_should_start || cpu_thread_terminate; }); - - if (cpu_thread_terminate) break; - - cpu_thread_done = false; - cpu_thread_should_start = false; - } - - static_cast(this)->run_worker(); - - { - std::lock_guard lock(cpu_mutex); - cpu_thread_done = true; - } - cpu_cv.notify_all(); - } -} - -template -void cpu_worker_thread_base_t::request_termination() -{ - bool should_terminate = false; - { - std::lock_guard lock(cpu_mutex); - if (cpu_thread_terminate) return; - cpu_thread_terminate = true; - should_terminate = true; - static_cast(this)->on_terminate(); - } - - if (should_terminate) { - cpu_cv.notify_one(); - join_worker(); - } -} - -template -void cpu_worker_thread_base_t::join_worker() -{ - { - std::lock_guard lock(cpu_mutex); - if (!cpu_thread_terminate) { cpu_thread_terminate = true; } - } - cpu_cv.notify_one(); - - if (cpu_worker.joinable()) { cpu_worker.join(); } -} - -template -void cpu_worker_thread_base_t::start_cpu_solver() -{ - { - std::lock_guard lock(cpu_mutex); - cpu_thread_done = false; - cpu_thread_should_start = true; - static_cast(this)->on_start(); - } - cpu_cv.notify_one(); -} - -template -bool cpu_worker_thread_base_t::wait_for_cpu_solver() -{ - auto wait_start = std::chrono::high_resolution_clock::now(); - std::unique_lock lock(cpu_mutex); - cpu_cv.wait(lock, [this] { return cpu_thread_done || cpu_thread_terminate; }); - auto wait_end = std::chrono::high_resolution_clock::now(); - double wait_time = std::chrono::duration(wait_end - wait_start).count(); - if (wait_time > 1.0) { CUOPT_LOG_DEBUG("CPU thread wait time: %.2f seconds", wait_time); } - - return static_cast(this)->get_result(); -} - -} // namespace cuopt::linear_programming::detail From 1bb5d10215c036c2c83b3129966a7adc93eb4f6f Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Tue, 14 Apr 2026 18:50:05 +0200 Subject: [PATCH 03/14] fixed compilation. removed debug code from probing_cache.cu Signed-off-by: Nicolas L. Guidotti --- cpp/src/mip_heuristics/diversity/diversity_manager.cu | 3 --- cpp/src/mip_heuristics/presolve/probing_cache.cu | 4 ++-- cpp/src/mip_heuristics/solve.cu | 3 ++- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/mip_heuristics/diversity/diversity_manager.cu b/cpp/src/mip_heuristics/diversity/diversity_manager.cu index b8dc3d33bf..f53648b0f8 100644 --- a/cpp/src/mip_heuristics/diversity/diversity_manager.cu +++ b/cpp/src/mip_heuristics/diversity/diversity_manager.cu @@ -603,18 +603,15 @@ solution_t diversity_manager_t::run_solver() generate_solution(timer.remaining_time(), false); if (timer.check_time_limit()) { - rins.stop_rins(); population.add_external_solutions_to_population(); return population.best_feasible(); } if (check_b_b_preemption()) { - rins.stop_rins(); population.add_external_solutions_to_population(); return population.best_feasible(); } run_fp_alone(); - rins.stop_rins(); population.add_external_solutions_to_population(); return population.best_feasible(); }; diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu index 0dc2ad6e8e..cfa177eaf5 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu @@ -861,9 +861,9 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, bound_presolve.settings.iteration_limit = 50; bound_presolve.settings.time_limit = timer.remaining_time(); - size_t num_tasks = bound_presolve.settings.num_tasks < 0 ? 0.2 * omp_get_max_threads() + size_t num_tasks = bound_presolve.settings.num_tasks < 0 ? 0.2 * omp_get_num_threads() : bound_presolve.settings.num_tasks; - num_tasks = 1; // std::clamp(num_tasks, 1, 8); + num_tasks = std::clamp(num_tasks, 1, 8); // Create a vector of multi_probe_t objects std::vector> multi_probe_presolve_pool; diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu index 7207e8acd9..ab6f324175 100644 --- a/cpp/src/mip_heuristics/solve.cu +++ b/cpp/src/mip_heuristics/solve.cu @@ -472,7 +472,8 @@ mip_solution_t solve_mip_helper(optimization_problem_t& op_p } // early_best_user_obj is in user-space. // run_mip stores it in context.initial_upper_bound and converts to target spaces as needed. - auto sol = run_mip(problem, settings, timer, early_best_user_obj, early_best_user_assignment); + auto sol = + run_mip_solver(problem, settings, timer, early_best_user_obj, early_best_user_assignment); const f_t cuopt_presolve_time = sol.get_stats().presolve_time; if (run_presolve) { From a7f2eb6830ea19a29934d57f1a8cff050d63ef9d Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Wed, 15 Apr 2026 11:27:46 +0200 Subject: [PATCH 04/14] addressed coderabbit reviews. added some comments and logs Signed-off-by: Nicolas L. Guidotti --- cpp/src/branch_and_bound/branch_and_bound.cpp | 4 +++- cpp/src/branch_and_bound/pseudo_costs.cpp | 6 ++++++ cpp/src/mip_heuristics/diversity/lns/rins.cu | 7 ++++--- .../feasibility_jump/early_cpufj.cu | 2 ++ .../mip_heuristics/feasibility_jump/fj_cpu.cu | 2 +- .../local_search/local_search.cu | 18 ++++++++++++++++-- .../conditional_bound_strengthening.cu | 13 ++++++++----- .../mip_heuristics/presolve/probing_cache.cu | 2 ++ cpp/src/mip_heuristics/solve.cu | 17 ++++++++++------- cpp/src/mip_heuristics/solver.cu | 2 +- 10 files changed, 53 insertions(+), 20 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 6fb5955edd..367dfbb416 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -1800,6 +1800,8 @@ void branch_and_bound_t::run_scheduler() template void branch_and_bound_t::single_threaded_solve() { + raft::common::nvtx::range scope("BB::single_threaded_solve"); + branch_and_bound_worker_t worker(0, original_lp_, Arow_, var_types_, settings_); f_t lower_bound = get_lower_bound(); @@ -2634,7 +2636,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } else { single_threaded_solve(); } - } + } // Implicit barrier for all tasks created within the group is_running_ = false; diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index bda91cf04f..47c460cc7b 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -1000,6 +1000,8 @@ void strong_branching(const lp_problem_t& original_lp, basis_update_mpf_t& basis_factors, pseudo_costs_t& pc) { + raft::common::nvtx::range scope("BB::strong_branching"); + pc.resize(original_lp.num_cols); pc.strong_branch_down.assign(fractional.size(), 0); pc.strong_branch_up.assign(fractional.size(), 0); @@ -1304,6 +1306,8 @@ i_t pseudo_costs_t::variable_selection(const std::vector& fractio const std::vector& solution, logger_t& log) { + raft::common::nvtx::range scope("BB::pseudocost_branching"); + i_t branch_var = fractional[0]; f_t max_score = -1; i_t num_initialized_down; @@ -1350,6 +1354,8 @@ i_t pseudo_costs_t::reliable_variable_selection( const std::vector& new_slacks, const lp_problem_t& original_lp) { + raft::common::nvtx::range scope("BB::reliability_branching"); + constexpr f_t eps = 1e-6; f_t start_time = bnb_stats.start_time; i_t branch_var = fractional[0]; diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu index c6a1bba7e7..6ddc645e4e 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cu +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu @@ -61,6 +61,8 @@ void rins_t::node_callback(const std::vector& solution, f_t objec if (population_ready) { lp_optimal_solution = solution; + + CUOPT_LOG_INFO("Launching RINS task"); #pragma omp task default(none) run_rins(); } else { @@ -82,9 +84,7 @@ template void rins_t::run_rins() { raft::common::nvtx::range fun_scope("Running RINS"); - - if (total_calls == 0) RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device())); - + RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device())); cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch"); cuopt_assert(problem_copy->handle_ptr == &rins_handle, "Handle mismatch"); // Do not make assertions based on problem_ptr. The original problem may have been modified within @@ -226,6 +226,7 @@ void rins_t::run_rins() true); fj_cpu->log_prefix = "[RINS] "; + CUOPT_LOG_INFO("Launching CPUFJ (RINS) task"); #pragma omp task shared(fj_cpu) firstprivate(time_limit) default(none) cpufj_solve(fj_cpu.get(), time_limit); diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu index 15ff3f4eb7..86d91c0539 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu @@ -43,6 +43,7 @@ void early_cpufj_t::start() const std::vector& assignment, double) { this->try_update_best(solver_obj, assignment); }; + CUOPT_LOG_INFO("Launching early CPUFJ task"); #pragma omp task shared(fj_cpu_) depend(out : *fj_cpu_) default(none) cpufj_solve(fj_cpu_.get()); } @@ -56,6 +57,7 @@ void early_cpufj_t::stop() fj_cpu_->halted = true; #pragma omp taskwait depend(in : *fj_cpu_) + CUOPT_LOG_INFO("Early CPUFJ task was stopped"); CUOPT_LOG_DEBUG("[Early CPUFJ] Stopped after %d iterations, solution_found=%d", fj_cpu_ ? fj_cpu_->iterations : 0, diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu index aea6295528..ee8ff9a53c 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu @@ -1421,7 +1421,7 @@ void cpufj_solve(fj_cpu_climber_t* fj_cpu, f_t in_time_limit) { i_t local_mins = 0; auto loop_start = std::chrono::high_resolution_clock::now(); - auto time_limit = std::chrono::milliseconds((int)(in_time_limit * 1000)); + auto time_limit = std::chrono::milliseconds(std::floor(in_time_limit * 1000.0)); auto loop_time_start = std::chrono::high_resolution_clock::now(); // Initialize feature tracking diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu index 9ea7743e5e..d5160862e5 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cu +++ b/cpp/src/mip_heuristics/local_search/local_search.cu @@ -90,6 +90,8 @@ void local_search_t::start_cpufj_scratch_threads(population_t::start_cpufj_lptopt_scratch_threads( // default weights cudaDeviceSynchronize(); + CUOPT_LOG_INFO("Launching scratch CPUFJ (on LP optimal) task"); + #pragma omp task shared(scratch_cpu_fj_on_lp_opt) default(none) \ depend(out : *scratch_cpu_fj_on_lp_opt) cpufj_solve(scratch_cpu_fj_on_lp_opt.get()); @@ -144,6 +148,8 @@ void local_search_t::stop_cpufj_scratch_threads() if (scratch_cpu_fj_on_lp_opt) { scratch_cpu_fj_on_lp_opt->halted = true; #pragma omp taskwait depend(in : *scratch_cpu_fj_on_lp_opt) + + CUOPT_LOG_INFO("All scratch CPUFJ tasks were stopped"); } } @@ -181,6 +187,7 @@ void local_search_t::start_cpufj_deterministic( bb.queue_external_solution_deterministic(h_vec, work_units); }; + CUOPT_LOG_INFO("Launching deterministic CPUFJ task"); #pragma omp task shared(deterministic_cpu_fj) default(none) depend(inout : *deterministic_cpu_fj) cpufj_solve(deterministic_cpu_fj.get()); @@ -199,6 +206,7 @@ void local_search_t::stop_cpufj_deterministic() deterministic_cpu_fj->halted = true; #pragma omp taskwait depend(in : *deterministic_cpu_fj) + CUOPT_LOG_INFO("Deterministic CPUFJ task was stopped"); } } @@ -246,9 +254,13 @@ bool local_search_t::do_fj_solve(solution_t& solution, // Start CPU solver in background thread #pragma omp taskgroup { + if (ls_cpu_fj.size() > 0) { + CUOPT_LOG_INFO("Launching %d CPUFJ tasks", ls_cpu_fj.size()); + #pragma omp taskloop shared(ls_cpu_fj) default(none) num_tasks(ls_cpu_fj.size()) nogroup - for (size_t i = 0; i < ls_cpu_fj.size(); ++i) { - cpufj_solve(ls_cpu_fj[i].get()); + for (size_t i = 0; i < ls_cpu_fj.size(); ++i) { + cpufj_solve(ls_cpu_fj[i].get()); + } } // Run GPU solver @@ -260,6 +272,8 @@ bool local_search_t::do_fj_solve(solution_t& solution, } } + CUOPT_LOG_INFO("All CPUFJ tasks were stopped"); + solution_t solution_cpu(*solution.problem_ptr); f_t best_cpu_obj = std::numeric_limits::max(); diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu index 74e04c89bb..846e22358a 100644 --- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu +++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu @@ -246,12 +246,15 @@ void conditional_bound_strengthening_t::select_constraint_pairs_host( std::vector constraint_pairs_h(max_pair_per_row * problem.n_constraints, {-1, -1}); std::unordered_set cnstr_pair; -#pragma omp taskloop private(cnstr_pair) default(none) \ - shared(offsets, variables, reverse_offsets, reverse_constraints, constraint_pairs_h) - for (int cnstr = 0; cnstr < problem.n_constraints; ++cnstr) { - for (int jj = offsets[cnstr]; jj < offsets[cnstr + 1]; ++jj) { + i_t num_tasks = omp_get_num_threads() - 4; + + CUOPT_LOG_INFO("Selecting constraint pairs with %d tasks", num_tasks); +#pragma omp taskloop num_tasks(num_tasks) private(cnstr_pair) default(none) \ + shared(problem, offsets, variables, reverse_offsets, reverse_constraints, constraint_pairs_h) + for (i_t cnstr = 0; cnstr < problem.n_constraints; ++cnstr) { + for (i_t jj = offsets[cnstr]; jj < offsets[cnstr + 1]; ++jj) { int var = variables[jj]; - for (int kk = reverse_offsets[var]; kk < reverse_offsets[var + 1]; ++kk) { + for (i_t kk = reverse_offsets[var]; kk < reverse_offsets[var + 1]; ++kk) { if (reverse_constraints[kk] != cnstr) { cnstr_pair.insert(reverse_constraints[kk]); } if (cnstr_pair.size() == max_pair_per_row) { break; } } diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu index cfa177eaf5..501e940a71 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu @@ -890,6 +890,8 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, // are visible before any per-thread kernel can reference that memory. problem.handle_ptr->sync_stream(); + CUOPT_LOG_INFO("Running probing cache with %d tasks", num_tasks); + // Main parallel loop for (size_t step_start = 0; step_start < priority_indices.size(); step_start += step_size) { if (timer.check_time_limit() || early_exit || problem_is_infeasible.load()) { break; } diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu index ab6f324175..6bf6ae9ce7 100644 --- a/cpp/src/mip_heuristics/solve.cu +++ b/cpp/src/mip_heuristics/solve.cu @@ -577,23 +577,26 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, if (settings_const.num_cpu_threads < 0) { num_threads = omp_get_max_threads(); } else { - if (settings_const.num_cpu_threads < 4) { - CUOPT_LOG_ERROR( - "The MIP solver requires at least 4 CPU threads! Setting the number of threads to 4."); - } + num_threads = settings_const.num_cpu_threads; + } - num_threads = std::max(4, settings_const.num_cpu_threads); + if (num_threads < 4) { + CUOPT_LOG_ERROR("The MIP solver requires at least 4 CPU threads!"); + return mip_solution_t{ + cuopt::logic_error("The number of CPU threads is below than expected.", + cuopt::error_type_t::RuntimeError), + op_problem.get_handle_ptr()->get_stream()}; } // TODO: Remove this after converting deterministic B&B to use tasks. This allows // creating a nested parallel region. omp_set_max_active_levels(2); - // mip_solution_t sol(mip_termination_status_t::NoTermination, solver_stats_t{}, op_problem.get_handle_ptr()->get_stream()); + // Creates the OpenMP thread pool. It will be shared across the entire MIP solver. #pragma omp parallel num_threads(num_threads) default(none) \ shared(sol, op_problem, settings_const, exception) { @@ -607,7 +610,7 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, exception = std::current_exception(); } } - } + } // Implicit barrier if (exception) { std::rethrow_exception(exception); } return sol; diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index 694fb7fa8d..1415c68922 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -462,7 +462,7 @@ solution_t mip_solver_t::run_solver() // Start the primal heuristics context.diversity_manager_ptr = &dm; sol = dm.run_solver(); - } + } // implicit barrier for all tasks created within the taskgroup if (!context.settings.heuristics_only) { if (branch_and_bound_solution.lower_bound > -std::numeric_limits::infinity()) { From 50fa3d6129c27d00dc6b1f217162eb12d973e451 Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Wed, 15 Apr 2026 12:07:01 +0200 Subject: [PATCH 05/14] fixed incorrect starting bounds for probing cache. added depend clause for root relaxation. Signed-off-by: Nicolas L. Guidotti --- cpp/src/branch_and_bound/branch_and_bound.cpp | 12 ++++++------ cpp/src/mip_heuristics/presolve/probing_cache.cu | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 367dfbb416..6c6ddad83f 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -1887,7 +1887,7 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( basic_list, \ nonbasic_list, \ root_vstatus_, \ - edge_norms_) default(none) + edge_norms_) default(none) depend(out : root_status) { root_status = solve_linear_program_with_advanced_basis(original_lp_, exploration_stats_.start_time, @@ -1942,9 +1942,9 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( // Check if crossover was stopped by dual simplex if (crossover_status == crossover_status_t::OPTIMAL) { - set_root_concurrent_halt(1); // Stop dual simplex -#pragma omp taskwait // Wait for dual simplex to finish - set_root_concurrent_halt(0); // Clear the concurrent halt flag + set_root_concurrent_halt(1); // Stop dual simplex +#pragma omp taskwait depend(in : root_status) // Wait for dual simplex to finish + set_root_concurrent_halt(0); // Clear the concurrent halt flag // Override the root relaxation solution with the crossover solution root_relax_soln = root_crossover_soln_; root_vstatus = crossover_vstatus_; @@ -1994,14 +1994,14 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( solver_name = method_to_string(root_relax_solved_by); } else { -#pragma omp taskwait +#pragma omp taskwait depend(in : root_status) user_objective = root_relax_soln_.user_objective; iter = root_relax_soln_.iterations; root_relax_solved_by = DualSimplex; solver_name = "Dual Simplex"; } } else { -#pragma omp taskwait +#pragma omp taskwait depend(in : root_status) user_objective = root_relax_soln_.user_objective; iter = root_relax_soln_.iterations; root_relax_solved_by = DualSimplex; diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu index 501e940a71..25562e20a9 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu @@ -912,9 +912,9 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, modification_vector_pool, \ substitution_vector_pool) for (size_t task_id = 0; task_id < num_tasks; ++task_id) { - size_t n = step_end - step_start; - size_t begin = std::floor(static_cast(n) * task_id / num_tasks); - size_t end = std::floor(static_cast(n) * (task_id + 1) / num_tasks); + size_t n = step_end - step_start; + size_t begin = step_start + std::floor(static_cast(n) * task_id / num_tasks); + size_t end = std::floor(static_cast(n) * (task_id + 1) / num_tasks); auto& multi_probe_presolve = multi_probe_presolve_pool[task_id]; auto& modification_vector = modification_vector_pool[task_id]; auto& substitution_vector = substitution_vector_pool[task_id]; @@ -924,7 +924,7 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, auto var_idx = priority_indices[i]; if (timer.check_time_limit()) { continue; } - CUOPT_LOG_TRACE("Computing probing cache for var %d on thread %d", var_idx, id); + CUOPT_LOG_TRACE("Computing probing cache for var %d on thread %d", var_idx, task_id); compute_cache_for_var(var_idx, bound_presolve, problem, From 180fbd908f82e6f4e15a9b833d32d39b02b3021f Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Wed, 15 Apr 2026 12:15:07 +0200 Subject: [PATCH 06/14] reduced the verbosity of the shared clause Signed-off-by: Nicolas L. Guidotti --- cpp/src/branch_and_bound/branch_and_bound.cpp | 9 +-- cpp/src/branch_and_bound/pseudo_costs.cpp | 65 ++----------------- .../conditional_bound_strengthening.cu | 3 +- .../mip_heuristics/presolve/probing_cache.cu | 15 +---- cpp/src/mip_heuristics/solver.cu | 3 +- 5 files changed, 8 insertions(+), 87 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 6c6ddad83f..b2a170b9d1 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -1880,14 +1880,7 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( // Note that we need to explicitly declared `root_status` as a shared variable here since // it is local to the thread that are executing the enclosing task. -#pragma omp task shared(root_status, \ - original_lp_, \ - lp_settings, \ - basis_update, \ - basic_list, \ - nonbasic_list, \ - root_vstatus_, \ - edge_norms_) default(none) depend(out : root_status) +#pragma omp task default(shared) depend(out : root_status) { root_status = solve_linear_program_with_advanced_basis(original_lp_, exploration_stats_.start_time, diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 47c460cc7b..8d56e3a068 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -1054,17 +1054,7 @@ void strong_branching(const lp_problem_t& original_lp, pc); } else { if (effective_batch_pdlp != 0) { -#pragma omp task shared(settings, \ - concurrent_halt, \ - original_lp, \ - new_slacks, \ - root_solution, \ - fractional, \ - pc, \ - sb_view, \ - pdlp_obj_down, \ - pdlp_obj_up) \ - firstprivate(effective_batch_pdlp, start_time, root_obj) default(none) +#pragma omp task default(shared) batch_pdlp_strong_branching_task(settings, effective_batch_pdlp, start_time, @@ -1084,20 +1074,7 @@ void strong_branching(const lp_problem_t& original_lp, i_t n = std::min(4 * settings.num_threads, fractional.size()); // Here we are creating more tasks than the number of threads // such that they can be scheduled dynamically to the threads. -#pragma omp taskloop num_tasks(n) default(none) shared(original_lp, \ - settings, \ - var_types, \ - fractional, \ - root_solution, \ - root_vstatus, \ - edge_norms, \ - pc, \ - dual_simplex_obj_down, \ - dual_simplex_obj_up, \ - dual_simplex_status_down, \ - dual_simplex_status_up, \ - sb_view) \ - firstprivate(start_time, root_obj, upper_bound, simplex_iteration_limit, n) +#pragma omp taskloop num_tasks(n) default(shared) for (i_t k = 0; k < n; k++) { i_t start = std::floor(k * fractional.size() / n); i_t end = std::floor((k + 1) * fractional.size() / n); @@ -1568,18 +1545,7 @@ i_t pseudo_costs_t::reliable_variable_selection( std::atomic concurrent_halt{0}; if (use_pdlp) { -#pragma omp task default(none) shared(log, \ - concurrent_halt, \ - original_lp, \ - new_slacks, \ - leaf_solution, \ - worker, \ - candidate_vars, \ - settings, \ - sb_view, \ - pdlp_obj_down, \ - pdlp_obj_up) \ - firstprivate(rb_mode, num_candidates, start_time) +#pragma omp task default(shared) batch_pdlp_reliability_branching_task(log, rb_mode, num_candidates, @@ -1614,30 +1580,7 @@ i_t pseudo_costs_t::reliable_variable_selection( f_t dual_simplex_start_time = tic(); if (rb_mode != 2) { -#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) default(none) \ - shared(log, \ - unreliable_list, \ - settings, \ - sb_view, \ - worker, \ - var_types, \ - node_ptr, \ - leaf_solution, \ - dual_simplex_obj_down, \ - dual_simplex_obj_up, \ - dual_simplex_status_down, \ - dual_simplex_status_up, \ - score_mutex, \ - max_score, \ - branch_var) firstprivate(num_candidates, \ - start_time, \ - rb_mode, \ - reliable_threshold, \ - upper_bound, \ - iter_limit_per_trial, \ - eps, \ - pseudo_cost_up_avg, \ - pseudo_cost_down_avg) +#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) default(shared) for (i_t i = 0; i < num_candidates; ++i) { auto [score, j] = unreliable_list[i]; diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu index 846e22358a..e6c403aacc 100644 --- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu +++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu @@ -249,8 +249,7 @@ void conditional_bound_strengthening_t::select_constraint_pairs_host( i_t num_tasks = omp_get_num_threads() - 4; CUOPT_LOG_INFO("Selecting constraint pairs with %d tasks", num_tasks); -#pragma omp taskloop num_tasks(num_tasks) private(cnstr_pair) default(none) \ - shared(problem, offsets, variables, reverse_offsets, reverse_constraints, constraint_pairs_h) +#pragma omp taskloop num_tasks(num_tasks) private(cnstr_pair) default(shared) for (i_t cnstr = 0; cnstr < problem.n_constraints; ++cnstr) { for (i_t jj = offsets[cnstr]; jj < offsets[cnstr + 1]; ++jj) { int var = variables[jj]; diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu index 25562e20a9..43097c6724 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu @@ -897,20 +897,7 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, if (timer.check_time_limit() || early_exit || problem_is_infeasible.load()) { break; } size_t step_end = std::min(step_start + step_size, priority_indices.size()); -#pragma omp taskloop num_tasks(num_tasks) default(none) firstprivate(step_start, step_end) \ - shared(num_tasks, \ - priority_indices, \ - timer, \ - multi_probe_presolve_pool, \ - bound_presolve, \ - problem, \ - h_var_bounds, \ - h_integer_indices, \ - n_of_implied_singletons, \ - n_of_cached_probings, \ - problem_is_infeasible, \ - modification_vector_pool, \ - substitution_vector_pool) +#pragma omp taskloop num_tasks(num_tasks) default(shared) for (size_t task_id = 0; task_id < num_tasks; ++task_id) { size_t n = step_end - step_start; size_t begin = step_start + std::floor(static_cast(n) * task_id / num_tasks); diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index 1415c68922..660e8a6f61 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -452,8 +452,7 @@ solution_t mip_solver_t::run_solver() #pragma omp taskgroup { if (!context.settings.heuristics_only) { -#pragma omp task default(none) \ - shared(branch_and_bound, branch_and_bound_solution, branch_and_bound_status) +#pragma omp task default(shared) { branch_and_bound_status = branch_and_bound->solve(branch_and_bound_solution); } From b4efd6715d72cbab3a4a897c118582163133ec1d Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Wed, 15 Apr 2026 12:20:36 +0200 Subject: [PATCH 07/14] added missing offset for the end in probing cache Signed-off-by: Nicolas L. Guidotti --- cpp/src/mip_heuristics/presolve/probing_cache.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu index 43097c6724..1171e03a53 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu @@ -901,7 +901,7 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, for (size_t task_id = 0; task_id < num_tasks; ++task_id) { size_t n = step_end - step_start; size_t begin = step_start + std::floor(static_cast(n) * task_id / num_tasks); - size_t end = std::floor(static_cast(n) * (task_id + 1) / num_tasks); + size_t end = step_start + std::floor(static_cast(n) * (task_id + 1) / num_tasks); auto& multi_probe_presolve = multi_probe_presolve_pool[task_id]; auto& modification_vector = modification_vector_pool[task_id]; auto& substitution_vector = substitution_vector_pool[task_id]; From 80b14ab2b2227dae3c79cdc09ecd9990552da3ac Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Wed, 15 Apr 2026 13:46:09 +0200 Subject: [PATCH 08/14] fixed missing type conversion Signed-off-by: Nicolas L. Guidotti --- cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu index ee8ff9a53c..ae0057ba5f 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu @@ -1419,9 +1419,9 @@ std::unique_ptr> fj_t::create_cpu_climber( template void cpufj_solve(fj_cpu_climber_t* fj_cpu, f_t in_time_limit) { - i_t local_mins = 0; - auto loop_start = std::chrono::high_resolution_clock::now(); - auto time_limit = std::chrono::milliseconds(std::floor(in_time_limit * 1000.0)); + i_t local_mins = 0; + auto loop_start = std::chrono::high_resolution_clock::now(); + auto time_limit = std::chrono::milliseconds(static_cast(std::floor(in_time_limit * 1000.0))); auto loop_time_start = std::chrono::high_resolution_clock::now(); // Initialize feature tracking From 588b401d4b7b041ba7618435fa91e83800e50e6f Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Wed, 15 Apr 2026 14:43:34 +0200 Subject: [PATCH 09/14] converted GPUFJ to omp task Signed-off-by: Nicolas L. Guidotti --- .../feasibility_jump/early_cpufj.cu | 7 +++--- .../feasibility_jump/early_gpufj.cu | 23 ++++++++----------- .../feasibility_jump/early_gpufj.cuh | 4 ---- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu index 86d91c0539..667edf8455 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu @@ -57,11 +57,10 @@ void early_cpufj_t::stop() fj_cpu_->halted = true; #pragma omp taskwait depend(in : *fj_cpu_) - CUOPT_LOG_INFO("Early CPUFJ task was stopped"); - CUOPT_LOG_DEBUG("[Early CPUFJ] Stopped after %d iterations, solution_found=%d", - fj_cpu_ ? fj_cpu_->iterations : 0, - this->solution_found_); + CUOPT_LOG_INFO("[Early CPUFJ] Stopped after %d iterations, solution_found=%d", + fj_cpu_ ? fj_cpu_->iterations : 0, + this->solution_found_); fj_cpu_.reset(); } diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu index 3f77427d87..a69ff48aa7 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu @@ -38,7 +38,7 @@ early_gpufj_t::~early_gpufj_t() template void early_gpufj_t::start() { - if (worker_thread_) { return; } + if (fj_ptr_) { return; } this->start_time_ = std::chrono::steady_clock::now(); @@ -57,29 +57,26 @@ void early_gpufj_t::start() this->try_update_best(solver_obj, h_assignment); }; - worker_thread_ = std::make_unique(&early_gpufj_t::run_worker, this); -} + CUOPT_LOG_INFO("Launching early GPUFJ task"); -template -void early_gpufj_t::run_worker() -{ - RAFT_CUDA_TRY(cudaSetDevice(this->device_id_)); - fj_ptr_->solve(*this->solution_ptr_); +#pragma omp task default(none) shared(fj_ptr_) depend(out : *fj_ptr_) + { + RAFT_CUDA_TRY(cudaSetDevice(this->device_id_)); + fj_ptr_->solve(*this->solution_ptr_); + } } template void early_gpufj_t::stop() { - if (!worker_thread_) { return; } + if (!fj_ptr_) { return; } context_ptr_->preempt_heuristic_solver_.store(true); +#pragma omp taskwait depend(in : *fj_ptr_) - if (worker_thread_->joinable()) { worker_thread_->join(); } - - CUOPT_LOG_DEBUG("[Early GPU FJ] Stopped, solution_found=%d", this->solution_found_); + CUOPT_LOG_INFO("[Early GPU FJ] Stopped, solution_found=%d", this->solution_found_); fj_ptr_.reset(); - worker_thread_.reset(); } #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh index 4a7769143e..e5ceaaeb61 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh +++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh @@ -10,7 +10,6 @@ #include #include -#include namespace cuopt::linear_programming::detail { @@ -35,11 +34,8 @@ class early_gpufj_t : public early_heuristic_t void stop(); private: - void run_worker(); - std::unique_ptr> context_ptr_; std::unique_ptr> fj_ptr_; - std::unique_ptr worker_thread_; }; } // namespace cuopt::linear_programming::detail From 3c569e61ec190b43a31901210d1ee6522f4e4336 Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Wed, 15 Apr 2026 16:03:30 +0200 Subject: [PATCH 10/14] added more comments about the implicit barriers. removed cudaDeviceSync in CPU FJ. use scope_guard in RINS. Signed-off-by: Nicolas L. Guidotti --- cpp/src/branch_and_bound/branch_and_bound.cpp | 6 +++--- cpp/src/branch_and_bound/pseudo_costs.cpp | 6 +++--- cpp/src/mip_heuristics/diversity/lns/rins.cu | 13 +++++-------- .../mip_heuristics/feasibility_jump/early_cpufj.cu | 2 +- .../mip_heuristics/feasibility_jump/early_gpufj.cu | 2 +- cpp/src/mip_heuristics/local_search/local_search.cu | 13 ++++++------- .../presolve/conditional_bound_strengthening.cu | 2 +- cpp/src/mip_heuristics/presolve/probing_cache.cu | 2 +- cpp/src/mip_heuristics/solver.cu | 2 +- 9 files changed, 22 insertions(+), 26 deletions(-) diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index b2a170b9d1..db8b9910dd 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -1987,14 +1987,14 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( solver_name = method_to_string(root_relax_solved_by); } else { -#pragma omp taskwait depend(in : root_status) +#pragma omp taskwait depend(in : root_status) // Wait for the dual simplex to finish user_objective = root_relax_soln_.user_objective; iter = root_relax_soln_.iterations; root_relax_solved_by = DualSimplex; solver_name = "Dual Simplex"; } } else { -#pragma omp taskwait depend(in : root_status) +#pragma omp taskwait depend(in : root_status) // Wait for the dual simplex to finish user_objective = root_relax_soln_.user_objective; iter = root_relax_soln_.iterations; root_relax_solved_by = DualSimplex; @@ -2629,7 +2629,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut } else { single_threaded_solve(); } - } // Implicit barrier for all tasks created within the group + } // Implicit barrier for all tasks created within the group (RINS, B&B workers) is_running_ = false; diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 8d56e3a068..91824d5cac 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -1114,7 +1114,7 @@ void strong_branching(const lp_problem_t& original_lp, } if (effective_batch_pdlp != 0) { -#pragma omp taskwait +#pragma omp taskwait // Wait for the batch PDLP task to finish } } @@ -1567,7 +1567,7 @@ i_t pseudo_costs_t::reliable_variable_selection( log.printf("Time limit reached\n"); if (use_pdlp) { concurrent_halt.store(1); -#pragma omp taskwait +#pragma omp taskwait // Wait for the batch PDLP task to finish } return branch_var; } @@ -1706,7 +1706,7 @@ i_t pseudo_costs_t::reliable_variable_selection( //} if (use_pdlp) { -#pragma omp taskwait +#pragma omp taskwait // Wait for the batch PDLP task to finish i_t pdlp_applied = 0; i_t pdlp_optimal = 0; diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu index 6ddc645e4e..10430043a4 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cu +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu @@ -24,6 +24,7 @@ #include #include +#include namespace cuopt::linear_programming::detail { template @@ -84,6 +85,8 @@ template void rins_t::run_rins() { raft::common::nvtx::range fun_scope("Running RINS"); + scope_guard scope_guard([this]() { this->launch_new_task = true; }); + RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device())); cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch"); cuopt_assert(problem_copy->handle_ptr == &rins_handle, "Handle mismatch"); @@ -115,10 +118,7 @@ void rins_t::run_rins() cuopt_assert(best_sol.handle_ptr == &rins_handle, "Handle mismatch"); cuopt_assert(best_sol.get_feasible(), "Best solution is not feasible"); - if (!best_sol.get_feasible()) { - launch_new_task = true; - return; - } + if (!best_sol.get_feasible()) { return; } i_t sol_size_before_rins = best_sol.assignment.size(); auto lp_opt_device = cuopt::device_copy(this->lp_optimal_solution, rins_handle.get_stream()); @@ -142,7 +142,6 @@ void rins_t::run_rins() // abort if the fractional ratio is too low if (fractional_ratio < settings.min_fractional_ratio) { CUOPT_LOG_TRACE("RINS fractional ratio too low, aborting"); - launch_new_task = true; return; } @@ -167,7 +166,6 @@ void rins_t::run_rins() if (n_to_fix == 0) { CUOPT_LOG_DEBUG("RINS no variables to fix"); - launch_new_task = true; return; } @@ -299,7 +297,7 @@ void rins_t::run_rins() static_cast(context.settings.heuristic_params.rins_max_time_limit)); } -#pragma omp taskwait +#pragma omp taskwait // Wait for the CPU FJ (RINS) to finish CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", fj_cpu->iterations); if (fj_cpu->feasible_found) { @@ -341,7 +339,6 @@ void rins_t::run_rins() if (improvement_found) total_success++; CUOPT_LOG_DEBUG("RINS calls/successes %d/%d", total_calls, total_success); - launch_new_task = true; } #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu index 667edf8455..e198ac20f1 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu @@ -56,7 +56,7 @@ void early_cpufj_t::stop() preemption_flag_.store(true); fj_cpu_->halted = true; -#pragma omp taskwait depend(in : *fj_cpu_) +#pragma omp taskwait depend(in : *fj_cpu_) // Wait for the early CPUFJ task to finish CUOPT_LOG_INFO("[Early CPUFJ] Stopped after %d iterations, solution_found=%d", fj_cpu_ ? fj_cpu_->iterations : 0, diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu index a69ff48aa7..96c88daf16 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu @@ -72,7 +72,7 @@ void early_gpufj_t::stop() if (!fj_ptr_) { return; } context_ptr_->preempt_heuristic_solver_.store(true); -#pragma omp taskwait depend(in : *fj_ptr_) +#pragma omp taskwait depend(in : *fj_ptr_) // Wait for the early GPU FJ task to finish CUOPT_LOG_INFO("[Early GPU FJ] Stopped, solution_found=%d", this->solution_found_); diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu index d5160862e5..7844617f3f 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cu +++ b/cpp/src/mip_heuristics/local_search/local_search.cu @@ -127,9 +127,6 @@ void local_search_t::start_cpufj_lptopt_scratch_threads( } }; - // default weights - cudaDeviceSynchronize(); - CUOPT_LOG_INFO("Launching scratch CPUFJ (on LP optimal) task"); #pragma omp task shared(scratch_cpu_fj_on_lp_opt) default(none) \ @@ -142,12 +139,13 @@ void local_search_t::stop_cpufj_scratch_threads() { for (size_t i = 0; i < scratch_cpu_fj.size(); ++i) { scratch_cpu_fj[i]->halted = true; -#pragma omp taskwait depend(in : *scratch_cpu_fj[i]) +#pragma omp taskwait depend(in : *scratch_cpu_fj[i]) // Wait for each scratch CPU FJ task to finish } if (scratch_cpu_fj_on_lp_opt) { scratch_cpu_fj_on_lp_opt->halted = true; -#pragma omp taskwait depend(in : *scratch_cpu_fj_on_lp_opt) +#pragma omp taskwait depend( \ + in : *scratch_cpu_fj_on_lp_opt) // Wait for the scratch CPU FJ (LP optimal) task to finish CUOPT_LOG_INFO("All scratch CPUFJ tasks were stopped"); } @@ -205,7 +203,8 @@ void local_search_t::stop_cpufj_deterministic() } deterministic_cpu_fj->halted = true; -#pragma omp taskwait depend(in : *deterministic_cpu_fj) +#pragma omp taskwait depend( \ + in : *deterministic_cpu_fj) // Wait for deterministic CPU FJ task to finish CUOPT_LOG_INFO("Deterministic CPUFJ task was stopped"); } } @@ -270,7 +269,7 @@ bool local_search_t::do_fj_solve(solution_t& solution, for (size_t i = 0; i < ls_cpu_fj.size(); ++i) { ls_cpu_fj[i]->halted = true; } - } + } // implicit barrier that waits all CPU FJ tasks to finish CUOPT_LOG_INFO("All CPUFJ tasks were stopped"); diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu index e6c403aacc..52333b1c14 100644 --- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu +++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu @@ -266,7 +266,7 @@ void conditional_bound_strengthening_t::select_constraint_pairs_host( constraint_pairs_h[cnstr * max_pair_per_row + counter++] = {cnstr, temp}; } cnstr_pair.clear(); - } + } // implicit barrier that waits for all iterations to finish before proceeding constraint_pairs = cuopt::device_copy(constraint_pairs_h, problem.handle_ptr->get_stream()); diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu index 1171e03a53..9d3fb65a36 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu @@ -926,7 +926,7 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, timer, problem.handle_ptr->get_device()); } - } + } // implicit barrier that waits for all iterations to finish before proceeding // TODO when we have determinism, check current threads work/time counter and filter queue // items that are smaller or equal to that diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index 660e8a6f61..0229a8f27f 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -461,7 +461,7 @@ solution_t mip_solver_t::run_solver() // Start the primal heuristics context.diversity_manager_ptr = &dm; sol = dm.run_solver(); - } // implicit barrier for all tasks created within the taskgroup + } // implicit barrier for all tasks created in B&B and heuristics if (!context.settings.heuristics_only) { if (branch_and_bound_solution.lower_bound > -std::numeric_limits::infinity()) { From d7d3dad7878b277922b6a05da2ade20d43fca580 Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Wed, 15 Apr 2026 17:50:50 +0200 Subject: [PATCH 11/14] decreased verbosity for logging the task launches Signed-off-by: Nicolas L. Guidotti --- cpp/src/branch_and_bound/pseudo_costs.cpp | 2 +- cpp/src/mip_heuristics/diversity/lns/rins.cu | 2 +- .../mip_heuristics/feasibility_jump/early_cpufj.cu | 8 ++++---- .../mip_heuristics/feasibility_jump/early_gpufj.cu | 4 ++-- .../mip_heuristics/local_search/local_search.cu | 14 +++++++------- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 91824d5cac..a9297de571 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -1030,7 +1030,7 @@ void strong_branching(const lp_problem_t& original_lp, shared_strong_branching_context_t shared_ctx(2 * fractional.size()); shared_strong_branching_context_view_t sb_view(shared_ctx.solved); - std::atomic concurrent_halt{0}; + std::atomic concurrent_halt{0}; std::vector pdlp_obj_down(fractional.size(), std::numeric_limits::quiet_NaN()); std::vector pdlp_obj_up(fractional.size(), std::numeric_limits::quiet_NaN()); diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu index 10430043a4..fd631d573d 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cu +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu @@ -63,7 +63,7 @@ void rins_t::node_callback(const std::vector& solution, f_t objec if (population_ready) { lp_optimal_solution = solution; - CUOPT_LOG_INFO("Launching RINS task"); + CUOPT_LOG_DEBUG("Launching RINS task"); #pragma omp task default(none) run_rins(); } else { diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu index e198ac20f1..de905f7592 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu @@ -43,7 +43,7 @@ void early_cpufj_t::start() const std::vector& assignment, double) { this->try_update_best(solver_obj, assignment); }; - CUOPT_LOG_INFO("Launching early CPUFJ task"); + CUOPT_LOG_DEBUG("Launching early CPUFJ task"); #pragma omp task shared(fj_cpu_) depend(out : *fj_cpu_) default(none) cpufj_solve(fj_cpu_.get()); } @@ -58,9 +58,9 @@ void early_cpufj_t::stop() fj_cpu_->halted = true; #pragma omp taskwait depend(in : *fj_cpu_) // Wait for the early CPUFJ task to finish - CUOPT_LOG_INFO("[Early CPUFJ] Stopped after %d iterations, solution_found=%d", - fj_cpu_ ? fj_cpu_->iterations : 0, - this->solution_found_); + CUOPT_LOG_DEBUG("[Early CPUFJ] Stopped after %d iterations, solution_found=%d", + fj_cpu_ ? fj_cpu_->iterations : 0, + this->solution_found_); fj_cpu_.reset(); } diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu index 96c88daf16..4615fa8dc3 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu @@ -57,7 +57,7 @@ void early_gpufj_t::start() this->try_update_best(solver_obj, h_assignment); }; - CUOPT_LOG_INFO("Launching early GPUFJ task"); + CUOPT_LOG_DEBUG("Launching early GPUFJ task"); #pragma omp task default(none) shared(fj_ptr_) depend(out : *fj_ptr_) { @@ -74,7 +74,7 @@ void early_gpufj_t::stop() context_ptr_->preempt_heuristic_solver_.store(true); #pragma omp taskwait depend(in : *fj_ptr_) // Wait for the early GPU FJ task to finish - CUOPT_LOG_INFO("[Early GPU FJ] Stopped, solution_found=%d", this->solution_found_); + CUOPT_LOG_DEBUG("[Early GPU FJ] Stopped, solution_found=%d", this->solution_found_); fj_ptr_.reset(); } diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu index 7844617f3f..58918148ba 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cu +++ b/cpp/src/mip_heuristics/local_search/local_search.cu @@ -90,7 +90,7 @@ void local_search_t::start_cpufj_scratch_threads(population_t::start_cpufj_lptopt_scratch_threads( } }; - CUOPT_LOG_INFO("Launching scratch CPUFJ (on LP optimal) task"); + CUOPT_LOG_DEBUG("Launching scratch CPUFJ (on LP optimal) task"); #pragma omp task shared(scratch_cpu_fj_on_lp_opt) default(none) \ depend(out : *scratch_cpu_fj_on_lp_opt) @@ -147,7 +147,7 @@ void local_search_t::stop_cpufj_scratch_threads() #pragma omp taskwait depend( \ in : *scratch_cpu_fj_on_lp_opt) // Wait for the scratch CPU FJ (LP optimal) task to finish - CUOPT_LOG_INFO("All scratch CPUFJ tasks were stopped"); + CUOPT_LOG_DEBUG("All scratch CPUFJ tasks were stopped"); } } @@ -185,7 +185,7 @@ void local_search_t::start_cpufj_deterministic( bb.queue_external_solution_deterministic(h_vec, work_units); }; - CUOPT_LOG_INFO("Launching deterministic CPUFJ task"); + CUOPT_LOG_DEBUG("Launching deterministic CPUFJ task"); #pragma omp task shared(deterministic_cpu_fj) default(none) depend(inout : *deterministic_cpu_fj) cpufj_solve(deterministic_cpu_fj.get()); @@ -205,7 +205,7 @@ void local_search_t::stop_cpufj_deterministic() deterministic_cpu_fj->halted = true; #pragma omp taskwait depend( \ in : *deterministic_cpu_fj) // Wait for deterministic CPU FJ task to finish - CUOPT_LOG_INFO("Deterministic CPUFJ task was stopped"); + CUOPT_LOG_DEBUG("Deterministic CPUFJ task was stopped"); } } @@ -254,7 +254,7 @@ bool local_search_t::do_fj_solve(solution_t& solution, #pragma omp taskgroup { if (ls_cpu_fj.size() > 0) { - CUOPT_LOG_INFO("Launching %d CPUFJ tasks", ls_cpu_fj.size()); + CUOPT_LOG_DEBUG("Launching %d CPUFJ tasks", ls_cpu_fj.size()); #pragma omp taskloop shared(ls_cpu_fj) default(none) num_tasks(ls_cpu_fj.size()) nogroup for (size_t i = 0; i < ls_cpu_fj.size(); ++i) { @@ -271,7 +271,7 @@ bool local_search_t::do_fj_solve(solution_t& solution, } } // implicit barrier that waits all CPU FJ tasks to finish - CUOPT_LOG_INFO("All CPUFJ tasks were stopped"); + CUOPT_LOG_DEBUG("All CPUFJ tasks were stopped"); solution_t solution_cpu(*solution.problem_ptr); f_t best_cpu_obj = std::numeric_limits::max(); From edcec044ab69b9872554489d7545483fb2a1ebff Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Thu, 16 Apr 2026 12:09:10 +0200 Subject: [PATCH 12/14] revert scope guard as it was causing the compilation to fail with CUDA 12.9 Signed-off-by: Nicolas L. Guidotti --- cpp/src/mip_heuristics/diversity/lns/rins.cu | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu index fd631d573d..2e3c616335 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cu +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu @@ -24,7 +24,6 @@ #include #include -#include namespace cuopt::linear_programming::detail { template @@ -85,8 +84,6 @@ template void rins_t::run_rins() { raft::common::nvtx::range fun_scope("Running RINS"); - scope_guard scope_guard([this]() { this->launch_new_task = true; }); - RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device())); cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch"); cuopt_assert(problem_copy->handle_ptr == &rins_handle, "Handle mismatch"); @@ -118,7 +115,10 @@ void rins_t::run_rins() cuopt_assert(best_sol.handle_ptr == &rins_handle, "Handle mismatch"); cuopt_assert(best_sol.get_feasible(), "Best solution is not feasible"); - if (!best_sol.get_feasible()) { return; } + if (!best_sol.get_feasible()) { + launch_new_task = true; + return; + } i_t sol_size_before_rins = best_sol.assignment.size(); auto lp_opt_device = cuopt::device_copy(this->lp_optimal_solution, rins_handle.get_stream()); @@ -142,6 +142,7 @@ void rins_t::run_rins() // abort if the fractional ratio is too low if (fractional_ratio < settings.min_fractional_ratio) { CUOPT_LOG_TRACE("RINS fractional ratio too low, aborting"); + launch_new_task = true; return; } @@ -166,6 +167,7 @@ void rins_t::run_rins() if (n_to_fix == 0) { CUOPT_LOG_DEBUG("RINS no variables to fix"); + launch_new_task = true; return; } @@ -339,6 +341,7 @@ void rins_t::run_rins() if (improvement_found) total_success++; CUOPT_LOG_DEBUG("RINS calls/successes %d/%d", total_calls, total_success); + launch_new_task = true; } #if MIP_INSTANTIATE_FLOAT From ff75e043c71c27d37e93fd86982be234214fbb7e Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Thu, 16 Apr 2026 12:15:16 +0200 Subject: [PATCH 13/14] trying again to re-enable scope_guard Signed-off-by: Nicolas L. Guidotti --- cpp/src/mip_heuristics/diversity/lns/rins.cu | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu index 2e3c616335..d9f130720d 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cu +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu @@ -24,6 +24,7 @@ #include #include +#include namespace cuopt::linear_programming::detail { template @@ -84,6 +85,8 @@ template void rins_t::run_rins() { raft::common::nvtx::range fun_scope("Running RINS"); + scope_guard guard([this]() { this->launch_new_task = true; }); + RAFT_CUDA_TRY(cudaSetDevice(context.handle_ptr->get_device())); cuopt_assert(lp_optimal_solution.size() == problem_copy->n_variables, "Assignment size mismatch"); cuopt_assert(problem_copy->handle_ptr == &rins_handle, "Handle mismatch"); @@ -115,10 +118,7 @@ void rins_t::run_rins() cuopt_assert(best_sol.handle_ptr == &rins_handle, "Handle mismatch"); cuopt_assert(best_sol.get_feasible(), "Best solution is not feasible"); - if (!best_sol.get_feasible()) { - launch_new_task = true; - return; - } + if (!best_sol.get_feasible()) { return; } i_t sol_size_before_rins = best_sol.assignment.size(); auto lp_opt_device = cuopt::device_copy(this->lp_optimal_solution, rins_handle.get_stream()); @@ -142,7 +142,6 @@ void rins_t::run_rins() // abort if the fractional ratio is too low if (fractional_ratio < settings.min_fractional_ratio) { CUOPT_LOG_TRACE("RINS fractional ratio too low, aborting"); - launch_new_task = true; return; } @@ -167,7 +166,6 @@ void rins_t::run_rins() if (n_to_fix == 0) { CUOPT_LOG_DEBUG("RINS no variables to fix"); - launch_new_task = true; return; } @@ -341,7 +339,6 @@ void rins_t::run_rins() if (improvement_found) total_success++; CUOPT_LOG_DEBUG("RINS calls/successes %d/%d", total_calls, total_success); - launch_new_task = true; } #if MIP_INSTANTIATE_FLOAT From 11b92ed495fbbbe5656ca0153af23318a2eb25bd Mon Sep 17 00:00:00 2001 From: "Nicolas L. Guidotti" Date: Thu, 16 Apr 2026 12:18:25 +0200 Subject: [PATCH 14/14] missed one debug message Signed-off-by: Nicolas L. Guidotti --- cpp/src/mip_heuristics/diversity/lns/rins.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu index d9f130720d..9396d7158a 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cu +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu @@ -224,7 +224,7 @@ void rins_t::run_rins() true); fj_cpu->log_prefix = "[RINS] "; - CUOPT_LOG_INFO("Launching CPUFJ (RINS) task"); + CUOPT_LOG_DEBUG("Launching CPUFJ (RINS) task"); #pragma omp task shared(fj_cpu) firstprivate(time_limit) default(none) cpufj_solve(fj_cpu.get(), time_limit); @@ -298,6 +298,7 @@ void rins_t::run_rins() } #pragma omp taskwait // Wait for the CPU FJ (RINS) to finish + CUOPT_LOG_DEBUG("CPUFJ (RINS) task was stopped"); CUOPT_LOG_DEBUG("RINS FJ ran for %d iterations", fj_cpu->iterations); if (fj_cpu->feasible_found) {