From 41de3895b70181b84867335f4f0bedaaa82c62ab Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Tue, 7 Apr 2026 12:57:25 -0500 Subject: [PATCH 01/16] Debug for latency reductions --- cpp/src/barrier/barrier.cu | 24 ++++++++ cpp/src/dual_simplex/basis_updates.cpp | 17 +++++- cpp/src/dual_simplex/crossover.cpp | 10 ++-- cpp/src/dual_simplex/phase2.cpp | 69 ++++++++++++++++++++++- cpp/src/dual_simplex/right_looking_lu.cpp | 2 +- cpp/src/dual_simplex/solve.cpp | 64 ++++++++++++++++++--- 6 files changed, 169 insertions(+), 17 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 76ed1927b1..d56a769ec5 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -1094,6 +1094,10 @@ class iteration_data_t { std::sort(column_nz_permutation.begin(), column_nz_permutation.end(), [&column_nz](i_t i, i_t j) { return column_nz[i] < column_nz[j]; }); + printf("[Barrier fdc] t=%.3f halt=%d after column_nz sort\n", + toc(start_column_density), + settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } // We then compute the exact sparsity pattern for columns of A whose where // the number of nonzeros is less than a threshold. This part can be done @@ -1124,6 +1128,10 @@ class iteration_data_t { // The best way to do that is to have A stored in CSR format. csr_matrix_t A_row(0, 0, 0); A.to_compressed_row(A_row); + printf("[Barrier fdc] t=%.3f halt=%d after to_compressed_row\n", + toc(start_column_density), + settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } std::vector histogram(m + 1, 0); for (i_t j = 0; j < n; j++) { @@ -1253,6 +1261,10 @@ class iteration_data_t { std::sort(permutation.begin(), permutation.end(), [&delta_nz](i_t i, i_t j) { return delta_nz[i] < delta_nz[j]; }); + printf("[Barrier fdc] t=%.3f halt=%d after delta_nz sort\n", + toc(start_column_density), + settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } // Now we make a forward pass and compute the number of nonzeros in C // assuming we had included column j @@ -2298,6 +2310,12 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_tfactorize(data.device_augmented); #ifdef CHOLESKY_DEBUG_CHECK @@ -2306,6 +2324,12 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_tfactorize(data.device_ADAT); } diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp index 9c56ada50e..fdf8acf07d 100644 --- a/cpp/src/dual_simplex/basis_updates.cpp +++ b/cpp/src/dual_simplex/basis_updates.cpp @@ -2431,7 +2431,22 @@ int basis_update_mpf_t::refactor_basis( assert(q.size() == A.m); reorder_basic_list(q, basic_list); // We no longer need q after reordering the basic list work_estimate_ += 3 * q.size(); - reset(); + + // Check halt before the transpose operations: these can take hundreds of ms + // on large problems (L0 and U0 each have O(fill-in) nonzeros) and have no + // internal halt checks. Catching the flag here avoids the dead zone. + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return CONCURRENT_HALT_RETURN; + } + // Inline reset() so we can check halt between the two transposes. + clear(); + L0_.transpose(L0_transpose_); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return CONCURRENT_HALT_RETURN; + } + U0_.transpose(U0_transpose_); + work_estimate_ += 6 * L0_.col_start[L0_.n] + 6 * U0_.col_start[U0_.n]; + reset_stats(); return 0; } diff --git a/cpp/src/dual_simplex/crossover.cpp b/cpp/src/dual_simplex/crossover.cpp index f55ee0837d..acfdd2abc0 100644 --- a/cpp/src/dual_simplex/crossover.cpp +++ b/cpp/src/dual_simplex/crossover.cpp @@ -612,7 +612,7 @@ i_t dual_push(const lp_problem_t& lp, return TIME_LIMIT_RETURN; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - settings.log.printf("Concurrent halt\n"); + settings.log.printf("Dual simplex halted inside crossover dual push loop\n"); return CONCURRENT_HALT_RETURN; } } @@ -989,7 +989,7 @@ i_t primal_push(const lp_problem_t& lp, return TIME_LIMIT_RETURN; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - settings.log.printf("Concurrent halt\n"); + settings.log.printf("Dual simplex halted inside crossover primal push loop\n"); return CONCURRENT_HALT_RETURN; } } @@ -1353,7 +1353,7 @@ crossover_status_t crossover(const lp_problem_t& lp, return crossover_status_t::TIME_LIMIT; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - settings.log.printf("Concurrent halt\n"); + settings.log.printf("Dual simplex halted in crossover after basis reorder (before FTran/BTran)\n"); return crossover_status_t::CONCURRENT_LIMIT; } @@ -1415,7 +1415,7 @@ crossover_status_t crossover(const lp_problem_t& lp, return crossover_status_t::TIME_LIMIT; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - settings.log.printf("Concurrent halt\n"); + settings.log.printf("Dual simplex halted in crossover after dual_phase2 refinement\n"); return crossover_status_t::CONCURRENT_LIMIT; } primal_infeas = primal_infeasibility(lp, settings, vstatus, solution.x); @@ -1577,7 +1577,7 @@ crossover_status_t crossover(const lp_problem_t& lp, return crossover_status_t::TIME_LIMIT; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - settings.log.printf("Concurrent halt\n"); + settings.log.printf("Dual simplex halted in crossover during iterative dual_phase2 refinement\n"); return crossover_status_t::CONCURRENT_LIMIT; } solution.iterations += iter; diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index a6a1b80b6f..8c6072f3e6 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -23,6 +23,9 @@ #include +#include +#include + // #define PHASE2_NVTX_RANGES #ifdef PHASE2_NVTX_RANGES @@ -2489,22 +2492,29 @@ dual::status_t dual_phase2(i_t phase, std::vector basic_list(m); std::vector nonbasic_list; std::vector superbasic_list; - basis_update_mpf_t ft(m, settings.refactor_frequency); + auto ft = std::make_unique>(m, settings.refactor_frequency); const bool initialize_basis = true; - return dual_phase2_with_advanced_basis(phase, + dual::status_t result = dual_phase2_with_advanced_basis(phase, slack_basis, initialize_basis, start_time, lp, settings, vstatus, - ft, + *ft, basic_list, nonbasic_list, sol, iter, delta_y_steepest_edge, work_unit_context); + if (result == dual::status_t::CONCURRENT_LIMIT) { + std::thread([bl = std::move(basic_list), + nl = std::move(nonbasic_list), + sl = std::move(superbasic_list), + f = std::move(ft)]() {}).detach(); + } + return result; } template @@ -2580,6 +2590,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, if (refactor_status > 0) { return dual::status_t::NUMERICAL; } if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; } + printf("[DS ph%d init] t=%.3f halt=%d after refactor_basis\n", + phase, + toc(start_time), + settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); } // Populate c_basic after basis is initialized @@ -2648,6 +2662,10 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, phase2_work_estimate); if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; } + printf("[DS ph%d init] t=%.3f halt=%d after compute_primal_variables\n", + phase, + toc(start_time), + settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); if (print_norms) { settings.log.printf("|| x || %e\n", vector_norm2(x)); } #ifdef COMPUTE_PRIMAL_RESIDUAL @@ -2688,6 +2706,14 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, vector_norm2(delta_y_steepest_edge)); } + printf("[DS ph%d init] t=%.3f halt=%d after SE_norms\n", + phase, + toc(start_time), + settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } + if (phase == 2) { settings.log.printf(" Iter Objective Num Inf. Sum Inf. Perturb Time\n"); } @@ -2730,14 +2756,31 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, infeasibility_indices, primal_infeasibility); phase2_work_estimate += 4 * m + 2 * n; + printf("[DS ph%d init] t=%.3f halt=%d after compute_primal_infeas infeas_nz=%d\n", + phase, + toc(start_time), + settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0, + (int)infeasibility_indices.size()); #ifdef CHECK_BASIC_INFEASIBILITIES phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0); #endif + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } + csc_matrix_t A_transpose(1, 1, 0); lp.A.transpose(A_transpose); phase2_work_estimate += 2 * lp.A.col_start[lp.A.n]; + printf("[DS ph%d init] t=%.3f halt=%d after A_transpose\n", + phase, + toc(start_time), + settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); + + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } f_t obj = compute_objective(lp, x); phase2_work_estimate += 2 * n; @@ -2784,6 +2827,14 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, while (iter < iter_limit) { PHASE2_NVTX_RANGE("DualSimplex::phase2_main_loop"); + if (iter == 0 || (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1)) { + printf("[DS ph%d iter %d] loop top t=%.3f halt=%d infeas_nz=%d\n", + phase, + iter, + toc(start_time), + settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0, + (int)infeasibility_indices.size()); + } // Pricing i_t direction = 0; @@ -2908,6 +2959,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, phase2::compute_delta_y(ft, basic_leaving_index, direction, delta_y_sparse, UTsol_sparse); } timers.btran_time += timers.stop_timer(); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } const f_t steepest_edge_norm_check = delta_y_sparse.norm2_squared(); phase2_work_estimate += 2 * delta_y_sparse.i.size(); @@ -2966,6 +3020,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, } } timers.delta_z_time += timers.stop_timer(); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } #ifdef COMPUTE_DUAL_RESIDUAL std::vector dual_residual; @@ -3301,6 +3358,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, } timers.ftran_time += timers.stop_timer(); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } #ifdef CHECK_PRIMAL_STEP std::vector residual(m); @@ -3331,6 +3391,9 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, #endif assert(steepest_edge_status == 0); timers.se_norms_time += timers.stop_timer(); + if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + return dual::status_t::CONCURRENT_LIMIT; + } timers.start_timer(); // x <- x + delta_x diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp index 37202000f8..ef3c8d90f6 100644 --- a/cpp/src/dual_simplex/right_looking_lu.cpp +++ b/cpp/src/dual_simplex/right_looking_lu.cpp @@ -1258,7 +1258,7 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t& A, } if (toc(start_time) > settings.time_limit) { return TIME_LIMIT_RETURN; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - settings.log.printf("Concurrent halt\n"); + settings.log.printf("Dual simplex halted inside right-looking LU factorization (fill-in loop)\n"); return CONCURRENT_HALT_RETURN; } } diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index b7c619f246..f31a6b9e74 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -30,8 +30,10 @@ #include #include +#include #include #include +#include namespace cuopt::linear_programming::dual_simplex { @@ -119,17 +121,23 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t& original assert(m <= n); std::vector basic_list(m); std::vector nonbasic_list; - basis_update_mpf_t ft(m, settings.refactor_frequency); - return solve_linear_program_with_advanced_basis(original_lp, + auto ft = std::make_unique>(m, settings.refactor_frequency); + lp_status_t result = solve_linear_program_with_advanced_basis(original_lp, start_time, settings, original_solution, - ft, + *ft, basic_list, nonbasic_list, vstatus, edge_norms, work_unit_context); + if (result == lp_status_t::CONCURRENT_LIMIT) { + std::thread([bl = std::move(basic_list), + nl = std::move(nonbasic_list), + f = std::move(ft)]() {}).detach(); + } + return result; } template @@ -222,7 +230,16 @@ lp_status_t solve_linear_program_with_advanced_basis( if (phase1_status == dual::status_t::TIME_LIMIT) { return lp_status_t::TIME_LIMIT; } if (phase1_status == dual::status_t::WORK_LIMIT) { return lp_status_t::WORK_LIMIT; } if (phase1_status == dual::status_t::ITERATION_LIMIT) { return lp_status_t::ITERATION_LIMIT; } - if (phase1_status == dual::status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; } + if (phase1_status == dual::status_t::CONCURRENT_LIMIT) { + std::thread([plp = std::move(presolved_lp), + pi = std::move(presolve_info), + lpp = std::move(lp), + cs = std::move(column_scales), + p1 = std::move(phase1_problem), + p1v = std::move(phase1_vstatus), + p1s = std::move(phase1_solution)]() {}).detach(); + return lp_status_t::CONCURRENT_LIMIT; + } phase1_obj = phase1_solution.objective; if (phase1_obj > -settings.primal_tol) { settings.log.printf("Dual feasible solution found.\n"); @@ -309,7 +326,18 @@ lp_status_t solve_linear_program_with_advanced_basis( if (status == dual::status_t::TIME_LIMIT) { lp_status = lp_status_t::TIME_LIMIT; } if (status == dual::status_t::WORK_LIMIT) { lp_status = lp_status_t::WORK_LIMIT; } if (status == dual::status_t::ITERATION_LIMIT) { lp_status = lp_status_t::ITERATION_LIMIT; } - if (status == dual::status_t::CONCURRENT_LIMIT) { lp_status = lp_status_t::CONCURRENT_LIMIT; } + if (status == dual::status_t::CONCURRENT_LIMIT) { + original_solution.iterations = iter; + std::thread([sol = std::move(solution), + plp = std::move(presolved_lp), + pi = std::move(presolve_info), + lpp = std::move(lp), + cs = std::move(column_scales), + p1 = std::move(phase1_problem), + p1v = std::move(phase1_vstatus), + p1s = std::move(phase1_solution)]() {}).detach(); + return lp_status_t::CONCURRENT_LIMIT; + } if (status == dual::status_t::NUMERICAL) { lp_status = lp_status_t::NUMERICAL_ISSUES; } if (status == dual::status_t::CUTOFF) { lp_status = lp_status_t::CUTOFF; } original_solution.iterations = iter; @@ -383,10 +411,23 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us } } - barrier_solver_t barrier_solver(barrier_lp, presolve_info, barrier_settings); + auto barrier_solver = std::make_unique>(barrier_lp, presolve_info, barrier_settings); barrier_solver_settings_t barrier_solver_settings; lp_status_t barrier_status = - barrier_solver.solve(start_time, barrier_solver_settings, barrier_solution); + barrier_solver->solve(start_time, barrier_solver_settings, barrier_solution); + if (barrier_status == lp_status_t::CONCURRENT_LIMIT) { + std::thread([s = std::move(barrier_solver), + b = std::move(barrier_lp), + p = std::move(presolved_lp), + o = std::move(original_lp), + bs = std::move(barrier_solution), + ls = std::move(lp_solution), + pi = std::move(presolve_info), + cs = std::move(column_scales), + ns = std::move(new_slacks), + di = std::move(dualize_info)]() {}).detach(); + return lp_status_t::CONCURRENT_LIMIT; + } if (barrier_status == lp_status_t::OPTIMAL) { #ifdef COMPUTE_SCALED_RESIDUALS std::vector scaled_residual = barrier_lp.rhs; @@ -681,6 +722,15 @@ lp_status_t solve_linear_program(const user_problem_t& user_problem, std::vector edge_norms; lp_status_t status = solve_linear_program_advanced( original_lp, start_time, settings, lp_solution, vstatus, edge_norms); + if (status == lp_status_t::CONCURRENT_LIMIT) { + std::thread([lp = std::move(original_lp), + ls = std::move(lp_solution), + vs = std::move(vstatus), + en = std::move(edge_norms), + ns = std::move(new_slacks), + di = std::move(dualize_info)]() {}).detach(); + return lp_status_t::CONCURRENT_LIMIT; + } uncrush_primal_solution(user_problem, original_lp, lp_solution.x, solution.x); uncrush_dual_solution( user_problem, original_lp, lp_solution.y, lp_solution.z, solution.y, solution.z); From b6083f3ffb96d50821d9940428566cf413769101 Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Wed, 8 Apr 2026 01:57:24 -0500 Subject: [PATCH 02/16] Additional logging for analysis --- cpp/src/barrier/barrier.cu | 54 ++++++++++++++++-- cpp/src/barrier/sparse_cholesky.cuh | 3 + cpp/src/barrier/sparse_matrix_kernels.cuh | 29 +++++++++- cpp/src/dual_simplex/presolve.cpp | 68 +++++++++++++++++++++++ cpp/src/dual_simplex/solve.cpp | 39 ++++++++++++- 5 files changed, 184 insertions(+), 9 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index d56a769ec5..6f2d0471d6 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -219,6 +219,9 @@ class iteration_data_t { symbolic_status(0) { raft::common::nvtx::range fun_scope("Barrier: LP Data Creation"); + f_t constructor_start = tic(); + printf("Barrier ctor: enter : %.2fs\n", toc(constructor_start)); + fflush(stdout); bool has_Q = Q.x.size() > 0; indefinite_Q = false; @@ -288,9 +291,16 @@ class iteration_data_t { std::vector dense_columns_unordered; f_t start_column_density = tic(); + printf("Barrier ctor: find_dense begin: %.2fs\n", toc(constructor_start)); + fflush(stdout); // Ignore Q matrix for now find_dense_columns( lp.A, settings, dense_columns_unordered, n_dense_rows, max_row_nz, estimated_nz_AAT); + printf("Barrier ctor: find_dense end : %.2fs elapsed %.2fs halt=%d\n", + toc(constructor_start), + toc(start_column_density), + settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); + fflush(stdout); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } #ifdef PRINT_INFO for (i_t j : dense_columns_unordered) { @@ -322,10 +332,11 @@ class iteration_data_t { } if (use_augmented) { - settings.log.printf("Linear system : augmented\n"); + printf("Linear system : augmented\n"); } else { - settings.log.printf("Linear system : ADAT\n"); + printf("Linear system : ADAT\n"); } + fflush(stdout); // D = I + EET diag.set_scalar(1.0); @@ -400,22 +411,43 @@ class iteration_data_t { if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } i_t factorization_size = use_augmented ? lp.num_rows + lp.num_cols : lp.num_rows; + printf("Barrier ctor: chol create begin: %.2fs\n", toc(constructor_start)); + fflush(stdout); chol = std::make_unique>(handle_ptr, settings, factorization_size); + printf("Barrier ctor: chol create end : %.2fs\n", toc(constructor_start)); + fflush(stdout); chol->set_positive_definite(false); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } // Perform symbolic analysis symbolic_status = 0; if (use_augmented) { // Build the sparsity pattern of the augmented system + printf("Barrier ctor: form_aug begin : %.2fs\n", toc(constructor_start)); + fflush(stdout); form_augmented(true); + printf("Barrier ctor: form_aug end : %.2fs\n", toc(constructor_start)); + fflush(stdout); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } + printf("Barrier ctor: analyze begin : %.2fs\n", toc(constructor_start)); + fflush(stdout); symbolic_status = chol->analyze(device_augmented); } else { + printf("Barrier ctor: form_adat begin: %.2fs\n", toc(constructor_start)); + fflush(stdout); form_adat(true); + printf("Barrier ctor: form_adat end : %.2fs\n", toc(constructor_start)); + fflush(stdout); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } + printf("Barrier ctor: analyze begin : %.2fs\n", toc(constructor_start)); + fflush(stdout); symbolic_status = chol->analyze(device_ADAT); } + printf("Barrier ctor: analyze end : %.2fs status %d\n", + toc(constructor_start), + symbolic_status); + printf("Barrier ctor: exit : %.2fs\n", toc(constructor_start)); + fflush(stdout); } void form_augmented(bool first_call = false) @@ -546,6 +578,7 @@ class iteration_data_t { handle_ptr->sync_stream(); raft::common::nvtx::range fun_scope("Barrier: Form ADAT"); float64_t start_form_adat = tic(); + float64_t start_value_update = tic(); const i_t m = AD.m; raft::copy(device_AD.x.data(), @@ -583,11 +616,12 @@ class iteration_data_t { span_x[i] *= span_scale[span_col_ind[i]]; }); RAFT_CHECK_CUDA(stream_view_); + float64_t value_update_time = toc(start_value_update); if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { return; } if (first_call) { try { initialize_cusparse_data( - handle_ptr, device_A, device_AD, device_ADAT, cusparse_info); + handle_ptr, device_A, device_AD, device_ADAT, cusparse_info, settings_); } catch (const raft::cuda_error& e) { settings_.log.printf("Error in initialize_cusparse_data: %s\n", e.what()); return; @@ -595,14 +629,18 @@ class iteration_data_t { } if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { return; } - multiply_kernels(handle_ptr, device_A, device_AD, device_ADAT, cusparse_info); + multiply_kernels( + handle_ptr, device_A, device_AD, device_ADAT, cusparse_info, settings_); handle_ptr->sync_stream(); auto adat_nnz = device_ADAT.row_start.element(device_ADAT.m, handle_ptr->get_stream()); float64_t adat_time = toc(start_form_adat); + printf("ADAT value update time : %.2fs\n", value_update_time); + printf("ADAT total time : %.2fs\n", adat_time); + fflush(stdout); + if (num_factorizations == 0) { - settings_.log.printf("ADAT time : %.2fs\n", adat_time); settings_.log.printf("ADAT nonzeros : %.2e\n", static_cast(adat_nnz)); settings_.log.printf( @@ -3592,6 +3630,7 @@ lp_status_t barrier_solver_t::solve(f_t start_time, compute_affine_rhs(data); f_t max_affine_residual = 0.0; + f_t affine_search_direction_start = tic(); i_t status = gpu_compute_search_direction( data, data.dw_aff, data.dx_aff, data.dy_aff, data.dv_aff, data.dz_aff, max_affine_residual); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { @@ -3600,6 +3639,8 @@ lp_status_t barrier_solver_t::solve(f_t start_time, } // Sync to make sure all the async copies to host done inside are finished RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + printf("Barrier iter %d affine dir : %.2fs\n", iter, toc(affine_search_direction_start)); + fflush(stdout); if (status < 0) { return check_for_suboptimal_solution(options, @@ -3631,6 +3672,7 @@ lp_status_t barrier_solver_t::solve(f_t start_time, f_t max_corrector_residual = 0.0; + f_t corrector_search_direction_start = tic(); status = gpu_compute_search_direction( data, data.dw, data.dx, data.dy, data.dv, data.dz, max_corrector_residual); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { @@ -3639,6 +3681,8 @@ lp_status_t barrier_solver_t::solve(f_t start_time, } // Sync to make sure all the async copies to host done inside are finished RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + printf("Barrier iter %d corrector dir : %.2fs\n", iter, toc(corrector_search_direction_start)); + fflush(stdout); if (status < 0) { return check_for_suboptimal_solution(options, data, diff --git a/cpp/src/barrier/sparse_cholesky.cuh b/cpp/src/barrier/sparse_cholesky.cuh index f7938fb989..4845a7501d 100644 --- a/cpp/src/barrier/sparse_cholesky.cuh +++ b/cpp/src/barrier/sparse_cholesky.cuh @@ -539,6 +539,9 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t { return CONCURRENT_HALT_RETURN; } + printf("cuDSS numeric factor time : %.2fs\n", numeric_time); + fflush(stdout); + int info; size_t sizeWritten = 0; CUDSS_CALL_AND_CHECK( diff --git a/cpp/src/barrier/sparse_matrix_kernels.cuh b/cpp/src/barrier/sparse_matrix_kernels.cuh index 4727c12ec8..6a06f364c5 100644 --- a/cpp/src/barrier/sparse_matrix_kernels.cuh +++ b/cpp/src/barrier/sparse_matrix_kernels.cuh @@ -9,6 +9,7 @@ #include #include +#include namespace cuopt::linear_programming::dual_simplex { @@ -17,8 +18,10 @@ void initialize_cusparse_data(raft::handle_t const* handle, device_csr_matrix_t& A, device_csc_matrix_t& DAT, device_csr_matrix_t& ADAT, - cusparse_info_t& cusparse_data) + cusparse_info_t& cusparse_data, + const simplex_solver_settings_t& settings) { + f_t start_init = tic(); auto A_nnz = A.nz_max; auto DAT_nnz = DAT.nz_max; f_t chunk_fraction = 0.15; @@ -45,6 +48,7 @@ void initialize_cusparse_data(raft::handle_t const* handle, // Buffer size size_t buffer_size; + f_t start_work_estimation = tic(); RAFT_CUSPARSE_TRY(cusparseSpGEMM_workEstimation(handle->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -73,11 +77,13 @@ void initialize_cusparse_data(raft::handle_t const* handle, cusparse_data.spgemm_descr, &buffer_size, cusparse_data.buffer_size.data())); + f_t work_estimation_time = toc(start_work_estimation); int64_t num_prods; RAFT_CUSPARSE_TRY(cusparseSpGEMM_getNumProducts(cusparse_data.spgemm_descr, &num_prods)); size_t buffer_size_3_size; + f_t start_estimate_memory = tic(); RAFT_CUSPARSE_TRY(cusparseSpGEMM_estimateMemory(handle->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -112,6 +118,13 @@ void initialize_cusparse_data(raft::handle_t const* handle, &cusparse_data.buffer_size_2_size)); cusparse_data.buffer_size_3.resize(0, handle->get_stream()); cusparse_data.buffer_size_2.resize(cusparse_data.buffer_size_2_size, handle->get_stream()); + handle->sync_stream(); + + printf("SpGEMM init total : %.2fs\n", toc(start_init)); + printf("SpGEMM workEstimation time : %.2fs\n", work_estimation_time); + printf("SpGEMM estimateMemory time : %.2fs\n", toc(start_estimate_memory)); + printf("SpGEMM estimated products : %.2e\n", static_cast(num_prods)); + fflush(stdout); } template @@ -119,8 +132,10 @@ void multiply_kernels(raft::handle_t const* handle, device_csr_matrix_t& A, device_csc_matrix_t& DAT, device_csr_matrix_t& ADAT, - cusparse_info_t& cusparse_data) + cusparse_info_t& cusparse_data, + const simplex_solver_settings_t& settings) { + f_t start_spgemm_compute = tic(); RAFT_CUSPARSE_TRY( cusparseSpGEMM_compute(handle->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -135,8 +150,10 @@ void multiply_kernels(raft::handle_t const* handle, cusparse_data.spgemm_descr, &cusparse_data.buffer_size_2_size, cusparse_data.buffer_size_2.data())); + f_t spgemm_compute_time = toc(start_spgemm_compute); // get matrix C non-zero entries C_nnz1 + f_t start_materialize = tic(); int64_t ADAT_num_rows, ADAT_num_cols, ADAT_nnz1; RAFT_CUSPARSE_TRY( cusparseSpMatGetSize(cusparse_data.matADAT_descr, &ADAT_num_rows, &ADAT_num_cols, &ADAT_nnz1)); @@ -147,7 +164,9 @@ void multiply_kernels(raft::handle_t const* handle, // update matC with the new pointers RAFT_CUSPARSE_TRY(cusparseCsrSetPointers( cusparse_data.matADAT_descr, ADAT.row_start.data(), ADAT.j.data(), ADAT.x.data())); + f_t spgemm_materialize_time = toc(start_materialize); + f_t start_spgemm_copy = tic(); RAFT_CUSPARSE_TRY(cusparseSpGEMM_copy(handle->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -161,6 +180,12 @@ void multiply_kernels(raft::handle_t const* handle, cusparse_data.spgemm_descr)); handle->sync_stream(); + f_t spgemm_copy_time = toc(start_spgemm_copy); + + printf("SpGEMM compute time : %.2fs\n", spgemm_compute_time); + printf("SpGEMM materialize time : %.2fs\n", spgemm_materialize_time); + printf("SpGEMM copy time : %.2fs\n", spgemm_copy_time); + fflush(stdout); } } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp index d2a68d96de..fbdb8ca6b9 100644 --- a/cpp/src/dual_simplex/presolve.cpp +++ b/cpp/src/dual_simplex/presolve.cpp @@ -819,6 +819,11 @@ i_t presolve(const lp_problem_t& original, lp_problem_t& problem, presolve_info_t& presolve_info) { + f_t presolve_start = tic(); + if (settings.barrier_presolve) { + printf("Barrier presolve: enter : %.2fs\n", 0.0); + fflush(stdout); + } problem = original; std::vector row_sense(problem.num_rows, '='); @@ -843,6 +848,11 @@ i_t presolve(const lp_problem_t& original, if (problem.lower[j] != 0.0 && problem.lower[j] > -inf) { nonzero_lower_bounds++; } } if (settings.barrier_presolve && nonzero_lower_bounds > 0) { + f_t lower_bound_start = tic(); + printf("Barrier presolve: shift lb begin : %.2fs count %d\n", + toc(presolve_start), + static_cast(nonzero_lower_bounds)); + fflush(stdout); settings.log.printf("Transforming %ld nonzero lower bound\n", nonzero_lower_bounds); presolve_info.removed_lower_bounds.resize(problem.num_cols); // We can construct a new variable: x'_j = x_j - l_j or x_j = x'_j + l_j @@ -910,9 +920,14 @@ i_t presolve(const lp_problem_t& original, problem.lower[j] = 0.0; } } + printf("Barrier presolve: shift lb end : %.2fs elapsed %.2fs\n", + toc(presolve_start), + toc(lower_bound_start)); + fflush(stdout); } // Check for empty rows + f_t empty_row_scan_start = tic(); i_t num_empty_rows = 0; { csr_matrix_t Arow(0, 0, 0); @@ -921,22 +936,51 @@ i_t presolve(const lp_problem_t& original, if (Arow.row_start[i + 1] - Arow.row_start[i] == 0) { num_empty_rows++; } } } + if (settings.barrier_presolve) { + printf("Barrier presolve: empty row scan : %.2fs elapsed %.2fs rows %d\n", + toc(presolve_start), + toc(empty_row_scan_start), + static_cast(num_empty_rows)); + fflush(stdout); + } if (num_empty_rows > 0) { + f_t empty_row_remove_start = tic(); settings.log.printf("Presolve removing %d empty rows\n", num_empty_rows); i_t i = remove_empty_rows(problem, row_sense, num_empty_rows, presolve_info); if (i != 0) { return -1; } + if (settings.barrier_presolve) { + printf("Barrier presolve: empty row rm : %.2fs elapsed %.2fs\n", + toc(presolve_start), + toc(empty_row_remove_start)); + fflush(stdout); + } } // Check for empty cols + f_t empty_col_scan_start = tic(); i_t num_empty_cols = 0; { for (i_t j = 0; j < problem.num_cols; ++j) { if ((problem.A.col_start[j + 1] - problem.A.col_start[j]) == 0) { num_empty_cols++; } } } + if (settings.barrier_presolve) { + printf("Barrier presolve: empty col scan : %.2fs elapsed %.2fs cols %d\n", + toc(presolve_start), + toc(empty_col_scan_start), + static_cast(num_empty_cols)); + fflush(stdout); + } if (num_empty_cols > 0) { + f_t empty_col_remove_start = tic(); settings.log.printf("Presolve attempt to remove %d empty cols\n", num_empty_cols); remove_empty_cols(problem, num_empty_cols, presolve_info); + if (settings.barrier_presolve) { + printf("Barrier presolve: empty col rm : %.2fs elapsed %.2fs\n", + toc(presolve_start), + toc(empty_col_remove_start)); + fflush(stdout); + } } // Check for free variables @@ -948,6 +992,11 @@ i_t presolve(const lp_problem_t& original, problem.Q.check_matrix("Before free variable expansion"); if (settings.barrier_presolve && free_variables > 0) { + f_t free_var_start = tic(); + printf("Barrier presolve: free var begin : %.2fs count %d\n", + toc(presolve_start), + static_cast(free_variables)); + fflush(stdout); // We have a variable x_j: with -inf < x_j < inf // we create new variables v and w with 0 <= v, w and x_j = v - w // Constraints @@ -1092,10 +1141,21 @@ i_t presolve(const lp_problem_t& original, // assert(problem.A.p[num_cols] == nnz); problem.A.n = num_cols; problem.num_cols = num_cols; + printf("Barrier presolve: free var end : %.2fs elapsed %.2fs\n", + toc(presolve_start), + toc(free_var_start)); + fflush(stdout); } if (settings.barrier_presolve && settings.folding != 0 && problem.Q.n == 0) { + f_t folding_start = tic(); + printf("Barrier presolve: folding begin : %.2fs\n", toc(presolve_start)); + fflush(stdout); folding(problem, settings, presolve_info); + printf("Barrier presolve: folding end : %.2fs elapsed %.2fs\n", + toc(presolve_start), + toc(folding_start)); + fflush(stdout); } // Check for dependent rows @@ -1135,6 +1195,14 @@ i_t presolve(const lp_problem_t& original, problem.A.n, problem.A.col_start[problem.A.n]); } + if (settings.barrier_presolve) { + printf("Barrier presolve: exit : %.2fs rows %d cols %d nnz %d\n", + toc(presolve_start), + static_cast(problem.A.m), + static_cast(problem.A.n), + static_cast(problem.A.col_start[problem.A.n])); + fflush(stdout); + } assert(problem.rhs.size() == problem.A.m); return 0; } diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index f31a6b9e74..c5ff4c3d48 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -364,6 +364,8 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us f_t start_time, lp_solution_t& solution) { + printf("Barrier wrapper: start : %.2fs\n", toc(start_time)); + fflush(stdout); lp_status_t status = lp_status_t::UNSET; lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); @@ -378,8 +380,20 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us // Presolve the linear program presolve_info_t presolve_info; lp_problem_t presolved_lp(user_problem.handle_ptr, 1, 1, 1); + f_t barrier_presolve_start = tic(); + printf("Barrier wrapper: presolve begin : %.2fs\n", toc(start_time)); + fflush(stdout); const i_t ok = presolve(original_lp, barrier_settings, presolved_lp, presolve_info); - if (ok == CONCURRENT_HALT_RETURN) { return lp_status_t::CONCURRENT_LIMIT; } + printf("Barrier wrapper: presolve end : %.2fs elapsed %.2fs status %d\n", + toc(start_time), + toc(barrier_presolve_start), + ok); + fflush(stdout); + if (ok == CONCURRENT_HALT_RETURN) { + printf("Barrier wrapper: presolve halted: %.2fs\n", toc(start_time)); + fflush(stdout); + return lp_status_t::CONCURRENT_LIMIT; + } if (ok == TIME_LIMIT_RETURN) { return lp_status_t::TIME_LIMIT; } if (ok == -1) { return lp_status_t::INFEASIBLE; } @@ -389,7 +403,14 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us presolved_lp.num_cols, presolved_lp.A.col_start[presolved_lp.num_cols]); std::vector column_scales; + f_t barrier_scaling_start = tic(); + printf("Barrier wrapper: scaling begin : %.2fs\n", toc(start_time)); + fflush(stdout); column_scaling(presolved_lp, barrier_settings, barrier_lp, column_scales); + printf("Barrier wrapper: scaling end : %.2fs elapsed %.2fs\n", + toc(start_time), + toc(barrier_scaling_start)); + fflush(stdout); // Solve using barrier lp_solution_t barrier_solution(barrier_lp.num_rows, barrier_lp.num_cols); @@ -411,10 +432,24 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us } } - auto barrier_solver = std::make_unique>(barrier_lp, presolve_info, barrier_settings); + f_t barrier_ctor_start = tic(); + printf("Barrier wrapper: ctor begin : %.2fs\n", toc(start_time)); + fflush(stdout); + auto barrier_solver = + std::make_unique>(barrier_lp, presolve_info, barrier_settings); + printf("Barrier wrapper: ctor end : %.2fs elapsed %.2fs\n", + toc(start_time), + toc(barrier_ctor_start)); + fflush(stdout); barrier_solver_settings_t barrier_solver_settings; + printf("Barrier wrapper: solve begin : %.2fs\n", toc(start_time)); + fflush(stdout); lp_status_t barrier_status = barrier_solver->solve(start_time, barrier_solver_settings, barrier_solution); + printf("Barrier wrapper: solve end : %.2fs status %d\n", + toc(start_time), + static_cast(barrier_status)); + fflush(stdout); if (barrier_status == lp_status_t::CONCURRENT_LIMIT) { std::thread([s = std::move(barrier_solver), b = std::move(barrier_lp), From 53bc4beb6007a79bb283c8f946ffaafa0676cbbe Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Wed, 8 Apr 2026 02:31:41 -0500 Subject: [PATCH 03/16] additional timing measurements --- cpp/src/barrier/barrier.cu | 70 +++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 6f2d0471d6..c4103dfd18 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -91,11 +91,30 @@ namespace cuopt::linear_programming::dual_simplex { template class iteration_data_t { public: + struct lifecycle_logger_t { + explicit lifecycle_logger_t(const simplex_solver_settings_t& settings) + : start_time(tic()), concurrent_halt(settings.concurrent_halt) + { + } + + ~lifecycle_logger_t() + { + printf("Barrier data: dtor end : %.2fs halt=%d\n", + toc(start_time), + concurrent_halt != nullptr ? static_cast(concurrent_halt->load()) : 0); + fflush(stdout); + } + + f_t start_time; + const std::atomic* concurrent_halt; + }; + iteration_data_t(const lp_problem_t& lp, i_t num_upper_bounds, const csc_matrix_t& Qin, const simplex_solver_settings_t& settings) - : upper_bounds(num_upper_bounds), + : lifecycle_logger_(settings), + upper_bounds(num_upper_bounds), c(lp.objective), b(lp.rhs), w(num_upper_bounds), @@ -450,6 +469,16 @@ class iteration_data_t { fflush(stdout); } + ~iteration_data_t() + { + printf("Barrier data: dtor begin : %.2fs halt=%d\n", + toc(lifecycle_logger_.start_time), + settings_.concurrent_halt != nullptr + ? static_cast(settings_.concurrent_halt->load()) + : 0); + fflush(stdout); + } + void form_augmented(bool first_call = false) { i_t n = A.n; @@ -1517,6 +1546,8 @@ class iteration_data_t { handle_ptr->sync_stream(); } + lifecycle_logger_t lifecycle_logger_; + raft::handle_t const* handle_ptr; i_t n_upper_bounds; pinned_dense_vector_t upper_bounds; @@ -3492,8 +3523,20 @@ lp_status_t barrier_solver_t::solve(f_t start_time, csc_matrix_t Q(lp.num_cols, 0, 0); if (lp.Q.n > 0) { create_Q(lp, Q); } + f_t data_ctor_start = tic(); + printf("Barrier solve: data ctor begin: %.2fs\n", toc(start_time)); + fflush(stdout); iteration_data_t data(lp, num_upper_bounds, Q, settings); + printf("Barrier solve: data ctor end : %.2fs elapsed %.2fs halt=%d indef=%d symbolic=%d\n", + toc(start_time), + toc(data_ctor_start), + settings.concurrent_halt != nullptr ? static_cast(*settings.concurrent_halt) : 0, + static_cast(data.indefinite_Q), + static_cast(data.symbolic_status)); + fflush(stdout); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + printf("Barrier solve: halt after data: %.2fs\n", toc(start_time)); + fflush(stdout); settings.log.printf("Barrier solver halted\n"); return lp_status_t::CONCURRENT_LIMIT; } @@ -3503,6 +3546,9 @@ lp_status_t barrier_solver_t::solve(f_t start_time, return lp_status_t::NUMERICAL_ISSUES; } + f_t vector_setup_start = tic(); + printf("Barrier solve: vec init begin : %.2fs\n", toc(start_time)); + fflush(stdout); data.cusparse_dual_residual_ = data.cusparse_view_.create_vector(data.d_dual_residual_); data.cusparse_r1_ = data.cusparse_view_.create_vector(data.d_r1_); data.cusparse_tmp4_ = data.cusparse_view_.create_vector(data.d_tmp4_); @@ -3511,18 +3557,33 @@ lp_status_t barrier_solver_t::solve(f_t start_time, data.cusparse_u_ = data.cusparse_view_.create_vector(data.d_u_); data.cusparse_y_residual_ = data.cusparse_view_.create_vector(data.d_y_residual_); data.restrict_u_.resize(num_upper_bounds); + printf("Barrier solve: vec init end : %.2fs elapsed %.2fs\n", + toc(start_time), + toc(vector_setup_start)); + fflush(stdout); if (toc(start_time) > settings.time_limit) { settings.log.printf("Barrier time limit exceeded\n"); return lp_status_t::TIME_LIMIT; } + f_t initial_point_start = tic(); + printf("Barrier solve: initial begin : %.2fs\n", toc(start_time)); + fflush(stdout); i_t initial_status = initial_point(data); + printf("Barrier solve: initial end : %.2fs elapsed %.2fs status %d halt=%d\n", + toc(start_time), + toc(initial_point_start), + initial_status, + settings.concurrent_halt != nullptr ? static_cast(*settings.concurrent_halt) : 0); + fflush(stdout); if (toc(start_time) > settings.time_limit) { settings.log.printf("Barrier time limit exceeded\n"); return lp_status_t::TIME_LIMIT; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { + printf("Barrier solve: halt after init: %.2fs\n", toc(start_time)); + fflush(stdout); settings.log.printf("Barrier solver halted\n"); return lp_status_t::CONCURRENT_LIMIT; } @@ -3530,7 +3591,14 @@ lp_status_t barrier_solver_t::solve(f_t start_time, settings.log.printf("Unable to compute initial point\n"); return lp_status_t::NUMERICAL_ISSUES; } + f_t residual_init_start = tic(); + printf("Barrier solve: residual begin : %.2fs\n", toc(start_time)); + fflush(stdout); compute_residuals>(data.w, data.x, data.y, data.v, data.z, data); + printf("Barrier solve: residual end : %.2fs elapsed %.2fs\n", + toc(start_time), + toc(residual_init_start)); + fflush(stdout); f_t primal_residual_norm = std::max(vector_norm_inf(data.primal_residual, stream_view_), From d3be2f459ce71a24b4ff0214f9561cc0dbe2ebb5 Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Wed, 8 Apr 2026 10:09:04 -0500 Subject: [PATCH 04/16] Adding NVTX markers --- cpp/src/barrier/barrier.cu | 35 ++++++++++++++++++++++--------- cpp/src/dual_simplex/presolve.cpp | 17 +++++++++++++++ cpp/src/dual_simplex/solve.cpp | 26 +++++++++++++++++------ 3 files changed, 62 insertions(+), 16 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index c4103dfd18..1131b3b569 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -32,6 +32,7 @@ #include #include #include +#include #include @@ -3523,10 +3524,14 @@ lp_status_t barrier_solver_t::solve(f_t start_time, csc_matrix_t Q(lp.num_cols, 0, 0); if (lp.Q.n > 0) { create_Q(lp, Q); } + raft::common::nvtx::push_range("BarrierSolve::IterationDataLifetime"); + auto data_lifetime_scope = cuopt::scope_guard([&]() { raft::common::nvtx::pop_range(); }); f_t data_ctor_start = tic(); printf("Barrier solve: data ctor begin: %.2fs\n", toc(start_time)); fflush(stdout); + raft::common::nvtx::push_range("BarrierSolve::IterationDataCtorFull"); iteration_data_t data(lp, num_upper_bounds, Q, settings); + raft::common::nvtx::pop_range(); printf("Barrier solve: data ctor end : %.2fs elapsed %.2fs halt=%d indef=%d symbolic=%d\n", toc(start_time), toc(data_ctor_start), @@ -3549,14 +3554,17 @@ lp_status_t barrier_solver_t::solve(f_t start_time, f_t vector_setup_start = tic(); printf("Barrier solve: vec init begin : %.2fs\n", toc(start_time)); fflush(stdout); - data.cusparse_dual_residual_ = data.cusparse_view_.create_vector(data.d_dual_residual_); - data.cusparse_r1_ = data.cusparse_view_.create_vector(data.d_r1_); - data.cusparse_tmp4_ = data.cusparse_view_.create_vector(data.d_tmp4_); - data.cusparse_h_ = data.cusparse_view_.create_vector(data.d_h_); - data.cusparse_dx_residual_ = data.cusparse_view_.create_vector(data.d_dx_residual_); - data.cusparse_u_ = data.cusparse_view_.create_vector(data.d_u_); - data.cusparse_y_residual_ = data.cusparse_view_.create_vector(data.d_y_residual_); - data.restrict_u_.resize(num_upper_bounds); + { + raft::common::nvtx::range scope("BarrierSolve::VectorInit"); + data.cusparse_dual_residual_ = data.cusparse_view_.create_vector(data.d_dual_residual_); + data.cusparse_r1_ = data.cusparse_view_.create_vector(data.d_r1_); + data.cusparse_tmp4_ = data.cusparse_view_.create_vector(data.d_tmp4_); + data.cusparse_h_ = data.cusparse_view_.create_vector(data.d_h_); + data.cusparse_dx_residual_ = data.cusparse_view_.create_vector(data.d_dx_residual_); + data.cusparse_u_ = data.cusparse_view_.create_vector(data.d_u_); + data.cusparse_y_residual_ = data.cusparse_view_.create_vector(data.d_y_residual_); + data.restrict_u_.resize(num_upper_bounds); + } printf("Barrier solve: vec init end : %.2fs elapsed %.2fs\n", toc(start_time), toc(vector_setup_start)); @@ -3570,7 +3578,11 @@ lp_status_t barrier_solver_t::solve(f_t start_time, f_t initial_point_start = tic(); printf("Barrier solve: initial begin : %.2fs\n", toc(start_time)); fflush(stdout); - i_t initial_status = initial_point(data); + i_t initial_status; + { + raft::common::nvtx::range scope("BarrierSolve::InitialPoint"); + initial_status = initial_point(data); + } printf("Barrier solve: initial end : %.2fs elapsed %.2fs status %d halt=%d\n", toc(start_time), toc(initial_point_start), @@ -3594,7 +3606,10 @@ lp_status_t barrier_solver_t::solve(f_t start_time, f_t residual_init_start = tic(); printf("Barrier solve: residual begin : %.2fs\n", toc(start_time)); fflush(stdout); - compute_residuals>(data.w, data.x, data.y, data.v, data.z, data); + { + raft::common::nvtx::range scope("BarrierSolve::ResidualInit"); + compute_residuals>(data.w, data.x, data.y, data.v, data.z, data); + } printf("Barrier solve: residual end : %.2fs elapsed %.2fs\n", toc(start_time), toc(residual_init_start)); diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp index fbdb8ca6b9..9ff90329a1 100644 --- a/cpp/src/dual_simplex/presolve.cpp +++ b/cpp/src/dual_simplex/presolve.cpp @@ -13,6 +13,10 @@ #include #include +#include + +#include + #include #include #include @@ -820,6 +824,12 @@ i_t presolve(const lp_problem_t& original, presolve_info_t& presolve_info) { f_t presolve_start = tic(); + auto maybe_nvtx_scope = [&](const char* name) { + if (settings.barrier_presolve) { raft::common::nvtx::push_range(name); } + return cuopt::scope_guard([&]() { + if (settings.barrier_presolve) { raft::common::nvtx::pop_range(); } + }); + }; if (settings.barrier_presolve) { printf("Barrier presolve: enter : %.2fs\n", 0.0); fflush(stdout); @@ -848,6 +858,7 @@ i_t presolve(const lp_problem_t& original, if (problem.lower[j] != 0.0 && problem.lower[j] > -inf) { nonzero_lower_bounds++; } } if (settings.barrier_presolve && nonzero_lower_bounds > 0) { + auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::ShiftLowerBounds"); f_t lower_bound_start = tic(); printf("Barrier presolve: shift lb begin : %.2fs count %d\n", toc(presolve_start), @@ -930,6 +941,7 @@ i_t presolve(const lp_problem_t& original, f_t empty_row_scan_start = tic(); i_t num_empty_rows = 0; { + auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::EmptyRowScan"); csr_matrix_t Arow(0, 0, 0); problem.A.to_compressed_row(Arow); for (i_t i = 0; i < problem.num_rows; i++) { @@ -944,6 +956,7 @@ i_t presolve(const lp_problem_t& original, fflush(stdout); } if (num_empty_rows > 0) { + auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::EmptyRowRemove"); f_t empty_row_remove_start = tic(); settings.log.printf("Presolve removing %d empty rows\n", num_empty_rows); i_t i = remove_empty_rows(problem, row_sense, num_empty_rows, presolve_info); @@ -960,6 +973,7 @@ i_t presolve(const lp_problem_t& original, f_t empty_col_scan_start = tic(); i_t num_empty_cols = 0; { + auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::EmptyColScan"); for (i_t j = 0; j < problem.num_cols; ++j) { if ((problem.A.col_start[j + 1] - problem.A.col_start[j]) == 0) { num_empty_cols++; } } @@ -972,6 +986,7 @@ i_t presolve(const lp_problem_t& original, fflush(stdout); } if (num_empty_cols > 0) { + auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::EmptyColRemove"); f_t empty_col_remove_start = tic(); settings.log.printf("Presolve attempt to remove %d empty cols\n", num_empty_cols); remove_empty_cols(problem, num_empty_cols, presolve_info); @@ -992,6 +1007,7 @@ i_t presolve(const lp_problem_t& original, problem.Q.check_matrix("Before free variable expansion"); if (settings.barrier_presolve && free_variables > 0) { + auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::FreeVariableExpansion"); f_t free_var_start = tic(); printf("Barrier presolve: free var begin : %.2fs count %d\n", toc(presolve_start), @@ -1148,6 +1164,7 @@ i_t presolve(const lp_problem_t& original, } if (settings.barrier_presolve && settings.folding != 0 && problem.Q.n == 0) { + auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::Folding"); f_t folding_start = tic(); printf("Barrier presolve: folding begin : %.2fs\n", toc(presolve_start)); fflush(stdout); diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index c5ff4c3d48..d5890e3771 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -383,7 +383,11 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us f_t barrier_presolve_start = tic(); printf("Barrier wrapper: presolve begin : %.2fs\n", toc(start_time)); fflush(stdout); - const i_t ok = presolve(original_lp, barrier_settings, presolved_lp, presolve_info); + i_t ok; + { + raft::common::nvtx::range scope("BarrierWrapper::Presolve"); + ok = presolve(original_lp, barrier_settings, presolved_lp, presolve_info); + } printf("Barrier wrapper: presolve end : %.2fs elapsed %.2fs status %d\n", toc(start_time), toc(barrier_presolve_start), @@ -406,7 +410,10 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us f_t barrier_scaling_start = tic(); printf("Barrier wrapper: scaling begin : %.2fs\n", toc(start_time)); fflush(stdout); - column_scaling(presolved_lp, barrier_settings, barrier_lp, column_scales); + { + raft::common::nvtx::range scope("BarrierWrapper::Scaling"); + column_scaling(presolved_lp, barrier_settings, barrier_lp, column_scales); + } printf("Barrier wrapper: scaling end : %.2fs elapsed %.2fs\n", toc(start_time), toc(barrier_scaling_start)); @@ -435,8 +442,12 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us f_t barrier_ctor_start = tic(); printf("Barrier wrapper: ctor begin : %.2fs\n", toc(start_time)); fflush(stdout); - auto barrier_solver = - std::make_unique>(barrier_lp, presolve_info, barrier_settings); + std::unique_ptr> barrier_solver; + { + raft::common::nvtx::range scope("BarrierWrapper::Ctor"); + barrier_solver = + std::make_unique>(barrier_lp, presolve_info, barrier_settings); + } printf("Barrier wrapper: ctor end : %.2fs elapsed %.2fs\n", toc(start_time), toc(barrier_ctor_start)); @@ -444,8 +455,11 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us barrier_solver_settings_t barrier_solver_settings; printf("Barrier wrapper: solve begin : %.2fs\n", toc(start_time)); fflush(stdout); - lp_status_t barrier_status = - barrier_solver->solve(start_time, barrier_solver_settings, barrier_solution); + lp_status_t barrier_status; + { + raft::common::nvtx::range scope("BarrierWrapper::Solve"); + barrier_status = barrier_solver->solve(start_time, barrier_solver_settings, barrier_solution); + } printf("Barrier wrapper: solve end : %.2fs status %d\n", toc(start_time), static_cast(barrier_status)); From 5224cdf10f8b2799f589fc5fec630969e1ac716f Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Mon, 13 Apr 2026 10:35:38 -0500 Subject: [PATCH 05/16] Remove logging and NVTX statements --- cpp/src/barrier/barrier.cu | 170 ++-------------------- cpp/src/barrier/sparse_cholesky.cuh | 3 - cpp/src/barrier/sparse_matrix_kernels.cuh | 29 +--- cpp/src/dual_simplex/crossover.cpp | 10 +- cpp/src/dual_simplex/phase2.cpp | 29 ---- cpp/src/dual_simplex/presolve.cpp | 85 ----------- cpp/src/dual_simplex/right_looking_lu.cpp | 2 +- cpp/src/dual_simplex/solve.cpp | 62 +------- 8 files changed, 31 insertions(+), 359 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 1131b3b569..77cae20e1e 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -32,8 +32,6 @@ #include #include #include -#include - #include #include @@ -92,30 +90,11 @@ namespace cuopt::linear_programming::dual_simplex { template class iteration_data_t { public: - struct lifecycle_logger_t { - explicit lifecycle_logger_t(const simplex_solver_settings_t& settings) - : start_time(tic()), concurrent_halt(settings.concurrent_halt) - { - } - - ~lifecycle_logger_t() - { - printf("Barrier data: dtor end : %.2fs halt=%d\n", - toc(start_time), - concurrent_halt != nullptr ? static_cast(concurrent_halt->load()) : 0); - fflush(stdout); - } - - f_t start_time; - const std::atomic* concurrent_halt; - }; - iteration_data_t(const lp_problem_t& lp, i_t num_upper_bounds, const csc_matrix_t& Qin, const simplex_solver_settings_t& settings) - : lifecycle_logger_(settings), - upper_bounds(num_upper_bounds), + : upper_bounds(num_upper_bounds), c(lp.objective), b(lp.rhs), w(num_upper_bounds), @@ -239,9 +218,6 @@ class iteration_data_t { symbolic_status(0) { raft::common::nvtx::range fun_scope("Barrier: LP Data Creation"); - f_t constructor_start = tic(); - printf("Barrier ctor: enter : %.2fs\n", toc(constructor_start)); - fflush(stdout); bool has_Q = Q.x.size() > 0; indefinite_Q = false; @@ -311,16 +287,9 @@ class iteration_data_t { std::vector dense_columns_unordered; f_t start_column_density = tic(); - printf("Barrier ctor: find_dense begin: %.2fs\n", toc(constructor_start)); - fflush(stdout); // Ignore Q matrix for now find_dense_columns( lp.A, settings, dense_columns_unordered, n_dense_rows, max_row_nz, estimated_nz_AAT); - printf("Barrier ctor: find_dense end : %.2fs elapsed %.2fs halt=%d\n", - toc(constructor_start), - toc(start_column_density), - settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); - fflush(stdout); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } #ifdef PRINT_INFO for (i_t j : dense_columns_unordered) { @@ -352,11 +321,10 @@ class iteration_data_t { } if (use_augmented) { - printf("Linear system : augmented\n"); + settings.log.printf("Linear system : augmented\n"); } else { - printf("Linear system : ADAT\n"); + settings.log.printf("Linear system : ADAT\n"); } - fflush(stdout); // D = I + EET diag.set_scalar(1.0); @@ -431,53 +399,22 @@ class iteration_data_t { if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } i_t factorization_size = use_augmented ? lp.num_rows + lp.num_cols : lp.num_rows; - printf("Barrier ctor: chol create begin: %.2fs\n", toc(constructor_start)); - fflush(stdout); chol = std::make_unique>(handle_ptr, settings, factorization_size); - printf("Barrier ctor: chol create end : %.2fs\n", toc(constructor_start)); - fflush(stdout); chol->set_positive_definite(false); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } // Perform symbolic analysis symbolic_status = 0; if (use_augmented) { // Build the sparsity pattern of the augmented system - printf("Barrier ctor: form_aug begin : %.2fs\n", toc(constructor_start)); - fflush(stdout); form_augmented(true); - printf("Barrier ctor: form_aug end : %.2fs\n", toc(constructor_start)); - fflush(stdout); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } - printf("Barrier ctor: analyze begin : %.2fs\n", toc(constructor_start)); - fflush(stdout); symbolic_status = chol->analyze(device_augmented); } else { - printf("Barrier ctor: form_adat begin: %.2fs\n", toc(constructor_start)); - fflush(stdout); form_adat(true); - printf("Barrier ctor: form_adat end : %.2fs\n", toc(constructor_start)); - fflush(stdout); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } - printf("Barrier ctor: analyze begin : %.2fs\n", toc(constructor_start)); - fflush(stdout); symbolic_status = chol->analyze(device_ADAT); } - printf("Barrier ctor: analyze end : %.2fs status %d\n", - toc(constructor_start), - symbolic_status); - printf("Barrier ctor: exit : %.2fs\n", toc(constructor_start)); - fflush(stdout); - } - - ~iteration_data_t() - { - printf("Barrier data: dtor begin : %.2fs halt=%d\n", - toc(lifecycle_logger_.start_time), - settings_.concurrent_halt != nullptr - ? static_cast(settings_.concurrent_halt->load()) - : 0); - fflush(stdout); } void form_augmented(bool first_call = false) @@ -608,7 +545,6 @@ class iteration_data_t { handle_ptr->sync_stream(); raft::common::nvtx::range fun_scope("Barrier: Form ADAT"); float64_t start_form_adat = tic(); - float64_t start_value_update = tic(); const i_t m = AD.m; raft::copy(device_AD.x.data(), @@ -646,12 +582,10 @@ class iteration_data_t { span_x[i] *= span_scale[span_col_ind[i]]; }); RAFT_CHECK_CUDA(stream_view_); - float64_t value_update_time = toc(start_value_update); if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { return; } if (first_call) { try { - initialize_cusparse_data( - handle_ptr, device_A, device_AD, device_ADAT, cusparse_info, settings_); + initialize_cusparse_data(handle_ptr, device_A, device_AD, device_ADAT, cusparse_info); } catch (const raft::cuda_error& e) { settings_.log.printf("Error in initialize_cusparse_data: %s\n", e.what()); return; @@ -659,18 +593,14 @@ class iteration_data_t { } if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { return; } - multiply_kernels( - handle_ptr, device_A, device_AD, device_ADAT, cusparse_info, settings_); + multiply_kernels(handle_ptr, device_A, device_AD, device_ADAT, cusparse_info); handle_ptr->sync_stream(); auto adat_nnz = device_ADAT.row_start.element(device_ADAT.m, handle_ptr->get_stream()); float64_t adat_time = toc(start_form_adat); - printf("ADAT value update time : %.2fs\n", value_update_time); - printf("ADAT total time : %.2fs\n", adat_time); - fflush(stdout); - if (num_factorizations == 0) { + settings_.log.printf("ADAT time : %.2fs\n", adat_time); settings_.log.printf("ADAT nonzeros : %.2e\n", static_cast(adat_nnz)); settings_.log.printf( @@ -1162,9 +1092,6 @@ class iteration_data_t { std::sort(column_nz_permutation.begin(), column_nz_permutation.end(), [&column_nz](i_t i, i_t j) { return column_nz[i] < column_nz[j]; }); - printf("[Barrier fdc] t=%.3f halt=%d after column_nz sort\n", - toc(start_column_density), - settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } // We then compute the exact sparsity pattern for columns of A whose where @@ -1196,9 +1123,6 @@ class iteration_data_t { // The best way to do that is to have A stored in CSR format. csr_matrix_t A_row(0, 0, 0); A.to_compressed_row(A_row); - printf("[Barrier fdc] t=%.3f halt=%d after to_compressed_row\n", - toc(start_column_density), - settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } std::vector histogram(m + 1, 0); @@ -1329,9 +1253,6 @@ class iteration_data_t { std::sort(permutation.begin(), permutation.end(), [&delta_nz](i_t i, i_t j) { return delta_nz[i] < delta_nz[j]; }); - printf("[Barrier fdc] t=%.3f halt=%d after delta_nz sort\n", - toc(start_column_density), - settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return; } // Now we make a forward pass and compute the number of nonzeros in C @@ -1547,8 +1468,6 @@ class iteration_data_t { handle_ptr->sync_stream(); } - lifecycle_logger_t lifecycle_logger_; - raft::handle_t const* handle_ptr; i_t n_upper_bounds; pinned_dense_vector_t upper_bounds; @@ -3524,24 +3443,8 @@ lp_status_t barrier_solver_t::solve(f_t start_time, csc_matrix_t Q(lp.num_cols, 0, 0); if (lp.Q.n > 0) { create_Q(lp, Q); } - raft::common::nvtx::push_range("BarrierSolve::IterationDataLifetime"); - auto data_lifetime_scope = cuopt::scope_guard([&]() { raft::common::nvtx::pop_range(); }); - f_t data_ctor_start = tic(); - printf("Barrier solve: data ctor begin: %.2fs\n", toc(start_time)); - fflush(stdout); - raft::common::nvtx::push_range("BarrierSolve::IterationDataCtorFull"); iteration_data_t data(lp, num_upper_bounds, Q, settings); - raft::common::nvtx::pop_range(); - printf("Barrier solve: data ctor end : %.2fs elapsed %.2fs halt=%d indef=%d symbolic=%d\n", - toc(start_time), - toc(data_ctor_start), - settings.concurrent_halt != nullptr ? static_cast(*settings.concurrent_halt) : 0, - static_cast(data.indefinite_Q), - static_cast(data.symbolic_status)); - fflush(stdout); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - printf("Barrier solve: halt after data: %.2fs\n", toc(start_time)); - fflush(stdout); settings.log.printf("Barrier solver halted\n"); return lp_status_t::CONCURRENT_LIMIT; } @@ -3551,51 +3454,26 @@ lp_status_t barrier_solver_t::solve(f_t start_time, return lp_status_t::NUMERICAL_ISSUES; } - f_t vector_setup_start = tic(); - printf("Barrier solve: vec init begin : %.2fs\n", toc(start_time)); - fflush(stdout); - { - raft::common::nvtx::range scope("BarrierSolve::VectorInit"); - data.cusparse_dual_residual_ = data.cusparse_view_.create_vector(data.d_dual_residual_); - data.cusparse_r1_ = data.cusparse_view_.create_vector(data.d_r1_); - data.cusparse_tmp4_ = data.cusparse_view_.create_vector(data.d_tmp4_); - data.cusparse_h_ = data.cusparse_view_.create_vector(data.d_h_); - data.cusparse_dx_residual_ = data.cusparse_view_.create_vector(data.d_dx_residual_); - data.cusparse_u_ = data.cusparse_view_.create_vector(data.d_u_); - data.cusparse_y_residual_ = data.cusparse_view_.create_vector(data.d_y_residual_); - data.restrict_u_.resize(num_upper_bounds); - } - printf("Barrier solve: vec init end : %.2fs elapsed %.2fs\n", - toc(start_time), - toc(vector_setup_start)); - fflush(stdout); + data.cusparse_dual_residual_ = data.cusparse_view_.create_vector(data.d_dual_residual_); + data.cusparse_r1_ = data.cusparse_view_.create_vector(data.d_r1_); + data.cusparse_tmp4_ = data.cusparse_view_.create_vector(data.d_tmp4_); + data.cusparse_h_ = data.cusparse_view_.create_vector(data.d_h_); + data.cusparse_dx_residual_ = data.cusparse_view_.create_vector(data.d_dx_residual_); + data.cusparse_u_ = data.cusparse_view_.create_vector(data.d_u_); + data.cusparse_y_residual_ = data.cusparse_view_.create_vector(data.d_y_residual_); + data.restrict_u_.resize(num_upper_bounds); if (toc(start_time) > settings.time_limit) { settings.log.printf("Barrier time limit exceeded\n"); return lp_status_t::TIME_LIMIT; } - f_t initial_point_start = tic(); - printf("Barrier solve: initial begin : %.2fs\n", toc(start_time)); - fflush(stdout); - i_t initial_status; - { - raft::common::nvtx::range scope("BarrierSolve::InitialPoint"); - initial_status = initial_point(data); - } - printf("Barrier solve: initial end : %.2fs elapsed %.2fs status %d halt=%d\n", - toc(start_time), - toc(initial_point_start), - initial_status, - settings.concurrent_halt != nullptr ? static_cast(*settings.concurrent_halt) : 0); - fflush(stdout); + i_t initial_status = initial_point(data); if (toc(start_time) > settings.time_limit) { settings.log.printf("Barrier time limit exceeded\n"); return lp_status_t::TIME_LIMIT; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - printf("Barrier solve: halt after init: %.2fs\n", toc(start_time)); - fflush(stdout); settings.log.printf("Barrier solver halted\n"); return lp_status_t::CONCURRENT_LIMIT; } @@ -3603,17 +3481,7 @@ lp_status_t barrier_solver_t::solve(f_t start_time, settings.log.printf("Unable to compute initial point\n"); return lp_status_t::NUMERICAL_ISSUES; } - f_t residual_init_start = tic(); - printf("Barrier solve: residual begin : %.2fs\n", toc(start_time)); - fflush(stdout); - { - raft::common::nvtx::range scope("BarrierSolve::ResidualInit"); - compute_residuals>(data.w, data.x, data.y, data.v, data.z, data); - } - printf("Barrier solve: residual end : %.2fs elapsed %.2fs\n", - toc(start_time), - toc(residual_init_start)); - fflush(stdout); + compute_residuals>(data.w, data.x, data.y, data.v, data.z, data); f_t primal_residual_norm = std::max(vector_norm_inf(data.primal_residual, stream_view_), @@ -3713,7 +3581,6 @@ lp_status_t barrier_solver_t::solve(f_t start_time, compute_affine_rhs(data); f_t max_affine_residual = 0.0; - f_t affine_search_direction_start = tic(); i_t status = gpu_compute_search_direction( data, data.dw_aff, data.dx_aff, data.dy_aff, data.dv_aff, data.dz_aff, max_affine_residual); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { @@ -3722,8 +3589,6 @@ lp_status_t barrier_solver_t::solve(f_t start_time, } // Sync to make sure all the async copies to host done inside are finished RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - printf("Barrier iter %d affine dir : %.2fs\n", iter, toc(affine_search_direction_start)); - fflush(stdout); if (status < 0) { return check_for_suboptimal_solution(options, @@ -3755,7 +3620,6 @@ lp_status_t barrier_solver_t::solve(f_t start_time, f_t max_corrector_residual = 0.0; - f_t corrector_search_direction_start = tic(); status = gpu_compute_search_direction( data, data.dw, data.dx, data.dy, data.dv, data.dz, max_corrector_residual); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { @@ -3764,8 +3628,6 @@ lp_status_t barrier_solver_t::solve(f_t start_time, } // Sync to make sure all the async copies to host done inside are finished RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - printf("Barrier iter %d corrector dir : %.2fs\n", iter, toc(corrector_search_direction_start)); - fflush(stdout); if (status < 0) { return check_for_suboptimal_solution(options, data, diff --git a/cpp/src/barrier/sparse_cholesky.cuh b/cpp/src/barrier/sparse_cholesky.cuh index 4845a7501d..f7938fb989 100644 --- a/cpp/src/barrier/sparse_cholesky.cuh +++ b/cpp/src/barrier/sparse_cholesky.cuh @@ -539,9 +539,6 @@ class sparse_cholesky_cudss_t : public sparse_cholesky_base_t { return CONCURRENT_HALT_RETURN; } - printf("cuDSS numeric factor time : %.2fs\n", numeric_time); - fflush(stdout); - int info; size_t sizeWritten = 0; CUDSS_CALL_AND_CHECK( diff --git a/cpp/src/barrier/sparse_matrix_kernels.cuh b/cpp/src/barrier/sparse_matrix_kernels.cuh index 6a06f364c5..4727c12ec8 100644 --- a/cpp/src/barrier/sparse_matrix_kernels.cuh +++ b/cpp/src/barrier/sparse_matrix_kernels.cuh @@ -9,7 +9,6 @@ #include #include -#include namespace cuopt::linear_programming::dual_simplex { @@ -18,10 +17,8 @@ void initialize_cusparse_data(raft::handle_t const* handle, device_csr_matrix_t& A, device_csc_matrix_t& DAT, device_csr_matrix_t& ADAT, - cusparse_info_t& cusparse_data, - const simplex_solver_settings_t& settings) + cusparse_info_t& cusparse_data) { - f_t start_init = tic(); auto A_nnz = A.nz_max; auto DAT_nnz = DAT.nz_max; f_t chunk_fraction = 0.15; @@ -48,7 +45,6 @@ void initialize_cusparse_data(raft::handle_t const* handle, // Buffer size size_t buffer_size; - f_t start_work_estimation = tic(); RAFT_CUSPARSE_TRY(cusparseSpGEMM_workEstimation(handle->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -77,13 +73,11 @@ void initialize_cusparse_data(raft::handle_t const* handle, cusparse_data.spgemm_descr, &buffer_size, cusparse_data.buffer_size.data())); - f_t work_estimation_time = toc(start_work_estimation); int64_t num_prods; RAFT_CUSPARSE_TRY(cusparseSpGEMM_getNumProducts(cusparse_data.spgemm_descr, &num_prods)); size_t buffer_size_3_size; - f_t start_estimate_memory = tic(); RAFT_CUSPARSE_TRY(cusparseSpGEMM_estimateMemory(handle->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -118,13 +112,6 @@ void initialize_cusparse_data(raft::handle_t const* handle, &cusparse_data.buffer_size_2_size)); cusparse_data.buffer_size_3.resize(0, handle->get_stream()); cusparse_data.buffer_size_2.resize(cusparse_data.buffer_size_2_size, handle->get_stream()); - handle->sync_stream(); - - printf("SpGEMM init total : %.2fs\n", toc(start_init)); - printf("SpGEMM workEstimation time : %.2fs\n", work_estimation_time); - printf("SpGEMM estimateMemory time : %.2fs\n", toc(start_estimate_memory)); - printf("SpGEMM estimated products : %.2e\n", static_cast(num_prods)); - fflush(stdout); } template @@ -132,10 +119,8 @@ void multiply_kernels(raft::handle_t const* handle, device_csr_matrix_t& A, device_csc_matrix_t& DAT, device_csr_matrix_t& ADAT, - cusparse_info_t& cusparse_data, - const simplex_solver_settings_t& settings) + cusparse_info_t& cusparse_data) { - f_t start_spgemm_compute = tic(); RAFT_CUSPARSE_TRY( cusparseSpGEMM_compute(handle->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -150,10 +135,8 @@ void multiply_kernels(raft::handle_t const* handle, cusparse_data.spgemm_descr, &cusparse_data.buffer_size_2_size, cusparse_data.buffer_size_2.data())); - f_t spgemm_compute_time = toc(start_spgemm_compute); // get matrix C non-zero entries C_nnz1 - f_t start_materialize = tic(); int64_t ADAT_num_rows, ADAT_num_cols, ADAT_nnz1; RAFT_CUSPARSE_TRY( cusparseSpMatGetSize(cusparse_data.matADAT_descr, &ADAT_num_rows, &ADAT_num_cols, &ADAT_nnz1)); @@ -164,9 +147,7 @@ void multiply_kernels(raft::handle_t const* handle, // update matC with the new pointers RAFT_CUSPARSE_TRY(cusparseCsrSetPointers( cusparse_data.matADAT_descr, ADAT.row_start.data(), ADAT.j.data(), ADAT.x.data())); - f_t spgemm_materialize_time = toc(start_materialize); - f_t start_spgemm_copy = tic(); RAFT_CUSPARSE_TRY(cusparseSpGEMM_copy(handle->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -180,12 +161,6 @@ void multiply_kernels(raft::handle_t const* handle, cusparse_data.spgemm_descr)); handle->sync_stream(); - f_t spgemm_copy_time = toc(start_spgemm_copy); - - printf("SpGEMM compute time : %.2fs\n", spgemm_compute_time); - printf("SpGEMM materialize time : %.2fs\n", spgemm_materialize_time); - printf("SpGEMM copy time : %.2fs\n", spgemm_copy_time); - fflush(stdout); } } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/crossover.cpp b/cpp/src/dual_simplex/crossover.cpp index acfdd2abc0..f55ee0837d 100644 --- a/cpp/src/dual_simplex/crossover.cpp +++ b/cpp/src/dual_simplex/crossover.cpp @@ -612,7 +612,7 @@ i_t dual_push(const lp_problem_t& lp, return TIME_LIMIT_RETURN; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - settings.log.printf("Dual simplex halted inside crossover dual push loop\n"); + settings.log.printf("Concurrent halt\n"); return CONCURRENT_HALT_RETURN; } } @@ -989,7 +989,7 @@ i_t primal_push(const lp_problem_t& lp, return TIME_LIMIT_RETURN; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - settings.log.printf("Dual simplex halted inside crossover primal push loop\n"); + settings.log.printf("Concurrent halt\n"); return CONCURRENT_HALT_RETURN; } } @@ -1353,7 +1353,7 @@ crossover_status_t crossover(const lp_problem_t& lp, return crossover_status_t::TIME_LIMIT; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - settings.log.printf("Dual simplex halted in crossover after basis reorder (before FTran/BTran)\n"); + settings.log.printf("Concurrent halt\n"); return crossover_status_t::CONCURRENT_LIMIT; } @@ -1415,7 +1415,7 @@ crossover_status_t crossover(const lp_problem_t& lp, return crossover_status_t::TIME_LIMIT; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - settings.log.printf("Dual simplex halted in crossover after dual_phase2 refinement\n"); + settings.log.printf("Concurrent halt\n"); return crossover_status_t::CONCURRENT_LIMIT; } primal_infeas = primal_infeasibility(lp, settings, vstatus, solution.x); @@ -1577,7 +1577,7 @@ crossover_status_t crossover(const lp_problem_t& lp, return crossover_status_t::TIME_LIMIT; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - settings.log.printf("Dual simplex halted in crossover during iterative dual_phase2 refinement\n"); + settings.log.printf("Concurrent halt\n"); return crossover_status_t::CONCURRENT_LIMIT; } solution.iterations += iter; diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index 8c6072f3e6..c5880c137b 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -2590,10 +2590,6 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, if (refactor_status > 0) { return dual::status_t::NUMERICAL; } if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; } - printf("[DS ph%d init] t=%.3f halt=%d after refactor_basis\n", - phase, - toc(start_time), - settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); } // Populate c_basic after basis is initialized @@ -2662,10 +2658,6 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, phase2_work_estimate); if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; } - printf("[DS ph%d init] t=%.3f halt=%d after compute_primal_variables\n", - phase, - toc(start_time), - settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); if (print_norms) { settings.log.printf("|| x || %e\n", vector_norm2(x)); } #ifdef COMPUTE_PRIMAL_RESIDUAL @@ -2706,10 +2698,6 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, vector_norm2(delta_y_steepest_edge)); } - printf("[DS ph%d init] t=%.3f halt=%d after SE_norms\n", - phase, - toc(start_time), - settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return dual::status_t::CONCURRENT_LIMIT; } @@ -2756,11 +2744,6 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, infeasibility_indices, primal_infeasibility); phase2_work_estimate += 4 * m + 2 * n; - printf("[DS ph%d init] t=%.3f halt=%d after compute_primal_infeas infeas_nz=%d\n", - phase, - toc(start_time), - settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0, - (int)infeasibility_indices.size()); #ifdef CHECK_BASIC_INFEASIBILITIES phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0); @@ -2773,10 +2756,6 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, csc_matrix_t A_transpose(1, 1, 0); lp.A.transpose(A_transpose); phase2_work_estimate += 2 * lp.A.col_start[lp.A.n]; - printf("[DS ph%d init] t=%.3f halt=%d after A_transpose\n", - phase, - toc(start_time), - settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0); if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { return dual::status_t::CONCURRENT_LIMIT; @@ -2827,14 +2806,6 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase, while (iter < iter_limit) { PHASE2_NVTX_RANGE("DualSimplex::phase2_main_loop"); - if (iter == 0 || (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1)) { - printf("[DS ph%d iter %d] loop top t=%.3f halt=%d infeas_nz=%d\n", - phase, - iter, - toc(start_time), - settings.concurrent_halt != nullptr ? (int)*settings.concurrent_halt : 0, - (int)infeasibility_indices.size()); - } // Pricing i_t direction = 0; diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp index 9ff90329a1..d2a68d96de 100644 --- a/cpp/src/dual_simplex/presolve.cpp +++ b/cpp/src/dual_simplex/presolve.cpp @@ -13,10 +13,6 @@ #include #include -#include - -#include - #include #include #include @@ -823,17 +819,6 @@ i_t presolve(const lp_problem_t& original, lp_problem_t& problem, presolve_info_t& presolve_info) { - f_t presolve_start = tic(); - auto maybe_nvtx_scope = [&](const char* name) { - if (settings.barrier_presolve) { raft::common::nvtx::push_range(name); } - return cuopt::scope_guard([&]() { - if (settings.barrier_presolve) { raft::common::nvtx::pop_range(); } - }); - }; - if (settings.barrier_presolve) { - printf("Barrier presolve: enter : %.2fs\n", 0.0); - fflush(stdout); - } problem = original; std::vector row_sense(problem.num_rows, '='); @@ -858,12 +843,6 @@ i_t presolve(const lp_problem_t& original, if (problem.lower[j] != 0.0 && problem.lower[j] > -inf) { nonzero_lower_bounds++; } } if (settings.barrier_presolve && nonzero_lower_bounds > 0) { - auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::ShiftLowerBounds"); - f_t lower_bound_start = tic(); - printf("Barrier presolve: shift lb begin : %.2fs count %d\n", - toc(presolve_start), - static_cast(nonzero_lower_bounds)); - fflush(stdout); settings.log.printf("Transforming %ld nonzero lower bound\n", nonzero_lower_bounds); presolve_info.removed_lower_bounds.resize(problem.num_cols); // We can construct a new variable: x'_j = x_j - l_j or x_j = x'_j + l_j @@ -931,71 +910,33 @@ i_t presolve(const lp_problem_t& original, problem.lower[j] = 0.0; } } - printf("Barrier presolve: shift lb end : %.2fs elapsed %.2fs\n", - toc(presolve_start), - toc(lower_bound_start)); - fflush(stdout); } // Check for empty rows - f_t empty_row_scan_start = tic(); i_t num_empty_rows = 0; { - auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::EmptyRowScan"); csr_matrix_t Arow(0, 0, 0); problem.A.to_compressed_row(Arow); for (i_t i = 0; i < problem.num_rows; i++) { if (Arow.row_start[i + 1] - Arow.row_start[i] == 0) { num_empty_rows++; } } } - if (settings.barrier_presolve) { - printf("Barrier presolve: empty row scan : %.2fs elapsed %.2fs rows %d\n", - toc(presolve_start), - toc(empty_row_scan_start), - static_cast(num_empty_rows)); - fflush(stdout); - } if (num_empty_rows > 0) { - auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::EmptyRowRemove"); - f_t empty_row_remove_start = tic(); settings.log.printf("Presolve removing %d empty rows\n", num_empty_rows); i_t i = remove_empty_rows(problem, row_sense, num_empty_rows, presolve_info); if (i != 0) { return -1; } - if (settings.barrier_presolve) { - printf("Barrier presolve: empty row rm : %.2fs elapsed %.2fs\n", - toc(presolve_start), - toc(empty_row_remove_start)); - fflush(stdout); - } } // Check for empty cols - f_t empty_col_scan_start = tic(); i_t num_empty_cols = 0; { - auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::EmptyColScan"); for (i_t j = 0; j < problem.num_cols; ++j) { if ((problem.A.col_start[j + 1] - problem.A.col_start[j]) == 0) { num_empty_cols++; } } } - if (settings.barrier_presolve) { - printf("Barrier presolve: empty col scan : %.2fs elapsed %.2fs cols %d\n", - toc(presolve_start), - toc(empty_col_scan_start), - static_cast(num_empty_cols)); - fflush(stdout); - } if (num_empty_cols > 0) { - auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::EmptyColRemove"); - f_t empty_col_remove_start = tic(); settings.log.printf("Presolve attempt to remove %d empty cols\n", num_empty_cols); remove_empty_cols(problem, num_empty_cols, presolve_info); - if (settings.barrier_presolve) { - printf("Barrier presolve: empty col rm : %.2fs elapsed %.2fs\n", - toc(presolve_start), - toc(empty_col_remove_start)); - fflush(stdout); - } } // Check for free variables @@ -1007,12 +948,6 @@ i_t presolve(const lp_problem_t& original, problem.Q.check_matrix("Before free variable expansion"); if (settings.barrier_presolve && free_variables > 0) { - auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::FreeVariableExpansion"); - f_t free_var_start = tic(); - printf("Barrier presolve: free var begin : %.2fs count %d\n", - toc(presolve_start), - static_cast(free_variables)); - fflush(stdout); // We have a variable x_j: with -inf < x_j < inf // we create new variables v and w with 0 <= v, w and x_j = v - w // Constraints @@ -1157,22 +1092,10 @@ i_t presolve(const lp_problem_t& original, // assert(problem.A.p[num_cols] == nnz); problem.A.n = num_cols; problem.num_cols = num_cols; - printf("Barrier presolve: free var end : %.2fs elapsed %.2fs\n", - toc(presolve_start), - toc(free_var_start)); - fflush(stdout); } if (settings.barrier_presolve && settings.folding != 0 && problem.Q.n == 0) { - auto nvtx_scope = maybe_nvtx_scope("BarrierPresolve::Folding"); - f_t folding_start = tic(); - printf("Barrier presolve: folding begin : %.2fs\n", toc(presolve_start)); - fflush(stdout); folding(problem, settings, presolve_info); - printf("Barrier presolve: folding end : %.2fs elapsed %.2fs\n", - toc(presolve_start), - toc(folding_start)); - fflush(stdout); } // Check for dependent rows @@ -1212,14 +1135,6 @@ i_t presolve(const lp_problem_t& original, problem.A.n, problem.A.col_start[problem.A.n]); } - if (settings.barrier_presolve) { - printf("Barrier presolve: exit : %.2fs rows %d cols %d nnz %d\n", - toc(presolve_start), - static_cast(problem.A.m), - static_cast(problem.A.n), - static_cast(problem.A.col_start[problem.A.n])); - fflush(stdout); - } assert(problem.rhs.size() == problem.A.m); return 0; } diff --git a/cpp/src/dual_simplex/right_looking_lu.cpp b/cpp/src/dual_simplex/right_looking_lu.cpp index ef3c8d90f6..37202000f8 100644 --- a/cpp/src/dual_simplex/right_looking_lu.cpp +++ b/cpp/src/dual_simplex/right_looking_lu.cpp @@ -1258,7 +1258,7 @@ i_t right_looking_lu_row_permutation_only(const csc_matrix_t& A, } if (toc(start_time) > settings.time_limit) { return TIME_LIMIT_RETURN; } if (settings.concurrent_halt != nullptr && *settings.concurrent_halt == 1) { - settings.log.printf("Dual simplex halted inside right-looking LU factorization (fill-in loop)\n"); + settings.log.printf("Concurrent halt\n"); return CONCURRENT_HALT_RETURN; } } diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index d5890e3771..ff852fb2dc 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -364,8 +364,6 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us f_t start_time, lp_solution_t& solution) { - printf("Barrier wrapper: start : %.2fs\n", toc(start_time)); - fflush(stdout); lp_status_t status = lp_status_t::UNSET; lp_problem_t original_lp(user_problem.handle_ptr, 1, 1, 1); @@ -380,24 +378,8 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us // Presolve the linear program presolve_info_t presolve_info; lp_problem_t presolved_lp(user_problem.handle_ptr, 1, 1, 1); - f_t barrier_presolve_start = tic(); - printf("Barrier wrapper: presolve begin : %.2fs\n", toc(start_time)); - fflush(stdout); - i_t ok; - { - raft::common::nvtx::range scope("BarrierWrapper::Presolve"); - ok = presolve(original_lp, barrier_settings, presolved_lp, presolve_info); - } - printf("Barrier wrapper: presolve end : %.2fs elapsed %.2fs status %d\n", - toc(start_time), - toc(barrier_presolve_start), - ok); - fflush(stdout); - if (ok == CONCURRENT_HALT_RETURN) { - printf("Barrier wrapper: presolve halted: %.2fs\n", toc(start_time)); - fflush(stdout); - return lp_status_t::CONCURRENT_LIMIT; - } + const i_t ok = presolve(original_lp, barrier_settings, presolved_lp, presolve_info); + if (ok == CONCURRENT_HALT_RETURN) { return lp_status_t::CONCURRENT_LIMIT; } if (ok == TIME_LIMIT_RETURN) { return lp_status_t::TIME_LIMIT; } if (ok == -1) { return lp_status_t::INFEASIBLE; } @@ -407,17 +389,7 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us presolved_lp.num_cols, presolved_lp.A.col_start[presolved_lp.num_cols]); std::vector column_scales; - f_t barrier_scaling_start = tic(); - printf("Barrier wrapper: scaling begin : %.2fs\n", toc(start_time)); - fflush(stdout); - { - raft::common::nvtx::range scope("BarrierWrapper::Scaling"); - column_scaling(presolved_lp, barrier_settings, barrier_lp, column_scales); - } - printf("Barrier wrapper: scaling end : %.2fs elapsed %.2fs\n", - toc(start_time), - toc(barrier_scaling_start)); - fflush(stdout); + column_scaling(presolved_lp, barrier_settings, barrier_lp, column_scales); // Solve using barrier lp_solution_t barrier_solution(barrier_lp.num_rows, barrier_lp.num_cols); @@ -439,31 +411,11 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us } } - f_t barrier_ctor_start = tic(); - printf("Barrier wrapper: ctor begin : %.2fs\n", toc(start_time)); - fflush(stdout); - std::unique_ptr> barrier_solver; - { - raft::common::nvtx::range scope("BarrierWrapper::Ctor"); - barrier_solver = - std::make_unique>(barrier_lp, presolve_info, barrier_settings); - } - printf("Barrier wrapper: ctor end : %.2fs elapsed %.2fs\n", - toc(start_time), - toc(barrier_ctor_start)); - fflush(stdout); + std::unique_ptr> barrier_solver = + std::make_unique>(barrier_lp, presolve_info, barrier_settings); barrier_solver_settings_t barrier_solver_settings; - printf("Barrier wrapper: solve begin : %.2fs\n", toc(start_time)); - fflush(stdout); - lp_status_t barrier_status; - { - raft::common::nvtx::range scope("BarrierWrapper::Solve"); - barrier_status = barrier_solver->solve(start_time, barrier_solver_settings, barrier_solution); - } - printf("Barrier wrapper: solve end : %.2fs status %d\n", - toc(start_time), - static_cast(barrier_status)); - fflush(stdout); + lp_status_t barrier_status = + barrier_solver->solve(start_time, barrier_solver_settings, barrier_solution); if (barrier_status == lp_status_t::CONCURRENT_LIMIT) { std::thread([s = std::move(barrier_solver), b = std::move(barrier_lp), From 9206737fccffde98ad3f1db0e103b06506085113 Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Mon, 13 Apr 2026 10:39:38 -0500 Subject: [PATCH 06/16] Minimize diffs --- cpp/src/barrier/barrier.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index 77cae20e1e..e6a32bd786 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -32,6 +32,7 @@ #include #include #include + #include #include @@ -585,7 +586,8 @@ class iteration_data_t { if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) { return; } if (first_call) { try { - initialize_cusparse_data(handle_ptr, device_A, device_AD, device_ADAT, cusparse_info); + initialize_cusparse_data( + handle_ptr, device_A, device_AD, device_ADAT, cusparse_info); } catch (const raft::cuda_error& e) { settings_.log.printf("Error in initialize_cusparse_data: %s\n", e.what()); return; From e20ee9dea01446162ba90725998a22ef4538f209 Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Mon, 13 Apr 2026 12:49:34 -0500 Subject: [PATCH 07/16] address coderabbit --- cpp/src/dual_simplex/solve.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index ff852fb2dc..d04d497a71 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -724,6 +724,7 @@ lp_status_t solve_linear_program(const user_problem_t& user_problem, lp_status_t status = solve_linear_program_advanced( original_lp, start_time, settings, lp_solution, vstatus, edge_norms); if (status == lp_status_t::CONCURRENT_LIMIT) { + solution.iterations = lp_solution.iterations; std::thread([lp = std::move(original_lp), ls = std::move(lp_solution), vs = std::move(vstatus), From b777fe5009210f806c03eaf4e5afc0995fe08d9b Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Mon, 13 Apr 2026 12:53:13 -0500 Subject: [PATCH 08/16] Address Rajesh's comments --- cpp/src/dual_simplex/phase2.cpp | 1 + cpp/src/dual_simplex/solve.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index c5880c137b..a0c266b381 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -2509,6 +2509,7 @@ dual::status_t dual_phase2(i_t phase, delta_y_steepest_edge, work_unit_context); if (result == dual::status_t::CONCURRENT_LIMIT) { + // Keep basis state alive while the concurrent solve continues asynchronously. std::thread([bl = std::move(basic_list), nl = std::move(nonbasic_list), sl = std::move(superbasic_list), diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index d04d497a71..d8bcf5db79 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -231,6 +231,7 @@ lp_status_t solve_linear_program_with_advanced_basis( if (phase1_status == dual::status_t::WORK_LIMIT) { return lp_status_t::WORK_LIMIT; } if (phase1_status == dual::status_t::ITERATION_LIMIT) { return lp_status_t::ITERATION_LIMIT; } if (phase1_status == dual::status_t::CONCURRENT_LIMIT) { + // Keep phase-1 state alive while the concurrent solve continues asynchronously. std::thread([plp = std::move(presolved_lp), pi = std::move(presolve_info), lpp = std::move(lp), @@ -327,6 +328,7 @@ lp_status_t solve_linear_program_with_advanced_basis( if (status == dual::status_t::WORK_LIMIT) { lp_status = lp_status_t::WORK_LIMIT; } if (status == dual::status_t::ITERATION_LIMIT) { lp_status = lp_status_t::ITERATION_LIMIT; } if (status == dual::status_t::CONCURRENT_LIMIT) { + // Preserve observed progress before returning the concurrent halt. original_solution.iterations = iter; std::thread([sol = std::move(solution), plp = std::move(presolved_lp), From 384407ecc77130c2dbf2cb10e0c80374721672a9 Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Mon, 13 Apr 2026 12:58:38 -0500 Subject: [PATCH 09/16] Address remaining coderabbit comments --- cpp/src/dual_simplex/solve.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index d8bcf5db79..598a1fcaff 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -232,6 +232,7 @@ lp_status_t solve_linear_program_with_advanced_basis( if (phase1_status == dual::status_t::ITERATION_LIMIT) { return lp_status_t::ITERATION_LIMIT; } if (phase1_status == dual::status_t::CONCURRENT_LIMIT) { // Keep phase-1 state alive while the concurrent solve continues asynchronously. + original_solution.iterations = iter; std::thread([plp = std::move(presolved_lp), pi = std::move(presolve_info), lpp = std::move(lp), From 26e5d4de4f3e243ad04899acb9e2466728386053 Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Mon, 13 Apr 2026 13:03:09 -0500 Subject: [PATCH 10/16] address nitpick --- cpp/src/dual_simplex/phase2.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index a0c266b381..938aaa2466 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -2512,7 +2512,6 @@ dual::status_t dual_phase2(i_t phase, // Keep basis state alive while the concurrent solve continues asynchronously. std::thread([bl = std::move(basic_list), nl = std::move(nonbasic_list), - sl = std::move(superbasic_list), f = std::move(ft)]() {}).detach(); } return result; From 8b896e6d894492dade103fbab8ce28e1d249ad5b Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Mon, 13 Apr 2026 13:43:17 -0500 Subject: [PATCH 11/16] Remove calling destructor in a detached thread --- cpp/src/dual_simplex/phase2.cpp | 8 ------ cpp/src/dual_simplex/solve.cpp | 43 +-------------------------------- 2 files changed, 1 insertion(+), 50 deletions(-) diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index 938aaa2466..10c200c74d 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -24,7 +24,6 @@ #include #include -#include // #define PHASE2_NVTX_RANGES @@ -2491,7 +2490,6 @@ dual::status_t dual_phase2(i_t phase, const i_t n = lp.num_cols; std::vector basic_list(m); std::vector nonbasic_list; - std::vector superbasic_list; auto ft = std::make_unique>(m, settings.refactor_frequency); const bool initialize_basis = true; dual::status_t result = dual_phase2_with_advanced_basis(phase, @@ -2508,12 +2506,6 @@ dual::status_t dual_phase2(i_t phase, iter, delta_y_steepest_edge, work_unit_context); - if (result == dual::status_t::CONCURRENT_LIMIT) { - // Keep basis state alive while the concurrent solve continues asynchronously. - std::thread([bl = std::move(basic_list), - nl = std::move(nonbasic_list), - f = std::move(ft)]() {}).detach(); - } return result; } diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index 598a1fcaff..89b10e3f00 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -33,7 +33,6 @@ #include #include #include -#include namespace cuopt::linear_programming::dual_simplex { @@ -132,11 +131,6 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t& original vstatus, edge_norms, work_unit_context); - if (result == lp_status_t::CONCURRENT_LIMIT) { - std::thread([bl = std::move(basic_list), - nl = std::move(nonbasic_list), - f = std::move(ft)]() {}).detach(); - } return result; } @@ -231,15 +225,7 @@ lp_status_t solve_linear_program_with_advanced_basis( if (phase1_status == dual::status_t::WORK_LIMIT) { return lp_status_t::WORK_LIMIT; } if (phase1_status == dual::status_t::ITERATION_LIMIT) { return lp_status_t::ITERATION_LIMIT; } if (phase1_status == dual::status_t::CONCURRENT_LIMIT) { - // Keep phase-1 state alive while the concurrent solve continues asynchronously. original_solution.iterations = iter; - std::thread([plp = std::move(presolved_lp), - pi = std::move(presolve_info), - lpp = std::move(lp), - cs = std::move(column_scales), - p1 = std::move(phase1_problem), - p1v = std::move(phase1_vstatus), - p1s = std::move(phase1_solution)]() {}).detach(); return lp_status_t::CONCURRENT_LIMIT; } phase1_obj = phase1_solution.objective; @@ -329,16 +315,7 @@ lp_status_t solve_linear_program_with_advanced_basis( if (status == dual::status_t::WORK_LIMIT) { lp_status = lp_status_t::WORK_LIMIT; } if (status == dual::status_t::ITERATION_LIMIT) { lp_status = lp_status_t::ITERATION_LIMIT; } if (status == dual::status_t::CONCURRENT_LIMIT) { - // Preserve observed progress before returning the concurrent halt. original_solution.iterations = iter; - std::thread([sol = std::move(solution), - plp = std::move(presolved_lp), - pi = std::move(presolve_info), - lpp = std::move(lp), - cs = std::move(column_scales), - p1 = std::move(phase1_problem), - p1v = std::move(phase1_vstatus), - p1s = std::move(phase1_solution)]() {}).detach(); return lp_status_t::CONCURRENT_LIMIT; } if (status == dual::status_t::NUMERICAL) { lp_status = lp_status_t::NUMERICAL_ISSUES; } @@ -419,19 +396,7 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us barrier_solver_settings_t barrier_solver_settings; lp_status_t barrier_status = barrier_solver->solve(start_time, barrier_solver_settings, barrier_solution); - if (barrier_status == lp_status_t::CONCURRENT_LIMIT) { - std::thread([s = std::move(barrier_solver), - b = std::move(barrier_lp), - p = std::move(presolved_lp), - o = std::move(original_lp), - bs = std::move(barrier_solution), - ls = std::move(lp_solution), - pi = std::move(presolve_info), - cs = std::move(column_scales), - ns = std::move(new_slacks), - di = std::move(dualize_info)]() {}).detach(); - return lp_status_t::CONCURRENT_LIMIT; - } + if (barrier_status == lp_status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; } if (barrier_status == lp_status_t::OPTIMAL) { #ifdef COMPUTE_SCALED_RESIDUALS std::vector scaled_residual = barrier_lp.rhs; @@ -728,12 +693,6 @@ lp_status_t solve_linear_program(const user_problem_t& user_problem, original_lp, start_time, settings, lp_solution, vstatus, edge_norms); if (status == lp_status_t::CONCURRENT_LIMIT) { solution.iterations = lp_solution.iterations; - std::thread([lp = std::move(original_lp), - ls = std::move(lp_solution), - vs = std::move(vstatus), - en = std::move(edge_norms), - ns = std::move(new_slacks), - di = std::move(dualize_info)]() {}).detach(); return lp_status_t::CONCURRENT_LIMIT; } uncrush_primal_solution(user_problem, original_lp, lp_solution.x, solution.x); From c576b118b4b9dc054366823e17d156ce0c03a8d8 Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Mon, 13 Apr 2026 14:05:42 -0500 Subject: [PATCH 12/16] No need of unique pointers --- cpp/src/dual_simplex/phase2.cpp | 6 ++---- cpp/src/dual_simplex/solve.cpp | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index 10c200c74d..569f7cf540 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -23,8 +23,6 @@ #include -#include - // #define PHASE2_NVTX_RANGES #ifdef PHASE2_NVTX_RANGES @@ -2490,7 +2488,7 @@ dual::status_t dual_phase2(i_t phase, const i_t n = lp.num_cols; std::vector basic_list(m); std::vector nonbasic_list; - auto ft = std::make_unique>(m, settings.refactor_frequency); + basis_update_mpf_t ft(m, settings.refactor_frequency); const bool initialize_basis = true; dual::status_t result = dual_phase2_with_advanced_basis(phase, slack_basis, @@ -2499,7 +2497,7 @@ dual::status_t dual_phase2(i_t phase, lp, settings, vstatus, - *ft, + ft, basic_list, nonbasic_list, sol, diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index 89b10e3f00..f315d35648 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -120,12 +120,12 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t& original assert(m <= n); std::vector basic_list(m); std::vector nonbasic_list; - auto ft = std::make_unique>(m, settings.refactor_frequency); + basis_update_mpf_t ft(m, settings.refactor_frequency); lp_status_t result = solve_linear_program_with_advanced_basis(original_lp, start_time, settings, original_solution, - *ft, + ft, basic_list, nonbasic_list, vstatus, From 591bfe153f9f1b42719278c2f78974cf8ad20e23 Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Mon, 13 Apr 2026 14:09:54 -0500 Subject: [PATCH 13/16] Remove another unique ptr reference --- cpp/src/dual_simplex/solve.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index f315d35648..d316eb5608 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -30,7 +30,6 @@ #include #include -#include #include #include @@ -391,11 +390,10 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us } } - std::unique_ptr> barrier_solver = - std::make_unique>(barrier_lp, presolve_info, barrier_settings); + barrier_solver_t barrier_solver(barrier_lp, presolve_info, barrier_settings); barrier_solver_settings_t barrier_solver_settings; lp_status_t barrier_status = - barrier_solver->solve(start_time, barrier_solver_settings, barrier_solution); + barrier_solver.solve(start_time, barrier_solver_settings, barrier_solution); if (barrier_status == lp_status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; } if (barrier_status == lp_status_t::OPTIMAL) { #ifdef COMPUTE_SCALED_RESIDUALS From 03dc5380eb4b2988ea0d509d271b3899cebe0cb6 Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Mon, 13 Apr 2026 14:11:48 -0500 Subject: [PATCH 14/16] minimize diffs --- cpp/src/dual_simplex/phase2.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp index 569f7cf540..345f240677 100644 --- a/cpp/src/dual_simplex/phase2.cpp +++ b/cpp/src/dual_simplex/phase2.cpp @@ -2490,7 +2490,7 @@ dual::status_t dual_phase2(i_t phase, std::vector nonbasic_list; basis_update_mpf_t ft(m, settings.refactor_frequency); const bool initialize_basis = true; - dual::status_t result = dual_phase2_with_advanced_basis(phase, + return dual_phase2_with_advanced_basis(phase, slack_basis, initialize_basis, start_time, @@ -2504,7 +2504,6 @@ dual::status_t dual_phase2(i_t phase, iter, delta_y_steepest_edge, work_unit_context); - return result; } template From 3c4ed45d36e259423ee8989354c8d1e67165c54b Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Tue, 14 Apr 2026 11:36:24 -0500 Subject: [PATCH 15/16] check for optimality before concurrent limit --- cpp/src/dual_simplex/solve.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index d316eb5608..9851db058f 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -394,7 +394,6 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us barrier_solver_settings_t barrier_solver_settings; lp_status_t barrier_status = barrier_solver.solve(start_time, barrier_solver_settings, barrier_solution); - if (barrier_status == lp_status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; } if (barrier_status == lp_status_t::OPTIMAL) { #ifdef COMPUTE_SCALED_RESIDUALS std::vector scaled_residual = barrier_lp.rhs; @@ -589,6 +588,8 @@ lp_status_t solve_linear_program_with_barrier(const user_problem_t& us solution.iterations = barrier_solution.iterations; } + if (barrier_status == lp_status_t::CONCURRENT_LIMIT) { return lp_status_t::CONCURRENT_LIMIT; } + // If we aren't doing crossover, we're done if (!settings.crossover || barrier_lp.Q.n > 0) { return barrier_status; } From 194192bd88f0d54766cc297b2eb3802c69df8883 Mon Sep 17 00:00:00 2001 From: skainkaryam Date: Tue, 14 Apr 2026 12:47:19 -0500 Subject: [PATCH 16/16] Apply clang-format --- cpp/src/dual_simplex/solve.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/dual_simplex/solve.cpp b/cpp/src/dual_simplex/solve.cpp index 9851db058f..82d922eec3 100644 --- a/cpp/src/dual_simplex/solve.cpp +++ b/cpp/src/dual_simplex/solve.cpp @@ -121,15 +121,15 @@ lp_status_t solve_linear_program_advanced(const lp_problem_t& original std::vector nonbasic_list; basis_update_mpf_t ft(m, settings.refactor_frequency); lp_status_t result = solve_linear_program_with_advanced_basis(original_lp, - start_time, - settings, - original_solution, - ft, - basic_list, - nonbasic_list, - vstatus, - edge_norms, - work_unit_context); + start_time, + settings, + original_solution, + ft, + basic_list, + nonbasic_list, + vstatus, + edge_norms, + work_unit_context); return result; }