diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu index d04979080a..e76a77dba6 100644 --- a/benchmarks/linear_programming/cuopt/run_pdlp.cu +++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu @@ -70,15 +70,23 @@ static void parse_arguments(argparse::ArgumentParser& program) "Path to PDLP hyper-params file to configure PDLP solver. Has priority over PDLP solver " "modes."); - program.add_argument("--presolve") - .help("enable/disable presolve (default: true for MIP problems, false for LP problems)") - .default_value(0) - .scan<'i', int>() - .choices(0, 1); + program.add_argument("--presolver") + .help("Presolver to use. Possible values: None, Papilo, PSLP, Default") + .default_value("Default") + .choices("None", "Papilo", "PSLP", "Default"); program.add_argument("--solution-path").help("Path where solution file will be generated"); } +static cuopt::linear_programming::presolver_t string_to_presolver(const std::string& presolver) +{ + if (presolver == "None") return cuopt::linear_programming::presolver_t::None; + if (presolver == "Papilo") return cuopt::linear_programming::presolver_t::Papilo; + if (presolver == "PSLP") return cuopt::linear_programming::presolver_t::PSLP; + if (presolver == "Default") return cuopt::linear_programming::presolver_t::Default; + return cuopt::linear_programming::presolver_t::Default; +} + static cuopt::linear_programming::pdlp_solver_mode_t string_to_pdlp_solver_mode( const std::string& mode) { @@ -107,7 +115,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t create_sol string_to_pdlp_solver_mode(program.get("--pdlp-solver-mode")); settings.method = static_cast(program.get("--method")); settings.crossover = program.get("--crossover"); - settings.presolve = program.get("--presolve"); + settings.presolver = string_to_presolver(program.get("--presolver")); return settings; } diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index db72bbb323..095f462382 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -335,8 +335,9 @@ void strong_branching(const user_problem_t& original_problem, } const auto mps_model = simplex_problem_to_mps_data_model(original_problem); + const raft::handle_t batch_pdlp_handle; const auto solutions = - batch_pdlp_solve(original_problem.handle_ptr, mps_model, fractional, fraction_values); + batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values); f_t batch_pdlp_strong_branching_time = toc(start_batch); // Find max iteration on how many are done accross the batch diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt index ced9da8edc..2071bdfdef 100644 --- a/cpp/src/pdlp/CMakeLists.txt +++ b/cpp/src/pdlp/CMakeLists.txt @@ -24,6 +24,7 @@ set(LP_CORE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/infeasibility_information.cu ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu ${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu + ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu ) # C and Python adapter files diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu index 0adbc3809f..afa3ee5fb7 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu @@ -545,7 +545,7 @@ void pdlp_initial_scaling_strategy_t::scale_problem() #ifdef CUPDLP_DEBUG_MODE print("constraint_lower_bound", op_problem_scaled_.constraint_lower_bounds); print("constraint_upper_bound", op_problem_scaled_.constraint_upper_bounds); - std::vector variable_bounds = host_copy(op_problem_scaled_.variable_bounds); + std::vector variable_bounds = host_copy(op_problem_scaled_.variable_bounds, stream_view_); std::vector lower_bounds; std::vector upper_bounds; for (const auto& variable_bound : variable_bounds) { diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index a92ce8e8ef..51e0b29381 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -567,6 +567,12 @@ struct primal_reflected_major_projection_bulk_op { const f_t obj_coef = objective_coefficients[var_idx]; const f_t aty_val = current_AtY[idx]; + cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_major_projection"); + cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_major_projection"); + cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0"); + cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_major_projection"); + cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_major_projection"); + const f_t next = primal_val - step_size * (obj_coef - aty_val); const f_t2 bounds = variable_bounds[var_idx]; @@ -576,6 +582,9 @@ struct primal_reflected_major_projection_bulk_op { potential_next_primal[idx] = next_clamped; dual_slack[idx] = (next_clamped - next) / step_size; reflected_primal[idx] = f_t(2.0) * next_clamped - primal_val; + + cuopt_assert(!isnan(reflected_primal[idx]), + "reflected_primal is NaN after primal_reflected_major_projection"); } }; @@ -599,6 +608,12 @@ struct dual_reflected_major_projection_bulk_op { const f_t current_dual = dual_solution[idx]; const f_t Ax = dual_gradient[idx]; + cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_major_projection"); + cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_major_projection"); + cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0"); + cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_major_projection"); + cuopt_assert(!isnan(Ax), "dual_gradient is NaN in dual_reflected_major_projection"); + const f_t tmp = current_dual / step_size - Ax; const f_t tmp_proj = cuda::std::max(-constraint_upper_bounds[constraint_idx], @@ -607,6 +622,9 @@ struct dual_reflected_major_projection_bulk_op { potential_next_dual[idx] = next_dual; reflected_dual[idx] = f_t(2.0) * next_dual - current_dual; + + cuopt_assert(!isnan(reflected_dual[idx]), + "reflected_dual is NaN after dual_reflected_major_projection"); } }; @@ -631,12 +649,21 @@ struct primal_reflected_projection_bulk_op { const f_t obj_coef = objective_coefficients[var_idx]; const f_t aty_val = current_AtY[idx]; + cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_projection"); + cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_projection"); + cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_projection"); + cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_projection"); + cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0"); + f_t reflected = primal_val - step_size * (obj_coef - aty_val); const f_t2 bounds = variable_bounds[var_idx]; reflected = cuda::std::max(cuda::std::min(reflected, get_upper(bounds)), get_lower(bounds)); reflected_primal[idx] = f_t(2.0) * reflected - primal_val; + + cuopt_assert(!isnan(reflected_primal[idx]), + "reflected_primal is NaN after primal_reflected_projection"); } }; @@ -659,13 +686,23 @@ struct dual_reflected_projection_bulk_op { const f_t step_size = dual_step_size[batch_idx]; const f_t current_dual = dual_solution[idx]; - const f_t tmp = current_dual / step_size - dual_gradient[idx]; + + cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_projection"); + cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_projection"); + cuopt_assert(!isnan(dual_gradient[idx]), "dual_gradient is NaN in dual_reflected_projection"); + cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_projection"); + cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0"); + + const f_t tmp = current_dual / step_size - dual_gradient[idx]; const f_t tmp_proj = cuda::std::max(-constraint_upper_bounds[constraint_idx], cuda::std::min(tmp, -constraint_lower_bounds[constraint_idx])); const f_t next_dual = (tmp - tmp_proj) * step_size; reflected_dual[idx] = f_t(2.0) * next_dual - current_dual; + + cuopt_assert(!isnan(reflected_dual[idx]), + "reflected_dual is NaN after dual_reflected_projection"); } }; diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 012b987eea..d79cc6c3a2 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -35,7 +35,9 @@ #include #include +#include +#include #include #include @@ -1186,24 +1188,55 @@ static void compute_stats(const rmm::device_uvector& vec, f_t& avg) { auto abs_op = [] __host__ __device__(f_t x) { return abs(x); }; - auto min_nonzero = [] __host__ __device__(f_t x) { - return x == 0 ? std::numeric_limits::max() : abs(x); - }; - - smallest = thrust::transform_reduce(rmm::exec_policy(vec.stream()), - vec.begin(), - vec.end(), - min_nonzero, - std::numeric_limits::max(), - thrust::minimum()); - - largest = thrust::transform_reduce( - rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::maximum()); - - f_t sum = thrust::transform_reduce( - rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::plus()); - - avg = sum / vec.size(); + auto min_nonzero = [] __host__ __device__(f_t x) + -> f_t { return x == 0 ? std::numeric_limits::max() : abs(x); }; + + cuopt_assert(vec.size() > 0, "Vector must not be empty"); + + auto stream = vec.stream(); + size_t n = vec.size(); + + rmm::device_scalar d_smallest(stream); + rmm::device_scalar d_largest(stream); + rmm::device_scalar d_sum(stream); + + auto min_nz_iter = thrust::make_transform_iterator(vec.cbegin(), min_nonzero); + auto abs_iter = thrust::make_transform_iterator(vec.cbegin(), abs_op); + + void* d_temp = nullptr; + size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 0; + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(d_temp, + bytes_1, + min_nz_iter, + d_smallest.data(), + n, + cuda::minimum<>{}, + std::numeric_limits::max(), + stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream)); + + size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3}); + rmm::device_buffer temp_buf(max_bytes, stream); + + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(temp_buf.data(), + bytes_1, + min_nz_iter, + d_smallest.data(), + n, + cuda::minimum<>{}, + std::numeric_limits::max(), + stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream)); + + smallest = d_smallest.value(stream); + largest = d_largest.value(stream); + avg = d_sum.value(stream) / vec.size(); }; template @@ -1406,11 +1439,25 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal, const f_t interaction, f_t* fixed_point_error) { + cuopt_assert(!isnan(norm_squared_delta_primal), "norm_squared_delta_primal must not be NaN"); + cuopt_assert(!isnan(norm_squared_delta_dual), "norm_squared_delta_dual must not be NaN"); + cuopt_assert(!isnan(primal_weight), "primal_weight must not be NaN"); + cuopt_assert(!isnan(step_size), "step_size must not be NaN"); + cuopt_assert(!isnan(interaction), "interaction must not be NaN"); + cuopt_assert(norm_squared_delta_primal >= f_t(0.0), "norm_squared_delta_primal must be >= 0"); + cuopt_assert(norm_squared_delta_dual >= f_t(0.0), "norm_squared_delta_dual must be >= 0"); + cuopt_assert(primal_weight > f_t(0.0), "primal_weight must be > 0"); + cuopt_assert(step_size > f_t(0.0), "step_size must be > 0"); + const f_t movement = norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight; const f_t computed_interaction = f_t(2.0) * interaction * step_size; - *fixed_point_error = cuda::std::sqrt(movement + computed_interaction); + cuopt_assert(movement + computed_interaction >= f_t(0.0), + "Movement + computed interaction must be >= 0"); + + // Clamp to 0 to avoid NaN + *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction)); #ifdef CUPDLP_DEBUG_MODE printf("movement %lf\n", movement); @@ -1790,6 +1837,7 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // Sync to make sure all previous cuSparse operations are finished before setting the // potential_next_dual_solution RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + // Make potential_next_dual_solution point towards reflected dual solution to reuse the code RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, (void*)pdhg_solver_.get_reflected_dual().data())); @@ -1813,6 +1861,7 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte RAFT_CUDA_TRY(cudaStreamSynchronize( stream_view_)); // To make sure all the data is written from device to host RAFT_CUDA_TRY(cudaPeekAtLastError()); + #ifdef CUPDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif @@ -1847,9 +1896,15 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte #endif for (size_t i = 0; i < climber_strategies_.size(); ++i) { + cuopt_assert(!std::isnan(restart_strategy_.fixed_point_error_[i]), + "fixed_point_error_ must not be NaN after compute_fixed_error"); + cuopt_assert(restart_strategy_.fixed_point_error_[i] >= f_t(0.0), + "fixed_point_error_ must be >= 0 after compute_fixed_error"); if (has_restarted[i]) { restart_strategy_.initial_fixed_point_error_[i] = restart_strategy_.fixed_point_error_[i]; - has_restarted[i] = false; + cuopt_assert(!std::isnan(restart_strategy_.initial_fixed_point_error_[i]), + "initial_fixed_point_error_ must not be NaN after assignment"); + has_restarted[i] = false; } } } @@ -1869,6 +1924,7 @@ void pdlp_solver_t::transpose_primal_dual_to_row( rmm::device_uvector dual_slack_transposed( is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_); + RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_)); CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, @@ -1945,6 +2001,7 @@ void pdlp_solver_t::transpose_primal_dual_back_to_col( rmm::device_uvector dual_slack_transposed( is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_); + RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_)); CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, @@ -2632,7 +2689,7 @@ void pdlp_solver_t::compute_initial_step_size() rmm::device_uvector d_atq(n, stream_view_); std::mt19937 gen(1); - std::normal_distribution dist(0.0, 1.0); + std::normal_distribution dist(f_t(0.0), f_t(1.0)); for (int i = 0; i < m; ++i) z[i] = dist(gen); @@ -2684,7 +2741,7 @@ void pdlp_solver_t::compute_initial_step_size() vecATQ, CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view_.buffer_transpose.data(), - stream_view_)); + stream_view_.value())); // z = A @ A_t_q RAFT_CUSPARSE_TRY( @@ -2697,7 +2754,7 @@ void pdlp_solver_t::compute_initial_step_size() vecZ, CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view_.buffer_non_transpose.data(), - stream_view_)); + stream_view_.value())); // sigma_max_sq = dot(q, z) RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), m, @@ -2706,7 +2763,7 @@ void pdlp_solver_t::compute_initial_step_size() d_z.data(), primal_stride, sigma_max_sq.data(), - stream_view_)); + stream_view_.value())); cub::DeviceTransform::Transform( cuda::std::make_tuple(d_q.data(), d_z.data()), diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index 5723cf78c1..8eacd4d246 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -34,11 +34,11 @@ #include #include #include +#include #include #include #include #include -#include #include diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index 329a1ec465..24ef29b243 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -32,8 +32,6 @@ namespace cuopt::linear_programming::detail { -constexpr int parallel_stream_computation = 2; - template adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( raft::handle_t const* handle_ptr, @@ -45,10 +43,6 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( const std::vector& climber_strategies, const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params) : batch_mode_(climber_strategies.size() > 1), - stream_pool_(parallel_stream_computation), - dot_delta_X_(cudaEventDisableTiming), - dot_delta_Y_(cudaEventDisableTiming), - deltas_are_done_(cudaEventDisableTiming), handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), primal_size_(primal_size), @@ -80,7 +74,7 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( interaction_.data(), climber_strategies_.size(), primal_size_, - stream_view_)); + stream_view_.value())); dot_product_bytes = std::max(dot_product_bytes, byte_needed); RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum( @@ -90,7 +84,7 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( norm_squared_delta_primal_.data(), climber_strategies_.size(), primal_size_, - stream_view_)); + stream_view_.value())); dot_product_bytes = std::max(dot_product_bytes, byte_needed); RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum( @@ -100,10 +94,10 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( norm_squared_delta_dual_.data(), climber_strategies_.size(), dual_size_, - stream_view_)); + stream_view_.value())); dot_product_bytes = std::max(dot_product_bytes, byte_needed); - dot_product_storage.resize(dot_product_bytes, stream_view_); + dot_product_storage.resize(dot_product_bytes, stream_view_.value()); } } @@ -143,11 +137,12 @@ void adaptive_step_size_strategy_t::swap_context( const auto [grid_size, block_size] = kernel_config_from_batch_size(static_cast(swap_pairs.size())); adaptive_step_size_swap_device_vectors_kernel - <<>>(thrust::raw_pointer_cast(swap_pairs.data()), - static_cast(swap_pairs.size()), - make_span(interaction_), - make_span(norm_squared_delta_primal_), - make_span(norm_squared_delta_dual_)); + <<>>( + thrust::raw_pointer_cast(swap_pairs.data()), + static_cast(swap_pairs.size()), + make_span(interaction_), + make_span(norm_squared_delta_primal_), + make_span(norm_squared_delta_dual_)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -159,9 +154,9 @@ void adaptive_step_size_strategy_t::resize_context(i_t new_size) cuopt_assert(new_size > 0, "New size must be greater than 0"); cuopt_assert(new_size < batch_size, "New size must be less than batch size"); - interaction_.resize(new_size, stream_view_); - norm_squared_delta_primal_.resize(new_size, stream_view_); - norm_squared_delta_dual_.resize(new_size, stream_view_); + interaction_.resize(new_size, stream_view_.value()); + norm_squared_delta_primal_.resize(new_size, stream_view_.value()); + norm_squared_delta_dual_.resize(new_size, stream_view_.value()); } template @@ -276,19 +271,19 @@ i_t adaptive_step_size_strategy_t::get_valid_step_size() const template f_t adaptive_step_size_strategy_t::get_interaction(i_t i) const { - return interaction_.element(i, stream_view_); + return interaction_.element(i, stream_view_.value()); } template f_t adaptive_step_size_strategy_t::get_norm_squared_delta_primal(i_t i) const { - return norm_squared_delta_primal_.element(i, stream_view_); + return norm_squared_delta_primal_.element(i, stream_view_.value()); } template f_t adaptive_step_size_strategy_t::get_norm_squared_delta_dual(i_t i) const { - return norm_squared_delta_dual_.element(i, stream_view_); + return norm_squared_delta_dual_.element(i, stream_view_.value()); } template @@ -337,15 +332,15 @@ void adaptive_step_size_strategy_t::compute_step_sizes( pdhg_solver.get_saddle_point_state()); // Compute n_lim, n_next and decide if step size is valid compute_step_sizes_from_movement_and_interaction - <<<1, 1, 0, stream_view_>>>(this->view(), - primal_step_size.data(), - dual_step_size.data(), - pdhg_solver.get_d_total_pdhg_iterations().data()); + <<<1, 1, 0, stream_view_.value()>>>(this->view(), + primal_step_size.data(), + dual_step_size.data(), + pdhg_solver.get_d_total_pdhg_iterations().data()); graph.end_capture(total_pdlp_iterations); } graph.launch(total_pdlp_iterations); // Steam sync so that next call can see modification made to host var valid_step_size - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value())); } template @@ -371,7 +366,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( Deltas x & y were computed during pdhg step - We will compute in parallel (parallel cuda graph): + We will compute: ||(x' - x)|| ||(y' - y)|| (y' - y)_t . A @ (x' - x) @@ -379,11 +374,6 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( And finally merge the results */ - // We need to make sure both dot products happens after previous operations (next_primal/dual) - // Thus, we add another node in the main stream before starting the SpMVs - - if (!batch_mode_) deltas_are_done_.record(stream_view_); - // primal_dual_interaction computation => we purposly diverge from the paper (delta_y . (A @ x' - // A@x)) to save one SpMV // Instead we do: delta_x . (A_t @ y' - A_t @ y) @@ -406,7 +396,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( cusparse_view.next_AtY, CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); + stream_view_.value())); } else { // TODO later batch mode: handle if not all restart RAFT_CUSPARSE_TRY( @@ -420,7 +410,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( cusparse_view.batch_next_AtYs, CUSPARSE_SPMM_CSR_ALG3, (f_t*)cusparse_view.buffer_transpose_batch.data(), - stream_view_)); + stream_view_.value())); } // Compute Ay' - Ay = next_Aty - current_Aty @@ -443,7 +433,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_delta_primal().data(), primal_stride, interaction_.data(), - stream_view_)); + stream_view_.value())); // Compute movement // compute euclidean norm squared which is @@ -453,7 +443,6 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // 2 + (0.5 / // solver_state.primal_weight) * // norm(delta_dual) ^ 2; - deltas_are_done_.stream_wait(stream_pool_.get_stream(0)); RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_primal_size(), @@ -462,10 +451,8 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_delta_primal().data(), primal_stride, norm_squared_delta_primal_.data(), - stream_pool_.get_stream(0))); - dot_delta_X_.record(stream_pool_.get_stream(0)); + stream_view_.value())); - deltas_are_done_.stream_wait(stream_pool_.get_stream(1)); RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_dual_size(), @@ -474,12 +461,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_delta_dual().data(), dual_stride, norm_squared_delta_dual_.data(), - stream_pool_.get_stream(1))); - dot_delta_Y_.record(stream_pool_.get_stream(1)); - - // Wait on main stream for both dot to be done before launching the next kernel - dot_delta_X_.stream_wait(stream_view_); - dot_delta_Y_.stream_wait(stream_view_); + stream_view_.value())); } else { // TODO later batch mode: remove this once you want to do per climber restart cub::DeviceSegmentedReduce::Sum( @@ -492,7 +474,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( interaction_.data(), climber_strategies_.size(), primal_size_, - stream_view_); + stream_view_.value()); cub::DeviceSegmentedReduce::Sum( dot_product_storage.data(), @@ -502,7 +484,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( norm_squared_delta_primal_.data(), climber_strategies_.size(), primal_size_, - stream_view_); + stream_view_.value()); cub::DeviceSegmentedReduce::Sum( dot_product_storage.data(), @@ -512,7 +494,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( norm_squared_delta_dual_.data(), climber_strategies_.size(), dual_size_, - stream_view_); + stream_view_.value()); } } diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp index 8e7e048b18..1e969150e7 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp @@ -91,15 +91,6 @@ class adaptive_step_size_strategy_t { private: const bool batch_mode_; - // Stream pool to run different step size computation in parallel - // Because we already have the main stream, we just need 2 extra streams from this - rmm::cuda_stream_pool stream_pool_; - - // Events to record when dot product of both delta_x and y are done and when to start them - event_handler_t deltas_are_done_; - event_handler_t dot_delta_X_; - event_handler_t dot_delta_Y_; - raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_view stream_view_; diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index 464178e40b..9b01608b47 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -395,28 +395,28 @@ void convergence_information_t::compute_convergence_information( "Batch mode not supported for per_constraint_residual"); // Compute the linf of (residual_i - rel * b_i) - thrust::device_ptr result_ptr(linf_primal_residual_.data()); - const f_t neutral = f_t(0.0); - if (settings.save_best_primal_so_far) { const i_t zero_int = 0; nb_violated_constraints_.set_value_async(zero_int, handle_ptr_->get_stream()); - *result_ptr = thrust::transform_reduce( - handle_ptr_->get_thrust_policy(), - thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()), - thrust::make_zip_iterator(primal_residual_.cend(), combined_bounds.cend()), - relative_residual_t{settings.tolerances.relative_primal_tolerance}, - neutral, - thrust::maximum()); - } else { - *result_ptr = thrust::transform_reduce( - handle_ptr_->get_thrust_policy(), - thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()), - thrust::make_zip_iterator(primal_residual_.cend(), combined_bounds.cend()), - relative_residual_t{settings.tolerances.relative_primal_tolerance}, - neutral, - thrust::maximum()); } + auto transform_iter = thrust::make_transform_iterator( + thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()), + relative_residual_t{settings.tolerances.relative_primal_tolerance}); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + RAFT_CUDA_TRY(cub::DeviceReduce::Max(d_temp_storage, + temp_storage_bytes, + transform_iter, + linf_primal_residual_.data(), + primal_residual_.size(), + stream_view_)); + rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); + RAFT_CUDA_TRY(cub::DeviceReduce::Max(temp_buf.data(), + temp_storage_bytes, + transform_iter, + linf_primal_residual_.data(), + primal_residual_.size(), + stream_view_)); } compute_dual_residual(op_problem_cusparse_view_, @@ -458,16 +458,26 @@ void convergence_information_t::compute_convergence_information( "Batch mode not supported for per_constraint_residual"); // Compute the linf of (residual_i - rel * c_i) - thrust::device_ptr result_ptr(linf_dual_residual_.data()); - const f_t neutral = f_t(0.0); - - *result_ptr = thrust::transform_reduce( - handle_ptr_->get_thrust_policy(), - thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()), - thrust::make_zip_iterator(dual_residual_.cend(), objective_coefficients.cend()), - relative_residual_t{settings.tolerances.relative_dual_tolerance}, - neutral, - thrust::maximum()); + { + auto transform_iter = thrust::make_transform_iterator( + thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()), + relative_residual_t{settings.tolerances.relative_dual_tolerance}); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Max(d_temp_storage, + temp_storage_bytes, + transform_iter, + linf_dual_residual_.data(), + dual_residual_.size(), + stream_view_); + rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); + cub::DeviceReduce::Max(temp_buf.data(), + temp_storage_bytes, + transform_iter, + linf_dual_residual_.data(), + dual_residual_.size(), + stream_view_); + } } const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu new file mode 100644 index 0000000000..4ec5bff8c1 --- /dev/null +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu @@ -0,0 +1,100 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include + +#include + +#include + +namespace cuopt::linear_programming::detail { + +template +ping_pong_graph_t::ping_pong_graph_t(rmm::cuda_stream_view stream_view, + bool is_legacy_batch_mode) + : stream_view_(stream_view), is_legacy_batch_mode_(is_legacy_batch_mode) +{ +} + +template +ping_pong_graph_t::~ping_pong_graph_t() +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); } + if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); } + } +#endif +} + +template +void ping_pong_graph_t::start_capture(i_t total_pdlp_iterations) +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + if (total_pdlp_iterations % 2 == 0 && !even_initialized) { + RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); + capture_even_active_ = true; + } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { + RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); + capture_odd_active_ = true; + } + } +#endif +} + +template +void ping_pong_graph_t::end_capture(i_t total_pdlp_iterations) +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + if (total_pdlp_iterations % 2 == 0 && !even_initialized) { + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); + capture_even_active_ = false; + RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph)); + even_initialized = true; + RAFT_CUDA_TRY(cudaGraphDestroy(even_graph)); + } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); + capture_odd_active_ = false; + RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph)); + odd_initialized = true; + RAFT_CUDA_TRY(cudaGraphDestroy(odd_graph)); + } + } +#endif +} + +template +void ping_pong_graph_t::launch(i_t total_pdlp_iterations) +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + if (total_pdlp_iterations % 2 == 0 && even_initialized) { + RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value())); + } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) { + RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value())); + } + } +#endif +} + +template +bool ping_pong_graph_t::is_initialized(i_t total_pdlp_iterations) +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + return (total_pdlp_iterations % 2 == 0 && even_initialized) || + (total_pdlp_iterations % 2 == 1 && odd_initialized); + } +#endif + return false; +} + +template class ping_pong_graph_t; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh index 16e2d64957..dafecdd06e 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh @@ -9,6 +9,8 @@ #include +#include + #include namespace cuopt::linear_programming::detail { @@ -20,78 +22,13 @@ namespace cuopt::linear_programming::detail { template class ping_pong_graph_t { public: - ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false) - : stream_view_(stream_view), is_legacy_batch_mode_(is_legacy_batch_mode) - { - } - - ~ping_pong_graph_t() - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); } - if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); } - } -#endif - } - - void start_capture(i_t total_pdlp_iterations) - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY( - cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); - } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY( - cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); - } - } -#endif - } + ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false); + ~ping_pong_graph_t(); - void end_capture(i_t total_pdlp_iterations) - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); - RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph)); - even_initialized = true; - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); - } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); - RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph)); - odd_initialized = true; - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); - } - } -#endif - } - - void launch(i_t total_pdlp_iterations) - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (total_pdlp_iterations % 2 == 0 && even_initialized) { - RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value())); - } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) { - RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value())); - } - } -#endif - } - - bool is_initialized(i_t total_pdlp_iterations) - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - return (total_pdlp_iterations % 2 == 0 && even_initialized) || - (total_pdlp_iterations % 2 == 1 && odd_initialized); - } -#endif - return false; - } + void start_capture(i_t total_pdlp_iterations); + void end_capture(i_t total_pdlp_iterations); + void launch(i_t total_pdlp_iterations); + bool is_initialized(i_t total_pdlp_iterations); private: cudaGraph_t even_graph; @@ -101,7 +38,9 @@ class ping_pong_graph_t { rmm::cuda_stream_view stream_view_; bool even_initialized{false}; bool odd_initialized{false}; - // Temporary fix to disable cuda graph in legacy batch mode + bool capture_even_active_{false}; + bool capture_odd_active_{false}; bool is_legacy_batch_mode_{false}; }; + } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh index d48ae21c1a..33625f7680 100644 --- a/cpp/src/pdlp/utils.cuh +++ b/cpp/src/pdlp/utils.cuh @@ -604,15 +604,15 @@ void inline my_inf_norm(const rmm::device_uvector& input_vector, f_t* result, raft::handle_t const* handle_ptr) { - const f_t neutral = f_t(0.0); - thrust::device_ptr result_ptr(result); - - *result_ptr = thrust::transform_reduce(handle_ptr->get_thrust_policy(), - input_vector.data(), - input_vector.data() + input_vector.size(), - abs_t{}, - neutral, - thrust::maximum()); + auto stream = handle_ptr->get_stream(); + auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t{}); + auto n = input_vector.size(); + + void* d_temp = nullptr; + size_t temp_bytes = 0; + cub::DeviceReduce::Max(d_temp, temp_bytes, abs_iter, result, n, stream); + rmm::device_buffer temp_buf(temp_bytes, stream); + cub::DeviceReduce::Max(temp_buf.data(), temp_bytes, abs_iter, result, n, stream); } template diff --git a/docs/cuopt/source/lp-qp-milp-settings.rst b/docs/cuopt/source/lp-qp-milp-settings.rst index 51c6142c2b..bd1372f70e 100644 --- a/docs/cuopt/source/lp-qp-milp-settings.rst +++ b/docs/cuopt/source/lp-qp-milp-settings.rst @@ -513,3 +513,13 @@ Set this value to 0 to disable reliability branching. Set this value to k > 0, to enable reliability branching. A variable will be considered reliable if it has been branched on k times. .. note:: The default value is ``-1`` (automatic). + +Batch PDLP Strong Branching +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING`` controls whether to use batched PDLP over Dual Simplex during strong branching at the root. +When enabled, the solver evaluates multiple branching candidates simultaneously in a single batched PDLP solve rather than solving them in parallel using Dual Simplex. This can significantly reduce the time spent in strong branching if Dual Simplex is struggling. +Set this value to 0 to disable batched PDLP strong branching. +Set this value to 1 to enable batched PDLP strong branching. + +.. note:: The default value is ``0`` (disabled). This setting is ignored if the problem is not a MIP problem. diff --git a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py index dc470f3828..0eca50ba9b 100644 --- a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py +++ b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py @@ -30,6 +30,7 @@ CUOPT_ELIMINATE_DENSE_COLUMNS, CUOPT_FOLDING, CUOPT_INFEASIBILITY_DETECTION, + CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, CUOPT_MIP_CUT_PASSES, CUOPT_METHOD, CUOPT_ORDERING, @@ -997,3 +998,35 @@ def test_cuts(): assert problem.Status.name == "Optimal" assert problem.ObjValue == pytest.approx(-126, abs=1e-3) assert problem.SolutionStats.num_nodes == 0 + + +def test_batch_pdlp_strong_branching(): + # Minimize - 86*y1 - 4*y2 - 40*y3 + # subject to 774*y1 + 76*y2 + 42*y3 <= 875 + # 67*y1 + 27*y2 + 53*y3 <= 875 + # y1, y2, y3 in {0, 1} + + problem = Problem() + y1 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y1") + y2 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y2") + y3 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y3") + + problem.addConstraint(774 * y1 + 76 * y2 + 42 * y3 <= 875) + problem.addConstraint(67 * y1 + 27 * y2 + 53 * y3 <= 875) + + problem.setObjective(-86 * y1 - 4 * y2 - 40 * y3) + + settings = SolverSettings() + settings.set_parameter(CUOPT_PRESOLVE, 0) + settings.set_parameter(CUOPT_TIME_LIMIT, 10) + settings.set_parameter(CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, 0) + + problem.solve(settings) + assert problem.Status.name == "Optimal" + assert problem.ObjValue == pytest.approx(-126, abs=1e-3) + + settings.set_parameter(CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, 1) + + problem.solve(settings) + assert problem.Status.name == "Optimal" + assert problem.ObjValue == pytest.approx(-126, abs=1e-3) diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py index 8412c745b5..59ea62089d 100644 --- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py +++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py @@ -450,6 +450,11 @@ class SolverConfig(BaseModel): description="Set True to run heuristics only, False to run " "heuristics and branch and bound for MILP", ) + mip_batch_pdlp_strong_branching: Optional[int] = Field( + default=0, + description="Set 1 to enable batch PDLP strong branching " + "in the MIP solver, 0 to disable.", + ) num_cpu_threads: Optional[int] = Field( default=None, description="Set the number of CPU threads to use for branch and bound.", # noqa