From fc7aa0468c3e54c124c7932b2468e2e06024240d Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 17 Feb 2026 14:11:54 +0100 Subject: [PATCH 01/13] tmp --- .../linear_programming/cuopt/run_pdlp.cu | 2 +- compile.sh | 2 + .../initial_scaling.cu | 2 +- cpp/src/pdlp/pdhg.cu | 35 ++++- cpp/src/pdlp/pdlp.cu | 142 +++++++++++++++++- .../restart_strategy/pdlp_restart_strategy.cu | 6 +- cpp/src/pdlp/solve.cu | 3 + .../adaptive_step_size_strategy.cu | 108 +++++++++---- cpp/src/pdlp/utilities/ping_pong_graph.cuh | 2 +- .../solver_settings/solver_settings.py | 38 +++++ .../linear_programming/data_definition.py | 5 + run_multiple.sh | 3 + test.py | 12 ++ 13 files changed, 318 insertions(+), 42 deletions(-) create mode 100755 compile.sh create mode 100755 run_multiple.sh create mode 100755 test.py diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu index 229c72a49b..c3d6ad42f4 100644 --- a/benchmarks/linear_programming/cuopt/run_pdlp.cu +++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu @@ -107,7 +107,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t create_sol string_to_pdlp_solver_mode(program.get("--pdlp-solver-mode")); settings.method = static_cast(program.get("--method")); settings.crossover = program.get("--crossover"); - settings.presolve = program.get("--presolve"); + //settings.presolve = program.get("--presolve"); return settings; } diff --git a/compile.sh b/compile.sh new file mode 100755 index 0000000000..bedf3a7506 --- /dev/null +++ b/compile.sh @@ -0,0 +1,2 @@ +./build.sh libcuopt libmps_parser --cache-tool=ccache --skip-tests-build -a -l=OFF +./build.sh cuopt cuopt_mps_parser \ No newline at end of file diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu index 031cd9c3b6..5a08c3bb53 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu @@ -545,7 +545,7 @@ void pdlp_initial_scaling_strategy_t::scale_problem() #ifdef CUPDLP_DEBUG_MODE print("constraint_lower_bound", op_problem_scaled_.constraint_lower_bounds); print("constraint_upper_bound", op_problem_scaled_.constraint_upper_bounds); - std::vector variable_bounds = host_copy(op_problem_scaled_.variable_bounds); + std::vector variable_bounds = host_copy(op_problem_scaled_.variable_bounds, stream_view_); std::vector lower_bounds; std::vector upper_bounds; for (const auto& variable_bound : variable_bounds) { diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index 9a44bd31e3..c5efdcd722 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -567,6 +567,12 @@ struct primal_reflected_major_projection_bulk_op { const f_t obj_coef = objective_coefficients[var_idx]; const f_t aty_val = current_AtY[idx]; + cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_major_projection"); + cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_major_projection"); + cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0"); + cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_major_projection"); + cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_major_projection"); + const f_t next = primal_val - step_size * (obj_coef - aty_val); const f_t2 bounds = variable_bounds[var_idx]; @@ -576,6 +582,9 @@ struct primal_reflected_major_projection_bulk_op { potential_next_primal[idx] = next_clamped; dual_slack[idx] = (next_clamped - next) / step_size; reflected_primal[idx] = f_t(2.0) * next_clamped - primal_val; + + cuopt_assert(!isnan(reflected_primal[idx]), + "reflected_primal is NaN after primal_reflected_major_projection"); } }; @@ -599,6 +608,12 @@ struct dual_reflected_major_projection_bulk_op { const f_t current_dual = dual_solution[idx]; const f_t Ax = dual_gradient[idx]; + cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_major_projection"); + cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_major_projection"); + cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0"); + cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_major_projection"); + cuopt_assert(!isnan(Ax), "dual_gradient is NaN in dual_reflected_major_projection"); + const f_t tmp = current_dual / step_size - Ax; const f_t tmp_proj = cuda::std::max(-constraint_upper_bounds[constraint_idx], @@ -607,6 +622,9 @@ struct dual_reflected_major_projection_bulk_op { potential_next_dual[idx] = next_dual; reflected_dual[idx] = f_t(2.0) * next_dual - current_dual; + + cuopt_assert(!isnan(reflected_dual[idx]), + "reflected_dual is NaN after dual_reflected_major_projection"); } }; @@ -631,12 +649,19 @@ struct primal_reflected_projection_bulk_op { const f_t obj_coef = objective_coefficients[var_idx]; const f_t aty_val = current_AtY[idx]; + cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_projection"); + cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_projection"); + cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_projection"); + f_t reflected = primal_val - step_size * (obj_coef - aty_val); const f_t2 bounds = variable_bounds[var_idx]; reflected = cuda::std::max(cuda::std::min(reflected, get_upper(bounds)), get_lower(bounds)); reflected_primal[idx] = f_t(2.0) * reflected - primal_val; + + cuopt_assert(!isnan(reflected_primal[idx]), + "reflected_primal is NaN after primal_reflected_projection"); } }; @@ -659,13 +684,21 @@ struct dual_reflected_projection_bulk_op { const f_t step_size = dual_step_size[batch_idx]; const f_t current_dual = dual_solution[idx]; - const f_t tmp = current_dual / step_size - dual_gradient[idx]; + + cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_projection"); + cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_projection"); + cuopt_assert(!isnan(dual_gradient[idx]), "dual_gradient is NaN in dual_reflected_projection"); + + const f_t tmp = current_dual / step_size - dual_gradient[idx]; const f_t tmp_proj = cuda::std::max(-constraint_upper_bounds[constraint_idx], cuda::std::min(tmp, -constraint_lower_bounds[constraint_idx])); const f_t next_dual = (tmp - tmp_proj) * step_size; reflected_dual[idx] = f_t(2.0) * next_dual - current_dual; + + cuopt_assert(!isnan(reflected_dual[idx]), + "reflected_dual is NaN after dual_reflected_projection"); } }; diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 67e001db29..e1ab866b5b 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -34,8 +34,10 @@ #include #include +#include #include +#include #include #include @@ -1406,10 +1408,27 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal, const f_t interaction, f_t* fixed_point_error) { + cuopt_assert(!isnan(norm_squared_delta_primal), "norm_squared_delta_primal must not be NaN"); + cuopt_assert(!isnan(norm_squared_delta_dual), "norm_squared_delta_dual must not be NaN"); + cuopt_assert(!isnan(primal_weight), "primal_weight must not be NaN"); + cuopt_assert(!isnan(step_size), "step_size must not be NaN"); + cuopt_assert(!isnan(interaction), "interaction must not be NaN"); + cuopt_assert(norm_squared_delta_primal >= f_t(0.0), "norm_squared_delta_primal must be >= 0"); + cuopt_assert(norm_squared_delta_dual >= f_t(0.0), "norm_squared_delta_dual must be >= 0"); + cuopt_assert(primal_weight > f_t(0.0), "primal_weight must be > 0"); + cuopt_assert(step_size > f_t(0.0), "step_size must be > 0"); + const f_t movement = norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight; const f_t computed_interaction = f_t(2.0) * interaction * step_size; + //printf("movement %lf\n", movement); + //printf("computed_interaction %lf\n", computed_interaction); + + cuopt_assert( + movement + computed_interaction >= f_t(0.0), + "Movement + computed interaction must be >= 0"); + *fixed_point_error = cuda::std::sqrt(movement + computed_interaction); #ifdef CUPDLP_DEBUG_MODE @@ -1790,6 +1809,68 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // Sync to make sure all previous cuSparse operations are finished before setting the // potential_next_dual_solution RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + + // Validate reflected solutions have no NaN/Inf + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + pdhg_solver_.get_reflected_primal().data(), + pdhg_solver_.get_reflected_primal().data() + + pdhg_solver_.get_reflected_primal().size(), + is_nan_or_inf{}), + "reflected_primal contains NaN or Inf in compute_fixed_error"); + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + pdhg_solver_.get_reflected_dual().data(), + pdhg_solver_.get_reflected_dual().data() + + pdhg_solver_.get_reflected_dual().size(), + is_nan_or_inf{}), + "reflected_dual contains NaN or Inf in compute_fixed_error"); + + // Validate primal/dual solutions have no NaN/Inf + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + pdhg_solver_.get_primal_solution().data(), + pdhg_solver_.get_primal_solution().data() + + pdhg_solver_.get_primal_solution().size(), + is_nan_or_inf{}), + "primal_solution contains NaN or Inf in compute_fixed_error"); + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + pdhg_solver_.get_dual_solution().data(), + pdhg_solver_.get_dual_solution().data() + + pdhg_solver_.get_dual_solution().size(), + is_nan_or_inf{}), + "dual_solution contains NaN or Inf in compute_fixed_error"); + + // Validate deltas have no NaN/Inf + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + pdhg_solver_.get_saddle_point_state().get_delta_primal().data(), + pdhg_solver_.get_saddle_point_state().get_delta_primal().data() + + pdhg_solver_.get_saddle_point_state().get_delta_primal().size(), + is_nan_or_inf{}), + "delta_primal contains NaN or Inf in compute_fixed_error"); + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + pdhg_solver_.get_saddle_point_state().get_delta_dual().data(), + pdhg_solver_.get_saddle_point_state().get_delta_dual().data() + + pdhg_solver_.get_saddle_point_state().get_delta_dual().size(), + is_nan_or_inf{}), + "delta_dual contains NaN or Inf in compute_fixed_error"); + + // Validate primal_weight and step_size have no NaN/Inf + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + primal_weight_.data(), + primal_weight_.data() + primal_weight_.size(), + is_nan_or_inf{}), + "primal_weight_ contains NaN or Inf in compute_fixed_error"); + cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(), + step_size_.data(), + step_size_.data() + step_size_.size(), + is_nan_or_inf{}), + "step_size_ contains NaN or Inf in compute_fixed_error"); + // Make potential_next_dual_solution point towards reflected dual solution to reuse the code RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, (void*)pdhg_solver_.get_reflected_dual().data())); @@ -1813,6 +1894,49 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte RAFT_CUDA_TRY(cudaStreamSynchronize( stream_view_)); // To make sure all the data is written from device to host RAFT_CUDA_TRY(cudaPeekAtLastError()); + + // Host-side diagnostic: copy small device arrays and verify movement + interaction >= 0 + { + const auto bs = climber_strategies_.size(); + std::vector h_nsq_dp(bs), h_nsq_dd(bs), h_pw(bs), h_ss(bs), h_inter(bs); + RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dp.data(), + step_size_strategy_.get_norm_squared_delta_primal().data(), + bs * sizeof(f_t), + cudaMemcpyDeviceToHost)); + RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dd.data(), + step_size_strategy_.get_norm_squared_delta_dual().data(), + bs * sizeof(f_t), + cudaMemcpyDeviceToHost)); + RAFT_CUDA_TRY(cudaMemcpy( + h_pw.data(), primal_weight_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost)); + RAFT_CUDA_TRY( + cudaMemcpy(h_ss.data(), step_size_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost)); + RAFT_CUDA_TRY(cudaMemcpy(h_inter.data(), + step_size_strategy_.get_interaction().data(), + bs * sizeof(f_t), + cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < bs; ++i) { + const f_t movement = h_nsq_dp[i] * h_pw[i] + h_nsq_dd[i] / h_pw[i]; + const f_t comp_inter = f_t(2.0) * h_inter[i] * h_ss[i]; + if (movement + comp_inter < f_t(0.0)) { + fprintf(stderr, + "DIAGNOSTIC [%zu]: movement=%.17e comp_inter=%.17e sum=%.17e " + "norm_sq_dx=%.17e norm_sq_dy=%.17e pw=%.17e ss=%.17e interaction=%.17e\n", + i, + (double)movement, + (double)comp_inter, + (double)(movement + comp_inter), + (double)h_nsq_dp[i], + (double)h_nsq_dd[i], + (double)h_pw[i], + (double)h_ss[i], + (double)h_inter[i]); + } + cuopt_assert(movement + comp_inter >= f_t(0.0), + "Host check: movement + computed_interaction must be >= 0"); + } + } + #ifdef CUPDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif @@ -1847,9 +1971,15 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte #endif for (size_t i = 0; i < climber_strategies_.size(); ++i) { + cuopt_assert(!std::isnan(restart_strategy_.fixed_point_error_[i]), + "fixed_point_error_ must not be NaN after compute_fixed_error"); + cuopt_assert(restart_strategy_.fixed_point_error_[i] >= f_t(0.0), + "fixed_point_error_ must be >= 0 after compute_fixed_error"); if (has_restarted[i]) { restart_strategy_.initial_fixed_point_error_[i] = restart_strategy_.fixed_point_error_[i]; - has_restarted[i] = false; + cuopt_assert(!std::isnan(restart_strategy_.initial_fixed_point_error_[i]), + "initial_fixed_point_error_ must not be NaN after assignment"); + has_restarted[i] = false; } } } @@ -1869,6 +1999,7 @@ void pdlp_solver_t::transpose_primal_dual_to_row( rmm::device_uvector dual_slack_transposed( is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_); + RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_)); CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, @@ -1945,6 +2076,7 @@ void pdlp_solver_t::transpose_primal_dual_back_to_col( rmm::device_uvector dual_slack_transposed( is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_); + RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_)); CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, @@ -2632,7 +2764,7 @@ void pdlp_solver_t::compute_initial_step_size() rmm::device_uvector d_atq(n, stream_view_); std::mt19937 gen(1); - std::normal_distribution dist(0.0, 1.0); + std::normal_distribution dist(f_t(0.0), f_t(1.0)); for (int i = 0; i < m; ++i) z[i] = dist(gen); @@ -2684,7 +2816,7 @@ void pdlp_solver_t::compute_initial_step_size() vecATQ, CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view_.buffer_transpose.data(), - stream_view_)); + stream_view_.value())); // z = A @ A_t_q RAFT_CUSPARSE_TRY( @@ -2697,7 +2829,7 @@ void pdlp_solver_t::compute_initial_step_size() vecZ, CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view_.buffer_non_transpose.data(), - stream_view_)); + stream_view_.value())); // sigma_max_sq = dot(q, z) RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), m, @@ -2706,7 +2838,7 @@ void pdlp_solver_t::compute_initial_step_size() d_z.data(), primal_stride, sigma_max_sq.data(), - stream_view_)); + stream_view_.value())); cub::DeviceTransform::Transform( cuda::std::make_tuple(d_q.data(), d_z.data()), diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index bc12fb360f..a6304a8568 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -1995,14 +1995,14 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( f_t* end = threshold_.data() + primal_size_h_ + dual_size_h_; auto highest_negInf_primal = thrust::find(handle_ptr_->get_thrust_policy(), - thrust::make_reverse_iterator(thrust::device_ptr(end)), - thrust::make_reverse_iterator(thrust::device_ptr(start)), + thrust::device_ptr(end), + thrust::device_ptr(start), -std::numeric_limits::infinity()); // Set ranges accordingly i_t index_start_primal = 0; i_t index_end_primal = primal_size_h_ + dual_size_h_; - if (highest_negInf_primal != thrust::make_reverse_iterator(thrust::device_ptr(start))) { + if (highest_negInf_primal != thrust::device_ptr(start)) { cuopt_assert(device_to_host_value(thrust::raw_pointer_cast(&*highest_negInf_primal)) == -std::numeric_limits::infinity(), "Incorrect primal reverse iterator"); diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index a2766be98a..374c9ff513 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -793,6 +793,7 @@ optimization_problem_solution_t run_batch_pdlp( // If need warm start, solve the LP alone if (primal_dual_init || primal_weight_init) { + std::cout << "Solving LP for warm start" << std::endl; pdlp_solver_settings_t warm_start_settings = settings; warm_start_settings.new_bounds.clear(); warm_start_settings.method = cuopt::linear_programming::method_t::PDLP; @@ -841,6 +842,8 @@ optimization_problem_solution_t run_batch_pdlp( } if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); } + std::cout << "Solving batch PDLP" << std::endl; + for (int i = 0; i < max_batch_size; i += optimal_batch_size) { const int current_batch_size = std::min(optimal_batch_size, max_batch_size - i); // Only take the new bounds from [i, i + current_batch_size) diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index 3c1b85aeac..47ba16a297 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -28,6 +28,8 @@ #include +#include + #include namespace cuopt::linear_programming::detail { @@ -80,7 +82,7 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( interaction_.data(), climber_strategies_.size(), primal_size_, - stream_view_)); + stream_view_.value())); dot_product_bytes = std::max(dot_product_bytes, byte_needed); RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum( @@ -90,7 +92,7 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( norm_squared_delta_primal_.data(), climber_strategies_.size(), primal_size_, - stream_view_)); + stream_view_.value())); dot_product_bytes = std::max(dot_product_bytes, byte_needed); RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum( @@ -100,10 +102,10 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( norm_squared_delta_dual_.data(), climber_strategies_.size(), dual_size_, - stream_view_)); + stream_view_.value())); dot_product_bytes = std::max(dot_product_bytes, byte_needed); - dot_product_storage.resize(dot_product_bytes, stream_view_); + dot_product_storage.resize(dot_product_bytes, stream_view_.value()); } } @@ -143,7 +145,7 @@ void adaptive_step_size_strategy_t::swap_context( const auto [grid_size, block_size] = kernel_config_from_batch_size(static_cast(swap_pairs.size())); adaptive_step_size_swap_device_vectors_kernel - <<>>(thrust::raw_pointer_cast(swap_pairs.data()), + <<>>(thrust::raw_pointer_cast(swap_pairs.data()), static_cast(swap_pairs.size()), make_span(interaction_), make_span(norm_squared_delta_primal_), @@ -159,9 +161,9 @@ void adaptive_step_size_strategy_t::resize_context(i_t new_size) cuopt_assert(new_size > 0, "New size must be greater than 0"); cuopt_assert(new_size < batch_size, "New size must be less than batch size"); - interaction_.resize(new_size, stream_view_); - norm_squared_delta_primal_.resize(new_size, stream_view_); - norm_squared_delta_dual_.resize(new_size, stream_view_); + interaction_.resize(new_size, stream_view_.value()); + norm_squared_delta_primal_.resize(new_size, stream_view_.value()); + norm_squared_delta_dual_.resize(new_size, stream_view_.value()); } template @@ -276,19 +278,19 @@ i_t adaptive_step_size_strategy_t::get_valid_step_size() const template f_t adaptive_step_size_strategy_t::get_interaction(i_t i) const { - return interaction_.element(i, stream_view_); + return interaction_.element(i, stream_view_.value()); } template f_t adaptive_step_size_strategy_t::get_norm_squared_delta_primal(i_t i) const { - return norm_squared_delta_primal_.element(i, stream_view_); + return norm_squared_delta_primal_.element(i, stream_view_.value()); } template f_t adaptive_step_size_strategy_t::get_norm_squared_delta_dual(i_t i) const { - return norm_squared_delta_dual_.element(i, stream_view_); + return norm_squared_delta_dual_.element(i, stream_view_.value()); } template @@ -337,7 +339,7 @@ void adaptive_step_size_strategy_t::compute_step_sizes( pdhg_solver.get_saddle_point_state()); // Compute n_lim, n_next and decide if step size is valid compute_step_sizes_from_movement_and_interaction - <<<1, 1, 0, stream_view_>>>(this->view(), + <<<1, 1, 0, stream_view_.value()>>>(this->view(), primal_step_size.data(), dual_step_size.data(), pdhg_solver.get_d_total_pdhg_iterations().data()); @@ -345,7 +347,27 @@ void adaptive_step_size_strategy_t::compute_step_sizes( } graph.launch(total_pdlp_iterations); // Steam sync so that next call can see modification made to host var valid_step_size - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value())); +} + +template +__global__ void validate_interaction_and_movement_outputs( + raft::device_span norm_squared_delta_primal, + raft::device_span norm_squared_delta_dual, + raft::device_span interaction) +{ + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= norm_squared_delta_primal.size()) { return; } + cuopt_assert(!isnan(norm_squared_delta_primal[idx]), + "norm_squared_delta_primal is NaN after reduction"); + cuopt_assert(!isnan(norm_squared_delta_dual[idx]), + "norm_squared_delta_dual is NaN after reduction"); + cuopt_assert(!isnan(interaction[idx]), + "interaction is NaN after reduction"); + cuopt_assert(norm_squared_delta_primal[idx] >= f_t(0.0), + "norm_squared_delta_primal must be >= 0 after reduction"); + cuopt_assert(norm_squared_delta_dual[idx] >= f_t(0.0), + "norm_squared_delta_dual must be >= 0 after reduction"); } template @@ -382,7 +404,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // We need to make sure both dot products happens after previous operations (next_primal/dual) // Thus, we add another node in the main stream before starting the SpMVs - if (!batch_mode_) deltas_are_done_.record(stream_view_); + if (!batch_mode_) deltas_are_done_.record(stream_view_.value()); // primal_dual_interaction computation => we purposly diverge from the paper (delta_y . (A @ x' - // A@x)) to save one SpMV @@ -406,7 +428,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( cusparse_view.next_AtY, CUSPARSE_SPMV_CSR_ALG2, (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); + stream_view_.value())); } else { // TODO later batch mode: handle if not all restart RAFT_CUSPARSE_TRY( @@ -420,7 +442,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( cusparse_view.batch_next_AtYs, CUSPARSE_SPMM_CSR_ALG3, (f_t*)cusparse_view.buffer_transpose_batch.data(), - stream_view_)); + stream_view_.value())); } // Compute Ay' - Ay = next_Aty - current_Aty @@ -433,6 +455,31 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( cuda::std::minus<>{}, stream_view_.value()); + // Validate tmp_primal (A^T @ delta_y) has no NaN/Inf + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value())); + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + tmp_primal.data(), + tmp_primal.data() + tmp_primal.size(), + is_nan_or_inf{}), + "tmp_primal (A^T @ delta_y) contains NaN or Inf in compute_interaction_and_movement"); + + // Validate delta_primal and delta_dual inputs have no NaN/Inf + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + current_saddle_point_state.get_delta_primal().data(), + current_saddle_point_state.get_delta_primal().data() + + current_saddle_point_state.get_delta_primal().size(), + is_nan_or_inf{}), + "delta_primal contains NaN or Inf in compute_interaction_and_movement"); + cuopt_assert( + !thrust::any_of(handle_ptr_->get_thrust_policy(), + current_saddle_point_state.get_delta_dual().data(), + current_saddle_point_state.get_delta_dual().data() + + current_saddle_point_state.get_delta_dual().size(), + is_nan_or_inf{}), + "delta_dual contains NaN or Inf in compute_interaction_and_movement"); + if (!batch_mode_) { // compute interaction (x'-x) . (A(y'-y)) RAFT_CUBLAS_TRY( @@ -443,7 +490,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_delta_primal().data(), primal_stride, interaction_.data(), - stream_view_)); + stream_view_.value())); // Compute movement // compute euclidean norm squared which is @@ -453,7 +500,8 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // 2 + (0.5 / // solver_state.primal_weight) * // norm(delta_dual) ^ 2; - deltas_are_done_.stream_wait(stream_pool_.get_stream(0)); + // All dot products run on stream_view_ to avoid concurrent cuBLAS workspace access + // (cuBLAS uses a single internal workspace shared across all streams for the same handle) RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_primal_size(), @@ -462,10 +510,8 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_delta_primal().data(), primal_stride, norm_squared_delta_primal_.data(), - stream_pool_.get_stream(0))); - dot_delta_X_.record(stream_pool_.get_stream(0)); + stream_view_.value())); - deltas_are_done_.stream_wait(stream_pool_.get_stream(1)); RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_dual_size(), @@ -474,12 +520,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( current_saddle_point_state.get_delta_dual().data(), dual_stride, norm_squared_delta_dual_.data(), - stream_pool_.get_stream(1))); - dot_delta_Y_.record(stream_pool_.get_stream(1)); - - // Wait on main stream for both dot to be done before launching the next kernel - dot_delta_X_.stream_wait(stream_view_); - dot_delta_Y_.stream_wait(stream_view_); + stream_view_.value())); } else { // TODO later batch mode: remove this once you want to do per climber restart cub::DeviceSegmentedReduce::Sum( @@ -492,7 +533,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( interaction_.data(), climber_strategies_.size(), primal_size_, - stream_view_); + stream_view_.value()); cub::DeviceSegmentedReduce::Sum( dot_product_storage.data(), @@ -502,7 +543,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( norm_squared_delta_primal_.data(), climber_strategies_.size(), primal_size_, - stream_view_); + stream_view_.value()); cub::DeviceSegmentedReduce::Sum( dot_product_storage.data(), @@ -512,7 +553,14 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( norm_squared_delta_dual_.data(), climber_strategies_.size(), dual_size_, - stream_view_); + stream_view_.value()); + + validate_interaction_and_movement_outputs + <<<1, climber_strategies_.size(), 0, stream_view_>>>( + make_span(norm_squared_delta_primal_), + make_span(norm_squared_delta_dual_), + make_span(interaction_)); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } } diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh index 16e2d64957..14180b5bfd 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh @@ -21,7 +21,7 @@ template class ping_pong_graph_t { public: ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false) - : stream_view_(stream_view), is_legacy_batch_mode_(is_legacy_batch_mode) + : stream_view_(stream_view), is_legacy_batch_mode_(true) { } diff --git a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py index 19db315349..4ec4a9aaf2 100644 --- a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py +++ b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py @@ -207,6 +207,44 @@ def set_pdlp_warm_start_data(self, pdlp_warm_start_data): """ self.pdlp_warm_start_data = pdlp_warm_start_data + def set_mip_batch_pdlp_strong_branching(self, enable): + """ + Note: Only supported for MILP + + Toggle batch PDLP strong branching in the MIP solver. + + Parameters + ---------- + enable : bool + If True, enable batch PDLP strong branching (value 1). + If False, disable it (value 0). + + Examples + -------- + >>> settings.set_mip_batch_pdlp_strong_branching(True) + """ + self.set_parameter( + "mip_batch_pdlp_strong_branching", 1 if enable else 0 + ) + + def get_mip_batch_pdlp_strong_branching(self): + """ + Note: Only supported for MILP + + Get the current value of the batch PDLP strong branching setting. + + Returns + ------- + bool + True if batch PDLP strong branching is enabled, False otherwise. + + Examples + -------- + >>> settings.get_mip_batch_pdlp_strong_branching() + False + """ + return bool(self.get_parameter("mip_batch_pdlp_strong_branching")) + def set_mip_callback(self, callback, user_data): """ Note: Only supported for MILP diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py index 8412c745b5..59ea62089d 100644 --- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py +++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py @@ -450,6 +450,11 @@ class SolverConfig(BaseModel): description="Set True to run heuristics only, False to run " "heuristics and branch and bound for MILP", ) + mip_batch_pdlp_strong_branching: Optional[int] = Field( + default=0, + description="Set 1 to enable batch PDLP strong branching " + "in the MIP solver, 0 to disable.", + ) num_cpu_threads: Optional[int] = Field( default=None, description="Set the number of CPU threads to use for branch and bound.", # noqa diff --git a/run_multiple.sh b/run_multiple.sh new file mode 100755 index 0000000000..183b25b46e --- /dev/null +++ b/run_multiple.sh @@ -0,0 +1,3 @@ +for i in {1..5}; do + python test.py +done \ No newline at end of file diff --git a/test.py b/test.py new file mode 100755 index 0000000000..6cb236dae2 --- /dev/null +++ b/test.py @@ -0,0 +1,12 @@ +import cuopt_mps_parser +from cuopt.linear_programming import Solve, SolverSettings + +data_model = cuopt_mps_parser.ParseMps("batch_instances/neos8.mps") + +settings = SolverSettings() +settings.set_mip_batch_pdlp_strong_branching(True) + +solution = Solve(data_model, settings) + +print(solution.get_termination_reason()) +print(solution.get_primal_objective()) From 1614bc14836050bbf30308c7c7edf140f352770e Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Tue, 17 Feb 2026 17:42:10 +0100 Subject: [PATCH 02/13] fix --- cpp/src/pdlp/pdlp.cu | 89 +++++++------------ .../restart_strategy/pdlp_restart_strategy.cu | 1 - .../convergence_information.cu | 66 ++++++++------ cpp/src/pdlp/utils.cuh | 20 +++-- 4 files changed, 82 insertions(+), 94 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index e1ab866b5b..082299902d 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -1188,24 +1188,42 @@ static void compute_stats(const rmm::device_uvector& vec, f_t& avg) { auto abs_op = [] __host__ __device__(f_t x) { return abs(x); }; - auto min_nonzero = [] __host__ __device__(f_t x) { + auto min_nonzero = [] __host__ __device__(f_t x) -> f_t { return x == 0 ? std::numeric_limits::max() : abs(x); }; - smallest = thrust::transform_reduce(rmm::exec_policy(vec.stream()), - vec.begin(), - vec.end(), - min_nonzero, - std::numeric_limits::max(), - thrust::minimum()); - - largest = thrust::transform_reduce( - rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::maximum()); - - f_t sum = thrust::transform_reduce( - rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::plus()); - - avg = sum / vec.size(); + auto stream = vec.stream(); + auto n = static_cast(vec.size()); + + rmm::device_scalar d_smallest(stream); + rmm::device_scalar d_largest(stream); + rmm::device_scalar d_sum(stream); + + auto min_nz_iter = thrust::make_transform_iterator(vec.cbegin(), min_nonzero); + auto abs_iter = thrust::make_transform_iterator(vec.cbegin(), abs_op); + + void* d_temp = nullptr; + size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 1; + cub::DeviceReduce::Reduce( + d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream); + cub::DeviceReduce::Reduce( + d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream); + cub::DeviceReduce::Reduce( + d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream); + + size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3}); + rmm::device_buffer temp_buf(max_bytes, stream); + + cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream); + cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream); + cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream); + + smallest = d_smallest.value(stream); + largest = d_largest.value(stream); + avg = d_sum.value(stream) / vec.size(); }; template @@ -1895,47 +1913,6 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte stream_view_)); // To make sure all the data is written from device to host RAFT_CUDA_TRY(cudaPeekAtLastError()); - // Host-side diagnostic: copy small device arrays and verify movement + interaction >= 0 - { - const auto bs = climber_strategies_.size(); - std::vector h_nsq_dp(bs), h_nsq_dd(bs), h_pw(bs), h_ss(bs), h_inter(bs); - RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dp.data(), - step_size_strategy_.get_norm_squared_delta_primal().data(), - bs * sizeof(f_t), - cudaMemcpyDeviceToHost)); - RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dd.data(), - step_size_strategy_.get_norm_squared_delta_dual().data(), - bs * sizeof(f_t), - cudaMemcpyDeviceToHost)); - RAFT_CUDA_TRY(cudaMemcpy( - h_pw.data(), primal_weight_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost)); - RAFT_CUDA_TRY( - cudaMemcpy(h_ss.data(), step_size_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost)); - RAFT_CUDA_TRY(cudaMemcpy(h_inter.data(), - step_size_strategy_.get_interaction().data(), - bs * sizeof(f_t), - cudaMemcpyDeviceToHost)); - for (size_t i = 0; i < bs; ++i) { - const f_t movement = h_nsq_dp[i] * h_pw[i] + h_nsq_dd[i] / h_pw[i]; - const f_t comp_inter = f_t(2.0) * h_inter[i] * h_ss[i]; - if (movement + comp_inter < f_t(0.0)) { - fprintf(stderr, - "DIAGNOSTIC [%zu]: movement=%.17e comp_inter=%.17e sum=%.17e " - "norm_sq_dx=%.17e norm_sq_dy=%.17e pw=%.17e ss=%.17e interaction=%.17e\n", - i, - (double)movement, - (double)comp_inter, - (double)(movement + comp_inter), - (double)h_nsq_dp[i], - (double)h_nsq_dd[i], - (double)h_pw[i], - (double)h_ss[i], - (double)h_inter[i]); - } - cuopt_assert(movement + comp_inter >= f_t(0.0), - "Host check: movement + computed_interaction must be >= 0"); - } - } #ifdef CUPDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index a6304a8568..b2ed166a2d 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -38,7 +38,6 @@ #include #include #include -#include #include diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index 97e0e9c0e9..eec078f8d7 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -395,28 +395,28 @@ void convergence_information_t::compute_convergence_information( "Batch mode not supported for per_constraint_residual"); // Compute the linf of (residual_i - rel * b_i) - thrust::device_ptr result_ptr(linf_primal_residual_.data()); - const f_t neutral = f_t(0.0); - if (settings.save_best_primal_so_far) { const i_t zero_int = 0; nb_violated_constraints_.set_value_async(zero_int, handle_ptr_->get_stream()); - *result_ptr = thrust::transform_reduce( - handle_ptr_->get_thrust_policy(), - thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()), - thrust::make_zip_iterator(primal_residual_.cend(), combined_bounds.cend()), - relative_residual_t{settings.tolerances.relative_primal_tolerance}, - neutral, - thrust::maximum()); - } else { - *result_ptr = thrust::transform_reduce( - handle_ptr_->get_thrust_policy(), - thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()), - thrust::make_zip_iterator(primal_residual_.cend(), combined_bounds.cend()), - relative_residual_t{settings.tolerances.relative_primal_tolerance}, - neutral, - thrust::maximum()); } + auto transform_iter = thrust::make_transform_iterator( + thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()), + relative_residual_t{settings.tolerances.relative_primal_tolerance}); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Max(d_temp_storage, + temp_storage_bytes, + transform_iter, + linf_primal_residual_.data(), + primal_residual_.size(), + stream_view_); + rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); + cub::DeviceReduce::Max(temp_buf.data(), + temp_storage_bytes, + transform_iter, + linf_primal_residual_.data(), + primal_residual_.size(), + stream_view_); } compute_dual_residual(op_problem_cusparse_view_, @@ -458,16 +458,26 @@ void convergence_information_t::compute_convergence_information( "Batch mode not supported for per_constraint_residual"); // Compute the linf of (residual_i - rel * c_i) - thrust::device_ptr result_ptr(linf_dual_residual_.data()); - const f_t neutral = f_t(0.0); - - *result_ptr = thrust::transform_reduce( - handle_ptr_->get_thrust_policy(), - thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()), - thrust::make_zip_iterator(dual_residual_.cend(), objective_coefficients.cend()), - relative_residual_t{settings.tolerances.relative_dual_tolerance}, - neutral, - thrust::maximum()); + { + auto transform_iter = thrust::make_transform_iterator( + thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()), + relative_residual_t{settings.tolerances.relative_dual_tolerance}); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Max(d_temp_storage, + temp_storage_bytes, + transform_iter, + linf_dual_residual_.data(), + dual_residual_.size(), + stream_view_); + rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); + cub::DeviceReduce::Max(temp_buf.data(), + temp_storage_bytes, + transform_iter, + linf_dual_residual_.data(), + dual_residual_.size(), + stream_view_); + } } const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh index d48ae21c1a..9150ab8c51 100644 --- a/cpp/src/pdlp/utils.cuh +++ b/cpp/src/pdlp/utils.cuh @@ -604,15 +604,17 @@ void inline my_inf_norm(const rmm::device_uvector& input_vector, f_t* result, raft::handle_t const* handle_ptr) { - const f_t neutral = f_t(0.0); - thrust::device_ptr result_ptr(result); - - *result_ptr = thrust::transform_reduce(handle_ptr->get_thrust_policy(), - input_vector.data(), - input_vector.data() + input_vector.size(), - abs_t{}, - neutral, - thrust::maximum()); + auto stream = handle_ptr->get_stream(); + auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t{}); + auto n = static_cast(input_vector.size()); + + void* d_temp = nullptr; + size_t temp_bytes = 0; + cub::DeviceReduce::Max( + d_temp, temp_bytes, abs_iter, result, n, stream); + rmm::device_buffer temp_buf(temp_bytes, stream); + cub::DeviceReduce::Max( + temp_buf.data(), temp_bytes, abs_iter, result, n, stream); } template From 4f3353191f6178ce02ffb3449daa9891ee8eb38e Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 18 Feb 2026 13:18:41 +0100 Subject: [PATCH 03/13] fix --- .../adaptive_step_size_strategy.cu | 25 ---- cpp/src/pdlp/utilities/ping_pong_graph.cu | 123 ++++++++++++++++++ cpp/src/pdlp/utilities/ping_pong_graph.cuh | 87 ++----------- 3 files changed, 137 insertions(+), 98 deletions(-) create mode 100644 cpp/src/pdlp/utilities/ping_pong_graph.cu diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index 47ba16a297..32e21cfbf6 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -455,31 +455,6 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( cuda::std::minus<>{}, stream_view_.value()); - // Validate tmp_primal (A^T @ delta_y) has no NaN/Inf - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value())); - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - tmp_primal.data(), - tmp_primal.data() + tmp_primal.size(), - is_nan_or_inf{}), - "tmp_primal (A^T @ delta_y) contains NaN or Inf in compute_interaction_and_movement"); - - // Validate delta_primal and delta_dual inputs have no NaN/Inf - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - current_saddle_point_state.get_delta_primal().data(), - current_saddle_point_state.get_delta_primal().data() + - current_saddle_point_state.get_delta_primal().size(), - is_nan_or_inf{}), - "delta_primal contains NaN or Inf in compute_interaction_and_movement"); - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - current_saddle_point_state.get_delta_dual().data(), - current_saddle_point_state.get_delta_dual().data() + - current_saddle_point_state.get_delta_dual().size(), - is_nan_or_inf{}), - "delta_dual contains NaN or Inf in compute_interaction_and_movement"); - if (!batch_mode_) { // compute interaction (x'-x) . (A(y'-y)) RAFT_CUBLAS_TRY( diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu new file mode 100644 index 0000000000..08045b47a1 --- /dev/null +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu @@ -0,0 +1,123 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#include + +#include + +#include + +namespace cuopt::linear_programming::detail { + +template +ping_pong_graph_t::ping_pong_graph_t(rmm::cuda_stream_view stream_view, + bool is_legacy_batch_mode) + : stream_view_(stream_view), is_legacy_batch_mode_(is_legacy_batch_mode) +{ +} + +template +void ping_pong_graph_t::cancel_active_capture() +{ + CUOPT_LOG_ERROR( + "Canceling active capture in ping_pong_graph_t"); + if (capture_even_active_) { + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); + RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); + capture_even_active_ = false; + } + if (capture_odd_active_) { + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); + RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); + capture_odd_active_ = false; + } +} + +template +ping_pong_graph_t::~ping_pong_graph_t() +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + // This should not happen, but in case a graph was capturing while destroying the object + if (capture_even_active_ || capture_odd_active_) { + cancel_active_capture(); + } + if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); } + if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); } + } +#endif +} + +template +void ping_pong_graph_t::start_capture(i_t total_pdlp_iterations) +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + if (total_pdlp_iterations % 2 == 0 && !even_initialized) { + RAFT_CUDA_TRY( + cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); + capture_even_active_ = true; + } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { + RAFT_CUDA_TRY( + cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); + capture_odd_active_ = true; + } + } +#endif +} + +template +void ping_pong_graph_t::end_capture(i_t total_pdlp_iterations) +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + if (total_pdlp_iterations % 2 == 0 && !even_initialized) { + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); + capture_even_active_ = false; + RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph)); + even_initialized = true; + RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); + } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); + capture_odd_active_ = false; + RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph)); + odd_initialized = true; + RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); + } + } +#endif +} + +template +void ping_pong_graph_t::launch(i_t total_pdlp_iterations) +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + if (total_pdlp_iterations % 2 == 0 && even_initialized) { + RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value())); + } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) { + RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value())); + } + } +#endif +} + +template +bool ping_pong_graph_t::is_initialized(i_t total_pdlp_iterations) +{ +#ifndef CUPDLP_DEBUG_MODE + if (!is_legacy_batch_mode_) { + return (total_pdlp_iterations % 2 == 0 && even_initialized) || + (total_pdlp_iterations % 2 == 1 && odd_initialized); + } +#endif + return false; +} + +template class ping_pong_graph_t; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh index 14180b5bfd..9d6ead8cf7 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh @@ -9,6 +9,8 @@ #include +#include + #include namespace cuopt::linear_programming::detail { @@ -17,83 +19,20 @@ namespace cuopt::linear_programming::detail { // No additional checks for safe usage (calling launch() before initializing the graph) use with // caution Binary part is because in pdlp we swap pointers instead of copying vectors to accept a // valid pdhg step So every odd pdlp step it's one graph, every even step it's another graph -template + template class ping_pong_graph_t { public: - ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false) - : stream_view_(stream_view), is_legacy_batch_mode_(true) - { - } - - ~ping_pong_graph_t() - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); } - if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); } - } -#endif - } - - void start_capture(i_t total_pdlp_iterations) - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY( - cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); - } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY( - cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); - } - } -#endif - } - - void end_capture(i_t total_pdlp_iterations) - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); - RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph)); - even_initialized = true; - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); - } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); - RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph)); - odd_initialized = true; - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); - } - } -#endif - } + ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false); + ~ping_pong_graph_t(); - void launch(i_t total_pdlp_iterations) - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - if (total_pdlp_iterations % 2 == 0 && even_initialized) { - RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value())); - } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) { - RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value())); - } - } -#endif - } - - bool is_initialized(i_t total_pdlp_iterations) - { -#ifndef CUPDLP_DEBUG_MODE - if (!is_legacy_batch_mode_) { - return (total_pdlp_iterations % 2 == 0 && even_initialized) || - (total_pdlp_iterations % 2 == 1 && odd_initialized); - } -#endif - return false; - } + void start_capture(i_t total_pdlp_iterations); + void end_capture(i_t total_pdlp_iterations); + void launch(i_t total_pdlp_iterations); + bool is_initialized(i_t total_pdlp_iterations); private: + void cancel_active_capture(); + cudaGraph_t even_graph; cudaGraph_t odd_graph; cudaGraphExec_t even_instance; @@ -101,7 +40,9 @@ class ping_pong_graph_t { rmm::cuda_stream_view stream_view_; bool even_initialized{false}; bool odd_initialized{false}; - // Temporary fix to disable cuda graph in legacy batch mode + bool capture_even_active_{false}; + bool capture_odd_active_{false}; bool is_legacy_batch_mode_{false}; }; + } // namespace cuopt::linear_programming::detail From e0a530ed449f7c904d651701aa066dd90a5edd54 Mon Sep 17 00:00:00 2001 From: Trevor McKay Date: Tue, 17 Feb 2026 13:49:33 -0500 Subject: [PATCH 04/13] workaround for thrust reverse iterator build error --- .../restart_strategy/pdlp_restart_strategy.cu | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index b2ed166a2d..e42a05e1e6 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1990,18 +1991,18 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( threshold_.end(), std::numeric_limits::infinity()); // Easier / Cleaner than to do reverse iterator arithmetic - f_t* start = threshold_.data(); - f_t* end = threshold_.data() + primal_size_h_ + dual_size_h_; - auto highest_negInf_primal = - thrust::find(handle_ptr_->get_thrust_policy(), - thrust::device_ptr(end), - thrust::device_ptr(start), - -std::numeric_limits::infinity()); + f_t* start = threshold_.data(); + f_t* end = threshold_.data() + primal_size_h_ + dual_size_h_; + using rev_iter_t = thrust::reverse_iterator>; + auto highest_negInf_primal = thrust::find(handle_ptr_->get_thrust_policy(), + rev_iter_t(thrust::device_ptr(end)), + rev_iter_t(thrust::device_ptr(start)), + -std::numeric_limits::infinity()); // Set ranges accordingly i_t index_start_primal = 0; i_t index_end_primal = primal_size_h_ + dual_size_h_; - if (highest_negInf_primal != thrust::device_ptr(start)) { + if (highest_negInf_primal != rev_iter_t(thrust::device_ptr(start))) { cuopt_assert(device_to_host_value(thrust::raw_pointer_cast(&*highest_negInf_primal)) == -std::numeric_limits::infinity(), "Incorrect primal reverse iterator"); From e330718a2f40799de9a22615d1e99954bd821922 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 18 Feb 2026 13:55:20 +0100 Subject: [PATCH 05/13] remove compile file --- compile.sh | 2 -- 1 file changed, 2 deletions(-) delete mode 100755 compile.sh diff --git a/compile.sh b/compile.sh deleted file mode 100755 index bedf3a7506..0000000000 --- a/compile.sh +++ /dev/null @@ -1,2 +0,0 @@ -./build.sh libcuopt libmps_parser --cache-tool=ccache --skip-tests-build -a -l=OFF -./build.sh cuopt cuopt_mps_parser \ No newline at end of file From dce6d4fec3d5a2ae6d3313e4f1af2db4ce55ccea Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 18 Feb 2026 15:56:09 +0100 Subject: [PATCH 06/13] fix --- .../linear_programming/cuopt/run_pdlp.cu | 20 ++++++--- cpp/src/branch_and_bound/pseudo_costs.cpp | 3 +- cpp/src/pdlp/CMakeLists.txt | 1 + cpp/src/pdlp/pdlp.cu | 3 -- .../adaptive_step_size_strategy.cu | 44 +------------------ .../adaptive_step_size_strategy.hpp | 9 ---- cpp/src/pdlp/utilities/ping_pong_graph.cu | 16 +++---- .../solver_settings/solver_settings.py | 38 ---------------- 8 files changed, 26 insertions(+), 108 deletions(-) diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu index c3d6ad42f4..64897264c9 100644 --- a/benchmarks/linear_programming/cuopt/run_pdlp.cu +++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu @@ -70,15 +70,23 @@ static void parse_arguments(argparse::ArgumentParser& program) "Path to PDLP hyper-params file to configure PDLP solver. Has priority over PDLP solver " "modes."); - program.add_argument("--presolve") - .help("enable/disable presolve (default: true for MIP problems, false for LP problems)") - .default_value(0) - .scan<'i', int>() - .choices(0, 1); + program.add_argument("--presolver") + .help("Presolver to use. Possible values: None, Papilo, PSLP, Default") + .default_value("Default") + .choices("None", "Papilo", "PSLP", "Default"); program.add_argument("--solution-path").help("Path where solution file will be generated"); } +static cuopt::linear_programming::presolver_t string_to_presolver(const std::string& presolver) +{ + if (presolver == "None") return cuopt::linear_programming::presolver_t::None; + if (presolver == "Papilo") return cuopt::linear_programming::presolver_t::Papilo; + if (presolver == "PSLP") return cuopt::linear_programming::presolver_t::PSLP; + if (presolver == "Default") return cuopt::linear_programming::presolver_t::Default; + return cuopt::linear_programming::presolver_t::Default; +} + static cuopt::linear_programming::pdlp_solver_mode_t string_to_pdlp_solver_mode( const std::string& mode) { @@ -107,7 +115,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t create_sol string_to_pdlp_solver_mode(program.get("--pdlp-solver-mode")); settings.method = static_cast(program.get("--method")); settings.crossover = program.get("--crossover"); - //settings.presolve = program.get("--presolve"); + settings.presolver = string_to_presolver(program.get("--presolver")); return settings; } diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index 2ddc672750..e06f497bdc 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -335,8 +335,9 @@ void strong_branching(const user_problem_t& original_problem, } const auto mps_model = simplex_problem_to_mps_data_model(original_problem); + const raft::handle_t batch_pdlp_handle; const auto solutions = - batch_pdlp_solve(original_problem.handle_ptr, mps_model, fractional, fraction_values); + batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values); f_t batch_pdlp_strong_branching_time = toc(start_batch); // Find max iteration on how many are done accross the batch diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt index ced9da8edc..2071bdfdef 100644 --- a/cpp/src/pdlp/CMakeLists.txt +++ b/cpp/src/pdlp/CMakeLists.txt @@ -24,6 +24,7 @@ set(LP_CORE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/infeasibility_information.cu ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu ${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu + ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu ) # C and Python adapter files diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 082299902d..eaafd1293e 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -1440,9 +1440,6 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal, norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight; const f_t computed_interaction = f_t(2.0) * interaction * step_size; - //printf("movement %lf\n", movement); - //printf("computed_interaction %lf\n", computed_interaction); - cuopt_assert( movement + computed_interaction >= f_t(0.0), "Movement + computed interaction must be >= 0"); diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index 32e21cfbf6..d491106aaf 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -28,14 +28,10 @@ #include -#include - #include namespace cuopt::linear_programming::detail { -constexpr int parallel_stream_computation = 2; - template adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( raft::handle_t const* handle_ptr, @@ -47,10 +43,6 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( const std::vector& climber_strategies, const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params) : batch_mode_(climber_strategies.size() > 1), - stream_pool_(parallel_stream_computation), - dot_delta_X_(cudaEventDisableTiming), - dot_delta_Y_(cudaEventDisableTiming), - deltas_are_done_(cudaEventDisableTiming), handle_ptr_(handle_ptr), stream_view_(handle_ptr_->get_stream()), primal_size_(primal_size), @@ -350,26 +342,6 @@ void adaptive_step_size_strategy_t::compute_step_sizes( RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value())); } -template -__global__ void validate_interaction_and_movement_outputs( - raft::device_span norm_squared_delta_primal, - raft::device_span norm_squared_delta_dual, - raft::device_span interaction) -{ - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= norm_squared_delta_primal.size()) { return; } - cuopt_assert(!isnan(norm_squared_delta_primal[idx]), - "norm_squared_delta_primal is NaN after reduction"); - cuopt_assert(!isnan(norm_squared_delta_dual[idx]), - "norm_squared_delta_dual is NaN after reduction"); - cuopt_assert(!isnan(interaction[idx]), - "interaction is NaN after reduction"); - cuopt_assert(norm_squared_delta_primal[idx] >= f_t(0.0), - "norm_squared_delta_primal must be >= 0 after reduction"); - cuopt_assert(norm_squared_delta_dual[idx] >= f_t(0.0), - "norm_squared_delta_dual must be >= 0 after reduction"); -} - template void adaptive_step_size_strategy_t::compute_interaction_and_movement( rmm::device_uvector& tmp_primal, @@ -393,7 +365,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( Deltas x & y were computed during pdhg step - We will compute in parallel (parallel cuda graph): + We will compute: ||(x' - x)|| ||(y' - y)|| (y' - y)_t . A @ (x' - x) @@ -401,11 +373,6 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( And finally merge the results */ - // We need to make sure both dot products happens after previous operations (next_primal/dual) - // Thus, we add another node in the main stream before starting the SpMVs - - if (!batch_mode_) deltas_are_done_.record(stream_view_.value()); - // primal_dual_interaction computation => we purposly diverge from the paper (delta_y . (A @ x' - // A@x)) to save one SpMV // Instead we do: delta_x . (A_t @ y' - A_t @ y) @@ -475,8 +442,6 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // 2 + (0.5 / // solver_state.primal_weight) * // norm(delta_dual) ^ 2; - // All dot products run on stream_view_ to avoid concurrent cuBLAS workspace access - // (cuBLAS uses a single internal workspace shared across all streams for the same handle) RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), current_saddle_point_state.get_primal_size(), @@ -529,13 +494,6 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( climber_strategies_.size(), dual_size_, stream_view_.value()); - - validate_interaction_and_movement_outputs - <<<1, climber_strategies_.size(), 0, stream_view_>>>( - make_span(norm_squared_delta_primal_), - make_span(norm_squared_delta_dual_), - make_span(interaction_)); - RAFT_CUDA_TRY(cudaPeekAtLastError()); } } diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp index 8e7e048b18..1e969150e7 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp @@ -91,15 +91,6 @@ class adaptive_step_size_strategy_t { private: const bool batch_mode_; - // Stream pool to run different step size computation in parallel - // Because we already have the main stream, we just need 2 extra streams from this - rmm::cuda_stream_pool stream_pool_; - - // Events to record when dot product of both delta_x and y are done and when to start them - event_handler_t deltas_are_done_; - event_handler_t dot_delta_X_; - event_handler_t dot_delta_Y_; - raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_view stream_view_; diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu index 08045b47a1..647672e535 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cu +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu @@ -58,11 +58,11 @@ void ping_pong_graph_t::start_capture(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY( + RAFT_CUDA_TRY_NO_THROW( cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); capture_even_active_ = true; } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY( + RAFT_CUDA_TRY_NO_THROW( cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); capture_odd_active_ = true; } @@ -76,15 +76,15 @@ void ping_pong_graph_t::end_capture(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); + RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &even_graph)); capture_even_active_ = false; - RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph)); + RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&even_instance, even_graph)); even_initialized = true; RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); + RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); capture_odd_active_ = false; - RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph)); + RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&odd_instance, odd_graph)); odd_initialized = true; RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); } @@ -98,9 +98,9 @@ void ping_pong_graph_t::launch(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && even_initialized) { - RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value())); + RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(even_instance, stream_view_.value())); } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) { - RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value())); + RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(odd_instance, stream_view_.value())); } } #endif diff --git a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py index 4ec4a9aaf2..19db315349 100644 --- a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py +++ b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py @@ -207,44 +207,6 @@ def set_pdlp_warm_start_data(self, pdlp_warm_start_data): """ self.pdlp_warm_start_data = pdlp_warm_start_data - def set_mip_batch_pdlp_strong_branching(self, enable): - """ - Note: Only supported for MILP - - Toggle batch PDLP strong branching in the MIP solver. - - Parameters - ---------- - enable : bool - If True, enable batch PDLP strong branching (value 1). - If False, disable it (value 0). - - Examples - -------- - >>> settings.set_mip_batch_pdlp_strong_branching(True) - """ - self.set_parameter( - "mip_batch_pdlp_strong_branching", 1 if enable else 0 - ) - - def get_mip_batch_pdlp_strong_branching(self): - """ - Note: Only supported for MILP - - Get the current value of the batch PDLP strong branching setting. - - Returns - ------- - bool - True if batch PDLP strong branching is enabled, False otherwise. - - Examples - -------- - >>> settings.get_mip_batch_pdlp_strong_branching() - False - """ - return bool(self.get_parameter("mip_batch_pdlp_strong_branching")) - def set_mip_callback(self, callback, user_data): """ Note: Only supported for MILP From 9c03faf0f2e5e5d44670507c8ac348ca3f142659 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 18 Feb 2026 17:28:11 +0100 Subject: [PATCH 07/13] final cleanup --- cpp/src/pdlp/pdlp.cu | 60 ------------------------------------------- cpp/src/pdlp/solve.cu | 3 --- run_multiple.sh | 3 --- test.py | 12 --------- 4 files changed, 78 deletions(-) delete mode 100755 run_multiple.sh delete mode 100755 test.py diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index eaafd1293e..4b4eed1f32 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -1825,66 +1825,6 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // potential_next_dual_solution RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - // Validate reflected solutions have no NaN/Inf - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - pdhg_solver_.get_reflected_primal().data(), - pdhg_solver_.get_reflected_primal().data() + - pdhg_solver_.get_reflected_primal().size(), - is_nan_or_inf{}), - "reflected_primal contains NaN or Inf in compute_fixed_error"); - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - pdhg_solver_.get_reflected_dual().data(), - pdhg_solver_.get_reflected_dual().data() + - pdhg_solver_.get_reflected_dual().size(), - is_nan_or_inf{}), - "reflected_dual contains NaN or Inf in compute_fixed_error"); - - // Validate primal/dual solutions have no NaN/Inf - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - pdhg_solver_.get_primal_solution().data(), - pdhg_solver_.get_primal_solution().data() + - pdhg_solver_.get_primal_solution().size(), - is_nan_or_inf{}), - "primal_solution contains NaN or Inf in compute_fixed_error"); - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - pdhg_solver_.get_dual_solution().data(), - pdhg_solver_.get_dual_solution().data() + - pdhg_solver_.get_dual_solution().size(), - is_nan_or_inf{}), - "dual_solution contains NaN or Inf in compute_fixed_error"); - - // Validate deltas have no NaN/Inf - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - pdhg_solver_.get_saddle_point_state().get_delta_primal().data(), - pdhg_solver_.get_saddle_point_state().get_delta_primal().data() + - pdhg_solver_.get_saddle_point_state().get_delta_primal().size(), - is_nan_or_inf{}), - "delta_primal contains NaN or Inf in compute_fixed_error"); - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - pdhg_solver_.get_saddle_point_state().get_delta_dual().data(), - pdhg_solver_.get_saddle_point_state().get_delta_dual().data() + - pdhg_solver_.get_saddle_point_state().get_delta_dual().size(), - is_nan_or_inf{}), - "delta_dual contains NaN or Inf in compute_fixed_error"); - - // Validate primal_weight and step_size have no NaN/Inf - cuopt_assert( - !thrust::any_of(handle_ptr_->get_thrust_policy(), - primal_weight_.data(), - primal_weight_.data() + primal_weight_.size(), - is_nan_or_inf{}), - "primal_weight_ contains NaN or Inf in compute_fixed_error"); - cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(), - step_size_.data(), - step_size_.data() + step_size_.size(), - is_nan_or_inf{}), - "step_size_ contains NaN or Inf in compute_fixed_error"); // Make potential_next_dual_solution point towards reflected dual solution to reuse the code RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 374c9ff513..a2766be98a 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -793,7 +793,6 @@ optimization_problem_solution_t run_batch_pdlp( // If need warm start, solve the LP alone if (primal_dual_init || primal_weight_init) { - std::cout << "Solving LP for warm start" << std::endl; pdlp_solver_settings_t warm_start_settings = settings; warm_start_settings.new_bounds.clear(); warm_start_settings.method = cuopt::linear_programming::method_t::PDLP; @@ -842,8 +841,6 @@ optimization_problem_solution_t run_batch_pdlp( } if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); } - std::cout << "Solving batch PDLP" << std::endl; - for (int i = 0; i < max_batch_size; i += optimal_batch_size) { const int current_batch_size = std::min(optimal_batch_size, max_batch_size - i); // Only take the new bounds from [i, i + current_batch_size) diff --git a/run_multiple.sh b/run_multiple.sh deleted file mode 100755 index 183b25b46e..0000000000 --- a/run_multiple.sh +++ /dev/null @@ -1,3 +0,0 @@ -for i in {1..5}; do - python test.py -done \ No newline at end of file diff --git a/test.py b/test.py deleted file mode 100755 index 6cb236dae2..0000000000 --- a/test.py +++ /dev/null @@ -1,12 +0,0 @@ -import cuopt_mps_parser -from cuopt.linear_programming import Solve, SolverSettings - -data_model = cuopt_mps_parser.ParseMps("batch_instances/neos8.mps") - -settings = SolverSettings() -settings.set_mip_batch_pdlp_strong_branching(True) - -solution = Solve(data_model, settings) - -print(solution.get_termination_reason()) -print(solution.get_primal_objective()) From 6c2fe356a9ffc0994f0e7b177233504aa73c1c7d Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 18 Feb 2026 17:39:07 +0100 Subject: [PATCH 08/13] addtional cleanup --- cpp/src/pdlp/pdhg.cu | 4 ++++ cpp/src/pdlp/utils.cuh | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index c5efdcd722..286d6de5b5 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -652,6 +652,8 @@ struct primal_reflected_projection_bulk_op { cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_projection"); cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_projection"); cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_projection"); + cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_projection"); + cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0"); f_t reflected = primal_val - step_size * (obj_coef - aty_val); @@ -688,6 +690,8 @@ struct dual_reflected_projection_bulk_op { cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_projection"); cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_projection"); cuopt_assert(!isnan(dual_gradient[idx]), "dual_gradient is NaN in dual_reflected_projection"); + cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_projection"); + cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0"); const f_t tmp = current_dual / step_size - dual_gradient[idx]; const f_t tmp_proj = diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh index 9150ab8c51..0f2ed44c42 100644 --- a/cpp/src/pdlp/utils.cuh +++ b/cpp/src/pdlp/utils.cuh @@ -606,7 +606,7 @@ void inline my_inf_norm(const rmm::device_uvector& input_vector, { auto stream = handle_ptr->get_stream(); auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t{}); - auto n = static_cast(input_vector.size()); + auto n = input_vector.size(); void* d_temp = nullptr; size_t temp_bytes = 0; From a43dc0c78b0d8bd833e8c0b05d8e6e46b0483703 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Feb 2026 13:43:06 +0100 Subject: [PATCH 09/13] address PR comments, add tests, update doc --- cpp/src/pdlp/pdlp.cu | 33 ++++++++------- .../convergence_information.cu | 8 ++-- cpp/src/pdlp/utilities/ping_pong_graph.cu | 41 +++++-------------- cpp/src/pdlp/utilities/ping_pong_graph.cuh | 1 - docs/cuopt/source/lp-qp-milp-settings.rst | 10 +++++ .../linear_programming/test_python_API.py | 33 +++++++++++++++ 6 files changed, 75 insertions(+), 51 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 4b4eed1f32..eebfede7f9 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -1192,8 +1192,10 @@ static void compute_stats(const rmm::device_uvector& vec, return x == 0 ? std::numeric_limits::max() : abs(x); }; + cuopt_assert(vec.size() > 0, "Vector must not be empty"); + auto stream = vec.stream(); - auto n = static_cast(vec.size()); + size_t n = vec.size(); rmm::device_scalar d_smallest(stream); rmm::device_scalar d_largest(stream); @@ -1203,23 +1205,23 @@ static void compute_stats(const rmm::device_uvector& vec, auto abs_iter = thrust::make_transform_iterator(vec.cbegin(), abs_op); void* d_temp = nullptr; - size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 1; - cub::DeviceReduce::Reduce( - d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream); - cub::DeviceReduce::Reduce( - d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream); - cub::DeviceReduce::Reduce( - d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream); + size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 0; + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream)); size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3}); rmm::device_buffer temp_buf(max_bytes, stream); - cub::DeviceReduce::Reduce( - temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream); - cub::DeviceReduce::Reduce( - temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream); - cub::DeviceReduce::Reduce( - temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( + temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream)); smallest = d_smallest.value(stream); largest = d_largest.value(stream); @@ -1444,7 +1446,8 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal, movement + computed_interaction >= f_t(0.0), "Movement + computed interaction must be >= 0"); - *fixed_point_error = cuda::std::sqrt(movement + computed_interaction); + // Clamp to 0 to avoid NaN + *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction)); #ifdef CUPDLP_DEBUG_MODE printf("movement %lf\n", movement); diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index eec078f8d7..269cb58f5d 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -404,19 +404,19 @@ void convergence_information_t::compute_convergence_information( relative_residual_t{settings.tolerances.relative_primal_tolerance}); void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; - cub::DeviceReduce::Max(d_temp_storage, + RAFT_CUDA_TRY(cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, transform_iter, linf_primal_residual_.data(), primal_residual_.size(), - stream_view_); + stream_view_)); rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); - cub::DeviceReduce::Max(temp_buf.data(), + RAFT_CUDA_TRY(cub::DeviceReduce::Max(temp_buf.data(), temp_storage_bytes, transform_iter, linf_primal_residual_.data(), primal_residual_.size(), - stream_view_); + stream_view_)); } compute_dual_residual(op_problem_cusparse_view_, diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu index 647672e535..5effbcdc48 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cu +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu @@ -20,32 +20,11 @@ ping_pong_graph_t::ping_pong_graph_t(rmm::cuda_stream_view stream_view, { } -template -void ping_pong_graph_t::cancel_active_capture() -{ - CUOPT_LOG_ERROR( - "Canceling active capture in ping_pong_graph_t"); - if (capture_even_active_) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); - capture_even_active_ = false; - } - if (capture_odd_active_) { - RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); - capture_odd_active_ = false; - } -} - template ping_pong_graph_t::~ping_pong_graph_t() { #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { - // This should not happen, but in case a graph was capturing while destroying the object - if (capture_even_active_ || capture_odd_active_) { - cancel_active_capture(); - } if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); } if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); } } @@ -58,11 +37,11 @@ void ping_pong_graph_t::start_capture(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY_NO_THROW( + RAFT_CUDA_TRY( cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); capture_even_active_ = true; } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY_NO_THROW( + RAFT_CUDA_TRY( cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); capture_odd_active_ = true; } @@ -76,17 +55,17 @@ void ping_pong_graph_t::end_capture(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &even_graph)); + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph)); capture_even_active_ = false; - RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&even_instance, even_graph)); + RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph)); even_initialized = true; - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph)); + RAFT_CUDA_TRY(cudaGraphDestroy(even_graph)); } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); + RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph)); capture_odd_active_ = false; - RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&odd_instance, odd_graph)); + RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph)); odd_initialized = true; - RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph)); + RAFT_CUDA_TRY(cudaGraphDestroy(odd_graph)); } } #endif @@ -98,9 +77,9 @@ void ping_pong_graph_t::launch(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && even_initialized) { - RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(even_instance, stream_view_.value())); + RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value())); } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) { - RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(odd_instance, stream_view_.value())); + RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value())); } } #endif diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh index 9d6ead8cf7..5113f804d6 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh @@ -31,7 +31,6 @@ class ping_pong_graph_t { bool is_initialized(i_t total_pdlp_iterations); private: - void cancel_active_capture(); cudaGraph_t even_graph; cudaGraph_t odd_graph; diff --git a/docs/cuopt/source/lp-qp-milp-settings.rst b/docs/cuopt/source/lp-qp-milp-settings.rst index 51c6142c2b..bd1372f70e 100644 --- a/docs/cuopt/source/lp-qp-milp-settings.rst +++ b/docs/cuopt/source/lp-qp-milp-settings.rst @@ -513,3 +513,13 @@ Set this value to 0 to disable reliability branching. Set this value to k > 0, to enable reliability branching. A variable will be considered reliable if it has been branched on k times. .. note:: The default value is ``-1`` (automatic). + +Batch PDLP Strong Branching +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING`` controls whether to use batched PDLP over Dual Simplex during strong branching at the root. +When enabled, the solver evaluates multiple branching candidates simultaneously in a single batched PDLP solve rather than solving them in parallel using Dual Simplex. This can significantly reduce the time spent in strong branching if Dual Simplex is struggling. +Set this value to 0 to disable batched PDLP strong branching. +Set this value to 1 to enable batched PDLP strong branching. + +.. note:: The default value is ``0`` (disabled). This setting is ignored if the problem is not a MIP problem. diff --git a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py index dc470f3828..0eca50ba9b 100644 --- a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py +++ b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py @@ -30,6 +30,7 @@ CUOPT_ELIMINATE_DENSE_COLUMNS, CUOPT_FOLDING, CUOPT_INFEASIBILITY_DETECTION, + CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, CUOPT_MIP_CUT_PASSES, CUOPT_METHOD, CUOPT_ORDERING, @@ -997,3 +998,35 @@ def test_cuts(): assert problem.Status.name == "Optimal" assert problem.ObjValue == pytest.approx(-126, abs=1e-3) assert problem.SolutionStats.num_nodes == 0 + + +def test_batch_pdlp_strong_branching(): + # Minimize - 86*y1 - 4*y2 - 40*y3 + # subject to 774*y1 + 76*y2 + 42*y3 <= 875 + # 67*y1 + 27*y2 + 53*y3 <= 875 + # y1, y2, y3 in {0, 1} + + problem = Problem() + y1 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y1") + y2 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y2") + y3 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y3") + + problem.addConstraint(774 * y1 + 76 * y2 + 42 * y3 <= 875) + problem.addConstraint(67 * y1 + 27 * y2 + 53 * y3 <= 875) + + problem.setObjective(-86 * y1 - 4 * y2 - 40 * y3) + + settings = SolverSettings() + settings.set_parameter(CUOPT_PRESOLVE, 0) + settings.set_parameter(CUOPT_TIME_LIMIT, 10) + settings.set_parameter(CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, 0) + + problem.solve(settings) + assert problem.Status.name == "Optimal" + assert problem.ObjValue == pytest.approx(-126, abs=1e-3) + + settings.set_parameter(CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, 1) + + problem.solve(settings) + assert problem.Status.name == "Optimal" + assert problem.ObjValue == pytest.approx(-126, abs=1e-3) From c8b8b74bc6cc5f65df94f868a0334464e360c374 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Feb 2026 13:44:54 +0100 Subject: [PATCH 10/13] format --- cpp/src/pdlp/pdlp.cu | 36 ++++++++++------- .../adaptive_step_size_strategy.cu | 17 ++++---- .../convergence_information.cu | 40 +++++++++---------- cpp/src/pdlp/utilities/ping_pong_graph.cu | 6 +-- cpp/src/pdlp/utilities/ping_pong_graph.cuh | 3 +- cpp/src/pdlp/utils.cuh | 10 ++--- 6 files changed, 58 insertions(+), 54 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index eebfede7f9..72aead03d0 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -34,8 +34,8 @@ #include #include -#include #include +#include #include #include @@ -1188,14 +1188,13 @@ static void compute_stats(const rmm::device_uvector& vec, f_t& avg) { auto abs_op = [] __host__ __device__(f_t x) { return abs(x); }; - auto min_nonzero = [] __host__ __device__(f_t x) -> f_t { - return x == 0 ? std::numeric_limits::max() : abs(x); - }; + auto min_nonzero = [] __host__ __device__(f_t x) + -> f_t { return x == 0 ? std::numeric_limits::max() : abs(x); }; cuopt_assert(vec.size() > 0, "Vector must not be empty"); auto stream = vec.stream(); - size_t n = vec.size(); + size_t n = vec.size(); rmm::device_scalar d_smallest(stream); rmm::device_scalar d_largest(stream); @@ -1206,8 +1205,14 @@ static void compute_stats(const rmm::device_uvector& vec, void* d_temp = nullptr; size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 0; - RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( - d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(d_temp, + bytes_1, + min_nz_iter, + d_smallest.data(), + n, + cuda::minimum<>{}, + std::numeric_limits::max(), + stream)); RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream)); RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( @@ -1216,8 +1221,14 @@ static void compute_stats(const rmm::device_uvector& vec, size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3}); rmm::device_buffer temp_buf(max_bytes, stream); - RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( - temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits::max(), stream)); + RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(temp_buf.data(), + bytes_1, + min_nz_iter, + d_smallest.data(), + n, + cuda::minimum<>{}, + std::numeric_limits::max(), + stream)); RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream)); RAFT_CUDA_TRY(cub::DeviceReduce::Reduce( @@ -1442,9 +1453,8 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal, norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight; const f_t computed_interaction = f_t(2.0) * interaction * step_size; - cuopt_assert( - movement + computed_interaction >= f_t(0.0), - "Movement + computed interaction must be >= 0"); + cuopt_assert(movement + computed_interaction >= f_t(0.0), + "Movement + computed interaction must be >= 0"); // Clamp to 0 to avoid NaN *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction)); @@ -1828,7 +1838,6 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // potential_next_dual_solution RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - // Make potential_next_dual_solution point towards reflected dual solution to reuse the code RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, (void*)pdhg_solver_.get_reflected_dual().data())); @@ -1853,7 +1862,6 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte stream_view_)); // To make sure all the data is written from device to host RAFT_CUDA_TRY(cudaPeekAtLastError()); - #ifdef CUPDLP_DEBUG_MODE RAFT_CUDA_TRY(cudaDeviceSynchronize()); #endif diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index d491106aaf..47e9a78a5e 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -137,11 +137,12 @@ void adaptive_step_size_strategy_t::swap_context( const auto [grid_size, block_size] = kernel_config_from_batch_size(static_cast(swap_pairs.size())); adaptive_step_size_swap_device_vectors_kernel - <<>>(thrust::raw_pointer_cast(swap_pairs.data()), - static_cast(swap_pairs.size()), - make_span(interaction_), - make_span(norm_squared_delta_primal_), - make_span(norm_squared_delta_dual_)); + <<>>( + thrust::raw_pointer_cast(swap_pairs.data()), + static_cast(swap_pairs.size()), + make_span(interaction_), + make_span(norm_squared_delta_primal_), + make_span(norm_squared_delta_dual_)); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -332,9 +333,9 @@ void adaptive_step_size_strategy_t::compute_step_sizes( // Compute n_lim, n_next and decide if step size is valid compute_step_sizes_from_movement_and_interaction <<<1, 1, 0, stream_view_.value()>>>(this->view(), - primal_step_size.data(), - dual_step_size.data(), - pdhg_solver.get_d_total_pdhg_iterations().data()); + primal_step_size.data(), + dual_step_size.data(), + pdhg_solver.get_d_total_pdhg_iterations().data()); graph.end_capture(total_pdlp_iterations); } graph.launch(total_pdlp_iterations); diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index 269cb58f5d..1e9a69d130 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -405,18 +405,18 @@ void convergence_information_t::compute_convergence_information( void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; RAFT_CUDA_TRY(cub::DeviceReduce::Max(d_temp_storage, - temp_storage_bytes, - transform_iter, - linf_primal_residual_.data(), - primal_residual_.size(), - stream_view_)); + temp_storage_bytes, + transform_iter, + linf_primal_residual_.data(), + primal_residual_.size(), + stream_view_)); rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); RAFT_CUDA_TRY(cub::DeviceReduce::Max(temp_buf.data(), - temp_storage_bytes, - transform_iter, - linf_primal_residual_.data(), - primal_residual_.size(), - stream_view_)); + temp_storage_bytes, + transform_iter, + linf_primal_residual_.data(), + primal_residual_.size(), + stream_view_)); } compute_dual_residual(op_problem_cusparse_view_, @@ -465,18 +465,18 @@ void convergence_information_t::compute_convergence_information( void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; cub::DeviceReduce::Max(d_temp_storage, - temp_storage_bytes, - transform_iter, - linf_dual_residual_.data(), - dual_residual_.size(), - stream_view_); + temp_storage_bytes, + transform_iter, + linf_dual_residual_.data(), + dual_residual_.size(), + stream_view_); rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_); cub::DeviceReduce::Max(temp_buf.data(), - temp_storage_bytes, - transform_iter, - linf_dual_residual_.data(), - dual_residual_.size(), - stream_view_); + temp_storage_bytes, + transform_iter, + linf_dual_residual_.data(), + dual_residual_.size(), + stream_view_); } } diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu index 5effbcdc48..4ec5bff8c1 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cu +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu @@ -37,12 +37,10 @@ void ping_pong_graph_t::start_capture(i_t total_pdlp_iterations) #ifndef CUPDLP_DEBUG_MODE if (!is_legacy_batch_mode_) { if (total_pdlp_iterations % 2 == 0 && !even_initialized) { - RAFT_CUDA_TRY( - cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); + RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); capture_even_active_ = true; } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) { - RAFT_CUDA_TRY( - cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); + RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal)); capture_odd_active_ = true; } } diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh index 5113f804d6..dafecdd06e 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh @@ -19,7 +19,7 @@ namespace cuopt::linear_programming::detail { // No additional checks for safe usage (calling launch() before initializing the graph) use with // caution Binary part is because in pdlp we swap pointers instead of copying vectors to accept a // valid pdhg step So every odd pdlp step it's one graph, every even step it's another graph - template +template class ping_pong_graph_t { public: ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false); @@ -31,7 +31,6 @@ class ping_pong_graph_t { bool is_initialized(i_t total_pdlp_iterations); private: - cudaGraph_t even_graph; cudaGraph_t odd_graph; cudaGraphExec_t even_instance; diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh index 0f2ed44c42..33625f7680 100644 --- a/cpp/src/pdlp/utils.cuh +++ b/cpp/src/pdlp/utils.cuh @@ -608,13 +608,11 @@ void inline my_inf_norm(const rmm::device_uvector& input_vector, auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t{}); auto n = input_vector.size(); - void* d_temp = nullptr; - size_t temp_bytes = 0; - cub::DeviceReduce::Max( - d_temp, temp_bytes, abs_iter, result, n, stream); + void* d_temp = nullptr; + size_t temp_bytes = 0; + cub::DeviceReduce::Max(d_temp, temp_bytes, abs_iter, result, n, stream); rmm::device_buffer temp_buf(temp_bytes, stream); - cub::DeviceReduce::Max( - temp_buf.data(), temp_bytes, abs_iter, result, n, stream); + cub::DeviceReduce::Max(temp_buf.data(), temp_bytes, abs_iter, result, n, stream); } template From b1be5bb2d0b729ddb685b722ad50c41f5a32aa58 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Feb 2026 13:45:01 +0100 Subject: [PATCH 11/13] format --- benchmarks/linear_programming/cuopt/run_pdlp.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu index 64897264c9..e9b4f8296c 100644 --- a/benchmarks/linear_programming/cuopt/run_pdlp.cu +++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu @@ -71,9 +71,9 @@ static void parse_arguments(argparse::ArgumentParser& program) "modes."); program.add_argument("--presolver") - .help("Presolver to use. Possible values: None, Papilo, PSLP, Default") - .default_value("Default") - .choices("None", "Papilo", "PSLP", "Default"); + .help("Presolver to use. Possible values: None, Papilo, PSLP, Default") + .default_value("Default") + .choices("None", "Papilo", "PSLP", "Default"); program.add_argument("--solution-path").help("Path where solution file will be generated"); } From d89af961b3d8af72d22f027114ac1921809c3ccc Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Feb 2026 13:53:59 +0100 Subject: [PATCH 12/13] style --- .../restart_strategy/pdlp_restart_strategy.cu | 827 +++++++++--------- 1 file changed, 415 insertions(+), 412 deletions(-) diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index cf715e8a1d..0b1c109185 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -2008,462 +2008,465 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( "Incorrect primal reverse iterator"); index_start_primal = thrust::raw_pointer_cast(&*highest_negInf_primal) - threshold_.data() + 1; // + 1 to go after last negInf - if (lowest_inf != end) { + if (lowest_inf != end) { std::numeric_limits::infinity(), "Incorrect primal iterator"); - index_end_primal = - thrust::raw_pointer_cast(lowest_inf) - - threshold_.data(); // no - 1 to go before the first inf because end is not included - testing_range_high_.set_value_async(index_end_primal, stream_view_); - } else // No inf found, end is primal_size_h_ - testing_range_high_.set_value_async(index_end_primal, stream_view_); - cuopt_assert(index_start_primal <= index_end_primal, - "Start should be strictly smaller than end"); - - cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(), - threshold_.data() + index_start_primal, - threshold_.data() + index_end_primal, - is_nan_or_inf()), - "Threshold vector should not contain inf or NaN values"); - - // Init parameters for live kernel - // Has to do this to pass lvalues (and not rvalue) to void* kernel_args - auto restart_view = this->view(); - auto op_view = problem_ptr->view(); - i_t* testing_range_low = testing_range_low_.data(); - i_t* testing_range_high = testing_range_high_.data(); - f_t* test_radius_squared = test_radius_squared_.data(); - f_t* low_radius_squared = low_radius_squared_.data(); - f_t* high_radius_squared = high_radius_squared_.data(); - f_t* distance_traveled = duality_gap.distance_traveled_.data(); - - void* kernel_args[] = { - &restart_view, - &op_view, - &testing_range_low, - &testing_range_high, - &test_radius_squared, - &low_radius_squared, - &high_radius_squared, - &distance_traveled, - }; - constexpr int numThreads = 128; - dim3 dimBlock(numThreads, 1, 1); - // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount * - // numBlocksPerSm - dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1); - // Compute the median for the join problem, while loop is inside the live kernel - RAFT_CUDA_TRY(cudaLaunchCooperativeKernel( - (void*)solve_bound_constrained_trust_region_kernel, - dimGrid, - dimBlock, - kernel_args, - 0, - stream_view_)); - - // Find max threshold for the join problem - const f_t* max_threshold = - thrust::max_element(handle_ptr_->get_thrust_policy(), - threshold_.data(), - threshold_.data() + primal_size_h_ + dual_size_h_); - - // we have now determined the test_threshold that should minimize the objective value of the - // solution. - - // if no component got fixed by their upper bound we can pick the maximum threshold to be the - // target_threshold which was computed before the loop in the direction_and_threshold_kernel - // Otherwise use the test_threshold determined in the loop - // { - target_threshold_determination_kernel<<<1, 1, 0, stream_view_>>>( - this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - // } + index_end_primal = + thrust::raw_pointer_cast(lowest_inf) - + threshold_ + .data(); // no - 1 to go before the first inf because end is not included + testing_range_high_.set_value_async(index_end_primal, stream_view_); + } else // No inf found, end is primal_size_h_ + testing_range_high_.set_value_async(index_end_primal, stream_view_); + cuopt_assert(index_start_primal <= index_end_primal, + "Start should be strictly smaller than end"); + + cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(), + threshold_.data() + index_start_primal, + threshold_.data() + index_end_primal, + is_nan_or_inf()), + "Threshold vector should not contain inf or NaN values"); + + // Init parameters for live kernel + // Has to do this to pass lvalues (and not rvalue) to void* kernel_args + auto restart_view = this->view(); + auto op_view = problem_ptr->view(); + i_t* testing_range_low = testing_range_low_.data(); + i_t* testing_range_high = testing_range_high_.data(); + f_t* test_radius_squared = test_radius_squared_.data(); + f_t* low_radius_squared = low_radius_squared_.data(); + f_t* high_radius_squared = high_radius_squared_.data(); + f_t* distance_traveled = duality_gap.distance_traveled_.data(); + + void* kernel_args[] = { + &restart_view, + &op_view, + &testing_range_low, + &testing_range_high, + &test_radius_squared, + &low_radius_squared, + &high_radius_squared, + &distance_traveled, + }; + constexpr int numThreads = 128; + dim3 dimBlock(numThreads, 1, 1); + // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount * + // numBlocksPerSm + dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1); + // Compute the median for the join problem, while loop is inside the live kernel + RAFT_CUDA_TRY(cudaLaunchCooperativeKernel( + (void*)solve_bound_constrained_trust_region_kernel, + dimGrid, + dimBlock, + kernel_args, + 0, + stream_view_)); + + // Find max threshold for the join problem + const f_t* max_threshold = + thrust::max_element(handle_ptr_->get_thrust_policy(), + threshold_.data(), + threshold_.data() + primal_size_h_ + dual_size_h_); + + // we have now determined the test_threshold that should minimize the objective value of the + // solution. + + // if no component got fixed by their upper bound we can pick the maximum threshold to be the + // target_threshold which was computed before the loop in the direction_and_threshold_kernel + // Otherwise use the test_threshold determined in the loop + // { + target_threshold_determination_kernel<<<1, 1, 0, stream_view_>>>( + this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + // } + + // Compute x (the solution which is defined by moving each component test_threshold * + // direction[component]) clamp on upper and lower bounds. + // Used unsorted_direction_full_ as the other one got sorted + // { + raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(), + duality_gap.primal_solution_.data(), + unsorted_direction_full_.data(), + primal_size_h_, + a_add_scalar_times_b(target_threshold_.data()), + stream_view_); + raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(), + duality_gap.dual_solution_.data(), + unsorted_direction_full_.data() + primal_size_h_, + dual_size_h_, + a_add_scalar_times_b(target_threshold_.data()), + stream_view_); + // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part + using f_t2 = typename type_2::type; + cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(), + problem_ptr->variable_bounds.data()), + duality_gap.primal_solution_tr_.data(), + primal_size_h_, + clamp(), + stream_view_.value()); + + // project by max(min(y[i], upperbound[i]),lowerbound[i]) + raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(), + duality_gap.dual_solution_tr_.data(), + transformed_constraint_lower_bounds_.data(), + transformed_constraint_upper_bounds_.data(), + dual_size_h_, + constraint_clamp(), + stream_view_); + // } + } - // Compute x (the solution which is defined by moving each component test_threshold * - // direction[component]) clamp on upper and lower bounds. - // Used unsorted_direction_full_ as the other one got sorted + // Compute the current lower bound for the objective value using the primal solution_tr and + // upper bound for the objective value using the dual solution_tr // { - raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(), - duality_gap.primal_solution_.data(), - unsorted_direction_full_.data(), - primal_size_h_, - a_add_scalar_times_b(target_threshold_.data()), - stream_view_); - raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(), - duality_gap.dual_solution_.data(), - unsorted_direction_full_.data() + primal_size_h_, - dual_size_h_, - a_add_scalar_times_b(target_threshold_.data()), - stream_view_); - // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part - using f_t2 = typename type_2::type; - cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(), - problem_ptr->variable_bounds.data()), - duality_gap.primal_solution_tr_.data(), - primal_size_h_, - clamp(), - stream_view_.value()); - - // project by max(min(y[i], upperbound[i]),lowerbound[i]) - raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(), - duality_gap.dual_solution_tr_.data(), - transformed_constraint_lower_bounds_.data(), - transformed_constraint_upper_bounds_.data(), - dual_size_h_, - constraint_clamp(), - stream_view_); + // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution, + // primal_gradient)) + compute_bound(duality_gap.primal_solution_tr_, + duality_gap.primal_solution_, + duality_gap.primal_gradient_, + duality_gap.lagrangian_value_, + primal_size_h_, + primal_stride, + tmp_primal, + duality_gap.lower_bound_value_); + + // compute 'upper bound' using dual + compute_bound(duality_gap.dual_solution_tr_, + duality_gap.dual_solution_, + duality_gap.dual_gradient_, + duality_gap.lagrangian_value_, + dual_size_h_, + dual_stride, + tmp_dual, + duality_gap.upper_bound_value_); + // } } - // Compute the current lower bound for the objective value using the primal solution_tr and - // upper bound for the objective value using the dual solution_tr - // { - // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution, - // primal_gradient)) - compute_bound(duality_gap.primal_solution_tr_, - duality_gap.primal_solution_, - duality_gap.primal_gradient_, - duality_gap.lagrangian_value_, - primal_size_h_, - primal_stride, - tmp_primal, - duality_gap.lower_bound_value_); - - // compute 'upper bound' using dual - compute_bound(duality_gap.dual_solution_tr_, - duality_gap.dual_solution_, - duality_gap.dual_gradient_, - duality_gap.lagrangian_value_, - dual_size_h_, - dual_stride, - tmp_dual, - duality_gap.upper_bound_value_); - - // } -} - -template -void pdlp_restart_strategy_t::compute_distance_traveled_from_last_restart( - localized_duality_gap_container_t& duality_gap, - rmm::device_uvector& primal_weight, - rmm::device_uvector& tmp_primal, - rmm::device_uvector& tmp_dual) -{ - raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart"); - // norm( - // new_primal_solution - last_restart.primal_solution, - // )^2 - - // Julia / Paper use a weighted norm using primal weight for primal / dual distance - // We simply use L2 norm of diff - distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_, - last_restart_duality_gap_.primal_solution_, - tmp_primal, - primal_size_h_, - primal_stride, - duality_gap.primal_distance_traveled_); - - // compute similarly for dual - distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_, - last_restart_duality_gap_.dual_solution_, - tmp_dual, - dual_size_h_, - dual_stride, - duality_gap.dual_distance_traveled_); - - // distance_traveled = primal_distance * 0.5 * primal_weight - // + dual_distance * 0.5 / primal_weight - compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( - duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} + template + void pdlp_restart_strategy_t::compute_distance_traveled_from_last_restart( + localized_duality_gap_container_t & duality_gap, + rmm::device_uvector & primal_weight, + rmm::device_uvector & tmp_primal, + rmm::device_uvector & tmp_dual) + { + raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart"); + // norm( + // new_primal_solution - last_restart.primal_solution, + // )^2 + + // Julia / Paper use a weighted norm using primal weight for primal / dual distance + // We simply use L2 norm of diff + distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_, + last_restart_duality_gap_.primal_solution_, + tmp_primal, + primal_size_h_, + primal_stride, + duality_gap.primal_distance_traveled_); + + // compute similarly for dual + distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_, + last_restart_duality_gap_.dual_solution_, + tmp_dual, + dual_size_h_, + dual_stride, + duality_gap.dual_distance_traveled_); + + // distance_traveled = primal_distance * 0.5 * primal_weight + // + dual_distance * 0.5 / primal_weight + compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( + duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } -template -void pdlp_restart_strategy_t::compute_primal_gradient( - localized_duality_gap_container_t& duality_gap, - cusparse_view_t& cusparse_view) -{ - raft::common::nvtx::range fun_scope("compute_primal_gradient"); + template + void pdlp_restart_strategy_t::compute_primal_gradient( + localized_duality_gap_container_t & duality_gap, + cusparse_view_t & cusparse_view) + { + raft::common::nvtx::range fun_scope("compute_primal_gradient"); #ifdef PDLP_DEBUG_MODE - std::cout << " Compute primal gradient:" << std::endl; + std::cout << " Compute primal gradient:" << std::endl; #endif - // for QP add problem.objective_matrix * primal_solution as well - // c - A^T*y (copy c to primal_gradient for correct writing of result) - raft::copy(duality_gap.primal_gradient_.data(), - problem_ptr->objective_coefficients.data(), - primal_size_h_, - stream_view_); + // for QP add problem.objective_matrix * primal_solution as well + // c - A^T*y (copy c to primal_gradient for correct writing of result) + raft::copy(duality_gap.primal_gradient_.data(), + problem_ptr->objective_coefficients.data(), + primal_size_h_, + stream_view_); - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_neg_1_.data(), - cusparse_view.A_T, - cusparse_view.dual_solution, - reusable_device_scalar_value_1_.data(), - cusparse_view.primal_gradient, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); -} + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_neg_1_.data(), + cusparse_view.A_T, + cusparse_view.dual_solution, + reusable_device_scalar_value_1_.data(), + cusparse_view.primal_gradient, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_transpose.data(), + stream_view_)); + } -template -__global__ void compute_subgradient_kernel( - const typename pdlp_restart_strategy_t::view_t restart_strategy_view, - const typename problem_t::view_t op_problem_view, - const typename localized_duality_gap_container_t::view_t duality_gap_view, - f_t* subgradient) -{ - i_t id = threadIdx.x + blockIdx.x * blockDim.x; - if (id >= duality_gap_view.dual_size) { return; } - - f_t lower = op_problem_view.constraint_lower_bounds[id]; - f_t upper = op_problem_view.constraint_upper_bounds[id]; - f_t primal_product = duality_gap_view.dual_gradient[id]; - f_t dual_solution = duality_gap_view.dual_solution[id]; - - f_t subgradient_coefficient; - - if (dual_solution < f_t(0)) { - subgradient_coefficient = upper; - } else if (dual_solution > f_t(0)) { - subgradient_coefficient = lower; - } else if (!isfinite(upper) && !isfinite(lower)) { - subgradient_coefficient = f_t(0); - } else if (!isfinite(upper) && isfinite(lower)) { - subgradient_coefficient = lower; - } else if (isfinite(upper) && !isfinite(lower)) { - subgradient_coefficient = upper; - } else { - if (primal_product < lower) { + template + __global__ void compute_subgradient_kernel( + const typename pdlp_restart_strategy_t::view_t restart_strategy_view, + const typename problem_t::view_t op_problem_view, + const typename localized_duality_gap_container_t::view_t duality_gap_view, + f_t* subgradient) + { + i_t id = threadIdx.x + blockIdx.x * blockDim.x; + if (id >= duality_gap_view.dual_size) { return; } + + f_t lower = op_problem_view.constraint_lower_bounds[id]; + f_t upper = op_problem_view.constraint_upper_bounds[id]; + f_t primal_product = duality_gap_view.dual_gradient[id]; + f_t dual_solution = duality_gap_view.dual_solution[id]; + + f_t subgradient_coefficient; + + if (dual_solution < f_t(0)) { + subgradient_coefficient = upper; + } else if (dual_solution > f_t(0)) { subgradient_coefficient = lower; - } else if (primal_product > upper) { + } else if (!isfinite(upper) && !isfinite(lower)) { + subgradient_coefficient = f_t(0); + } else if (!isfinite(upper) && isfinite(lower)) { + subgradient_coefficient = lower; + } else if (isfinite(upper) && !isfinite(lower)) { subgradient_coefficient = upper; } else { - subgradient_coefficient = primal_product; + if (primal_product < lower) { + subgradient_coefficient = lower; + } else if (primal_product > upper) { + subgradient_coefficient = upper; + } else { + subgradient_coefficient = primal_product; + } } - } - subgradient[id] = subgradient_coefficient; -} + subgradient[id] = subgradient_coefficient; + } -template -void pdlp_restart_strategy_t::compute_dual_gradient( - localized_duality_gap_container_t& duality_gap, - cusparse_view_t& cusparse_view, - rmm::device_uvector& tmp_dual) -{ - raft::common::nvtx::range fun_scope("compute_dual_gradient"); + template + void pdlp_restart_strategy_t::compute_dual_gradient( + localized_duality_gap_container_t & duality_gap, + cusparse_view_t & cusparse_view, + rmm::device_uvector & tmp_dual) + { + raft::common::nvtx::range fun_scope("compute_dual_gradient"); #ifdef PDLP_DEBUG_MODE - std::cout << " Compute dual gradient:" << std::endl; + std::cout << " Compute dual gradient:" << std::endl; #endif - // b - A*x - // is changed with the introduction of constraint upper and lower bounds - - // gradient constains primal_product - RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), - cusparse_view.A, - cusparse_view.primal_solution, - reusable_device_scalar_value_0_.data(), - cusparse_view.dual_gradient, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_non_transpose.data(), - stream_view_)); - - // tmp_dual will contain the subgradient - i_t number_of_blocks = dual_size_h_ / block_size; - if (dual_size_h_ % block_size) number_of_blocks++; - i_t number_of_threads = std::min(dual_size_h_, block_size); - compute_subgradient_kernel<<>>( - this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data()); - - // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient) - raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(), - tmp_dual.data(), - duality_gap.dual_gradient_.data(), - dual_size_h_, - stream_view_); -} + // b - A*x + // is changed with the introduction of constraint upper and lower bounds + + // gradient constains primal_product + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view.A, + cusparse_view.primal_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view.dual_gradient, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_non_transpose.data(), + stream_view_)); + + // tmp_dual will contain the subgradient + i_t number_of_blocks = dual_size_h_ / block_size; + if (dual_size_h_ % block_size) number_of_blocks++; + i_t number_of_threads = std::min(dual_size_h_, block_size); + compute_subgradient_kernel<<>>( + this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data()); + + // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient) + raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(), + tmp_dual.data(), + duality_gap.dual_gradient_.data(), + dual_size_h_, + stream_view_); + } -template -void pdlp_restart_strategy_t::compute_lagrangian_value( - localized_duality_gap_container_t& duality_gap, - cusparse_view_t& cusparse_view, - rmm::device_uvector& tmp_primal, - rmm::device_uvector& tmp_dual) -{ - raft::common::nvtx::range fun_scope("compute_lagrangian_value"); + template + void pdlp_restart_strategy_t::compute_lagrangian_value( + localized_duality_gap_container_t & duality_gap, + cusparse_view_t & cusparse_view, + rmm::device_uvector & tmp_primal, + rmm::device_uvector & tmp_dual) + { + raft::common::nvtx::range fun_scope("compute_lagrangian_value"); #ifdef PDLP_DEBUG_MODE - std::cout << " Compute lagrangian value:" << std::endl; + std::cout << " Compute lagrangian value:" << std::endl; #endif - // if QP - // 0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) + - // dot(primal_solution, problem.objective_vector) - - // dot(primal_solution, problem.constraint_matrix' * dual_solution) + - // dot(dual_solution, dual_gradient+primal_product) + - // problem.objective_constant + // if QP + // 0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) + + // dot(primal_solution, problem.objective_vector) - + // dot(primal_solution, problem.constraint_matrix' * dual_solution) + + // dot(dual_solution, dual_gradient+primal_product) + + // problem.objective_constant - // when lp first term is irrelevant + // when lp first term is irrelevant - // second term - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - primal_size_h_, - duality_gap.primal_solution_.data(), - primal_stride, - problem_ptr->objective_coefficients.data(), - primal_stride, - reusable_device_scalar_1_.data(), - stream_view_)); + // second term + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + primal_size_h_, + duality_gap.primal_solution_.data(), + primal_stride, + problem_ptr->objective_coefficients.data(), + primal_stride, + reusable_device_scalar_1_.data(), + stream_view_)); - // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), - cusparse_view.A_T, - cusparse_view.dual_solution, - reusable_device_scalar_value_0_.data(), - cusparse_view.tmp_primal, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); + // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view.A_T, + cusparse_view.dual_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view.tmp_primal, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_transpose.data(), + stream_view_)); - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - primal_size_h_, - duality_gap.primal_solution_.data(), - primal_stride, - tmp_primal.data(), - primal_stride, - reusable_device_scalar_2_.data(), - stream_view_)); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + primal_size_h_, + duality_gap.primal_solution_.data(), + primal_stride, + tmp_primal.data(), + primal_stride, + reusable_device_scalar_2_.data(), + stream_view_)); - // fourth term //tmp_dual still contains subgradient from the dual_gradient computation - reusable_device_scalar_3_.set_value_to_zero_async(stream_view_); - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - dual_size_h_, - duality_gap.dual_solution_.data(), - dual_stride, - tmp_dual.data(), - dual_stride, - reusable_device_scalar_3_.data(), - stream_view_)); + // fourth term //tmp_dual still contains subgradient from the dual_gradient computation + reusable_device_scalar_3_.set_value_to_zero_async(stream_view_); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + dual_size_h_, + duality_gap.dual_solution_.data(), + dual_stride, + tmp_dual.data(), + dual_stride, + reusable_device_scalar_3_.data(), + stream_view_)); - // subtract third term from second up - raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(), - reusable_device_scalar_1_.data(), - reusable_device_scalar_2_.data(), - 1, - stream_view_); - raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(), - reusable_device_scalar_1_.data(), - reusable_device_scalar_3_.data(), - 1, - stream_view_); -} + // subtract third term from second up + raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(), + reusable_device_scalar_1_.data(), + reusable_device_scalar_2_.data(), + 1, + stream_view_); + raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(), + reusable_device_scalar_1_.data(), + reusable_device_scalar_3_.data(), + 1, + stream_view_); + } -template -void pdlp_restart_strategy_t::reset_internal() -{ - candidate_is_avg_.set_value_to_zero_async(stream_view_); - restart_triggered_.set_value_to_zero_async(stream_view_); -} + template + void pdlp_restart_strategy_t::reset_internal() + { + candidate_is_avg_.set_value_to_zero_async(stream_view_); + restart_triggered_.set_value_to_zero_async(stream_view_); + } -template -typename pdlp_restart_strategy_t::view_t pdlp_restart_strategy_t::view() -{ - pdlp_restart_strategy_t::view_t v{}; - v.primal_size = primal_size_h_; - v.dual_size = dual_size_h_; - v.transformed_constraint_lower_bounds = raft::device_span{ - transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()}; - v.transformed_constraint_upper_bounds = raft::device_span{ - transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()}; - v.last_restart_length = last_restart_length_; + template + typename pdlp_restart_strategy_t::view_t pdlp_restart_strategy_t::view() + { + pdlp_restart_strategy_t::view_t v{}; + v.primal_size = primal_size_h_; + v.dual_size = dual_size_h_; + v.transformed_constraint_lower_bounds = raft::device_span{ + transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()}; + v.transformed_constraint_upper_bounds = raft::device_span{ + transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()}; + v.last_restart_length = last_restart_length_; - v.weights = raft::device_span{weights_.data(), weights_.size()}; + v.weights = raft::device_span{weights_.data(), weights_.size()}; - v.candidate_is_avg = candidate_is_avg_.data(); - v.restart_triggered = restart_triggered_.data(); + v.candidate_is_avg = candidate_is_avg_.data(); + v.restart_triggered = restart_triggered_.data(); - v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data(); + v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data(); - v.center_point = raft::device_span{center_point_.data(), center_point_.size()}; - v.objective_vector = raft::device_span{objective_vector_.data(), objective_vector_.size()}; - v.direction_full = raft::device_span{direction_full_.data(), direction_full_.size()}; - v.threshold = raft::device_span{threshold_.data(), threshold_.size()}; - v.lower_bound = raft::device_span{lower_bound_.data(), lower_bound_.size()}; - v.upper_bound = raft::device_span{upper_bound_.data(), upper_bound_.size()}; - v.test_point = raft::device_span{test_point_.data(), test_point_.size()}; + v.center_point = raft::device_span{center_point_.data(), center_point_.size()}; + v.objective_vector = raft::device_span{objective_vector_.data(), objective_vector_.size()}; + v.direction_full = raft::device_span{direction_full_.data(), direction_full_.size()}; + v.threshold = raft::device_span{threshold_.data(), threshold_.size()}; + v.lower_bound = raft::device_span{lower_bound_.data(), lower_bound_.size()}; + v.upper_bound = raft::device_span{upper_bound_.data(), upper_bound_.size()}; + v.test_point = raft::device_span{test_point_.data(), test_point_.size()}; - v.target_threshold = target_threshold_.data(); - v.low_radius_squared = low_radius_squared_.data(); - v.high_radius_squared = high_radius_squared_.data(); - v.test_radius_squared = test_radius_squared_.data(); + v.target_threshold = target_threshold_.data(); + v.low_radius_squared = low_radius_squared_.data(); + v.high_radius_squared = high_radius_squared_.data(); + v.test_radius_squared = test_radius_squared_.data(); - v.testing_range_low = testing_range_low_.data(); - v.testing_range_high = testing_range_high_.data(); + v.testing_range_low = testing_range_low_.data(); + v.testing_range_high = testing_range_high_.data(); - v.shared_live_kernel_accumulator = raft::device_span{shared_live_kernel_accumulator_.data(), - shared_live_kernel_accumulator_.size()}; + v.shared_live_kernel_accumulator = raft::device_span{ + shared_live_kernel_accumulator_.data(), shared_live_kernel_accumulator_.size()}; - v.hyper_params = hyper_params_; + v.hyper_params = hyper_params_; - return v; -} + return v; + } -template -typename pdlp_restart_strategy_t::cupdlpx_restart_view_t -pdlp_restart_strategy_t::make_cupdlpx_restart_view( - const rmm::device_uvector& primal_distance, - const rmm::device_uvector& dual_distance, - const convergence_information_t& current_convergence_information, - const rmm::device_uvector& step_size, - rmm::device_uvector& primal_weight, - rmm::device_uvector& best_primal_weight, - rmm::device_uvector& primal_step_size, - rmm::device_uvector& dual_step_size) -{ - cupdlpx_restart_view_t v{}; - v.primal_distance = make_span(primal_distance); - v.dual_distance = make_span(dual_distance); - v.l2_dual_residual = make_span(current_convergence_information.get_l2_dual_residual()); - v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual()); - v.l2_norm_primal_linear_objective = - current_convergence_information.get_relative_dual_tolerance_factor(); - v.l2_norm_primal_right_hand_side = - current_convergence_information.get_relative_primal_tolerance_factor(); - v.step_size = make_span(step_size); - v.primal_weight = make_span(primal_weight); - v.primal_weight_error_sum = make_span(primal_weight_error_sum_); - v.primal_weight_last_error = make_span(primal_weight_last_error_); - v.best_primal_weight = make_span(best_primal_weight); - v.new_primal_step_size = make_span(primal_step_size); - v.new_dual_step_size = make_span(dual_step_size); - v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_); - v.hyper_params = hyper_params_; - return v; -} + template + typename pdlp_restart_strategy_t::cupdlpx_restart_view_t + pdlp_restart_strategy_t::make_cupdlpx_restart_view( + const rmm::device_uvector& primal_distance, + const rmm::device_uvector& dual_distance, + const convergence_information_t& current_convergence_information, + const rmm::device_uvector& step_size, + rmm::device_uvector& primal_weight, + rmm::device_uvector& best_primal_weight, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size) + { + cupdlpx_restart_view_t v{}; + v.primal_distance = make_span(primal_distance); + v.dual_distance = make_span(dual_distance); + v.l2_dual_residual = make_span(current_convergence_information.get_l2_dual_residual()); + v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual()); + v.l2_norm_primal_linear_objective = + current_convergence_information.get_relative_dual_tolerance_factor(); + v.l2_norm_primal_right_hand_side = + current_convergence_information.get_relative_primal_tolerance_factor(); + v.step_size = make_span(step_size); + v.primal_weight = make_span(primal_weight); + v.primal_weight_error_sum = make_span(primal_weight_error_sum_); + v.primal_weight_last_error = make_span(primal_weight_last_error_); + v.best_primal_weight = make_span(best_primal_weight); + v.new_primal_step_size = make_span(primal_step_size); + v.new_dual_step_size = make_span(dual_step_size); + v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_); + v.hyper_params = hyper_params_; + return v; + } -template -i_t pdlp_restart_strategy_t::get_iterations_since_last_restart() const -{ - return weighted_average_solution_.get_iterations_since_last_restart(); -} + template + i_t pdlp_restart_strategy_t::get_iterations_since_last_restart() const + { + return weighted_average_solution_.get_iterations_since_last_restart(); + } -template -void pdlp_restart_strategy_t::set_last_restart_was_average(bool value) -{ - last_restart_was_average_ = value; -} + template + void pdlp_restart_strategy_t::set_last_restart_was_average(bool value) + { + last_restart_was_average_ = value; + } -template -bool pdlp_restart_strategy_t::get_last_restart_was_average() const -{ - return last_restart_was_average_; -} + template + bool pdlp_restart_strategy_t::get_last_restart_was_average() const + { + return last_restart_was_average_; + } #define INSTANTIATE(F_TYPE) \ template class pdlp_restart_strategy_t; \ @@ -2520,11 +2523,11 @@ bool pdlp_restart_strategy_t::get_last_restart_was_average() const F_TYPE* primal_product); #if MIP_INSTANTIATE_FLOAT -INSTANTIATE(float) + INSTANTIATE(float) #endif #if MIP_INSTANTIATE_DOUBLE -INSTANTIATE(double) + INSTANTIATE(double) #endif } // namespace cuopt::linear_programming::detail From c7e3e222d063b98907efa2f5f81e31d84f231f10 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Thu, 19 Feb 2026 15:03:34 +0100 Subject: [PATCH 13/13] put back changes in restart --- .../restart_strategy/pdlp_restart_strategy.cu | 831 +++++++++--------- 1 file changed, 416 insertions(+), 415 deletions(-) diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index 0b1c109185..8eacd4d246 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -2008,465 +2008,466 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( "Incorrect primal reverse iterator"); index_start_primal = thrust::raw_pointer_cast(&*highest_negInf_primal) - threshold_.data() + 1; // + 1 to go after last negInf - if (lowest_inf != end) { + testing_range_low_.set_value_async(index_start_primal, stream_view_); + } else // No negInf found, start is 0 + testing_range_low_.set_value_async(index_start_primal, stream_view_); + if (lowest_inf != end) { + cuopt_assert(device_to_host_value(thrust::raw_pointer_cast(&*lowest_inf)) == std::numeric_limits::infinity(), "Incorrect primal iterator"); - index_end_primal = - thrust::raw_pointer_cast(lowest_inf) - - threshold_ - .data(); // no - 1 to go before the first inf because end is not included - testing_range_high_.set_value_async(index_end_primal, stream_view_); - } else // No inf found, end is primal_size_h_ - testing_range_high_.set_value_async(index_end_primal, stream_view_); - cuopt_assert(index_start_primal <= index_end_primal, - "Start should be strictly smaller than end"); - - cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(), - threshold_.data() + index_start_primal, - threshold_.data() + index_end_primal, - is_nan_or_inf()), - "Threshold vector should not contain inf or NaN values"); - - // Init parameters for live kernel - // Has to do this to pass lvalues (and not rvalue) to void* kernel_args - auto restart_view = this->view(); - auto op_view = problem_ptr->view(); - i_t* testing_range_low = testing_range_low_.data(); - i_t* testing_range_high = testing_range_high_.data(); - f_t* test_radius_squared = test_radius_squared_.data(); - f_t* low_radius_squared = low_radius_squared_.data(); - f_t* high_radius_squared = high_radius_squared_.data(); - f_t* distance_traveled = duality_gap.distance_traveled_.data(); - - void* kernel_args[] = { - &restart_view, - &op_view, - &testing_range_low, - &testing_range_high, - &test_radius_squared, - &low_radius_squared, - &high_radius_squared, - &distance_traveled, - }; - constexpr int numThreads = 128; - dim3 dimBlock(numThreads, 1, 1); - // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount * - // numBlocksPerSm - dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1); - // Compute the median for the join problem, while loop is inside the live kernel - RAFT_CUDA_TRY(cudaLaunchCooperativeKernel( - (void*)solve_bound_constrained_trust_region_kernel, - dimGrid, - dimBlock, - kernel_args, - 0, - stream_view_)); - - // Find max threshold for the join problem - const f_t* max_threshold = - thrust::max_element(handle_ptr_->get_thrust_policy(), - threshold_.data(), - threshold_.data() + primal_size_h_ + dual_size_h_); - - // we have now determined the test_threshold that should minimize the objective value of the - // solution. - - // if no component got fixed by their upper bound we can pick the maximum threshold to be the - // target_threshold which was computed before the loop in the direction_and_threshold_kernel - // Otherwise use the test_threshold determined in the loop - // { - target_threshold_determination_kernel<<<1, 1, 0, stream_view_>>>( - this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - // } - - // Compute x (the solution which is defined by moving each component test_threshold * - // direction[component]) clamp on upper and lower bounds. - // Used unsorted_direction_full_ as the other one got sorted - // { - raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(), - duality_gap.primal_solution_.data(), - unsorted_direction_full_.data(), - primal_size_h_, - a_add_scalar_times_b(target_threshold_.data()), - stream_view_); - raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(), - duality_gap.dual_solution_.data(), - unsorted_direction_full_.data() + primal_size_h_, - dual_size_h_, - a_add_scalar_times_b(target_threshold_.data()), - stream_view_); - // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part - using f_t2 = typename type_2::type; - cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(), - problem_ptr->variable_bounds.data()), - duality_gap.primal_solution_tr_.data(), - primal_size_h_, - clamp(), - stream_view_.value()); - - // project by max(min(y[i], upperbound[i]),lowerbound[i]) - raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(), - duality_gap.dual_solution_tr_.data(), - transformed_constraint_lower_bounds_.data(), - transformed_constraint_upper_bounds_.data(), - dual_size_h_, - constraint_clamp(), - stream_view_); - // } - } - - // Compute the current lower bound for the objective value using the primal solution_tr and - // upper bound for the objective value using the dual solution_tr + index_end_primal = + thrust::raw_pointer_cast(lowest_inf) - + threshold_.data(); // no - 1 to go before the first inf because end is not included + testing_range_high_.set_value_async(index_end_primal, stream_view_); + } else // No inf found, end is primal_size_h_ + testing_range_high_.set_value_async(index_end_primal, stream_view_); + cuopt_assert(index_start_primal <= index_end_primal, + "Start should be strictly smaller than end"); + + cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(), + threshold_.data() + index_start_primal, + threshold_.data() + index_end_primal, + is_nan_or_inf()), + "Threshold vector should not contain inf or NaN values"); + + // Init parameters for live kernel + // Has to do this to pass lvalues (and not rvalue) to void* kernel_args + auto restart_view = this->view(); + auto op_view = problem_ptr->view(); + i_t* testing_range_low = testing_range_low_.data(); + i_t* testing_range_high = testing_range_high_.data(); + f_t* test_radius_squared = test_radius_squared_.data(); + f_t* low_radius_squared = low_radius_squared_.data(); + f_t* high_radius_squared = high_radius_squared_.data(); + f_t* distance_traveled = duality_gap.distance_traveled_.data(); + + void* kernel_args[] = { + &restart_view, + &op_view, + &testing_range_low, + &testing_range_high, + &test_radius_squared, + &low_radius_squared, + &high_radius_squared, + &distance_traveled, + }; + constexpr int numThreads = 128; + dim3 dimBlock(numThreads, 1, 1); + // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount * + // numBlocksPerSm + dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1); + // Compute the median for the join problem, while loop is inside the live kernel + RAFT_CUDA_TRY(cudaLaunchCooperativeKernel( + (void*)solve_bound_constrained_trust_region_kernel, + dimGrid, + dimBlock, + kernel_args, + 0, + stream_view_)); + + // Find max threshold for the join problem + const f_t* max_threshold = + thrust::max_element(handle_ptr_->get_thrust_policy(), + threshold_.data(), + threshold_.data() + primal_size_h_ + dual_size_h_); + + // we have now determined the test_threshold that should minimize the objective value of the + // solution. + + // if no component got fixed by their upper bound we can pick the maximum threshold to be the + // target_threshold which was computed before the loop in the direction_and_threshold_kernel + // Otherwise use the test_threshold determined in the loop // { - // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution, - // primal_gradient)) - compute_bound(duality_gap.primal_solution_tr_, - duality_gap.primal_solution_, - duality_gap.primal_gradient_, - duality_gap.lagrangian_value_, - primal_size_h_, - primal_stride, - tmp_primal, - duality_gap.lower_bound_value_); - - // compute 'upper bound' using dual - compute_bound(duality_gap.dual_solution_tr_, - duality_gap.dual_solution_, - duality_gap.dual_gradient_, - duality_gap.lagrangian_value_, - dual_size_h_, - dual_stride, - tmp_dual, - duality_gap.upper_bound_value_); + target_threshold_determination_kernel<<<1, 1, 0, stream_view_>>>( + this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + // } + // Compute x (the solution which is defined by moving each component test_threshold * + // direction[component]) clamp on upper and lower bounds. + // Used unsorted_direction_full_ as the other one got sorted + // { + raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(), + duality_gap.primal_solution_.data(), + unsorted_direction_full_.data(), + primal_size_h_, + a_add_scalar_times_b(target_threshold_.data()), + stream_view_); + raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(), + duality_gap.dual_solution_.data(), + unsorted_direction_full_.data() + primal_size_h_, + dual_size_h_, + a_add_scalar_times_b(target_threshold_.data()), + stream_view_); + // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part + using f_t2 = typename type_2::type; + cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(), + problem_ptr->variable_bounds.data()), + duality_gap.primal_solution_tr_.data(), + primal_size_h_, + clamp(), + stream_view_.value()); + + // project by max(min(y[i], upperbound[i]),lowerbound[i]) + raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(), + duality_gap.dual_solution_tr_.data(), + transformed_constraint_lower_bounds_.data(), + transformed_constraint_upper_bounds_.data(), + dual_size_h_, + constraint_clamp(), + stream_view_); // } } - template - void pdlp_restart_strategy_t::compute_distance_traveled_from_last_restart( - localized_duality_gap_container_t & duality_gap, - rmm::device_uvector & primal_weight, - rmm::device_uvector & tmp_primal, - rmm::device_uvector & tmp_dual) - { - raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart"); - // norm( - // new_primal_solution - last_restart.primal_solution, - // )^2 - - // Julia / Paper use a weighted norm using primal weight for primal / dual distance - // We simply use L2 norm of diff - distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_, - last_restart_duality_gap_.primal_solution_, - tmp_primal, - primal_size_h_, - primal_stride, - duality_gap.primal_distance_traveled_); - - // compute similarly for dual - distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_, - last_restart_duality_gap_.dual_solution_, - tmp_dual, - dual_size_h_, - dual_stride, - duality_gap.dual_distance_traveled_); - - // distance_traveled = primal_distance * 0.5 * primal_weight - // + dual_distance * 0.5 / primal_weight - compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( - duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } + // Compute the current lower bound for the objective value using the primal solution_tr and + // upper bound for the objective value using the dual solution_tr + // { + // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution, + // primal_gradient)) + compute_bound(duality_gap.primal_solution_tr_, + duality_gap.primal_solution_, + duality_gap.primal_gradient_, + duality_gap.lagrangian_value_, + primal_size_h_, + primal_stride, + tmp_primal, + duality_gap.lower_bound_value_); + + // compute 'upper bound' using dual + compute_bound(duality_gap.dual_solution_tr_, + duality_gap.dual_solution_, + duality_gap.dual_gradient_, + duality_gap.lagrangian_value_, + dual_size_h_, + dual_stride, + tmp_dual, + duality_gap.upper_bound_value_); + + // } +} - template - void pdlp_restart_strategy_t::compute_primal_gradient( - localized_duality_gap_container_t & duality_gap, - cusparse_view_t & cusparse_view) - { - raft::common::nvtx::range fun_scope("compute_primal_gradient"); +template +void pdlp_restart_strategy_t::compute_distance_traveled_from_last_restart( + localized_duality_gap_container_t& duality_gap, + rmm::device_uvector& primal_weight, + rmm::device_uvector& tmp_primal, + rmm::device_uvector& tmp_dual) +{ + raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart"); + // norm( + // new_primal_solution - last_restart.primal_solution, + // )^2 + + // Julia / Paper use a weighted norm using primal weight for primal / dual distance + // We simply use L2 norm of diff + distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_, + last_restart_duality_gap_.primal_solution_, + tmp_primal, + primal_size_h_, + primal_stride, + duality_gap.primal_distance_traveled_); + + // compute similarly for dual + distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_, + last_restart_duality_gap_.dual_solution_, + tmp_dual, + dual_size_h_, + dual_stride, + duality_gap.dual_distance_traveled_); + + // distance_traveled = primal_distance * 0.5 * primal_weight + // + dual_distance * 0.5 / primal_weight + compute_distance_traveled_last_restart_kernel<<<1, 1, 0, stream_view_>>>( + duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +void pdlp_restart_strategy_t::compute_primal_gradient( + localized_duality_gap_container_t& duality_gap, + cusparse_view_t& cusparse_view) +{ + raft::common::nvtx::range fun_scope("compute_primal_gradient"); #ifdef PDLP_DEBUG_MODE - std::cout << " Compute primal gradient:" << std::endl; + std::cout << " Compute primal gradient:" << std::endl; #endif - // for QP add problem.objective_matrix * primal_solution as well - // c - A^T*y (copy c to primal_gradient for correct writing of result) - raft::copy(duality_gap.primal_gradient_.data(), - problem_ptr->objective_coefficients.data(), - primal_size_h_, - stream_view_); - - RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_neg_1_.data(), - cusparse_view.A_T, - cusparse_view.dual_solution, - reusable_device_scalar_value_1_.data(), - cusparse_view.primal_gradient, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); - } - - template - __global__ void compute_subgradient_kernel( - const typename pdlp_restart_strategy_t::view_t restart_strategy_view, - const typename problem_t::view_t op_problem_view, - const typename localized_duality_gap_container_t::view_t duality_gap_view, - f_t* subgradient) - { - i_t id = threadIdx.x + blockIdx.x * blockDim.x; - if (id >= duality_gap_view.dual_size) { return; } - - f_t lower = op_problem_view.constraint_lower_bounds[id]; - f_t upper = op_problem_view.constraint_upper_bounds[id]; - f_t primal_product = duality_gap_view.dual_gradient[id]; - f_t dual_solution = duality_gap_view.dual_solution[id]; + // for QP add problem.objective_matrix * primal_solution as well + // c - A^T*y (copy c to primal_gradient for correct writing of result) + raft::copy(duality_gap.primal_gradient_.data(), + problem_ptr->objective_coefficients.data(), + primal_size_h_, + stream_view_); - f_t subgradient_coefficient; + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_neg_1_.data(), + cusparse_view.A_T, + cusparse_view.dual_solution, + reusable_device_scalar_value_1_.data(), + cusparse_view.primal_gradient, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_transpose.data(), + stream_view_)); +} - if (dual_solution < f_t(0)) { - subgradient_coefficient = upper; - } else if (dual_solution > f_t(0)) { - subgradient_coefficient = lower; - } else if (!isfinite(upper) && !isfinite(lower)) { - subgradient_coefficient = f_t(0); - } else if (!isfinite(upper) && isfinite(lower)) { +template +__global__ void compute_subgradient_kernel( + const typename pdlp_restart_strategy_t::view_t restart_strategy_view, + const typename problem_t::view_t op_problem_view, + const typename localized_duality_gap_container_t::view_t duality_gap_view, + f_t* subgradient) +{ + i_t id = threadIdx.x + blockIdx.x * blockDim.x; + if (id >= duality_gap_view.dual_size) { return; } + + f_t lower = op_problem_view.constraint_lower_bounds[id]; + f_t upper = op_problem_view.constraint_upper_bounds[id]; + f_t primal_product = duality_gap_view.dual_gradient[id]; + f_t dual_solution = duality_gap_view.dual_solution[id]; + + f_t subgradient_coefficient; + + if (dual_solution < f_t(0)) { + subgradient_coefficient = upper; + } else if (dual_solution > f_t(0)) { + subgradient_coefficient = lower; + } else if (!isfinite(upper) && !isfinite(lower)) { + subgradient_coefficient = f_t(0); + } else if (!isfinite(upper) && isfinite(lower)) { + subgradient_coefficient = lower; + } else if (isfinite(upper) && !isfinite(lower)) { + subgradient_coefficient = upper; + } else { + if (primal_product < lower) { subgradient_coefficient = lower; - } else if (isfinite(upper) && !isfinite(lower)) { + } else if (primal_product > upper) { subgradient_coefficient = upper; } else { - if (primal_product < lower) { - subgradient_coefficient = lower; - } else if (primal_product > upper) { - subgradient_coefficient = upper; - } else { - subgradient_coefficient = primal_product; - } + subgradient_coefficient = primal_product; } - - subgradient[id] = subgradient_coefficient; } - template - void pdlp_restart_strategy_t::compute_dual_gradient( - localized_duality_gap_container_t & duality_gap, - cusparse_view_t & cusparse_view, - rmm::device_uvector & tmp_dual) - { - raft::common::nvtx::range fun_scope("compute_dual_gradient"); + subgradient[id] = subgradient_coefficient; +} + +template +void pdlp_restart_strategy_t::compute_dual_gradient( + localized_duality_gap_container_t& duality_gap, + cusparse_view_t& cusparse_view, + rmm::device_uvector& tmp_dual) +{ + raft::common::nvtx::range fun_scope("compute_dual_gradient"); #ifdef PDLP_DEBUG_MODE - std::cout << " Compute dual gradient:" << std::endl; + std::cout << " Compute dual gradient:" << std::endl; #endif - // b - A*x - // is changed with the introduction of constraint upper and lower bounds - - // gradient constains primal_product - RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), - cusparse_view.A, - cusparse_view.primal_solution, - reusable_device_scalar_value_0_.data(), - cusparse_view.dual_gradient, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_non_transpose.data(), - stream_view_)); - - // tmp_dual will contain the subgradient - i_t number_of_blocks = dual_size_h_ / block_size; - if (dual_size_h_ % block_size) number_of_blocks++; - i_t number_of_threads = std::min(dual_size_h_, block_size); - compute_subgradient_kernel<<>>( - this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data()); - - // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient) - raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(), - tmp_dual.data(), - duality_gap.dual_gradient_.data(), - dual_size_h_, - stream_view_); - } + // b - A*x + // is changed with the introduction of constraint upper and lower bounds - template - void pdlp_restart_strategy_t::compute_lagrangian_value( - localized_duality_gap_container_t & duality_gap, - cusparse_view_t & cusparse_view, - rmm::device_uvector & tmp_primal, - rmm::device_uvector & tmp_dual) - { - raft::common::nvtx::range fun_scope("compute_lagrangian_value"); + // gradient constains primal_product + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view.A, + cusparse_view.primal_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view.dual_gradient, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_non_transpose.data(), + stream_view_)); + + // tmp_dual will contain the subgradient + i_t number_of_blocks = dual_size_h_ / block_size; + if (dual_size_h_ % block_size) number_of_blocks++; + i_t number_of_threads = std::min(dual_size_h_, block_size); + compute_subgradient_kernel<<>>( + this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data()); + + // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient) + raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(), + tmp_dual.data(), + duality_gap.dual_gradient_.data(), + dual_size_h_, + stream_view_); +} + +template +void pdlp_restart_strategy_t::compute_lagrangian_value( + localized_duality_gap_container_t& duality_gap, + cusparse_view_t& cusparse_view, + rmm::device_uvector& tmp_primal, + rmm::device_uvector& tmp_dual) +{ + raft::common::nvtx::range fun_scope("compute_lagrangian_value"); #ifdef PDLP_DEBUG_MODE - std::cout << " Compute lagrangian value:" << std::endl; + std::cout << " Compute lagrangian value:" << std::endl; #endif - // if QP - // 0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) + - // dot(primal_solution, problem.objective_vector) - - // dot(primal_solution, problem.constraint_matrix' * dual_solution) + - // dot(dual_solution, dual_gradient+primal_product) + - // problem.objective_constant + // if QP + // 0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) + + // dot(primal_solution, problem.objective_vector) - + // dot(primal_solution, problem.constraint_matrix' * dual_solution) + + // dot(dual_solution, dual_gradient+primal_product) + + // problem.objective_constant - // when lp first term is irrelevant + // when lp first term is irrelevant - // second term - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - primal_size_h_, - duality_gap.primal_solution_.data(), - primal_stride, - problem_ptr->objective_coefficients.data(), - primal_stride, - reusable_device_scalar_1_.data(), - stream_view_)); + // second term + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + primal_size_h_, + duality_gap.primal_solution_.data(), + primal_stride, + problem_ptr->objective_coefficients.data(), + primal_stride, + reusable_device_scalar_1_.data(), + stream_view_)); - // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot - RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), - cusparse_view.A_T, - cusparse_view.dual_solution, - reusable_device_scalar_value_0_.data(), - cusparse_view.tmp_primal, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view.buffer_transpose.data(), - stream_view_)); + // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view.A_T, + cusparse_view.dual_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view.tmp_primal, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view.buffer_transpose.data(), + stream_view_)); - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - primal_size_h_, - duality_gap.primal_solution_.data(), - primal_stride, - tmp_primal.data(), - primal_stride, - reusable_device_scalar_2_.data(), - stream_view_)); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + primal_size_h_, + duality_gap.primal_solution_.data(), + primal_stride, + tmp_primal.data(), + primal_stride, + reusable_device_scalar_2_.data(), + stream_view_)); - // fourth term //tmp_dual still contains subgradient from the dual_gradient computation - reusable_device_scalar_3_.set_value_to_zero_async(stream_view_); - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - dual_size_h_, - duality_gap.dual_solution_.data(), - dual_stride, - tmp_dual.data(), - dual_stride, - reusable_device_scalar_3_.data(), - stream_view_)); + // fourth term //tmp_dual still contains subgradient from the dual_gradient computation + reusable_device_scalar_3_.set_value_to_zero_async(stream_view_); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + dual_size_h_, + duality_gap.dual_solution_.data(), + dual_stride, + tmp_dual.data(), + dual_stride, + reusable_device_scalar_3_.data(), + stream_view_)); - // subtract third term from second up - raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(), - reusable_device_scalar_1_.data(), - reusable_device_scalar_2_.data(), - 1, - stream_view_); - raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(), - reusable_device_scalar_1_.data(), - reusable_device_scalar_3_.data(), - 1, - stream_view_); - } + // subtract third term from second up + raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(), + reusable_device_scalar_1_.data(), + reusable_device_scalar_2_.data(), + 1, + stream_view_); + raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(), + reusable_device_scalar_1_.data(), + reusable_device_scalar_3_.data(), + 1, + stream_view_); +} - template - void pdlp_restart_strategy_t::reset_internal() - { - candidate_is_avg_.set_value_to_zero_async(stream_view_); - restart_triggered_.set_value_to_zero_async(stream_view_); - } +template +void pdlp_restart_strategy_t::reset_internal() +{ + candidate_is_avg_.set_value_to_zero_async(stream_view_); + restart_triggered_.set_value_to_zero_async(stream_view_); +} - template - typename pdlp_restart_strategy_t::view_t pdlp_restart_strategy_t::view() - { - pdlp_restart_strategy_t::view_t v{}; - v.primal_size = primal_size_h_; - v.dual_size = dual_size_h_; - v.transformed_constraint_lower_bounds = raft::device_span{ - transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()}; - v.transformed_constraint_upper_bounds = raft::device_span{ - transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()}; - v.last_restart_length = last_restart_length_; +template +typename pdlp_restart_strategy_t::view_t pdlp_restart_strategy_t::view() +{ + pdlp_restart_strategy_t::view_t v{}; + v.primal_size = primal_size_h_; + v.dual_size = dual_size_h_; + v.transformed_constraint_lower_bounds = raft::device_span{ + transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()}; + v.transformed_constraint_upper_bounds = raft::device_span{ + transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()}; + v.last_restart_length = last_restart_length_; - v.weights = raft::device_span{weights_.data(), weights_.size()}; + v.weights = raft::device_span{weights_.data(), weights_.size()}; - v.candidate_is_avg = candidate_is_avg_.data(); - v.restart_triggered = restart_triggered_.data(); + v.candidate_is_avg = candidate_is_avg_.data(); + v.restart_triggered = restart_triggered_.data(); - v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data(); + v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data(); - v.center_point = raft::device_span{center_point_.data(), center_point_.size()}; - v.objective_vector = raft::device_span{objective_vector_.data(), objective_vector_.size()}; - v.direction_full = raft::device_span{direction_full_.data(), direction_full_.size()}; - v.threshold = raft::device_span{threshold_.data(), threshold_.size()}; - v.lower_bound = raft::device_span{lower_bound_.data(), lower_bound_.size()}; - v.upper_bound = raft::device_span{upper_bound_.data(), upper_bound_.size()}; - v.test_point = raft::device_span{test_point_.data(), test_point_.size()}; + v.center_point = raft::device_span{center_point_.data(), center_point_.size()}; + v.objective_vector = raft::device_span{objective_vector_.data(), objective_vector_.size()}; + v.direction_full = raft::device_span{direction_full_.data(), direction_full_.size()}; + v.threshold = raft::device_span{threshold_.data(), threshold_.size()}; + v.lower_bound = raft::device_span{lower_bound_.data(), lower_bound_.size()}; + v.upper_bound = raft::device_span{upper_bound_.data(), upper_bound_.size()}; + v.test_point = raft::device_span{test_point_.data(), test_point_.size()}; - v.target_threshold = target_threshold_.data(); - v.low_radius_squared = low_radius_squared_.data(); - v.high_radius_squared = high_radius_squared_.data(); - v.test_radius_squared = test_radius_squared_.data(); + v.target_threshold = target_threshold_.data(); + v.low_radius_squared = low_radius_squared_.data(); + v.high_radius_squared = high_radius_squared_.data(); + v.test_radius_squared = test_radius_squared_.data(); - v.testing_range_low = testing_range_low_.data(); - v.testing_range_high = testing_range_high_.data(); + v.testing_range_low = testing_range_low_.data(); + v.testing_range_high = testing_range_high_.data(); - v.shared_live_kernel_accumulator = raft::device_span{ - shared_live_kernel_accumulator_.data(), shared_live_kernel_accumulator_.size()}; + v.shared_live_kernel_accumulator = raft::device_span{shared_live_kernel_accumulator_.data(), + shared_live_kernel_accumulator_.size()}; - v.hyper_params = hyper_params_; + v.hyper_params = hyper_params_; - return v; - } + return v; +} - template - typename pdlp_restart_strategy_t::cupdlpx_restart_view_t - pdlp_restart_strategy_t::make_cupdlpx_restart_view( - const rmm::device_uvector& primal_distance, - const rmm::device_uvector& dual_distance, - const convergence_information_t& current_convergence_information, - const rmm::device_uvector& step_size, - rmm::device_uvector& primal_weight, - rmm::device_uvector& best_primal_weight, - rmm::device_uvector& primal_step_size, - rmm::device_uvector& dual_step_size) - { - cupdlpx_restart_view_t v{}; - v.primal_distance = make_span(primal_distance); - v.dual_distance = make_span(dual_distance); - v.l2_dual_residual = make_span(current_convergence_information.get_l2_dual_residual()); - v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual()); - v.l2_norm_primal_linear_objective = - current_convergence_information.get_relative_dual_tolerance_factor(); - v.l2_norm_primal_right_hand_side = - current_convergence_information.get_relative_primal_tolerance_factor(); - v.step_size = make_span(step_size); - v.primal_weight = make_span(primal_weight); - v.primal_weight_error_sum = make_span(primal_weight_error_sum_); - v.primal_weight_last_error = make_span(primal_weight_last_error_); - v.best_primal_weight = make_span(best_primal_weight); - v.new_primal_step_size = make_span(primal_step_size); - v.new_dual_step_size = make_span(dual_step_size); - v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_); - v.hyper_params = hyper_params_; - return v; - } +template +typename pdlp_restart_strategy_t::cupdlpx_restart_view_t +pdlp_restart_strategy_t::make_cupdlpx_restart_view( + const rmm::device_uvector& primal_distance, + const rmm::device_uvector& dual_distance, + const convergence_information_t& current_convergence_information, + const rmm::device_uvector& step_size, + rmm::device_uvector& primal_weight, + rmm::device_uvector& best_primal_weight, + rmm::device_uvector& primal_step_size, + rmm::device_uvector& dual_step_size) +{ + cupdlpx_restart_view_t v{}; + v.primal_distance = make_span(primal_distance); + v.dual_distance = make_span(dual_distance); + v.l2_dual_residual = make_span(current_convergence_information.get_l2_dual_residual()); + v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual()); + v.l2_norm_primal_linear_objective = + current_convergence_information.get_relative_dual_tolerance_factor(); + v.l2_norm_primal_right_hand_side = + current_convergence_information.get_relative_primal_tolerance_factor(); + v.step_size = make_span(step_size); + v.primal_weight = make_span(primal_weight); + v.primal_weight_error_sum = make_span(primal_weight_error_sum_); + v.primal_weight_last_error = make_span(primal_weight_last_error_); + v.best_primal_weight = make_span(best_primal_weight); + v.new_primal_step_size = make_span(primal_step_size); + v.new_dual_step_size = make_span(dual_step_size); + v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_); + v.hyper_params = hyper_params_; + return v; +} - template - i_t pdlp_restart_strategy_t::get_iterations_since_last_restart() const - { - return weighted_average_solution_.get_iterations_since_last_restart(); - } +template +i_t pdlp_restart_strategy_t::get_iterations_since_last_restart() const +{ + return weighted_average_solution_.get_iterations_since_last_restart(); +} - template - void pdlp_restart_strategy_t::set_last_restart_was_average(bool value) - { - last_restart_was_average_ = value; - } +template +void pdlp_restart_strategy_t::set_last_restart_was_average(bool value) +{ + last_restart_was_average_ = value; +} - template - bool pdlp_restart_strategy_t::get_last_restart_was_average() const - { - return last_restart_was_average_; - } +template +bool pdlp_restart_strategy_t::get_last_restart_was_average() const +{ + return last_restart_was_average_; +} #define INSTANTIATE(F_TYPE) \ template class pdlp_restart_strategy_t; \ @@ -2523,11 +2524,11 @@ void pdlp_restart_strategy_t::solve_bound_constrained_trust_region( F_TYPE* primal_product); #if MIP_INSTANTIATE_FLOAT - INSTANTIATE(float) +INSTANTIATE(float) #endif #if MIP_INSTANTIATE_DOUBLE - INSTANTIATE(double) +INSTANTIATE(double) #endif } // namespace cuopt::linear_programming::detail