From 8596529e2563ec8dc25bcf7691e2188e5c29db64 Mon Sep 17 00:00:00 2001 From: Kumar Aatish Date: Sat, 26 Jul 2025 07:32:06 -0700 Subject: [PATCH 1/4] init --- cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh index 53a76536b8..dea2ffe7c5 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh @@ -108,6 +108,7 @@ i_t create_heavy_item_block_segments(rmm::cuda_stream_view stream, // Inclusive scan so that each block can determine which item it belongs to item_block_segments.set_element_to_zero_async(0, stream); + thrust::inclusive_scan(rmm::exec_policy(stream), calc_blocks_per_vertex_iter, calc_blocks_per_vertex_iter + heavy_id_count, From 509dfe638ce29e6fc52e2de9d3ead4e7d50ea102 Mon Sep 17 00:00:00 2001 From: Kumar Aatish Date: Wed, 30 Jul 2025 16:58:01 -0400 Subject: [PATCH 2/4] Manual graph creation for lb bounds update --- cpp/src/mip/presolve/bounds_presolve.cu | 1 + cpp/src/mip/presolve/bounds_presolve.cuh | 1 + .../presolve/load_balanced_bounds_presolve.cu | 167 ++++- .../load_balanced_bounds_presolve.cuh | 3 + .../load_balanced_bounds_presolve_helpers.cuh | 610 +++++++++++++++++- .../load_balanced_bounds_presolve_kernels.cuh | 122 +++- cpp/tests/mip/CMakeLists.txt | 3 + cpp/tests/mip/lb_test.cu | 188 ++++++ 8 files changed, 998 insertions(+), 97 deletions(-) create mode 100644 cpp/tests/mip/lb_test.cu diff --git a/cpp/src/mip/presolve/bounds_presolve.cu b/cpp/src/mip/presolve/bounds_presolve.cu index 45fee622eb..72440cd9a8 100644 --- a/cpp/src/mip/presolve/bounds_presolve.cu +++ b/cpp/src/mip/presolve/bounds_presolve.cu @@ -202,6 +202,7 @@ termination_criterion_t bound_presolve_t::bound_update_loop(problem_t< } pb.handle_ptr->sync_stream(); calculate_infeasible_redundant_constraints(pb); + solve_iter = iter; return criteria; } diff --git a/cpp/src/mip/presolve/bounds_presolve.cuh b/cpp/src/mip/presolve/bounds_presolve.cuh index 9a25b05e9c..84853a7812 100644 --- a/cpp/src/mip/presolve/bounds_presolve.cuh +++ b/cpp/src/mip/presolve/bounds_presolve.cuh @@ -86,6 +86,7 @@ class bound_presolve_t { i_t infeas_constraints_count = 0; i_t redund_constraints_count = 0; probing_cache_t probing_cache; + i_t solve_iter; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu index 091f8a53b5..de8f481901 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu @@ -245,6 +245,7 @@ void load_balanced_bounds_presolve_t::setup( heavy_degree_cutoff, problem.cnst_bin_offsets, problem.offsets); + RAFT_CHECK_CUDA(stream_heavy_cnst); num_blocks_heavy_vars = create_heavy_item_block_segments(stream_heavy_vars, heavy_vars_vertex_ids, @@ -253,49 +254,34 @@ void load_balanced_bounds_presolve_t::setup( heavy_degree_cutoff, problem.vars_bin_offsets, problem.reverse_offsets); + RAFT_CHECK_CUDA(stream_heavy_vars); tmp_act.resize(2 * num_blocks_heavy_cnst, stream_heavy_cnst); tmp_bnd.resize(2 * num_blocks_heavy_vars, stream_heavy_vars); - std::tie(is_cnst_sub_warp_single_bin, cnst_sub_warp_count) = sub_warp_meta( - streams.get_stream(), warp_cnst_offsets, warp_cnst_id_offsets, pb->cnst_bin_offsets, 4); + std::tie(is_cnst_sub_warp_single_bin, cnst_sub_warp_count) = + sub_warp_meta(stream, warp_cnst_offsets, warp_cnst_id_offsets, pb->cnst_bin_offsets, 4); - std::tie(is_vars_sub_warp_single_bin, vars_sub_warp_count) = sub_warp_meta( - streams.get_stream(), warp_vars_offsets, warp_vars_id_offsets, pb->vars_bin_offsets, 4); + std::tie(is_vars_sub_warp_single_bin, vars_sub_warp_count) = + sub_warp_meta(stream, warp_vars_offsets, warp_vars_id_offsets, pb->vars_bin_offsets, 4); - stream.synchronize(); - streams.sync_all_issued(); + // stream.synchronize(); + RAFT_CHECK_CUDA(stream); + streams.sync_test_all_issued(); if (!calc_slack_erase_inf_cnst_graph_created) { - bool erase_inf_cnst = true; - calc_slack_erase_inf_cnst_graph_created = build_graph( - streams, - handle_ptr, - calc_slack_erase_inf_cnst_graph, - calc_slack_erase_inf_cnst_exec, - [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst, true); }, - [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst); }); + create_constraint_slack_graph(true); + calc_slack_erase_inf_cnst_graph_created = true; } if (!calc_slack_graph_created) { - bool erase_inf_cnst = false; - calc_slack_graph_created = build_graph( - streams, - handle_ptr, - calc_slack_graph, - calc_slack_exec, - [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst, true); }, - [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst); }); + create_constraint_slack_graph(false); + calc_slack_graph_created = true; } if (!upd_bnd_graph_created) { - upd_bnd_graph_created = build_graph( - streams, - handle_ptr, - upd_bnd_graph, - upd_bnd_exec, - [this]() { this->calculate_bounds_update_graph(true); }, - [this]() { this->calculate_bounds_update_graph(); }); + create_bounds_update_graph(); + upd_bnd_graph_created = true; } } @@ -368,6 +354,119 @@ void load_balanced_bounds_presolve_t::calculate_activity_graph(bool er dry_run); } +template +void load_balanced_bounds_presolve_t::create_bounds_update_graph() +{ + using f_t2 = typename type_2::type; + cudaGraph_t upd_graph; + cudaGraphCreate(&upd_graph, 0); + cudaGraphNode_t bounds_changed_node; + { + i_t* bounds_changed_ptr = bounds_changed.data(); + + cudaMemcpy3DParms memcpyParams = {0}; + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = make_cudaPitchedPtr(bounds_changed_ptr, sizeof(i_t), 1, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = make_cudaPitchedPtr(&h_bounds_changed, sizeof(i_t), 1, 1); + memcpyParams.extent = make_cudaExtent(sizeof(i_t), 1, 1); + memcpyParams.kind = cudaMemcpyDeviceToHost; + cudaGraphAddMemcpyNode(&bounds_changed_node, upd_graph, NULL, 0, &memcpyParams); + } + + auto bounds_update_view = get_bounds_update_view(*pb); + + create_update_bounds_heavy_vars(upd_graph, + bounds_changed_node, + bounds_update_view, + make_span_2(tmp_bnd), + heavy_vars_vertex_ids, + heavy_vars_pseudo_block_ids, + heavy_vars_block_segments, + pb->vars_bin_offsets, + heavy_degree_cutoff, + num_blocks_heavy_vars); + RAFT_CUDA_TRY(cudaGetLastError()); + create_update_bounds_per_block( + upd_graph, bounds_changed_node, bounds_update_view, pb->vars_bin_offsets, heavy_degree_cutoff); + RAFT_CUDA_TRY(cudaGetLastError()); + create_update_bounds_sub_warp(upd_graph, + bounds_changed_node, + bounds_update_view, + is_vars_sub_warp_single_bin, + vars_sub_warp_count, + warp_vars_offsets, + warp_vars_id_offsets, + pb->vars_bin_offsets); + RAFT_CUDA_TRY(cudaGetLastError()); + cudaGraphDebugDotPrint(upd_graph, "/home/aatish/debug_upd_graph", 0); + RAFT_CUDA_TRY(cudaGetLastError()); + cudaGraphInstantiate(&upd_bnd_exec, upd_graph, NULL, NULL, 0); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +template +void load_balanced_bounds_presolve_t::create_constraint_slack_graph(bool erase_inf_cnst) +{ + using f_t2 = typename type_2::type; + cudaGraph_t cnst_slack_graph; + cudaGraphCreate(&cnst_slack_graph, 0); + + cudaGraphNode_t set_bounds_changed_node; + { + // TODO : Investigate why memset node is not captured manually + i_t* bounds_changed_ptr = bounds_changed.data(); + + cudaMemcpy3DParms memcpyParams = {0}; + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = make_cudaPitchedPtr(&h_bounds_changed, sizeof(i_t), 1, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = make_cudaPitchedPtr(bounds_changed_ptr, sizeof(i_t), 1, 1); + memcpyParams.extent = make_cudaExtent(sizeof(i_t), 1, 1); + memcpyParams.kind = cudaMemcpyHostToDevice; + cudaGraphAddMemcpyNode(&set_bounds_changed_node, cnst_slack_graph, NULL, 0, &memcpyParams); + } + + auto activity_view = get_activity_view(*pb); + + create_activity_heavy_cnst(cnst_slack_graph, + set_bounds_changed_node, + activity_view, + make_span_2(tmp_act), + heavy_cnst_vertex_ids, + heavy_cnst_pseudo_block_ids, + heavy_cnst_block_segments, + pb->cnst_bin_offsets, + heavy_degree_cutoff, + num_blocks_heavy_cnst, + erase_inf_cnst); + create_activity_per_block(cnst_slack_graph, + set_bounds_changed_node, + activity_view, + pb->cnst_bin_offsets, + heavy_degree_cutoff, + erase_inf_cnst); + create_activity_sub_warp(cnst_slack_graph, + set_bounds_changed_node, + activity_view, + is_cnst_sub_warp_single_bin, + cnst_sub_warp_count, + warp_cnst_offsets, + warp_cnst_id_offsets, + pb->cnst_bin_offsets, + erase_inf_cnst); + cudaGraphDebugDotPrint(cnst_slack_graph, "/home/aatish/debug_cnst_slack_graph", 0); + if (erase_inf_cnst) { + cudaGraphInstantiate(&calc_slack_erase_inf_cnst_exec, cnst_slack_graph, NULL, NULL, 0); + } else { + cudaGraphInstantiate(&calc_slack_exec, cnst_slack_graph, NULL, NULL, 0); + } +} + template void load_balanced_bounds_presolve_t::calculate_bounds_update_graph(bool dry_run) { @@ -401,12 +500,13 @@ template void load_balanced_bounds_presolve_t::calculate_constraint_slack_iter( const raft::handle_t* handle_ptr) { + // h_bounds_changed is copied to bounds_changed in calc_slack_exec + h_bounds_changed = 0; { // writes nans to constraint activities that are infeasible //-> less expensive checks for update bounds step raft::common::nvtx::range scope("act_cuda_task_graph"); cudaGraphLaunch(calc_slack_erase_inf_cnst_exec, handle_ptr->get_stream()); - handle_ptr->sync_stream(); } infeas_cnst_slack_set_to_nan = true; RAFT_CHECK_CUDA(handle_ptr->get_stream()); @@ -416,6 +516,8 @@ template void load_balanced_bounds_presolve_t::calculate_constraint_slack( const raft::handle_t* handle_ptr) { + // h_bounds_changed is copied to bounds_changed in calc_slack_exec + h_bounds_changed = 0; { raft::common::nvtx::range scope("act_cuda_task_graph"); cudaGraphLaunch(calc_slack_exec, handle_ptr->get_stream()); @@ -428,13 +530,10 @@ template bool load_balanced_bounds_presolve_t::update_bounds_from_slack( const raft::handle_t* handle_ptr) { - i_t h_bounds_changed; - bounds_changed.set_value_to_zero_async(handle_ptr->get_stream()); - + // bounds_changed is copied to h_bounds_changed in upd_bnd_exec { raft::common::nvtx::range scope("upd_cuda_task_graph"); cudaGraphLaunch(upd_bnd_exec, handle_ptr->get_stream()); - h_bounds_changed = bounds_changed.value(handle_ptr->get_stream()); } RAFT_CHECK_CUDA(handle_ptr->get_stream()); constexpr i_t zero = 0; diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh index 19aef04f8c..42736b3a0a 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh @@ -212,6 +212,8 @@ class load_balanced_bounds_presolve_t { activity_view_t get_activity_view(const load_balanced_problem_t& pb); bounds_update_view_t get_bounds_update_view(const load_balanced_problem_t& pb); + void create_bounds_update_graph(); + void create_constraint_slack_graph(bool erase_inf_cnst); rmm::cuda_stream main_stream; rmm::cuda_stream act_stream; @@ -221,6 +223,7 @@ class load_balanced_bounds_presolve_t { const load_balanced_problem_t* pb; rmm::device_scalar bounds_changed; + i_t h_bounds_changed; rmm::device_uvector cnst_slack; rmm::device_uvector vars_bnd; diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh index dea2ffe7c5..07a268d9aa 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh @@ -157,6 +157,7 @@ void calc_activity_heavy_cnst(managed_stream_pool& streams, { if (num_blocks_heavy_cnst != 0) { auto heavy_cnst_stream = streams.get_stream(); + RAFT_CHECK_CUDA(heavy_cnst_stream); // TODO : Check heavy_cnst_block_segments size for profiling if (!dry_run) { auto heavy_cnst_beg_id = get_id_offset(cnst_bin_offsets, heavy_degree_cutoff); @@ -168,15 +169,18 @@ void calc_activity_heavy_cnst(managed_stream_pool& streams, heavy_degree_cutoff, view, tmp_cnst_act); + RAFT_CHECK_CUDA(heavy_cnst_stream); auto num_heavy_cnst = cnst_bin_offsets.back() - heavy_cnst_beg_id; if (erase_inf_cnst) { finalize_calc_act_kernel <<>>( heavy_cnst_beg_id, make_span(heavy_cnst_block_segments), tmp_cnst_act, view); + RAFT_CHECK_CUDA(heavy_cnst_stream); } else { finalize_calc_act_kernel <<>>( heavy_cnst_beg_id, make_span(heavy_cnst_block_segments), tmp_cnst_act, view); + RAFT_CHECK_CUDA(heavy_cnst_stream); } } } @@ -202,9 +206,11 @@ void calc_activity_per_block(managed_stream_pool& streams, if (erase_inf_cnst) { lb_calc_act_block_kernel <<>>(cnst_id_beg, view); + RAFT_CHECK_CUDA(block_stream); } else { lb_calc_act_block_kernel <<>>(cnst_id_beg, view); + RAFT_CHECK_CUDA(block_stream); } } } @@ -261,9 +267,11 @@ void calc_activity_sub_warp(managed_stream_pool& streams, if (erase_inf_cnst) { lb_calc_act_sub_warp_kernel <<>>(cnst_id_beg, cnst_id_end, view); + RAFT_CHECK_CUDA(sub_warp_thread); } else { lb_calc_act_sub_warp_kernel <<>>(cnst_id_beg, cnst_id_end, view); + RAFT_CHECK_CUDA(sub_warp_thread); } } } @@ -304,10 +312,12 @@ void calc_activity_sub_warp(managed_stream_pool& streams, lb_calc_act_sub_warp_kernel <<>>( view, make_span(warp_cnst_offsets), make_span(warp_cnst_id_offsets)); + RAFT_CHECK_CUDA(sub_warp_stream); } else { lb_calc_act_sub_warp_kernel <<>>( view, make_span(warp_cnst_offsets), make_span(warp_cnst_id_offsets)); + RAFT_CHECK_CUDA(sub_warp_stream); } } } @@ -359,44 +369,310 @@ void calc_activity_sub_warp(managed_stream_pool& streams, } } -/// BOUNDS UPDATE +template +void create_activity_sub_warp(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + i_t degree_beg, + i_t degree_end, + const std::vector& cnst_bin_offsets, + bool erase_inf_cnst) +{ + constexpr i_t block_dim = 32; + auto cnst_per_block = block_dim / threads_per_constraint; + auto [cnst_id_beg, cnst_id_end] = get_id_range(cnst_bin_offsets, degree_beg, degree_end); -template -void upd_bounds_heavy_vars(managed_stream_pool& streams, - bounds_update_view_t view, - raft::device_span tmp_vars_bnd, - const rmm::device_uvector& heavy_vars_vertex_ids, - const rmm::device_uvector& heavy_vars_pseudo_block_ids, - const rmm::device_uvector& heavy_vars_block_segments, - const std::vector& vars_bin_offsets, - i_t heavy_degree_cutoff, - i_t num_blocks_heavy_vars, - bool dry_run = false) + auto block_count = raft::ceildiv(cnst_id_end - cnst_id_beg, cnst_per_block); + if (block_count != 0) { + cudaGraphNode_t act_sub_warp_node; + void* kernelArgs[] = {&cnst_id_beg, &cnst_id_end, &view}; + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.gridDim = dim3(block_count, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + if (erase_inf_cnst) { + kernelNodeParams.func = (void*)lb_calc_act_sub_warp_kernel; + } else { + kernelNodeParams.func = (void*)lb_calc_act_sub_warp_kernel; + } + + cudaGraphAddKernelNode(&act_sub_warp_node, act_graph, NULL, 0, &kernelNodeParams); + cudaGraphAddDependencies(act_graph, &act_sub_warp_node, &set_bounds_changed_node, 1); + } +} + +template +void create_activity_sub_warp(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + i_t degree, + const std::vector& cnst_bin_offsets, + bool erase_inf_cnst) { - if (num_blocks_heavy_vars != 0) { - auto heavy_vars_stream = streams.get_stream(); - // TODO : Check heavy_vars_block_segments size for profiling - if (!dry_run) { - auto heavy_vars_beg_id = get_id_offset(vars_bin_offsets, heavy_degree_cutoff); - lb_upd_bnd_heavy_kernel - <<>>( - heavy_vars_beg_id, - make_span(heavy_vars_vertex_ids), - make_span(heavy_vars_pseudo_block_ids), - heavy_degree_cutoff, - view, - tmp_vars_bnd); - auto num_heavy_vars = vars_bin_offsets.back() - heavy_vars_beg_id; - finalize_upd_bnd_kernel<<>>( - heavy_vars_beg_id, make_span(heavy_vars_block_segments), tmp_vars_bnd, view); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, degree, degree, cnst_bin_offsets, erase_inf_cnst); +} + +template +void create_activity_sub_warp(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + i_t cnst_sub_warp_count, + rmm::device_uvector& warp_cnst_offsets, + rmm::device_uvector& warp_cnst_id_offsets, + bool erase_inf_cnst) +{ + constexpr i_t block_dim = 256; + + auto block_count = raft::ceildiv(cnst_sub_warp_count * 32, block_dim); + if (block_count != 0) { + cudaGraphNode_t act_sub_warp_node; + auto warp_cnst_offsets_span = make_span(warp_cnst_offsets); + auto warp_cnst_id_offsets_span = make_span(warp_cnst_id_offsets); + + void* kernelArgs[] = {&view, &warp_cnst_offsets_span, &warp_cnst_id_offsets_span}; + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.gridDim = dim3(block_count, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + if (erase_inf_cnst) { + kernelNodeParams.func = + (void*)lb_calc_act_sub_warp_kernel; + } else { + kernelNodeParams.func = + (void*)lb_calc_act_sub_warp_kernel; + } + + cudaGraphAddKernelNode(&act_sub_warp_node, act_graph, NULL, 0, &kernelNodeParams); + cudaGraphAddDependencies(act_graph, &act_sub_warp_node, &set_bounds_changed_node, 1); + } +} + +template +void create_activity_sub_warp(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + bool is_cnst_sub_warp_single_bin, + i_t cnst_sub_warp_count, + rmm::device_uvector& warp_cnst_offsets, + rmm::device_uvector& warp_cnst_id_offsets, + const std::vector& cnst_bin_offsets, + bool erase_inf_cnst) +{ + if (view.nnz < 10000) { + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 16, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 8, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 4, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 2, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 1, cnst_bin_offsets, erase_inf_cnst); + } else { + if (is_cnst_sub_warp_single_bin) { + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 64, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 32, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 16, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 8, cnst_bin_offsets, erase_inf_cnst); + create_activity_sub_warp( + act_graph, set_bounds_changed_node, view, 1, 4, cnst_bin_offsets, erase_inf_cnst); + } else { + create_activity_sub_warp(act_graph, + set_bounds_changed_node, + view, + cnst_sub_warp_count, + warp_cnst_offsets, + warp_cnst_id_offsets, + erase_inf_cnst); + } + } +} + +template +void create_activity_per_block(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + const std::vector& cnst_bin_offsets, + i_t degree_beg, + i_t degree_end, + bool erase_inf_cnst) +{ + static_assert(block_dim <= 1024, "Cannot launch kernel with more than 1024 threads"); + + auto [cnst_id_beg, cnst_id_end] = get_id_range(cnst_bin_offsets, degree_beg, degree_end); + + auto block_count = cnst_id_end - cnst_id_beg; + if (block_count > 0) { + cudaGraphNode_t act_block_node; + void* kernelArgs[] = {&cnst_id_beg, &view}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.gridDim = dim3(block_count, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + if (erase_inf_cnst) { + kernelNodeParams.func = + (void*)lb_calc_act_block_kernel; + } else { + kernelNodeParams.func = + (void*)lb_calc_act_block_kernel; } + + cudaGraphAddKernelNode(&act_block_node, act_graph, NULL, 0, &kernelNodeParams); + cudaGraphAddDependencies(act_graph, &act_block_node, &set_bounds_changed_node, 1); + } +} + +template +void create_activity_per_block(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + const std::vector& cnst_bin_offsets, + i_t heavy_degree_cutoff, + bool erase_inf_cnst) +{ + if (view.nnz < 10000) { + create_activity_per_block( + act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 32, 32, erase_inf_cnst); + create_activity_per_block( + act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 64, 64, erase_inf_cnst); + create_activity_per_block( + act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 128, 128, erase_inf_cnst); + create_activity_per_block( + act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 256, 256, erase_inf_cnst); + } else { + //[1024, heavy_degree_cutoff/2] -> 1024 block size + create_activity_per_block(act_graph, + set_bounds_changed_node, + view, + cnst_bin_offsets, + 1024, + heavy_degree_cutoff / 2, + erase_inf_cnst); + //[512, 512] -> 128 block size + create_activity_per_block( + act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 128, 512, erase_inf_cnst); } } +template +void create_activity_heavy_cnst(cudaGraph_t act_graph, + cudaGraphNode_t& set_bounds_changed_node, + activity_view_t view, + raft::device_span tmp_cnst_act, + const rmm::device_uvector& heavy_cnst_vertex_ids, + const rmm::device_uvector& heavy_cnst_pseudo_block_ids, + const rmm::device_uvector& heavy_cnst_block_segments, + const std::vector& cnst_bin_offsets, + i_t heavy_degree_cutoff, + i_t num_blocks_heavy_cnst, + bool erase_inf_cnst, + bool dry_run = false) +{ + if (num_blocks_heavy_cnst != 0) { + cudaGraphNode_t act_heavy_node; + cudaGraphNode_t finalize_heavy_node; + // Add heavy kernel + { + auto heavy_cnst_beg_id = get_id_offset(cnst_bin_offsets, heavy_degree_cutoff); + auto heavy_cnst_vertex_ids_span = make_span(heavy_cnst_vertex_ids); + auto heavy_cnst_pseudo_block_ids_span = make_span(heavy_cnst_pseudo_block_ids); + i_t work_per_block = heavy_degree_cutoff; + + void* kernelArgs[] = {&heavy_cnst_beg_id, + &heavy_cnst_vertex_ids_span, + &heavy_cnst_pseudo_block_ids_span, + &work_per_block, + &view, + &tmp_cnst_act}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.func = + (void*)lb_calc_act_heavy_kernel; + kernelNodeParams.gridDim = dim3(num_blocks_heavy_cnst, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + cudaGraphAddKernelNode(&act_heavy_node, act_graph, NULL, 0, &kernelNodeParams); + } + { + auto heavy_cnst_beg_id = get_id_offset(cnst_bin_offsets, heavy_degree_cutoff); + auto num_heavy_cnst = cnst_bin_offsets.back() - heavy_cnst_beg_id; + auto heavy_cnst_block_segments_span = make_span(heavy_cnst_block_segments); + + void* kernelArgs[] = { + &heavy_cnst_beg_id, &heavy_cnst_block_segments_span, &tmp_cnst_act, &view}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.gridDim = dim3(num_heavy_cnst, 1, 1); + kernelNodeParams.blockDim = dim3(32, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + if (erase_inf_cnst) { + kernelNodeParams.func = + (void*)finalize_calc_act_kernel; + } else { + kernelNodeParams.func = + (void*)finalize_calc_act_kernel; + } + + cudaGraphAddKernelNode(&finalize_heavy_node, act_graph, NULL, 0, &kernelNodeParams); + } + + cudaGraphAddDependencies(act_graph, &act_heavy_node, &finalize_heavy_node, 1); + cudaGraphAddDependencies(act_graph, &finalize_heavy_node, &set_bounds_changed_node, 1); + } +} + +/// BOUNDS UPDATE + template void upd_bounds_heavy_vars(managed_stream_pool& streams, bounds_update_view_t view, raft::device_span tmp_vars_bnd, + const rmm::device_uvector& heavy_vars_vertex_ids, + const rmm::device_uvector& heavy_vars_pseudo_block_ids, const rmm::device_uvector& heavy_vars_block_segments, const std::vector& vars_bin_offsets, i_t heavy_degree_cutoff, @@ -411,7 +687,8 @@ void upd_bounds_heavy_vars(managed_stream_pool& streams, lb_upd_bnd_heavy_kernel <<>>( heavy_vars_beg_id, - make_span(heavy_vars_block_segments, 1, heavy_vars_block_segments.size()), + make_span(heavy_vars_vertex_ids), + make_span(heavy_vars_pseudo_block_ids), heavy_degree_cutoff, view, tmp_vars_bnd); @@ -556,4 +833,279 @@ void upd_bounds_sub_warp(managed_stream_pool& streams, } } } + +template +void create_update_bounds_sub_warp(cudaGraph_t upd_graph, + // cudaGraphNode_t& set_bounds_changed_node, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + i_t degree_beg, + i_t degree_end, + const std::vector& vars_bin_offsets) +{ + constexpr i_t block_dim = 32; + auto vars_per_block = block_dim / threads_per_variable; + auto [vars_id_beg, vars_id_end] = get_id_range(vars_bin_offsets, degree_beg, degree_end); + + auto block_count = raft::ceildiv(vars_id_end - vars_id_beg, vars_per_block); + if (block_count != 0) { + cudaGraphNode_t upd_bnd_sub_warp_node; + + void* kernelArgs[] = {&vars_id_beg, &vars_id_end, &view}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.func = (void*)lb_upd_bnd_sub_warp_kernel; + kernelNodeParams.gridDim = dim3(block_count, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + cudaGraphAddKernelNode(&upd_bnd_sub_warp_node, upd_graph, NULL, 0, &kernelNodeParams); + RAFT_CUDA_TRY(cudaGetLastError()); + + // cudaGraphAddDependencies(upd_graph, &set_bounds_changed_node, &upd_bnd_sub_warp_node, 1); + cudaGraphAddDependencies(upd_graph, &upd_bnd_sub_warp_node, &bounds_changed_node, 1); + RAFT_CUDA_TRY(cudaGetLastError()); + } +} + +template +void create_update_bounds_sub_warp(cudaGraph_t upd_graph, + // cudaGraphNode_t& set_bounds_changed_node, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + i_t degree, + const std::vector& vars_bin_offsets) +{ + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, degree, degree, vars_bin_offsets); +} + +template +void create_update_bounds_sub_warp(cudaGraph_t upd_graph, + // cudaGraphNode_t& set_bounds_changed_node, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + i_t vars_sub_warp_count, + rmm::device_uvector& warp_vars_offsets, + rmm::device_uvector& warp_vars_id_offsets) +{ + constexpr i_t block_dim = 256; + + auto block_count = raft::ceildiv(vars_sub_warp_count * 32, block_dim); + if (block_count != 0) { + cudaGraphNode_t upd_bnd_sub_warp_node; + + auto warp_vars_offsets_span = make_span(warp_vars_offsets); + auto warp_vars_id_offsets_span = make_span(warp_vars_id_offsets); + + void* kernelArgs[] = {&view, &warp_vars_offsets_span, &warp_vars_id_offsets_span}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.func = + (void*)lb_upd_bnd_sub_warp_kernel; + kernelNodeParams.gridDim = dim3(block_count, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + cudaGraphAddKernelNode(&upd_bnd_sub_warp_node, upd_graph, NULL, 0, &kernelNodeParams); + RAFT_CUDA_TRY(cudaGetLastError()); + + cudaGraphAddDependencies(upd_graph, &upd_bnd_sub_warp_node, &bounds_changed_node, 1); + RAFT_CUDA_TRY(cudaGetLastError()); + } +} + +template +void create_update_bounds_sub_warp(cudaGraph_t upd_graph, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + bool is_vars_sub_warp_single_bin, + i_t vars_sub_warp_count, + rmm::device_uvector& warp_vars_offsets, + rmm::device_uvector& warp_vars_id_offsets, + const std::vector& vars_bin_offsets) +{ + if (view.nnz < 10000) { + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 16, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 8, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 4, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 2, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 1, vars_bin_offsets); + } else { + if (is_vars_sub_warp_single_bin) { + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 64, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 32, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 16, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 8, vars_bin_offsets); + create_update_bounds_sub_warp( + upd_graph, bounds_changed_node, view, 1, 4, vars_bin_offsets); + } else { + create_update_bounds_sub_warp(upd_graph, + bounds_changed_node, + view, + vars_sub_warp_count, + warp_vars_offsets, + warp_vars_id_offsets); + } + } +} + +template +void create_update_bounds_per_block(cudaGraph_t upd_graph, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + const std::vector& vars_bin_offsets, + i_t degree_beg, + i_t degree_end) +{ + auto [vars_id_beg, vars_id_end] = get_id_range(vars_bin_offsets, degree_beg, degree_end); + + auto block_count = vars_id_end - vars_id_beg; + if (block_count > 0) { + cudaGraphNode_t upd_bnd_block_node; + + void* kernelArgs[] = {&vars_id_beg, &view}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.func = + (void*)lb_upd_bnd_block_kernel; + kernelNodeParams.gridDim = dim3(block_count, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + cudaGraphAddKernelNode(&upd_bnd_block_node, upd_graph, NULL, 0, &kernelNodeParams); + RAFT_CUDA_TRY(cudaGetLastError()); + + cudaGraphAddDependencies(upd_graph, &upd_bnd_block_node, &bounds_changed_node, 1); + RAFT_CUDA_TRY(cudaGetLastError()); + } +} + +template +void create_update_bounds_per_block(cudaGraph_t upd_graph, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + const std::vector& vars_bin_offsets, + i_t heavy_degree_cutoff) +{ + if (view.nnz < 10000) { + create_update_bounds_per_block( + upd_graph, bounds_changed_node, view, vars_bin_offsets, 32, 32); + create_update_bounds_per_block( + upd_graph, bounds_changed_node, view, vars_bin_offsets, 64, 64); + create_update_bounds_per_block( + upd_graph, bounds_changed_node, view, vars_bin_offsets, 128, 128); + create_update_bounds_per_block( + upd_graph, bounds_changed_node, view, vars_bin_offsets, 256, 256); + } else { + //[1024, heavy_degree_cutoff/2] -> 128 block size + create_update_bounds_per_block( + upd_graph, bounds_changed_node, view, vars_bin_offsets, 1024, heavy_degree_cutoff / 2); + //[64, 512] -> 32 block size + create_update_bounds_per_block( + upd_graph, bounds_changed_node, view, vars_bin_offsets, 128, 512); + } +} + +template +void create_update_bounds_heavy_vars(cudaGraph_t upd_graph, + cudaGraphNode_t& bounds_changed_node, + bounds_update_view_t view, + raft::device_span tmp_vars_bnd, + const rmm::device_uvector& heavy_vars_vertex_ids, + const rmm::device_uvector& heavy_vars_pseudo_block_ids, + const rmm::device_uvector& heavy_vars_block_segments, + const std::vector& vars_bin_offsets, + i_t heavy_degree_cutoff, + i_t num_blocks_heavy_vars) +{ + if (num_blocks_heavy_vars != 0) { + cudaGraphNode_t upd_bnd_heavy_node; + cudaGraphNode_t finalize_heavy_node; + // Add heavy kernel + { + auto heavy_vars_beg_id = get_id_offset(vars_bin_offsets, heavy_degree_cutoff); + auto heavy_vars_vertex_ids_span = make_span(heavy_vars_vertex_ids); + auto heavy_vars_pseudo_block_ids_span = make_span(heavy_vars_pseudo_block_ids); + i_t work_per_block = heavy_degree_cutoff; + + void* kernelArgs[] = {&heavy_vars_beg_id, + &heavy_vars_vertex_ids_span, + &heavy_vars_pseudo_block_ids_span, + &work_per_block, + &view, + &tmp_vars_bnd}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.func = + (void*)lb_upd_bnd_heavy_kernel; + kernelNodeParams.gridDim = dim3(num_blocks_heavy_vars, 1, 1); + kernelNodeParams.blockDim = dim3(block_dim, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + cudaGraphAddKernelNode(&upd_bnd_heavy_node, upd_graph, NULL, 0, &kernelNodeParams); + RAFT_CUDA_TRY(cudaGetLastError()); + } + // Add finalize + { + auto heavy_vars_beg_id = get_id_offset(vars_bin_offsets, heavy_degree_cutoff); + auto num_heavy_vars = vars_bin_offsets.back() - heavy_vars_beg_id; + auto heavy_vars_block_segments_span = make_span(heavy_vars_block_segments); + + void* kernelArgs[] = { + &heavy_vars_beg_id, &heavy_vars_block_segments_span, &tmp_vars_bnd, &view}; + + cudaKernelNodeParams kernelNodeParams = {0}; + + kernelNodeParams.func = (void*)finalize_upd_bnd_kernel; + kernelNodeParams.gridDim = dim3(num_heavy_vars, 1, 1); + kernelNodeParams.blockDim = dim3(32, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + kernelNodeParams.kernelParams = (void**)kernelArgs; + kernelNodeParams.extra = NULL; + + cudaGraphAddKernelNode(&finalize_heavy_node, upd_graph, NULL, 0, &kernelNodeParams); + RAFT_CUDA_TRY(cudaGetLastError()); + } + cudaGraphAddDependencies(upd_graph, &upd_bnd_heavy_node, &finalize_heavy_node, 1); + RAFT_CUDA_TRY(cudaGetLastError()); + cudaGraphAddDependencies(upd_graph, &finalize_heavy_node, &bounds_changed_node, 1); + RAFT_CUDA_TRY(cudaGetLastError()); + } +} + } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh index 328fa25b93..a8f4c75008 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh @@ -79,42 +79,23 @@ __global__ void lb_calc_act_heavy_kernel(i_t id_range_beg, activity_view_t view, raft::device_span tmp_cnst_act) { + // if (pseudo_block_ids.size() <= blockIdx.x) { + // printf("oob pseudo_block_id %d %d\n", blockIdx.x, int(pseudo_block_ids.size())); + // } + // if (ids.size() <= blockIdx.x) { + // printf("oob ids\n"); + // } + // if (tmp_cnst_act.size() <= blockIdx.x) { + // printf("oob tmp_cnst_act\n"); + // } auto idx = ids[blockIdx.x] + id_range_beg; auto pseudo_block_id = pseudo_block_ids[blockIdx.x]; - i_t item_off_beg = view.offsets[idx] + work_per_block * pseudo_block_id; - i_t item_off_end = min(item_off_beg + work_per_block, view.offsets[idx + 1]); - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - auto act = calc_act(view, threadIdx.x, item_off_beg, item_off_end); - - act.x = BlockReduce(temp_storage).Sum(act.x); - __syncthreads(); - act.y = BlockReduce(temp_storage).Sum(act.y); - - // don't subtract constraint bounds yet - // to be done in post processing in finalize_calc_act_kernel - if (threadIdx.x == 0) { tmp_cnst_act[blockIdx.x] = act; } -} - -template -__global__ void lb_calc_act_heavy_kernel(i_t id_range_beg, - raft::device_span item_block_segments, - i_t work_per_block, - activity_view_t view, - raft::device_span tmp_cnst_act) -{ - __shared__ i_t id_map; - __shared__ i_t pseudo_block_id; - if (threadIdx.x == 0) { - id_map = thrust::upper_bound( - thrust::seq, item_block_segments.begin(), item_block_segments.end(), blockIdx.x) - - item_block_segments.begin(); - pseudo_block_id = blockIdx.x - item_block_segments[id_map - 1]; - } - __syncthreads(); - auto idx = id_range_beg + id_map; + // if (view.offsets.size() <= idx) { + // printf("oob offset\n"); + // } + // if (view.offsets.size() <= idx + 1) { + // printf("oob offset + 1\n"); + // } i_t item_off_beg = view.offsets[idx] + work_per_block * pseudo_block_id; i_t item_off_end = min(item_off_beg + work_per_block, view.offsets[idx + 1]); @@ -677,4 +658,77 @@ __global__ void lb_upd_bnd_sub_warp_kernel(bounds_update_view_t view, } } +#if 0 +template +__device__ void upd_bnd_block(i_t prior_blocks_in_seg, + i_t id_range_beg, bounds_update_view_t view) +{ + //i_t idx = id_range_beg + blockIdx.x; + i_t idx = id_beg_seg + prior_blocks_in_seg * (BDIM/PSEUDO_BDIM) + (threadIdx.x / PSEUDO_BDIM); + i_t var_idx = view.vars_reorg_ids[idx]; + // x is lb, y is ub + auto old_bounds = view.vars_bnd[var_idx]; + bool is_int = (view.vars_types[idx] == var_t::INTEGER); + i_t item_off_beg = view.offsets[idx]; + i_t item_off_end = view.offsets[idx + 1]; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + // if it is a set variable then don't propagate the bound + // consider continuous vars as set if their bounds cross or equal + if (old_bounds.x + view.tolerances.integrality_tolerance >= old_bounds.y) { return; } + auto bounds = + update_bounds(view, threadIdx.x, item_off_beg, item_off_end, old_bounds); + + bounds.x = BlockReduce(temp_storage).Reduce(bounds.x, cuda::maximum()); + __syncthreads(); + bounds.y = BlockReduce(temp_storage).Reduce(bounds.y, cuda::minimum()); + + if (threadIdx.x == 0) { + write_updated_bounds(&view.vars_bnd[var_idx], is_int, view, bounds, old_bounds); + } +} + +template +__device__ void upd_bnd_sub_warp(bounds_update_view_t view, + raft::device_span warp_vars_offsets, + raft::device_span warp_vars_id_offsets) +{ + i_t id_warp_beg, id_range_end, threads_per_variable; + detect_range_sub_warp( + &id_warp_beg, &id_range_end, &threads_per_variable, warp_vars_offsets, warp_vars_id_offsets); + + if (threads_per_variable == 1) { + upd_bnd_sub_warp(id_warp_beg, id_range_end, view); + } else if (threads_per_variable == 2) { + upd_bnd_sub_warp(id_warp_beg, id_range_end, view); + } else if (threads_per_variable == 4) { + upd_bnd_sub_warp(id_warp_beg, id_range_end, view); + } else if (threads_per_variable == 8) { + upd_bnd_sub_warp(id_warp_beg, id_range_end, view); + } else if (threads_per_variable == 16) { + upd_bnd_sub_warp(id_warp_beg, id_range_end, view); + } +} + +template +__global__ void lb_upd_bnd_kernel(bounds_update_view_t view, + raft::device_span warp_vars_offsets, + raft::device_span warp_vars_id_offsets, + raft::device_span block_vars_offsets, + raft::device_span block_vars_id_offsets) +{ + if (blockIdx.x < sub_warp_blocks_end) { + upd_bnd_sub_warp(view, warp_vars_offsets, warp_vars_id_offsets); + } else if (blockIdx.x < block_vars_offsets[1]) { + upd_bnd_block<64, BDIM>(view, blockIdx.x - block_vars_offsets[0], block_vars_offsets[0], block_vars_offsets[1]); + } else if (blockIdx.x < block_vars_offsets[2]) { + upd_bnd_block<256, BDIM>(view, blockIdx.x - block_vars_offsets[1], block_vars_offsets[1], block_vars_offsets[2]); + } else { + upd_bnd_heavy<512>(heavy_vars_beg_id, heavy_vars_vertex_ids, heavy_vars_pseudo_block_ids, heavy_degree_cutoff, view, tmp_bnd); + } +} +#endif + } // namespace cuopt::linear_programming::detail diff --git a/cpp/tests/mip/CMakeLists.txt b/cpp/tests/mip/CMakeLists.txt index b9fd249a56..18a4270ca4 100644 --- a/cpp/tests/mip/CMakeLists.txt +++ b/cpp/tests/mip/CMakeLists.txt @@ -27,6 +27,9 @@ ConfigureTest(ELIM_VAR_REMAP_TEST ConfigureTest(STANDARDIZATION_TEST ${CMAKE_CURRENT_SOURCE_DIR}/bounds_standardization_test.cu ) +ConfigureTest(LB_TEST + ${CMAKE_CURRENT_SOURCE_DIR}/lb_test.cu +) ConfigureTest(MULTI_PROBE_TEST ${CMAKE_CURRENT_SOURCE_DIR}/multi_probe_test.cu ) diff --git a/cpp/tests/mip/lb_test.cu b/cpp/tests/mip/lb_test.cu new file mode 100644 index 0000000000..5b5a4f1f05 --- /dev/null +++ b/cpp/tests/mip/lb_test.cu @@ -0,0 +1,188 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../linear_programming/utilities/pdlp_test_utilities.cuh" +#include "mip_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +namespace cuopt::linear_programming::test { + +inline auto make_async() { return std::make_shared(); } + +void init_handler(const raft::handle_t* handle_ptr) +{ + // Init cuBlas / cuSparse context here to avoid having it during solving time + RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode( + handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream())); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode( + handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream())); +} + +std::tuple, std::vector, std::vector> select_k_random( + detail::problem_t& problem, int sample_size) +{ + auto seed = std::random_device{}(); + std::cerr << "Tested with seed " << seed << "\n"; + problem.compute_n_integer_vars(); + auto v_lb = host_copy(problem.variable_lower_bounds); + auto v_ub = host_copy(problem.variable_upper_bounds); + auto int_var_id = host_copy(problem.integer_indices); + int_var_id.erase(std::remove_if(int_var_id.begin(), + int_var_id.end(), + [v_lb, v_ub](auto id) { + return !(std::isfinite(v_lb[id]) && std::isfinite(v_ub[id])); + }), + int_var_id.end()); + sample_size = std::min(sample_size, static_cast(int_var_id.size())); + std::vector random_int_vars; + std::mt19937 m{seed}; + std::sample( + int_var_id.begin(), int_var_id.end(), std::back_inserter(random_int_vars), sample_size, m); + std::vector probe_0(sample_size); + std::vector probe_1(sample_size); + for (int i = 0; i < static_cast(random_int_vars.size()); ++i) { + if (i % 2) { + probe_0[i] = v_lb[random_int_vars[i]]; + probe_1[i] = v_ub[random_int_vars[i]]; + } else { + probe_1[i] = v_lb[random_int_vars[i]]; + probe_0[i] = v_ub[random_int_vars[i]]; + } + } + return std::make_tuple(std::move(random_int_vars), std::move(probe_0), std::move(probe_1)); +} + +std::pair>, std::vector>> +convert_probe_tuple(std::tuple, std::vector, std::vector>& probe) +{ + std::vector> probe_first; + std::vector> probe_second; + for (size_t i = 0; i < std::get<0>(probe).size(); ++i) { + probe_first.emplace_back(thrust::make_pair(std::get<0>(probe)[i], std::get<1>(probe)[i])); + probe_second.emplace_back(thrust::make_pair(std::get<0>(probe)[i], std::get<2>(probe)[i])); + } + return std::make_pair(std::move(probe_first), std::move(probe_second)); +} + +std::tuple, std::vector, std::vector, std::vector> +bounds_probe_results(detail::bound_presolve_t& bnd_prb_0, + detail::bound_presolve_t& bnd_prb_1, + detail::problem_t& problem, + const std::pair>, + std::vector>>& probe) +{ + auto& probe_first = std::get<0>(probe); + auto& probe_second = std::get<1>(probe); + rmm::device_uvector b_lb_0(problem.n_variables, problem.handle_ptr->get_stream()); + rmm::device_uvector b_ub_0(problem.n_variables, problem.handle_ptr->get_stream()); + rmm::device_uvector b_lb_1(problem.n_variables, problem.handle_ptr->get_stream()); + rmm::device_uvector b_ub_1(problem.n_variables, problem.handle_ptr->get_stream()); + bnd_prb_0.solve(problem, probe_first); + bnd_prb_0.set_updated_bounds(problem.handle_ptr, make_span(b_lb_0), make_span(b_ub_0)); + bnd_prb_1.solve(problem, probe_second); + bnd_prb_1.set_updated_bounds(problem.handle_ptr, make_span(b_lb_1), make_span(b_ub_1)); + + auto h_lb_0 = host_copy(b_lb_0); + auto h_ub_0 = host_copy(b_ub_0); + auto h_lb_1 = host_copy(b_lb_1); + auto h_ub_1 = host_copy(b_ub_1); + return std::make_tuple( + std::move(h_lb_0), std::move(h_ub_0), std::move(h_lb_1), std::move(h_ub_1)); +} + +void test_multi_probe(std::string path) +{ + auto memory_resource = make_async(); + rmm::mr::set_current_device_resource(memory_resource.get()); + const raft::handle_t handle_{}; + cuopt::mps_parser::mps_data_model_t mps_problem = + cuopt::mps_parser::parse_mps(path, false); + handle_.sync_stream(); + auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem); + problem_checking_t::check_problem_representation(op_problem); + detail::problem_t problem(op_problem); + mip_solver_settings_t default_settings{}; + detail::pdhg_solver_t pdhg_solver(problem.handle_ptr, problem); + detail::pdlp_initial_scaling_strategy_t scaling(&handle_, + problem, + 10, + 1.0, + pdhg_solver, + problem.reverse_coefficients, + problem.reverse_offsets, + problem.reverse_constraints, + true); + detail::mip_solver_t solver(problem, default_settings, scaling, cuopt::timer_t(0)); + detail::load_balanced_problem_t lb_problem(problem); + detail::load_balanced_bounds_presolve_t lb_prs(lb_problem, solver.context); + + detail::bound_presolve_t bnd_prb(solver.context); + + auto probe_tuple = select_k_random(problem, 100); + auto bounds_probe_vals = convert_probe_tuple(probe_tuple); + { + auto& probe_first = std::get<0>(bounds_probe_vals); + bnd_prb.solve(problem, probe_first); + rmm::device_uvector b_lb(problem.n_variables, problem.handle_ptr->get_stream()); + rmm::device_uvector b_ub(problem.n_variables, problem.handle_ptr->get_stream()); + bnd_prb.set_updated_bounds(problem.handle_ptr, make_span(b_lb), make_span(b_ub)); + + auto h_lb = host_copy(b_lb); + auto h_ub = host_copy(b_ub); + + lb_prs.solve(probe_first); + + auto bnds = host_copy(lb_prs.vars_bnd); + for (int i = 0; i < (int)h_lb.size(); ++i) { + EXPECT_DOUBLE_EQ(bnds[2 * i], h_lb[i]); + EXPECT_DOUBLE_EQ(bnds[2 * i + 1], h_ub[i]); + } + } +} + +TEST(presolve, multi_probe) +{ + std::vector test_instances = { + "mip/50v-10-free-bound.mps", "mip/neos5-free-bound.mps", "mip/neos5.mps"}; + for (const auto& test_instance : test_instances) { + auto path = make_path_absolute(test_instance); + test_multi_probe(path); + } +} + +} // namespace cuopt::linear_programming::test From 18b7202924a8b570e02bd9941cfbb4dcce9d5699 Mon Sep 17 00:00:00 2001 From: Kumar Aatish Date: Wed, 30 Jul 2025 17:06:58 -0400 Subject: [PATCH 3/4] cleanup --- .../presolve/load_balanced_bounds_presolve.cu | 3 - .../load_balanced_bounds_presolve_kernels.cuh | 92 +------------------ 2 files changed, 2 insertions(+), 93 deletions(-) diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu index 960913fe10..5259cae014 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu @@ -401,8 +401,6 @@ void load_balanced_bounds_presolve_t::create_bounds_update_graph() warp_vars_id_offsets, pb->vars_bin_offsets); RAFT_CUDA_TRY(cudaGetLastError()); - cudaGraphDebugDotPrint(upd_graph, "/home/aatish/debug_upd_graph", 0); - RAFT_CUDA_TRY(cudaGetLastError()); cudaGraphInstantiate(&upd_bnd_exec, upd_graph, NULL, NULL, 0); RAFT_CUDA_TRY(cudaGetLastError()); } @@ -459,7 +457,6 @@ void load_balanced_bounds_presolve_t::create_constraint_slack_graph(bo warp_cnst_id_offsets, pb->cnst_bin_offsets, erase_inf_cnst); - cudaGraphDebugDotPrint(cnst_slack_graph, "/home/aatish/debug_cnst_slack_graph", 0); if (erase_inf_cnst) { cudaGraphInstantiate(&calc_slack_erase_inf_cnst_exec, cnst_slack_graph, NULL, NULL, 0); } else { diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh index a8f4c75008..10089664ad 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh @@ -79,25 +79,10 @@ __global__ void lb_calc_act_heavy_kernel(i_t id_range_beg, activity_view_t view, raft::device_span tmp_cnst_act) { - // if (pseudo_block_ids.size() <= blockIdx.x) { - // printf("oob pseudo_block_id %d %d\n", blockIdx.x, int(pseudo_block_ids.size())); - // } - // if (ids.size() <= blockIdx.x) { - // printf("oob ids\n"); - // } - // if (tmp_cnst_act.size() <= blockIdx.x) { - // printf("oob tmp_cnst_act\n"); - // } auto idx = ids[blockIdx.x] + id_range_beg; auto pseudo_block_id = pseudo_block_ids[blockIdx.x]; - // if (view.offsets.size() <= idx) { - // printf("oob offset\n"); - // } - // if (view.offsets.size() <= idx + 1) { - // printf("oob offset + 1\n"); - // } - i_t item_off_beg = view.offsets[idx] + work_per_block * pseudo_block_id; - i_t item_off_end = min(item_off_beg + work_per_block, view.offsets[idx + 1]); + i_t item_off_beg = view.offsets[idx] + work_per_block * pseudo_block_id; + i_t item_off_end = min(item_off_beg + work_per_block, view.offsets[idx + 1]); typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -658,77 +643,4 @@ __global__ void lb_upd_bnd_sub_warp_kernel(bounds_update_view_t view, } } -#if 0 -template -__device__ void upd_bnd_block(i_t prior_blocks_in_seg, - i_t id_range_beg, bounds_update_view_t view) -{ - //i_t idx = id_range_beg + blockIdx.x; - i_t idx = id_beg_seg + prior_blocks_in_seg * (BDIM/PSEUDO_BDIM) + (threadIdx.x / PSEUDO_BDIM); - i_t var_idx = view.vars_reorg_ids[idx]; - // x is lb, y is ub - auto old_bounds = view.vars_bnd[var_idx]; - bool is_int = (view.vars_types[idx] == var_t::INTEGER); - i_t item_off_beg = view.offsets[idx]; - i_t item_off_end = view.offsets[idx + 1]; - - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - // if it is a set variable then don't propagate the bound - // consider continuous vars as set if their bounds cross or equal - if (old_bounds.x + view.tolerances.integrality_tolerance >= old_bounds.y) { return; } - auto bounds = - update_bounds(view, threadIdx.x, item_off_beg, item_off_end, old_bounds); - - bounds.x = BlockReduce(temp_storage).Reduce(bounds.x, cuda::maximum()); - __syncthreads(); - bounds.y = BlockReduce(temp_storage).Reduce(bounds.y, cuda::minimum()); - - if (threadIdx.x == 0) { - write_updated_bounds(&view.vars_bnd[var_idx], is_int, view, bounds, old_bounds); - } -} - -template -__device__ void upd_bnd_sub_warp(bounds_update_view_t view, - raft::device_span warp_vars_offsets, - raft::device_span warp_vars_id_offsets) -{ - i_t id_warp_beg, id_range_end, threads_per_variable; - detect_range_sub_warp( - &id_warp_beg, &id_range_end, &threads_per_variable, warp_vars_offsets, warp_vars_id_offsets); - - if (threads_per_variable == 1) { - upd_bnd_sub_warp(id_warp_beg, id_range_end, view); - } else if (threads_per_variable == 2) { - upd_bnd_sub_warp(id_warp_beg, id_range_end, view); - } else if (threads_per_variable == 4) { - upd_bnd_sub_warp(id_warp_beg, id_range_end, view); - } else if (threads_per_variable == 8) { - upd_bnd_sub_warp(id_warp_beg, id_range_end, view); - } else if (threads_per_variable == 16) { - upd_bnd_sub_warp(id_warp_beg, id_range_end, view); - } -} - -template -__global__ void lb_upd_bnd_kernel(bounds_update_view_t view, - raft::device_span warp_vars_offsets, - raft::device_span warp_vars_id_offsets, - raft::device_span block_vars_offsets, - raft::device_span block_vars_id_offsets) -{ - if (blockIdx.x < sub_warp_blocks_end) { - upd_bnd_sub_warp(view, warp_vars_offsets, warp_vars_id_offsets); - } else if (blockIdx.x < block_vars_offsets[1]) { - upd_bnd_block<64, BDIM>(view, blockIdx.x - block_vars_offsets[0], block_vars_offsets[0], block_vars_offsets[1]); - } else if (blockIdx.x < block_vars_offsets[2]) { - upd_bnd_block<256, BDIM>(view, blockIdx.x - block_vars_offsets[1], block_vars_offsets[1], block_vars_offsets[2]); - } else { - upd_bnd_heavy<512>(heavy_vars_beg_id, heavy_vars_vertex_ids, heavy_vars_pseudo_block_ids, heavy_degree_cutoff, view, tmp_bnd); - } -} -#endif - } // namespace cuopt::linear_programming::detail From e6afa2e9dc8f92ba9a71b0728bbc5859f8c35099 Mon Sep 17 00:00:00 2001 From: Kumar Aatish Date: Wed, 30 Jul 2025 21:51:41 -0400 Subject: [PATCH 4/4] pr review fixes --- cpp/src/mip/presolve/load_balanced_bounds_presolve.cu | 1 - .../mip/presolve/load_balanced_bounds_presolve_helpers.cuh | 4 ---- cpp/tests/mip/CMakeLists.txt | 2 +- cpp/tests/mip/{lb_test.cu => load_balancing_test.cu} | 2 +- 4 files changed, 2 insertions(+), 7 deletions(-) rename cpp/tests/mip/{lb_test.cu => load_balancing_test.cu} (98%) diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu index 5259cae014..4b65de9c21 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu @@ -265,7 +265,6 @@ void load_balanced_bounds_presolve_t::setup( std::tie(is_vars_sub_warp_single_bin, vars_sub_warp_count) = sub_warp_meta(stream, warp_vars_offsets, warp_vars_id_offsets, pb->vars_bin_offsets, 4); - // stream.synchronize(); RAFT_CHECK_CUDA(stream); streams.sync_test_all_issued(); diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh index 07a268d9aa..7eb2b41a9d 100644 --- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh +++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh @@ -840,7 +840,6 @@ template void create_update_bounds_sub_warp(cudaGraph_t upd_graph, - // cudaGraphNode_t& set_bounds_changed_node, cudaGraphNode_t& bounds_changed_node, bounds_update_view_t view, i_t degree_beg, @@ -874,7 +873,6 @@ void create_update_bounds_sub_warp(cudaGraph_t upd_graph, cudaGraphAddKernelNode(&upd_bnd_sub_warp_node, upd_graph, NULL, 0, &kernelNodeParams); RAFT_CUDA_TRY(cudaGetLastError()); - // cudaGraphAddDependencies(upd_graph, &set_bounds_changed_node, &upd_bnd_sub_warp_node, 1); cudaGraphAddDependencies(upd_graph, &upd_bnd_sub_warp_node, &bounds_changed_node, 1); RAFT_CUDA_TRY(cudaGetLastError()); } @@ -886,7 +884,6 @@ template void create_update_bounds_sub_warp(cudaGraph_t upd_graph, - // cudaGraphNode_t& set_bounds_changed_node, cudaGraphNode_t& bounds_changed_node, bounds_update_view_t view, i_t degree, @@ -898,7 +895,6 @@ void create_update_bounds_sub_warp(cudaGraph_t upd_graph, template void create_update_bounds_sub_warp(cudaGraph_t upd_graph, - // cudaGraphNode_t& set_bounds_changed_node, cudaGraphNode_t& bounds_changed_node, bounds_update_view_t view, i_t vars_sub_warp_count, diff --git a/cpp/tests/mip/CMakeLists.txt b/cpp/tests/mip/CMakeLists.txt index 18a4270ca4..020c537f6a 100644 --- a/cpp/tests/mip/CMakeLists.txt +++ b/cpp/tests/mip/CMakeLists.txt @@ -28,7 +28,7 @@ ConfigureTest(STANDARDIZATION_TEST ${CMAKE_CURRENT_SOURCE_DIR}/bounds_standardization_test.cu ) ConfigureTest(LB_TEST - ${CMAKE_CURRENT_SOURCE_DIR}/lb_test.cu + ${CMAKE_CURRENT_SOURCE_DIR}/load_balancing_test.cu ) ConfigureTest(MULTI_PROBE_TEST ${CMAKE_CURRENT_SOURCE_DIR}/multi_probe_test.cu diff --git a/cpp/tests/mip/lb_test.cu b/cpp/tests/mip/load_balancing_test.cu similarity index 98% rename from cpp/tests/mip/lb_test.cu rename to cpp/tests/mip/load_balancing_test.cu index 5b5a4f1f05..deed9ea85a 100644 --- a/cpp/tests/mip/lb_test.cu +++ b/cpp/tests/mip/load_balancing_test.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights * reserved. SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License");