From 8596529e2563ec8dc25bcf7691e2188e5c29db64 Mon Sep 17 00:00:00 2001
From: Kumar Aatish <kaatish@nvidia.com>
Date: Sat, 26 Jul 2025 07:32:06 -0700
Subject: [PATCH 1/4] init

---
 cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh
index 53a76536b8..dea2ffe7c5 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh
@@ -108,6 +108,7 @@ i_t create_heavy_item_block_segments(rmm::cuda_stream_view stream,
 
   // Inclusive scan so that each block can determine which item it belongs to
   item_block_segments.set_element_to_zero_async(0, stream);
+
   thrust::inclusive_scan(rmm::exec_policy(stream),
                          calc_blocks_per_vertex_iter,
                          calc_blocks_per_vertex_iter + heavy_id_count,

From 509dfe638ce29e6fc52e2de9d3ead4e7d50ea102 Mon Sep 17 00:00:00 2001
From: Kumar Aatish <kaatish@nvidia.com>
Date: Wed, 30 Jul 2025 16:58:01 -0400
Subject: [PATCH 2/4] Manual graph creation for lb bounds update

---
 cpp/src/mip/presolve/bounds_presolve.cu       |   1 +
 cpp/src/mip/presolve/bounds_presolve.cuh      |   1 +
 .../presolve/load_balanced_bounds_presolve.cu | 167 ++++-
 .../load_balanced_bounds_presolve.cuh         |   3 +
 .../load_balanced_bounds_presolve_helpers.cuh | 610 +++++++++++++++++-
 .../load_balanced_bounds_presolve_kernels.cuh | 122 +++-
 cpp/tests/mip/CMakeLists.txt                  |   3 +
 cpp/tests/mip/lb_test.cu                      | 188 ++++++
 8 files changed, 998 insertions(+), 97 deletions(-)
 create mode 100644 cpp/tests/mip/lb_test.cu

diff --git a/cpp/src/mip/presolve/bounds_presolve.cu b/cpp/src/mip/presolve/bounds_presolve.cu
index 45fee622eb..72440cd9a8 100644
--- a/cpp/src/mip/presolve/bounds_presolve.cu
+++ b/cpp/src/mip/presolve/bounds_presolve.cu
@@ -202,6 +202,7 @@ termination_criterion_t bound_presolve_t<i_t, f_t>::bound_update_loop(problem_t<
   }
   pb.handle_ptr->sync_stream();
   calculate_infeasible_redundant_constraints(pb);
+  solve_iter = iter;
 
   return criteria;
 }
diff --git a/cpp/src/mip/presolve/bounds_presolve.cuh b/cpp/src/mip/presolve/bounds_presolve.cuh
index 9a25b05e9c..84853a7812 100644
--- a/cpp/src/mip/presolve/bounds_presolve.cuh
+++ b/cpp/src/mip/presolve/bounds_presolve.cuh
@@ -86,6 +86,7 @@ class bound_presolve_t {
   i_t infeas_constraints_count = 0;
   i_t redund_constraints_count = 0;
   probing_cache_t<i_t, f_t> probing_cache;
+  i_t solve_iter;
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu
index 091f8a53b5..de8f481901 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu
@@ -245,6 +245,7 @@ void load_balanced_bounds_presolve_t<i_t, f_t>::setup(
                                                            heavy_degree_cutoff,
                                                            problem.cnst_bin_offsets,
                                                            problem.offsets);
+  RAFT_CHECK_CUDA(stream_heavy_cnst);
 
   num_blocks_heavy_vars = create_heavy_item_block_segments(stream_heavy_vars,
                                                            heavy_vars_vertex_ids,
@@ -253,49 +254,34 @@ void load_balanced_bounds_presolve_t<i_t, f_t>::setup(
                                                            heavy_degree_cutoff,
                                                            problem.vars_bin_offsets,
                                                            problem.reverse_offsets);
+  RAFT_CHECK_CUDA(stream_heavy_vars);
 
   tmp_act.resize(2 * num_blocks_heavy_cnst, stream_heavy_cnst);
   tmp_bnd.resize(2 * num_blocks_heavy_vars, stream_heavy_vars);
 
-  std::tie(is_cnst_sub_warp_single_bin, cnst_sub_warp_count) = sub_warp_meta(
-    streams.get_stream(), warp_cnst_offsets, warp_cnst_id_offsets, pb->cnst_bin_offsets, 4);
+  std::tie(is_cnst_sub_warp_single_bin, cnst_sub_warp_count) =
+    sub_warp_meta(stream, warp_cnst_offsets, warp_cnst_id_offsets, pb->cnst_bin_offsets, 4);
 
-  std::tie(is_vars_sub_warp_single_bin, vars_sub_warp_count) = sub_warp_meta(
-    streams.get_stream(), warp_vars_offsets, warp_vars_id_offsets, pb->vars_bin_offsets, 4);
+  std::tie(is_vars_sub_warp_single_bin, vars_sub_warp_count) =
+    sub_warp_meta(stream, warp_vars_offsets, warp_vars_id_offsets, pb->vars_bin_offsets, 4);
 
-  stream.synchronize();
-  streams.sync_all_issued();
+  // stream.synchronize();
+  RAFT_CHECK_CUDA(stream);
+  streams.sync_test_all_issued();
 
   if (!calc_slack_erase_inf_cnst_graph_created) {
-    bool erase_inf_cnst                     = true;
-    calc_slack_erase_inf_cnst_graph_created = build_graph(
-      streams,
-      handle_ptr,
-      calc_slack_erase_inf_cnst_graph,
-      calc_slack_erase_inf_cnst_exec,
-      [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst, true); },
-      [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst); });
+    create_constraint_slack_graph(true);
+    calc_slack_erase_inf_cnst_graph_created = true;
   }
 
   if (!calc_slack_graph_created) {
-    bool erase_inf_cnst      = false;
-    calc_slack_graph_created = build_graph(
-      streams,
-      handle_ptr,
-      calc_slack_graph,
-      calc_slack_exec,
-      [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst, true); },
-      [erase_inf_cnst, this]() { this->calculate_activity_graph(erase_inf_cnst); });
+    create_constraint_slack_graph(false);
+    calc_slack_graph_created = true;
   }
 
   if (!upd_bnd_graph_created) {
-    upd_bnd_graph_created = build_graph(
-      streams,
-      handle_ptr,
-      upd_bnd_graph,
-      upd_bnd_exec,
-      [this]() { this->calculate_bounds_update_graph(true); },
-      [this]() { this->calculate_bounds_update_graph(); });
+    create_bounds_update_graph();
+    upd_bnd_graph_created = true;
   }
 }
 
@@ -368,6 +354,119 @@ void load_balanced_bounds_presolve_t<i_t, f_t>::calculate_activity_graph(bool er
                                          dry_run);
 }
 
+template <typename i_t, typename f_t>
+void load_balanced_bounds_presolve_t<i_t, f_t>::create_bounds_update_graph()
+{
+  using f_t2 = typename type_2<f_t>::type;
+  cudaGraph_t upd_graph;
+  cudaGraphCreate(&upd_graph, 0);
+  cudaGraphNode_t bounds_changed_node;
+  {
+    i_t* bounds_changed_ptr = bounds_changed.data();
+
+    cudaMemcpy3DParms memcpyParams = {0};
+    memcpyParams.srcArray          = NULL;
+    memcpyParams.srcPos            = make_cudaPos(0, 0, 0);
+    memcpyParams.srcPtr            = make_cudaPitchedPtr(bounds_changed_ptr, sizeof(i_t), 1, 1);
+    memcpyParams.dstArray          = NULL;
+    memcpyParams.dstPos            = make_cudaPos(0, 0, 0);
+    memcpyParams.dstPtr            = make_cudaPitchedPtr(&h_bounds_changed, sizeof(i_t), 1, 1);
+    memcpyParams.extent            = make_cudaExtent(sizeof(i_t), 1, 1);
+    memcpyParams.kind              = cudaMemcpyDeviceToHost;
+    cudaGraphAddMemcpyNode(&bounds_changed_node, upd_graph, NULL, 0, &memcpyParams);
+  }
+
+  auto bounds_update_view = get_bounds_update_view(*pb);
+
+  create_update_bounds_heavy_vars<i_t, f_t, f_t2, 640>(upd_graph,
+                                                       bounds_changed_node,
+                                                       bounds_update_view,
+                                                       make_span_2(tmp_bnd),
+                                                       heavy_vars_vertex_ids,
+                                                       heavy_vars_pseudo_block_ids,
+                                                       heavy_vars_block_segments,
+                                                       pb->vars_bin_offsets,
+                                                       heavy_degree_cutoff,
+                                                       num_blocks_heavy_vars);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  create_update_bounds_per_block<i_t, f_t, f_t2>(
+    upd_graph, bounds_changed_node, bounds_update_view, pb->vars_bin_offsets, heavy_degree_cutoff);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  create_update_bounds_sub_warp<i_t, f_t, f_t2>(upd_graph,
+                                                bounds_changed_node,
+                                                bounds_update_view,
+                                                is_vars_sub_warp_single_bin,
+                                                vars_sub_warp_count,
+                                                warp_vars_offsets,
+                                                warp_vars_id_offsets,
+                                                pb->vars_bin_offsets);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  cudaGraphDebugDotPrint(upd_graph, "/home/aatish/debug_upd_graph", 0);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  cudaGraphInstantiate(&upd_bnd_exec, upd_graph, NULL, NULL, 0);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename i_t, typename f_t>
+void load_balanced_bounds_presolve_t<i_t, f_t>::create_constraint_slack_graph(bool erase_inf_cnst)
+{
+  using f_t2 = typename type_2<f_t>::type;
+  cudaGraph_t cnst_slack_graph;
+  cudaGraphCreate(&cnst_slack_graph, 0);
+
+  cudaGraphNode_t set_bounds_changed_node;
+  {
+    // TODO : Investigate why memset node is not captured manually
+    i_t* bounds_changed_ptr = bounds_changed.data();
+
+    cudaMemcpy3DParms memcpyParams = {0};
+    memcpyParams.srcArray          = NULL;
+    memcpyParams.srcPos            = make_cudaPos(0, 0, 0);
+    memcpyParams.srcPtr            = make_cudaPitchedPtr(&h_bounds_changed, sizeof(i_t), 1, 1);
+    memcpyParams.dstArray          = NULL;
+    memcpyParams.dstPos            = make_cudaPos(0, 0, 0);
+    memcpyParams.dstPtr            = make_cudaPitchedPtr(bounds_changed_ptr, sizeof(i_t), 1, 1);
+    memcpyParams.extent            = make_cudaExtent(sizeof(i_t), 1, 1);
+    memcpyParams.kind              = cudaMemcpyHostToDevice;
+    cudaGraphAddMemcpyNode(&set_bounds_changed_node, cnst_slack_graph, NULL, 0, &memcpyParams);
+  }
+
+  auto activity_view = get_activity_view(*pb);
+
+  create_activity_heavy_cnst<i_t, f_t, f_t2, 512>(cnst_slack_graph,
+                                                  set_bounds_changed_node,
+                                                  activity_view,
+                                                  make_span_2(tmp_act),
+                                                  heavy_cnst_vertex_ids,
+                                                  heavy_cnst_pseudo_block_ids,
+                                                  heavy_cnst_block_segments,
+                                                  pb->cnst_bin_offsets,
+                                                  heavy_degree_cutoff,
+                                                  num_blocks_heavy_cnst,
+                                                  erase_inf_cnst);
+  create_activity_per_block<i_t, f_t, f_t2>(cnst_slack_graph,
+                                            set_bounds_changed_node,
+                                            activity_view,
+                                            pb->cnst_bin_offsets,
+                                            heavy_degree_cutoff,
+                                            erase_inf_cnst);
+  create_activity_sub_warp<i_t, f_t, f_t2>(cnst_slack_graph,
+                                           set_bounds_changed_node,
+                                           activity_view,
+                                           is_cnst_sub_warp_single_bin,
+                                           cnst_sub_warp_count,
+                                           warp_cnst_offsets,
+                                           warp_cnst_id_offsets,
+                                           pb->cnst_bin_offsets,
+                                           erase_inf_cnst);
+  cudaGraphDebugDotPrint(cnst_slack_graph, "/home/aatish/debug_cnst_slack_graph", 0);
+  if (erase_inf_cnst) {
+    cudaGraphInstantiate(&calc_slack_erase_inf_cnst_exec, cnst_slack_graph, NULL, NULL, 0);
+  } else {
+    cudaGraphInstantiate(&calc_slack_exec, cnst_slack_graph, NULL, NULL, 0);
+  }
+}
+
 template <typename i_t, typename f_t>
 void load_balanced_bounds_presolve_t<i_t, f_t>::calculate_bounds_update_graph(bool dry_run)
 {
@@ -401,12 +500,13 @@ template <typename i_t, typename f_t>
 void load_balanced_bounds_presolve_t<i_t, f_t>::calculate_constraint_slack_iter(
   const raft::handle_t* handle_ptr)
 {
+  // h_bounds_changed is copied to bounds_changed in calc_slack_exec
+  h_bounds_changed = 0;
   {
     // writes nans to constraint activities that are infeasible
     //-> less expensive checks for update bounds step
     raft::common::nvtx::range scope("act_cuda_task_graph");
     cudaGraphLaunch(calc_slack_erase_inf_cnst_exec, handle_ptr->get_stream());
-    handle_ptr->sync_stream();
   }
   infeas_cnst_slack_set_to_nan = true;
   RAFT_CHECK_CUDA(handle_ptr->get_stream());
@@ -416,6 +516,8 @@ template <typename i_t, typename f_t>
 void load_balanced_bounds_presolve_t<i_t, f_t>::calculate_constraint_slack(
   const raft::handle_t* handle_ptr)
 {
+  // h_bounds_changed is copied to bounds_changed in calc_slack_exec
+  h_bounds_changed = 0;
   {
     raft::common::nvtx::range scope("act_cuda_task_graph");
     cudaGraphLaunch(calc_slack_exec, handle_ptr->get_stream());
@@ -428,13 +530,10 @@ template <typename i_t, typename f_t>
 bool load_balanced_bounds_presolve_t<i_t, f_t>::update_bounds_from_slack(
   const raft::handle_t* handle_ptr)
 {
-  i_t h_bounds_changed;
-  bounds_changed.set_value_to_zero_async(handle_ptr->get_stream());
-
+  // bounds_changed is copied to h_bounds_changed in upd_bnd_exec
   {
     raft::common::nvtx::range scope("upd_cuda_task_graph");
     cudaGraphLaunch(upd_bnd_exec, handle_ptr->get_stream());
-    h_bounds_changed = bounds_changed.value(handle_ptr->get_stream());
   }
   RAFT_CHECK_CUDA(handle_ptr->get_stream());
   constexpr i_t zero = 0;
diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh
index 19aef04f8c..42736b3a0a 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cuh
@@ -212,6 +212,8 @@ class load_balanced_bounds_presolve_t {
 
   activity_view_t get_activity_view(const load_balanced_problem_t<i_t, f_t>& pb);
   bounds_update_view_t get_bounds_update_view(const load_balanced_problem_t<i_t, f_t>& pb);
+  void create_bounds_update_graph();
+  void create_constraint_slack_graph(bool erase_inf_cnst);
 
   rmm::cuda_stream main_stream;
   rmm::cuda_stream act_stream;
@@ -221,6 +223,7 @@ class load_balanced_bounds_presolve_t {
   const load_balanced_problem_t<i_t, f_t>* pb;
 
   rmm::device_scalar<i_t> bounds_changed;
+  i_t h_bounds_changed;
 
   rmm::device_uvector<f_t> cnst_slack;
   rmm::device_uvector<f_t> vars_bnd;
diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh
index dea2ffe7c5..07a268d9aa 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh
@@ -157,6 +157,7 @@ void calc_activity_heavy_cnst(managed_stream_pool& streams,
 {
   if (num_blocks_heavy_cnst != 0) {
     auto heavy_cnst_stream = streams.get_stream();
+    RAFT_CHECK_CUDA(heavy_cnst_stream);
     // TODO : Check heavy_cnst_block_segments size for profiling
     if (!dry_run) {
       auto heavy_cnst_beg_id = get_id_offset(cnst_bin_offsets, heavy_degree_cutoff);
@@ -168,15 +169,18 @@ void calc_activity_heavy_cnst(managed_stream_pool& streams,
           heavy_degree_cutoff,
           view,
           tmp_cnst_act);
+      RAFT_CHECK_CUDA(heavy_cnst_stream);
       auto num_heavy_cnst = cnst_bin_offsets.back() - heavy_cnst_beg_id;
       if (erase_inf_cnst) {
         finalize_calc_act_kernel<true, i_t, f_t, f_t2>
           <<<num_heavy_cnst, 32, 0, heavy_cnst_stream>>>(
             heavy_cnst_beg_id, make_span(heavy_cnst_block_segments), tmp_cnst_act, view);
+        RAFT_CHECK_CUDA(heavy_cnst_stream);
       } else {
         finalize_calc_act_kernel<false, i_t, f_t, f_t2>
           <<<num_heavy_cnst, 32, 0, heavy_cnst_stream>>>(
             heavy_cnst_beg_id, make_span(heavy_cnst_block_segments), tmp_cnst_act, view);
+        RAFT_CHECK_CUDA(heavy_cnst_stream);
       }
     }
   }
@@ -202,9 +206,11 @@ void calc_activity_per_block(managed_stream_pool& streams,
       if (erase_inf_cnst) {
         lb_calc_act_block_kernel<true, i_t, f_t, f_t2, block_dim>
           <<<block_count, block_dim, 0, block_stream>>>(cnst_id_beg, view);
+        RAFT_CHECK_CUDA(block_stream);
       } else {
         lb_calc_act_block_kernel<false, i_t, f_t, f_t2, block_dim>
           <<<block_count, block_dim, 0, block_stream>>>(cnst_id_beg, view);
+        RAFT_CHECK_CUDA(block_stream);
       }
     }
   }
@@ -261,9 +267,11 @@ void calc_activity_sub_warp(managed_stream_pool& streams,
       if (erase_inf_cnst) {
         lb_calc_act_sub_warp_kernel<true, i_t, f_t, f_t2, block_dim, threads_per_constraint>
           <<<block_count, block_dim, 0, sub_warp_thread>>>(cnst_id_beg, cnst_id_end, view);
+        RAFT_CHECK_CUDA(sub_warp_thread);
       } else {
         lb_calc_act_sub_warp_kernel<false, i_t, f_t, f_t2, block_dim, threads_per_constraint>
           <<<block_count, block_dim, 0, sub_warp_thread>>>(cnst_id_beg, cnst_id_end, view);
+        RAFT_CHECK_CUDA(sub_warp_thread);
       }
     }
   }
@@ -304,10 +312,12 @@ void calc_activity_sub_warp(managed_stream_pool& streams,
         lb_calc_act_sub_warp_kernel<true, i_t, f_t, f_t2, block_dim>
           <<<block_count, block_dim, 0, sub_warp_stream>>>(
             view, make_span(warp_cnst_offsets), make_span(warp_cnst_id_offsets));
+        RAFT_CHECK_CUDA(sub_warp_stream);
       } else {
         lb_calc_act_sub_warp_kernel<false, i_t, f_t, f_t2, block_dim>
           <<<block_count, block_dim, 0, sub_warp_stream>>>(
             view, make_span(warp_cnst_offsets), make_span(warp_cnst_id_offsets));
+        RAFT_CHECK_CUDA(sub_warp_stream);
       }
     }
   }
@@ -359,44 +369,310 @@ void calc_activity_sub_warp(managed_stream_pool& streams,
   }
 }
 
-/// BOUNDS UPDATE
+template <typename i_t,
+          typename f_t,
+          typename f_t2,
+          i_t threads_per_constraint,
+          typename activity_view_t>
+void create_activity_sub_warp(cudaGraph_t act_graph,
+                              cudaGraphNode_t& set_bounds_changed_node,
+                              activity_view_t view,
+                              i_t degree_beg,
+                              i_t degree_end,
+                              const std::vector<i_t>& cnst_bin_offsets,
+                              bool erase_inf_cnst)
+{
+  constexpr i_t block_dim         = 32;
+  auto cnst_per_block             = block_dim / threads_per_constraint;
+  auto [cnst_id_beg, cnst_id_end] = get_id_range(cnst_bin_offsets, degree_beg, degree_end);
 
-template <typename i_t, typename f_t, typename f_t2, i_t block_dim, typename bounds_update_view_t>
-void upd_bounds_heavy_vars(managed_stream_pool& streams,
-                           bounds_update_view_t view,
-                           raft::device_span<f_t2> tmp_vars_bnd,
-                           const rmm::device_uvector<i_t>& heavy_vars_vertex_ids,
-                           const rmm::device_uvector<i_t>& heavy_vars_pseudo_block_ids,
-                           const rmm::device_uvector<i_t>& heavy_vars_block_segments,
-                           const std::vector<i_t>& vars_bin_offsets,
-                           i_t heavy_degree_cutoff,
-                           i_t num_blocks_heavy_vars,
-                           bool dry_run = false)
+  auto block_count = raft::ceildiv<i_t>(cnst_id_end - cnst_id_beg, cnst_per_block);
+  if (block_count != 0) {
+    cudaGraphNode_t act_sub_warp_node;
+    void* kernelArgs[]                    = {&cnst_id_beg, &cnst_id_end, &view};
+    cudaKernelNodeParams kernelNodeParams = {0};
+
+    kernelNodeParams.gridDim        = dim3(block_count, 1, 1);
+    kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void**)kernelArgs;
+    kernelNodeParams.extra          = NULL;
+    if (erase_inf_cnst) {
+      kernelNodeParams.func = (void*)lb_calc_act_sub_warp_kernel<true,
+                                                                 i_t,
+                                                                 f_t,
+                                                                 f_t2,
+                                                                 block_dim,
+                                                                 threads_per_constraint,
+                                                                 activity_view_t>;
+    } else {
+      kernelNodeParams.func = (void*)lb_calc_act_sub_warp_kernel<false,
+                                                                 i_t,
+                                                                 f_t,
+                                                                 f_t2,
+                                                                 block_dim,
+                                                                 threads_per_constraint,
+                                                                 activity_view_t>;
+    }
+
+    cudaGraphAddKernelNode(&act_sub_warp_node, act_graph, NULL, 0, &kernelNodeParams);
+    cudaGraphAddDependencies(act_graph, &act_sub_warp_node, &set_bounds_changed_node, 1);
+  }
+}
+
+template <typename i_t,
+          typename f_t,
+          typename f_t2,
+          i_t threads_per_constraint,
+          typename activity_view_t>
+void create_activity_sub_warp(cudaGraph_t act_graph,
+                              cudaGraphNode_t& set_bounds_changed_node,
+                              activity_view_t view,
+                              i_t degree,
+                              const std::vector<i_t>& cnst_bin_offsets,
+                              bool erase_inf_cnst)
 {
-  if (num_blocks_heavy_vars != 0) {
-    auto heavy_vars_stream = streams.get_stream();
-    // TODO : Check heavy_vars_block_segments size for profiling
-    if (!dry_run) {
-      auto heavy_vars_beg_id = get_id_offset(vars_bin_offsets, heavy_degree_cutoff);
-      lb_upd_bnd_heavy_kernel<i_t, f_t, f_t2, block_dim>
-        <<<num_blocks_heavy_vars, block_dim, 0, heavy_vars_stream>>>(
-          heavy_vars_beg_id,
-          make_span(heavy_vars_vertex_ids),
-          make_span(heavy_vars_pseudo_block_ids),
-          heavy_degree_cutoff,
-          view,
-          tmp_vars_bnd);
-      auto num_heavy_vars = vars_bin_offsets.back() - heavy_vars_beg_id;
-      finalize_upd_bnd_kernel<i_t, f_t, f_t2><<<num_heavy_vars, 32, 0, heavy_vars_stream>>>(
-        heavy_vars_beg_id, make_span(heavy_vars_block_segments), tmp_vars_bnd, view);
+  create_activity_sub_warp<i_t, f_t, f_t2, threads_per_constraint>(
+    act_graph, set_bounds_changed_node, view, degree, degree, cnst_bin_offsets, erase_inf_cnst);
+}
+
+template <typename i_t, typename f_t, typename f_t2, typename activity_view_t>
+void create_activity_sub_warp(cudaGraph_t act_graph,
+                              cudaGraphNode_t& set_bounds_changed_node,
+                              activity_view_t view,
+                              i_t cnst_sub_warp_count,
+                              rmm::device_uvector<i_t>& warp_cnst_offsets,
+                              rmm::device_uvector<i_t>& warp_cnst_id_offsets,
+                              bool erase_inf_cnst)
+{
+  constexpr i_t block_dim = 256;
+
+  auto block_count = raft::ceildiv<i_t>(cnst_sub_warp_count * 32, block_dim);
+  if (block_count != 0) {
+    cudaGraphNode_t act_sub_warp_node;
+    auto warp_cnst_offsets_span    = make_span(warp_cnst_offsets);
+    auto warp_cnst_id_offsets_span = make_span(warp_cnst_id_offsets);
+
+    void* kernelArgs[] = {&view, &warp_cnst_offsets_span, &warp_cnst_id_offsets_span};
+    cudaKernelNodeParams kernelNodeParams = {0};
+
+    kernelNodeParams.gridDim        = dim3(block_count, 1, 1);
+    kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void**)kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    if (erase_inf_cnst) {
+      kernelNodeParams.func =
+        (void*)lb_calc_act_sub_warp_kernel<true, i_t, f_t, f_t2, block_dim, activity_view_t>;
+    } else {
+      kernelNodeParams.func =
+        (void*)lb_calc_act_sub_warp_kernel<false, i_t, f_t, f_t2, block_dim, activity_view_t>;
+    }
+
+    cudaGraphAddKernelNode(&act_sub_warp_node, act_graph, NULL, 0, &kernelNodeParams);
+    cudaGraphAddDependencies(act_graph, &act_sub_warp_node, &set_bounds_changed_node, 1);
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, typename activity_view_t>
+void create_activity_sub_warp(cudaGraph_t act_graph,
+                              cudaGraphNode_t& set_bounds_changed_node,
+                              activity_view_t view,
+                              bool is_cnst_sub_warp_single_bin,
+                              i_t cnst_sub_warp_count,
+                              rmm::device_uvector<i_t>& warp_cnst_offsets,
+                              rmm::device_uvector<i_t>& warp_cnst_id_offsets,
+                              const std::vector<i_t>& cnst_bin_offsets,
+                              bool erase_inf_cnst)
+{
+  if (view.nnz < 10000) {
+    create_activity_sub_warp<i_t, f_t, f_t2, 16>(
+      act_graph, set_bounds_changed_node, view, 16, cnst_bin_offsets, erase_inf_cnst);
+    create_activity_sub_warp<i_t, f_t, f_t2, 8>(
+      act_graph, set_bounds_changed_node, view, 8, cnst_bin_offsets, erase_inf_cnst);
+    create_activity_sub_warp<i_t, f_t, f_t2, 4>(
+      act_graph, set_bounds_changed_node, view, 4, cnst_bin_offsets, erase_inf_cnst);
+    create_activity_sub_warp<i_t, f_t, f_t2, 2>(
+      act_graph, set_bounds_changed_node, view, 2, cnst_bin_offsets, erase_inf_cnst);
+    create_activity_sub_warp<i_t, f_t, f_t2, 1>(
+      act_graph, set_bounds_changed_node, view, 1, cnst_bin_offsets, erase_inf_cnst);
+  } else {
+    if (is_cnst_sub_warp_single_bin) {
+      create_activity_sub_warp<i_t, f_t, f_t2, 16>(
+        act_graph, set_bounds_changed_node, view, 64, cnst_bin_offsets, erase_inf_cnst);
+      create_activity_sub_warp<i_t, f_t, f_t2, 8>(
+        act_graph, set_bounds_changed_node, view, 32, cnst_bin_offsets, erase_inf_cnst);
+      create_activity_sub_warp<i_t, f_t, f_t2, 4>(
+        act_graph, set_bounds_changed_node, view, 16, cnst_bin_offsets, erase_inf_cnst);
+      create_activity_sub_warp<i_t, f_t, f_t2, 2>(
+        act_graph, set_bounds_changed_node, view, 8, cnst_bin_offsets, erase_inf_cnst);
+      create_activity_sub_warp<i_t, f_t, f_t2, 1>(
+        act_graph, set_bounds_changed_node, view, 1, 4, cnst_bin_offsets, erase_inf_cnst);
+    } else {
+      create_activity_sub_warp<i_t, f_t, f_t2>(act_graph,
+                                               set_bounds_changed_node,
+                                               view,
+                                               cnst_sub_warp_count,
+                                               warp_cnst_offsets,
+                                               warp_cnst_id_offsets,
+                                               erase_inf_cnst);
+    }
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, i_t block_dim, typename activity_view_t>
+void create_activity_per_block(cudaGraph_t act_graph,
+                               cudaGraphNode_t& set_bounds_changed_node,
+                               activity_view_t view,
+                               const std::vector<i_t>& cnst_bin_offsets,
+                               i_t degree_beg,
+                               i_t degree_end,
+                               bool erase_inf_cnst)
+{
+  static_assert(block_dim <= 1024, "Cannot launch kernel with more than 1024 threads");
+
+  auto [cnst_id_beg, cnst_id_end] = get_id_range(cnst_bin_offsets, degree_beg, degree_end);
+
+  auto block_count = cnst_id_end - cnst_id_beg;
+  if (block_count > 0) {
+    cudaGraphNode_t act_block_node;
+    void* kernelArgs[] = {&cnst_id_beg, &view};
+
+    cudaKernelNodeParams kernelNodeParams = {0};
+
+    kernelNodeParams.gridDim        = dim3(block_count, 1, 1);
+    kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void**)kernelArgs;
+    kernelNodeParams.extra          = NULL;
+    if (erase_inf_cnst) {
+      kernelNodeParams.func =
+        (void*)lb_calc_act_block_kernel<true, i_t, f_t, f_t2, block_dim, activity_view_t>;
+    } else {
+      kernelNodeParams.func =
+        (void*)lb_calc_act_block_kernel<false, i_t, f_t, f_t2, block_dim, activity_view_t>;
     }
+
+    cudaGraphAddKernelNode(&act_block_node, act_graph, NULL, 0, &kernelNodeParams);
+    cudaGraphAddDependencies(act_graph, &act_block_node, &set_bounds_changed_node, 1);
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, typename activity_view_t>
+void create_activity_per_block(cudaGraph_t act_graph,
+                               cudaGraphNode_t& set_bounds_changed_node,
+                               activity_view_t view,
+                               const std::vector<i_t>& cnst_bin_offsets,
+                               i_t heavy_degree_cutoff,
+                               bool erase_inf_cnst)
+{
+  if (view.nnz < 10000) {
+    create_activity_per_block<i_t, f_t, f_t2, 32>(
+      act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 32, 32, erase_inf_cnst);
+    create_activity_per_block<i_t, f_t, f_t2, 64>(
+      act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 64, 64, erase_inf_cnst);
+    create_activity_per_block<i_t, f_t, f_t2, 128>(
+      act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 128, 128, erase_inf_cnst);
+    create_activity_per_block<i_t, f_t, f_t2, 256>(
+      act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 256, 256, erase_inf_cnst);
+  } else {
+    //[1024, heavy_degree_cutoff/2] -> 1024 block size
+    create_activity_per_block<i_t, f_t, f_t2, 1024>(act_graph,
+                                                    set_bounds_changed_node,
+                                                    view,
+                                                    cnst_bin_offsets,
+                                                    1024,
+                                                    heavy_degree_cutoff / 2,
+                                                    erase_inf_cnst);
+    //[512, 512] -> 128 block size
+    create_activity_per_block<i_t, f_t, f_t2, 128>(
+      act_graph, set_bounds_changed_node, view, cnst_bin_offsets, 128, 512, erase_inf_cnst);
   }
 }
 
+template <typename i_t, typename f_t, typename f_t2, i_t block_dim, typename activity_view_t>
+void create_activity_heavy_cnst(cudaGraph_t act_graph,
+                                cudaGraphNode_t& set_bounds_changed_node,
+                                activity_view_t view,
+                                raft::device_span<f_t2> tmp_cnst_act,
+                                const rmm::device_uvector<i_t>& heavy_cnst_vertex_ids,
+                                const rmm::device_uvector<i_t>& heavy_cnst_pseudo_block_ids,
+                                const rmm::device_uvector<i_t>& heavy_cnst_block_segments,
+                                const std::vector<i_t>& cnst_bin_offsets,
+                                i_t heavy_degree_cutoff,
+                                i_t num_blocks_heavy_cnst,
+                                bool erase_inf_cnst,
+                                bool dry_run = false)
+{
+  if (num_blocks_heavy_cnst != 0) {
+    cudaGraphNode_t act_heavy_node;
+    cudaGraphNode_t finalize_heavy_node;
+    // Add heavy kernel
+    {
+      auto heavy_cnst_beg_id                = get_id_offset(cnst_bin_offsets, heavy_degree_cutoff);
+      auto heavy_cnst_vertex_ids_span       = make_span(heavy_cnst_vertex_ids);
+      auto heavy_cnst_pseudo_block_ids_span = make_span(heavy_cnst_pseudo_block_ids);
+      i_t work_per_block                    = heavy_degree_cutoff;
+
+      void* kernelArgs[] = {&heavy_cnst_beg_id,
+                            &heavy_cnst_vertex_ids_span,
+                            &heavy_cnst_pseudo_block_ids_span,
+                            &work_per_block,
+                            &view,
+                            &tmp_cnst_act};
+
+      cudaKernelNodeParams kernelNodeParams = {0};
+
+      kernelNodeParams.func =
+        (void*)lb_calc_act_heavy_kernel<i_t, f_t, f_t2, block_dim, activity_view_t>;
+      kernelNodeParams.gridDim        = dim3(num_blocks_heavy_cnst, 1, 1);
+      kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+      kernelNodeParams.sharedMemBytes = 0;
+      kernelNodeParams.kernelParams   = (void**)kernelArgs;
+      kernelNodeParams.extra          = NULL;
+
+      cudaGraphAddKernelNode(&act_heavy_node, act_graph, NULL, 0, &kernelNodeParams);
+    }
+    {
+      auto heavy_cnst_beg_id              = get_id_offset(cnst_bin_offsets, heavy_degree_cutoff);
+      auto num_heavy_cnst                 = cnst_bin_offsets.back() - heavy_cnst_beg_id;
+      auto heavy_cnst_block_segments_span = make_span(heavy_cnst_block_segments);
+
+      void* kernelArgs[] = {
+        &heavy_cnst_beg_id, &heavy_cnst_block_segments_span, &tmp_cnst_act, &view};
+
+      cudaKernelNodeParams kernelNodeParams = {0};
+
+      kernelNodeParams.gridDim        = dim3(num_heavy_cnst, 1, 1);
+      kernelNodeParams.blockDim       = dim3(32, 1, 1);
+      kernelNodeParams.sharedMemBytes = 0;
+      kernelNodeParams.kernelParams   = (void**)kernelArgs;
+      kernelNodeParams.extra          = NULL;
+      if (erase_inf_cnst) {
+        kernelNodeParams.func =
+          (void*)finalize_calc_act_kernel<true, i_t, f_t, f_t2, activity_view_t>;
+      } else {
+        kernelNodeParams.func =
+          (void*)finalize_calc_act_kernel<false, i_t, f_t, f_t2, activity_view_t>;
+      }
+
+      cudaGraphAddKernelNode(&finalize_heavy_node, act_graph, NULL, 0, &kernelNodeParams);
+    }
+
+    cudaGraphAddDependencies(act_graph, &act_heavy_node, &finalize_heavy_node, 1);
+    cudaGraphAddDependencies(act_graph, &finalize_heavy_node, &set_bounds_changed_node, 1);
+  }
+}
+
+/// BOUNDS UPDATE
+
 template <typename i_t, typename f_t, typename f_t2, i_t block_dim, typename bounds_update_view_t>
 void upd_bounds_heavy_vars(managed_stream_pool& streams,
                            bounds_update_view_t view,
                            raft::device_span<f_t2> tmp_vars_bnd,
+                           const rmm::device_uvector<i_t>& heavy_vars_vertex_ids,
+                           const rmm::device_uvector<i_t>& heavy_vars_pseudo_block_ids,
                            const rmm::device_uvector<i_t>& heavy_vars_block_segments,
                            const std::vector<i_t>& vars_bin_offsets,
                            i_t heavy_degree_cutoff,
@@ -411,7 +687,8 @@ void upd_bounds_heavy_vars(managed_stream_pool& streams,
       lb_upd_bnd_heavy_kernel<i_t, f_t, f_t2, block_dim>
         <<<num_blocks_heavy_vars, block_dim, 0, heavy_vars_stream>>>(
           heavy_vars_beg_id,
-          make_span(heavy_vars_block_segments, 1, heavy_vars_block_segments.size()),
+          make_span(heavy_vars_vertex_ids),
+          make_span(heavy_vars_pseudo_block_ids),
           heavy_degree_cutoff,
           view,
           tmp_vars_bnd);
@@ -556,4 +833,279 @@ void upd_bounds_sub_warp(managed_stream_pool& streams,
     }
   }
 }
+
+template <typename i_t,
+          typename f_t,
+          typename f_t2,
+          i_t threads_per_variable,
+          typename bounds_update_view_t>
+void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
+                                   // cudaGraphNode_t& set_bounds_changed_node,
+                                   cudaGraphNode_t& bounds_changed_node,
+                                   bounds_update_view_t view,
+                                   i_t degree_beg,
+                                   i_t degree_end,
+                                   const std::vector<i_t>& vars_bin_offsets)
+{
+  constexpr i_t block_dim         = 32;
+  auto vars_per_block             = block_dim / threads_per_variable;
+  auto [vars_id_beg, vars_id_end] = get_id_range(vars_bin_offsets, degree_beg, degree_end);
+
+  auto block_count = raft::ceildiv<i_t>(vars_id_end - vars_id_beg, vars_per_block);
+  if (block_count != 0) {
+    cudaGraphNode_t upd_bnd_sub_warp_node;
+
+    void* kernelArgs[] = {&vars_id_beg, &vars_id_end, &view};
+
+    cudaKernelNodeParams kernelNodeParams = {0};
+
+    kernelNodeParams.func           = (void*)lb_upd_bnd_sub_warp_kernel<i_t,
+                                                                        f_t,
+                                                                        f_t2,
+                                                                        block_dim,
+                                                                        threads_per_variable,
+                                                                        bounds_update_view_t>;
+    kernelNodeParams.gridDim        = dim3(block_count, 1, 1);
+    kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void**)kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    cudaGraphAddKernelNode(&upd_bnd_sub_warp_node, upd_graph, NULL, 0, &kernelNodeParams);
+    RAFT_CUDA_TRY(cudaGetLastError());
+
+    // cudaGraphAddDependencies(upd_graph, &set_bounds_changed_node, &upd_bnd_sub_warp_node, 1);
+    cudaGraphAddDependencies(upd_graph, &upd_bnd_sub_warp_node, &bounds_changed_node, 1);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+}
+
+template <typename i_t,
+          typename f_t,
+          typename f_t2,
+          i_t threads_per_variable,
+          typename bounds_update_view_t>
+void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
+                                   // cudaGraphNode_t& set_bounds_changed_node,
+                                   cudaGraphNode_t& bounds_changed_node,
+                                   bounds_update_view_t view,
+                                   i_t degree,
+                                   const std::vector<i_t>& vars_bin_offsets)
+{
+  create_update_bounds_sub_warp<i_t, f_t, f_t2, threads_per_variable>(
+    upd_graph, bounds_changed_node, view, degree, degree, vars_bin_offsets);
+}
+
+template <typename i_t, typename f_t, typename f_t2, typename bounds_update_view_t>
+void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
+                                   // cudaGraphNode_t& set_bounds_changed_node,
+                                   cudaGraphNode_t& bounds_changed_node,
+                                   bounds_update_view_t view,
+                                   i_t vars_sub_warp_count,
+                                   rmm::device_uvector<i_t>& warp_vars_offsets,
+                                   rmm::device_uvector<i_t>& warp_vars_id_offsets)
+{
+  constexpr i_t block_dim = 256;
+
+  auto block_count = raft::ceildiv<i_t>(vars_sub_warp_count * 32, block_dim);
+  if (block_count != 0) {
+    cudaGraphNode_t upd_bnd_sub_warp_node;
+
+    auto warp_vars_offsets_span    = make_span(warp_vars_offsets);
+    auto warp_vars_id_offsets_span = make_span(warp_vars_id_offsets);
+
+    void* kernelArgs[] = {&view, &warp_vars_offsets_span, &warp_vars_id_offsets_span};
+
+    cudaKernelNodeParams kernelNodeParams = {0};
+
+    kernelNodeParams.func =
+      (void*)lb_upd_bnd_sub_warp_kernel<i_t, f_t, f_t2, block_dim, bounds_update_view_t>;
+    kernelNodeParams.gridDim        = dim3(block_count, 1, 1);
+    kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void**)kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    cudaGraphAddKernelNode(&upd_bnd_sub_warp_node, upd_graph, NULL, 0, &kernelNodeParams);
+    RAFT_CUDA_TRY(cudaGetLastError());
+
+    cudaGraphAddDependencies(upd_graph, &upd_bnd_sub_warp_node, &bounds_changed_node, 1);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, typename bounds_update_view_t>
+void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
+                                   cudaGraphNode_t& bounds_changed_node,
+                                   bounds_update_view_t view,
+                                   bool is_vars_sub_warp_single_bin,
+                                   i_t vars_sub_warp_count,
+                                   rmm::device_uvector<i_t>& warp_vars_offsets,
+                                   rmm::device_uvector<i_t>& warp_vars_id_offsets,
+                                   const std::vector<i_t>& vars_bin_offsets)
+{
+  if (view.nnz < 10000) {
+    create_update_bounds_sub_warp<i_t, f_t, f_t2, 16>(
+      upd_graph, bounds_changed_node, view, 16, vars_bin_offsets);
+    create_update_bounds_sub_warp<i_t, f_t, f_t2, 8>(
+      upd_graph, bounds_changed_node, view, 8, vars_bin_offsets);
+    create_update_bounds_sub_warp<i_t, f_t, f_t2, 4>(
+      upd_graph, bounds_changed_node, view, 4, vars_bin_offsets);
+    create_update_bounds_sub_warp<i_t, f_t, f_t2, 2>(
+      upd_graph, bounds_changed_node, view, 2, vars_bin_offsets);
+    create_update_bounds_sub_warp<i_t, f_t, f_t2, 1>(
+      upd_graph, bounds_changed_node, view, 1, vars_bin_offsets);
+  } else {
+    if (is_vars_sub_warp_single_bin) {
+      create_update_bounds_sub_warp<i_t, f_t, f_t2, 16>(
+        upd_graph, bounds_changed_node, view, 64, vars_bin_offsets);
+      create_update_bounds_sub_warp<i_t, f_t, f_t2, 8>(
+        upd_graph, bounds_changed_node, view, 32, vars_bin_offsets);
+      create_update_bounds_sub_warp<i_t, f_t, f_t2, 4>(
+        upd_graph, bounds_changed_node, view, 16, vars_bin_offsets);
+      create_update_bounds_sub_warp<i_t, f_t, f_t2, 2>(
+        upd_graph, bounds_changed_node, view, 8, vars_bin_offsets);
+      create_update_bounds_sub_warp<i_t, f_t, f_t2, 1>(
+        upd_graph, bounds_changed_node, view, 1, 4, vars_bin_offsets);
+    } else {
+      create_update_bounds_sub_warp<i_t, f_t, f_t2>(upd_graph,
+                                                    bounds_changed_node,
+                                                    view,
+                                                    vars_sub_warp_count,
+                                                    warp_vars_offsets,
+                                                    warp_vars_id_offsets);
+    }
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, i_t block_dim, typename bounds_update_view_t>
+void create_update_bounds_per_block(cudaGraph_t upd_graph,
+                                    cudaGraphNode_t& bounds_changed_node,
+                                    bounds_update_view_t view,
+                                    const std::vector<i_t>& vars_bin_offsets,
+                                    i_t degree_beg,
+                                    i_t degree_end)
+{
+  auto [vars_id_beg, vars_id_end] = get_id_range(vars_bin_offsets, degree_beg, degree_end);
+
+  auto block_count = vars_id_end - vars_id_beg;
+  if (block_count > 0) {
+    cudaGraphNode_t upd_bnd_block_node;
+
+    void* kernelArgs[] = {&vars_id_beg, &view};
+
+    cudaKernelNodeParams kernelNodeParams = {0};
+
+    kernelNodeParams.func =
+      (void*)lb_upd_bnd_block_kernel<i_t, f_t, f_t2, block_dim, bounds_update_view_t>;
+    kernelNodeParams.gridDim        = dim3(block_count, 1, 1);
+    kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void**)kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    cudaGraphAddKernelNode(&upd_bnd_block_node, upd_graph, NULL, 0, &kernelNodeParams);
+    RAFT_CUDA_TRY(cudaGetLastError());
+
+    cudaGraphAddDependencies(upd_graph, &upd_bnd_block_node, &bounds_changed_node, 1);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, typename bounds_update_view_t>
+void create_update_bounds_per_block(cudaGraph_t upd_graph,
+                                    cudaGraphNode_t& bounds_changed_node,
+                                    bounds_update_view_t view,
+                                    const std::vector<i_t>& vars_bin_offsets,
+                                    i_t heavy_degree_cutoff)
+{
+  if (view.nnz < 10000) {
+    create_update_bounds_per_block<i_t, f_t, f_t2, 32>(
+      upd_graph, bounds_changed_node, view, vars_bin_offsets, 32, 32);
+    create_update_bounds_per_block<i_t, f_t, f_t2, 64>(
+      upd_graph, bounds_changed_node, view, vars_bin_offsets, 64, 64);
+    create_update_bounds_per_block<i_t, f_t, f_t2, 128>(
+      upd_graph, bounds_changed_node, view, vars_bin_offsets, 128, 128);
+    create_update_bounds_per_block<i_t, f_t, f_t2, 256>(
+      upd_graph, bounds_changed_node, view, vars_bin_offsets, 256, 256);
+  } else {
+    //[1024, heavy_degree_cutoff/2] -> 128 block size
+    create_update_bounds_per_block<i_t, f_t, f_t2, 256>(
+      upd_graph, bounds_changed_node, view, vars_bin_offsets, 1024, heavy_degree_cutoff / 2);
+    //[64, 512] -> 32 block size
+    create_update_bounds_per_block<i_t, f_t, f_t2, 64>(
+      upd_graph, bounds_changed_node, view, vars_bin_offsets, 128, 512);
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, i_t block_dim, typename bounds_update_view_t>
+void create_update_bounds_heavy_vars(cudaGraph_t upd_graph,
+                                     cudaGraphNode_t& bounds_changed_node,
+                                     bounds_update_view_t view,
+                                     raft::device_span<f_t2> tmp_vars_bnd,
+                                     const rmm::device_uvector<i_t>& heavy_vars_vertex_ids,
+                                     const rmm::device_uvector<i_t>& heavy_vars_pseudo_block_ids,
+                                     const rmm::device_uvector<i_t>& heavy_vars_block_segments,
+                                     const std::vector<i_t>& vars_bin_offsets,
+                                     i_t heavy_degree_cutoff,
+                                     i_t num_blocks_heavy_vars)
+{
+  if (num_blocks_heavy_vars != 0) {
+    cudaGraphNode_t upd_bnd_heavy_node;
+    cudaGraphNode_t finalize_heavy_node;
+    // Add heavy kernel
+    {
+      auto heavy_vars_beg_id                = get_id_offset(vars_bin_offsets, heavy_degree_cutoff);
+      auto heavy_vars_vertex_ids_span       = make_span(heavy_vars_vertex_ids);
+      auto heavy_vars_pseudo_block_ids_span = make_span(heavy_vars_pseudo_block_ids);
+      i_t work_per_block                    = heavy_degree_cutoff;
+
+      void* kernelArgs[] = {&heavy_vars_beg_id,
+                            &heavy_vars_vertex_ids_span,
+                            &heavy_vars_pseudo_block_ids_span,
+                            &work_per_block,
+                            &view,
+                            &tmp_vars_bnd};
+
+      cudaKernelNodeParams kernelNodeParams = {0};
+
+      kernelNodeParams.func =
+        (void*)lb_upd_bnd_heavy_kernel<i_t, f_t, f_t2, block_dim, bounds_update_view_t>;
+      kernelNodeParams.gridDim        = dim3(num_blocks_heavy_vars, 1, 1);
+      kernelNodeParams.blockDim       = dim3(block_dim, 1, 1);
+      kernelNodeParams.sharedMemBytes = 0;
+      kernelNodeParams.kernelParams   = (void**)kernelArgs;
+      kernelNodeParams.extra          = NULL;
+
+      cudaGraphAddKernelNode(&upd_bnd_heavy_node, upd_graph, NULL, 0, &kernelNodeParams);
+      RAFT_CUDA_TRY(cudaGetLastError());
+    }
+    // Add finalize
+    {
+      auto heavy_vars_beg_id              = get_id_offset(vars_bin_offsets, heavy_degree_cutoff);
+      auto num_heavy_vars                 = vars_bin_offsets.back() - heavy_vars_beg_id;
+      auto heavy_vars_block_segments_span = make_span(heavy_vars_block_segments);
+
+      void* kernelArgs[] = {
+        &heavy_vars_beg_id, &heavy_vars_block_segments_span, &tmp_vars_bnd, &view};
+
+      cudaKernelNodeParams kernelNodeParams = {0};
+
+      kernelNodeParams.func = (void*)finalize_upd_bnd_kernel<i_t, f_t, f_t2, bounds_update_view_t>;
+      kernelNodeParams.gridDim        = dim3(num_heavy_vars, 1, 1);
+      kernelNodeParams.blockDim       = dim3(32, 1, 1);
+      kernelNodeParams.sharedMemBytes = 0;
+      kernelNodeParams.kernelParams   = (void**)kernelArgs;
+      kernelNodeParams.extra          = NULL;
+
+      cudaGraphAddKernelNode(&finalize_heavy_node, upd_graph, NULL, 0, &kernelNodeParams);
+      RAFT_CUDA_TRY(cudaGetLastError());
+    }
+    cudaGraphAddDependencies(upd_graph, &upd_bnd_heavy_node, &finalize_heavy_node, 1);
+    RAFT_CUDA_TRY(cudaGetLastError());
+    cudaGraphAddDependencies(upd_graph, &finalize_heavy_node, &bounds_changed_node, 1);
+    RAFT_CUDA_TRY(cudaGetLastError());
+  }
+}
+
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh
index 328fa25b93..a8f4c75008 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh
@@ -79,42 +79,23 @@ __global__ void lb_calc_act_heavy_kernel(i_t id_range_beg,
                                          activity_view_t view,
                                          raft::device_span<f_t2> tmp_cnst_act)
 {
+  // if (pseudo_block_ids.size() <= blockIdx.x) {
+  //   printf("oob pseudo_block_id %d %d\n", blockIdx.x, int(pseudo_block_ids.size()));
+  // }
+  // if (ids.size() <= blockIdx.x) {
+  //   printf("oob ids\n");
+  // }
+  // if (tmp_cnst_act.size() <= blockIdx.x) {
+  //   printf("oob tmp_cnst_act\n");
+  // }
   auto idx             = ids[blockIdx.x] + id_range_beg;
   auto pseudo_block_id = pseudo_block_ids[blockIdx.x];
-  i_t item_off_beg     = view.offsets[idx] + work_per_block * pseudo_block_id;
-  i_t item_off_end     = min(item_off_beg + work_per_block, view.offsets[idx + 1]);
-
-  typedef cub::BlockReduce<f_t, BDIM> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  auto act = calc_act<i_t, f_t, f_t2, BDIM>(view, threadIdx.x, item_off_beg, item_off_end);
-
-  act.x = BlockReduce(temp_storage).Sum(act.x);
-  __syncthreads();
-  act.y = BlockReduce(temp_storage).Sum(act.y);
-
-  // don't subtract constraint bounds yet
-  // to be done in post processing in finalize_calc_act_kernel
-  if (threadIdx.x == 0) { tmp_cnst_act[blockIdx.x] = act; }
-}
-
-template <typename i_t, typename f_t, typename f_t2, i_t BDIM, typename activity_view_t>
-__global__ void lb_calc_act_heavy_kernel(i_t id_range_beg,
-                                         raft::device_span<const i_t> item_block_segments,
-                                         i_t work_per_block,
-                                         activity_view_t view,
-                                         raft::device_span<f_t2> tmp_cnst_act)
-{
-  __shared__ i_t id_map;
-  __shared__ i_t pseudo_block_id;
-  if (threadIdx.x == 0) {
-    id_map = thrust::upper_bound(
-               thrust::seq, item_block_segments.begin(), item_block_segments.end(), blockIdx.x) -
-             item_block_segments.begin();
-    pseudo_block_id = blockIdx.x - item_block_segments[id_map - 1];
-  }
-  __syncthreads();
-  auto idx         = id_range_beg + id_map;
+  // if (view.offsets.size() <= idx) {
+  //   printf("oob offset\n");
+  // }
+  // if (view.offsets.size() <= idx + 1) {
+  //   printf("oob offset + 1\n");
+  // }
   i_t item_off_beg = view.offsets[idx] + work_per_block * pseudo_block_id;
   i_t item_off_end = min(item_off_beg + work_per_block, view.offsets[idx + 1]);
 
@@ -677,4 +658,77 @@ __global__ void lb_upd_bnd_sub_warp_kernel(bounds_update_view_t view,
   }
 }
 
+#if 0
+template <typename i_t, typename f_t, typename f_t2, i_t PSEUDO_BDIM, i_t BDIM, typename bounds_update_view_t>
+__device__ void upd_bnd_block(i_t prior_blocks_in_seg,
+                              i_t id_range_beg, bounds_update_view_t view)
+{
+  //i_t idx     = id_range_beg + blockIdx.x;
+  i_t idx     = id_beg_seg + prior_blocks_in_seg * (BDIM/PSEUDO_BDIM) + (threadIdx.x / PSEUDO_BDIM);
+  i_t var_idx = view.vars_reorg_ids[idx];
+  // x is lb, y is ub
+  auto old_bounds  = view.vars_bnd[var_idx];
+  bool is_int      = (view.vars_types[idx] == var_t::INTEGER);
+  i_t item_off_beg = view.offsets[idx];
+  i_t item_off_end = view.offsets[idx + 1];
+
+  typedef cub::BlockReduce<f_t, BDIM> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // if it is a set variable then don't propagate the bound
+  // consider continuous vars as set if their bounds cross or equal
+  if (old_bounds.x + view.tolerances.integrality_tolerance >= old_bounds.y) { return; }
+  auto bounds =
+    update_bounds<i_t, f_t, f_t2, BDIM>(view, threadIdx.x, item_off_beg, item_off_end, old_bounds);
+
+  bounds.x = BlockReduce(temp_storage).Reduce(bounds.x, cuda::maximum());
+  __syncthreads();
+  bounds.y = BlockReduce(temp_storage).Reduce(bounds.y, cuda::minimum());
+
+  if (threadIdx.x == 0) {
+    write_updated_bounds(&view.vars_bnd[var_idx], is_int, view, bounds, old_bounds);
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, i_t BDIM, typename bounds_update_view_t>
+__device__ void upd_bnd_sub_warp(bounds_update_view_t view,
+                                           raft::device_span<i_t> warp_vars_offsets,
+                                           raft::device_span<i_t> warp_vars_id_offsets)
+{
+  i_t id_warp_beg, id_range_end, threads_per_variable;
+  detect_range_sub_warp<i_t>(
+    &id_warp_beg, &id_range_end, &threads_per_variable, warp_vars_offsets, warp_vars_id_offsets);
+
+  if (threads_per_variable == 1) {
+    upd_bnd_sub_warp<i_t, f_t, f_t2, BDIM, 1>(id_warp_beg, id_range_end, view);
+  } else if (threads_per_variable == 2) {
+    upd_bnd_sub_warp<i_t, f_t, f_t2, BDIM, 2>(id_warp_beg, id_range_end, view);
+  } else if (threads_per_variable == 4) {
+    upd_bnd_sub_warp<i_t, f_t, f_t2, BDIM, 4>(id_warp_beg, id_range_end, view);
+  } else if (threads_per_variable == 8) {
+    upd_bnd_sub_warp<i_t, f_t, f_t2, BDIM, 8>(id_warp_beg, id_range_end, view);
+  } else if (threads_per_variable == 16) {
+    upd_bnd_sub_warp<i_t, f_t, f_t2, BDIM, 16>(id_warp_beg, id_range_end, view);
+  }
+}
+
+template <typename i_t, typename f_t, typename f_t2, i_t BDIM, typename bounds_update_view_t>
+__global__ void lb_upd_bnd_kernel(bounds_update_view_t view,
+                                  raft::device_span<i_t> warp_vars_offsets,
+                                  raft::device_span<i_t> warp_vars_id_offsets,
+                                  raft::device_span<i_t> block_vars_offsets,
+                                  raft::device_span<i_t> block_vars_id_offsets)
+{
+  if (blockIdx.x < sub_warp_blocks_end) {
+    upd_bnd_sub_warp(view, warp_vars_offsets, warp_vars_id_offsets);
+  } else if (blockIdx.x < block_vars_offsets[1]) {
+    upd_bnd_block<64, BDIM>(view, blockIdx.x - block_vars_offsets[0], block_vars_offsets[0], block_vars_offsets[1]);
+  } else if (blockIdx.x < block_vars_offsets[2]) {
+    upd_bnd_block<256, BDIM>(view, blockIdx.x - block_vars_offsets[1], block_vars_offsets[1], block_vars_offsets[2]);
+  } else {
+    upd_bnd_heavy<512>(heavy_vars_beg_id, heavy_vars_vertex_ids, heavy_vars_pseudo_block_ids, heavy_degree_cutoff, view, tmp_bnd);
+  }
+}
+#endif
+
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/tests/mip/CMakeLists.txt b/cpp/tests/mip/CMakeLists.txt
index b9fd249a56..18a4270ca4 100644
--- a/cpp/tests/mip/CMakeLists.txt
+++ b/cpp/tests/mip/CMakeLists.txt
@@ -27,6 +27,9 @@ ConfigureTest(ELIM_VAR_REMAP_TEST
 ConfigureTest(STANDARDIZATION_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/bounds_standardization_test.cu
 )
+ConfigureTest(LB_TEST
+    ${CMAKE_CURRENT_SOURCE_DIR}/lb_test.cu
+)
 ConfigureTest(MULTI_PROBE_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/multi_probe_test.cu
 )
diff --git a/cpp/tests/mip/lb_test.cu b/cpp/tests/mip/lb_test.cu
new file mode 100644
index 0000000000..5b5a4f1f05
--- /dev/null
+++ b/cpp/tests/mip/lb_test.cu
@@ -0,0 +1,188 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../linear_programming/utilities/pdlp_test_utilities.cuh"
+#include "mip_utils.cuh"
+
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <linear_programming/initial_scaling_strategy/initial_scaling.cuh>
+#include <linear_programming/utilities/problem_checking.cuh>
+#include <mip/presolve/bounds_presolve.cuh>
+#include <mip/presolve/load_balanced_bounds_presolve.cuh>
+#include <mip/problem/load_balanced_problem.cuh>
+#include <mps_parser/parser.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <utilities/common_utils.hpp>
+#include <utilities/error.hpp>
+#include <utilities/timer.hpp>
+
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace cuopt::linear_programming::test {
+
+inline auto make_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
+
+void init_handler(const raft::handle_t* handle_ptr)
+{
+  // Init cuBlas / cuSparse context here to avoid having it during solving time
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublassetpointermode(
+    handle_ptr->get_cublas_handle(), CUBLAS_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesetpointermode(
+    handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
+}
+
+std::tuple<std::vector<int>, std::vector<double>, std::vector<double>> select_k_random(
+  detail::problem_t<int, double>& problem, int sample_size)
+{
+  auto seed = std::random_device{}();
+  std::cerr << "Tested with seed " << seed << "\n";
+  problem.compute_n_integer_vars();
+  auto v_lb       = host_copy(problem.variable_lower_bounds);
+  auto v_ub       = host_copy(problem.variable_upper_bounds);
+  auto int_var_id = host_copy(problem.integer_indices);
+  int_var_id.erase(std::remove_if(int_var_id.begin(),
+                                  int_var_id.end(),
+                                  [v_lb, v_ub](auto id) {
+                                    return !(std::isfinite(v_lb[id]) && std::isfinite(v_ub[id]));
+                                  }),
+                   int_var_id.end());
+  sample_size = std::min(sample_size, static_cast<int>(int_var_id.size()));
+  std::vector<int> random_int_vars;
+  std::mt19937 m{seed};
+  std::sample(
+    int_var_id.begin(), int_var_id.end(), std::back_inserter(random_int_vars), sample_size, m);
+  std::vector<double> probe_0(sample_size);
+  std::vector<double> probe_1(sample_size);
+  for (int i = 0; i < static_cast<int>(random_int_vars.size()); ++i) {
+    if (i % 2) {
+      probe_0[i] = v_lb[random_int_vars[i]];
+      probe_1[i] = v_ub[random_int_vars[i]];
+    } else {
+      probe_1[i] = v_lb[random_int_vars[i]];
+      probe_0[i] = v_ub[random_int_vars[i]];
+    }
+  }
+  return std::make_tuple(std::move(random_int_vars), std::move(probe_0), std::move(probe_1));
+}
+
+std::pair<std::vector<thrust::pair<int, double>>, std::vector<thrust::pair<int, double>>>
+convert_probe_tuple(std::tuple<std::vector<int>, std::vector<double>, std::vector<double>>& probe)
+{
+  std::vector<thrust::pair<int, double>> probe_first;
+  std::vector<thrust::pair<int, double>> probe_second;
+  for (size_t i = 0; i < std::get<0>(probe).size(); ++i) {
+    probe_first.emplace_back(thrust::make_pair(std::get<0>(probe)[i], std::get<1>(probe)[i]));
+    probe_second.emplace_back(thrust::make_pair(std::get<0>(probe)[i], std::get<2>(probe)[i]));
+  }
+  return std::make_pair(std::move(probe_first), std::move(probe_second));
+}
+
+std::tuple<std::vector<double>, std::vector<double>, std::vector<double>, std::vector<double>>
+bounds_probe_results(detail::bound_presolve_t<int, double>& bnd_prb_0,
+                     detail::bound_presolve_t<int, double>& bnd_prb_1,
+                     detail::problem_t<int, double>& problem,
+                     const std::pair<std::vector<thrust::pair<int, double>>,
+                                     std::vector<thrust::pair<int, double>>>& probe)
+{
+  auto& probe_first  = std::get<0>(probe);
+  auto& probe_second = std::get<1>(probe);
+  rmm::device_uvector<double> b_lb_0(problem.n_variables, problem.handle_ptr->get_stream());
+  rmm::device_uvector<double> b_ub_0(problem.n_variables, problem.handle_ptr->get_stream());
+  rmm::device_uvector<double> b_lb_1(problem.n_variables, problem.handle_ptr->get_stream());
+  rmm::device_uvector<double> b_ub_1(problem.n_variables, problem.handle_ptr->get_stream());
+  bnd_prb_0.solve(problem, probe_first);
+  bnd_prb_0.set_updated_bounds(problem.handle_ptr, make_span(b_lb_0), make_span(b_ub_0));
+  bnd_prb_1.solve(problem, probe_second);
+  bnd_prb_1.set_updated_bounds(problem.handle_ptr, make_span(b_lb_1), make_span(b_ub_1));
+
+  auto h_lb_0 = host_copy(b_lb_0);
+  auto h_ub_0 = host_copy(b_ub_0);
+  auto h_lb_1 = host_copy(b_lb_1);
+  auto h_ub_1 = host_copy(b_ub_1);
+  return std::make_tuple(
+    std::move(h_lb_0), std::move(h_ub_0), std::move(h_lb_1), std::move(h_ub_1));
+}
+
+void test_multi_probe(std::string path)
+{
+  auto memory_resource = make_async();
+  rmm::mr::set_current_device_resource(memory_resource.get());
+  const raft::handle_t handle_{};
+  cuopt::mps_parser::mps_data_model_t<int, double> mps_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, false);
+  handle_.sync_stream();
+  auto op_problem = mps_data_model_to_optimization_problem(&handle_, mps_problem);
+  problem_checking_t<int, double>::check_problem_representation(op_problem);
+  detail::problem_t<int, double> problem(op_problem);
+  mip_solver_settings_t<int, double> default_settings{};
+  detail::pdhg_solver_t<int, double> pdhg_solver(problem.handle_ptr, problem);
+  detail::pdlp_initial_scaling_strategy_t<int, double> scaling(&handle_,
+                                                               problem,
+                                                               10,
+                                                               1.0,
+                                                               pdhg_solver,
+                                                               problem.reverse_coefficients,
+                                                               problem.reverse_offsets,
+                                                               problem.reverse_constraints,
+                                                               true);
+  detail::mip_solver_t<int, double> solver(problem, default_settings, scaling, cuopt::timer_t(0));
+  detail::load_balanced_problem_t<int, double> lb_problem(problem);
+  detail::load_balanced_bounds_presolve_t<int, double> lb_prs(lb_problem, solver.context);
+
+  detail::bound_presolve_t<int, double> bnd_prb(solver.context);
+
+  auto probe_tuple       = select_k_random(problem, 100);
+  auto bounds_probe_vals = convert_probe_tuple(probe_tuple);
+  {
+    auto& probe_first = std::get<0>(bounds_probe_vals);
+    bnd_prb.solve(problem, probe_first);
+    rmm::device_uvector<double> b_lb(problem.n_variables, problem.handle_ptr->get_stream());
+    rmm::device_uvector<double> b_ub(problem.n_variables, problem.handle_ptr->get_stream());
+    bnd_prb.set_updated_bounds(problem.handle_ptr, make_span(b_lb), make_span(b_ub));
+
+    auto h_lb = host_copy(b_lb);
+    auto h_ub = host_copy(b_ub);
+
+    lb_prs.solve(probe_first);
+
+    auto bnds = host_copy(lb_prs.vars_bnd);
+    for (int i = 0; i < (int)h_lb.size(); ++i) {
+      EXPECT_DOUBLE_EQ(bnds[2 * i], h_lb[i]);
+      EXPECT_DOUBLE_EQ(bnds[2 * i + 1], h_ub[i]);
+    }
+  }
+}
+
+TEST(presolve, multi_probe)
+{
+  std::vector<std::string> test_instances = {
+    "mip/50v-10-free-bound.mps", "mip/neos5-free-bound.mps", "mip/neos5.mps"};
+  for (const auto& test_instance : test_instances) {
+    auto path = make_path_absolute(test_instance);
+    test_multi_probe(path);
+  }
+}
+
+}  // namespace cuopt::linear_programming::test

From 18b7202924a8b570e02bd9941cfbb4dcce9d5699 Mon Sep 17 00:00:00 2001
From: Kumar Aatish <kaatish@nvidia.com>
Date: Wed, 30 Jul 2025 17:06:58 -0400
Subject: [PATCH 3/4] cleanup

---
 .../presolve/load_balanced_bounds_presolve.cu |  3 -
 .../load_balanced_bounds_presolve_kernels.cuh | 92 +------------------
 2 files changed, 2 insertions(+), 93 deletions(-)

diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu
index 960913fe10..5259cae014 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu
@@ -401,8 +401,6 @@ void load_balanced_bounds_presolve_t<i_t, f_t>::create_bounds_update_graph()
                                                 warp_vars_id_offsets,
                                                 pb->vars_bin_offsets);
   RAFT_CUDA_TRY(cudaGetLastError());
-  cudaGraphDebugDotPrint(upd_graph, "/home/aatish/debug_upd_graph", 0);
-  RAFT_CUDA_TRY(cudaGetLastError());
   cudaGraphInstantiate(&upd_bnd_exec, upd_graph, NULL, NULL, 0);
   RAFT_CUDA_TRY(cudaGetLastError());
 }
@@ -459,7 +457,6 @@ void load_balanced_bounds_presolve_t<i_t, f_t>::create_constraint_slack_graph(bo
                                            warp_cnst_id_offsets,
                                            pb->cnst_bin_offsets,
                                            erase_inf_cnst);
-  cudaGraphDebugDotPrint(cnst_slack_graph, "/home/aatish/debug_cnst_slack_graph", 0);
   if (erase_inf_cnst) {
     cudaGraphInstantiate(&calc_slack_erase_inf_cnst_exec, cnst_slack_graph, NULL, NULL, 0);
   } else {
diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh
index a8f4c75008..10089664ad 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_kernels.cuh
@@ -79,25 +79,10 @@ __global__ void lb_calc_act_heavy_kernel(i_t id_range_beg,
                                          activity_view_t view,
                                          raft::device_span<f_t2> tmp_cnst_act)
 {
-  // if (pseudo_block_ids.size() <= blockIdx.x) {
-  //   printf("oob pseudo_block_id %d %d\n", blockIdx.x, int(pseudo_block_ids.size()));
-  // }
-  // if (ids.size() <= blockIdx.x) {
-  //   printf("oob ids\n");
-  // }
-  // if (tmp_cnst_act.size() <= blockIdx.x) {
-  //   printf("oob tmp_cnst_act\n");
-  // }
   auto idx             = ids[blockIdx.x] + id_range_beg;
   auto pseudo_block_id = pseudo_block_ids[blockIdx.x];
-  // if (view.offsets.size() <= idx) {
-  //   printf("oob offset\n");
-  // }
-  // if (view.offsets.size() <= idx + 1) {
-  //   printf("oob offset + 1\n");
-  // }
-  i_t item_off_beg = view.offsets[idx] + work_per_block * pseudo_block_id;
-  i_t item_off_end = min(item_off_beg + work_per_block, view.offsets[idx + 1]);
+  i_t item_off_beg     = view.offsets[idx] + work_per_block * pseudo_block_id;
+  i_t item_off_end     = min(item_off_beg + work_per_block, view.offsets[idx + 1]);
 
   typedef cub::BlockReduce<f_t, BDIM> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -658,77 +643,4 @@ __global__ void lb_upd_bnd_sub_warp_kernel(bounds_update_view_t view,
   }
 }
 
-#if 0
-template <typename i_t, typename f_t, typename f_t2, i_t PSEUDO_BDIM, i_t BDIM, typename bounds_update_view_t>
-__device__ void upd_bnd_block(i_t prior_blocks_in_seg,
-                              i_t id_range_beg, bounds_update_view_t view)
-{
-  //i_t idx     = id_range_beg + blockIdx.x;
-  i_t idx     = id_beg_seg + prior_blocks_in_seg * (BDIM/PSEUDO_BDIM) + (threadIdx.x / PSEUDO_BDIM);
-  i_t var_idx = view.vars_reorg_ids[idx];
-  // x is lb, y is ub
-  auto old_bounds  = view.vars_bnd[var_idx];
-  bool is_int      = (view.vars_types[idx] == var_t::INTEGER);
-  i_t item_off_beg = view.offsets[idx];
-  i_t item_off_end = view.offsets[idx + 1];
-
-  typedef cub::BlockReduce<f_t, BDIM> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  // if it is a set variable then don't propagate the bound
-  // consider continuous vars as set if their bounds cross or equal
-  if (old_bounds.x + view.tolerances.integrality_tolerance >= old_bounds.y) { return; }
-  auto bounds =
-    update_bounds<i_t, f_t, f_t2, BDIM>(view, threadIdx.x, item_off_beg, item_off_end, old_bounds);
-
-  bounds.x = BlockReduce(temp_storage).Reduce(bounds.x, cuda::maximum());
-  __syncthreads();
-  bounds.y = BlockReduce(temp_storage).Reduce(bounds.y, cuda::minimum());
-
-  if (threadIdx.x == 0) {
-    write_updated_bounds(&view.vars_bnd[var_idx], is_int, view, bounds, old_bounds);
-  }
-}
-
-template <typename i_t, typename f_t, typename f_t2, i_t BDIM, typename bounds_update_view_t>
-__device__ void upd_bnd_sub_warp(bounds_update_view_t view,
-                                           raft::device_span<i_t> warp_vars_offsets,
-                                           raft::device_span<i_t> warp_vars_id_offsets)
-{
-  i_t id_warp_beg, id_range_end, threads_per_variable;
-  detect_range_sub_warp<i_t>(
-    &id_warp_beg, &id_range_end, &threads_per_variable, warp_vars_offsets, warp_vars_id_offsets);
-
-  if (threads_per_variable == 1) {
-    upd_bnd_sub_warp<i_t, f_t, f_t2, BDIM, 1>(id_warp_beg, id_range_end, view);
-  } else if (threads_per_variable == 2) {
-    upd_bnd_sub_warp<i_t, f_t, f_t2, BDIM, 2>(id_warp_beg, id_range_end, view);
-  } else if (threads_per_variable == 4) {
-    upd_bnd_sub_warp<i_t, f_t, f_t2, BDIM, 4>(id_warp_beg, id_range_end, view);
-  } else if (threads_per_variable == 8) {
-    upd_bnd_sub_warp<i_t, f_t, f_t2, BDIM, 8>(id_warp_beg, id_range_end, view);
-  } else if (threads_per_variable == 16) {
-    upd_bnd_sub_warp<i_t, f_t, f_t2, BDIM, 16>(id_warp_beg, id_range_end, view);
-  }
-}
-
-template <typename i_t, typename f_t, typename f_t2, i_t BDIM, typename bounds_update_view_t>
-__global__ void lb_upd_bnd_kernel(bounds_update_view_t view,
-                                  raft::device_span<i_t> warp_vars_offsets,
-                                  raft::device_span<i_t> warp_vars_id_offsets,
-                                  raft::device_span<i_t> block_vars_offsets,
-                                  raft::device_span<i_t> block_vars_id_offsets)
-{
-  if (blockIdx.x < sub_warp_blocks_end) {
-    upd_bnd_sub_warp(view, warp_vars_offsets, warp_vars_id_offsets);
-  } else if (blockIdx.x < block_vars_offsets[1]) {
-    upd_bnd_block<64, BDIM>(view, blockIdx.x - block_vars_offsets[0], block_vars_offsets[0], block_vars_offsets[1]);
-  } else if (blockIdx.x < block_vars_offsets[2]) {
-    upd_bnd_block<256, BDIM>(view, blockIdx.x - block_vars_offsets[1], block_vars_offsets[1], block_vars_offsets[2]);
-  } else {
-    upd_bnd_heavy<512>(heavy_vars_beg_id, heavy_vars_vertex_ids, heavy_vars_pseudo_block_ids, heavy_degree_cutoff, view, tmp_bnd);
-  }
-}
-#endif
-
 }  // namespace cuopt::linear_programming::detail

From e6afa2e9dc8f92ba9a71b0728bbc5859f8c35099 Mon Sep 17 00:00:00 2001
From: Kumar Aatish <kaatish@nvidia.com>
Date: Wed, 30 Jul 2025 21:51:41 -0400
Subject: [PATCH 4/4] pr review fixes

---
 cpp/src/mip/presolve/load_balanced_bounds_presolve.cu         | 1 -
 .../mip/presolve/load_balanced_bounds_presolve_helpers.cuh    | 4 ----
 cpp/tests/mip/CMakeLists.txt                                  | 2 +-
 cpp/tests/mip/{lb_test.cu => load_balancing_test.cu}          | 2 +-
 4 files changed, 2 insertions(+), 7 deletions(-)
 rename cpp/tests/mip/{lb_test.cu => load_balancing_test.cu} (98%)

diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu
index 5259cae014..4b65de9c21 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve.cu
@@ -265,7 +265,6 @@ void load_balanced_bounds_presolve_t<i_t, f_t>::setup(
   std::tie(is_vars_sub_warp_single_bin, vars_sub_warp_count) =
     sub_warp_meta(stream, warp_vars_offsets, warp_vars_id_offsets, pb->vars_bin_offsets, 4);
 
-  // stream.synchronize();
   RAFT_CHECK_CUDA(stream);
   streams.sync_test_all_issued();
 
diff --git a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh
index 07a268d9aa..7eb2b41a9d 100644
--- a/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh
+++ b/cpp/src/mip/presolve/load_balanced_bounds_presolve_helpers.cuh
@@ -840,7 +840,6 @@ template <typename i_t,
           i_t threads_per_variable,
           typename bounds_update_view_t>
 void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
-                                   // cudaGraphNode_t& set_bounds_changed_node,
                                    cudaGraphNode_t& bounds_changed_node,
                                    bounds_update_view_t view,
                                    i_t degree_beg,
@@ -874,7 +873,6 @@ void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
     cudaGraphAddKernelNode(&upd_bnd_sub_warp_node, upd_graph, NULL, 0, &kernelNodeParams);
     RAFT_CUDA_TRY(cudaGetLastError());
 
-    // cudaGraphAddDependencies(upd_graph, &set_bounds_changed_node, &upd_bnd_sub_warp_node, 1);
     cudaGraphAddDependencies(upd_graph, &upd_bnd_sub_warp_node, &bounds_changed_node, 1);
     RAFT_CUDA_TRY(cudaGetLastError());
   }
@@ -886,7 +884,6 @@ template <typename i_t,
           i_t threads_per_variable,
           typename bounds_update_view_t>
 void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
-                                   // cudaGraphNode_t& set_bounds_changed_node,
                                    cudaGraphNode_t& bounds_changed_node,
                                    bounds_update_view_t view,
                                    i_t degree,
@@ -898,7 +895,6 @@ void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
 
 template <typename i_t, typename f_t, typename f_t2, typename bounds_update_view_t>
 void create_update_bounds_sub_warp(cudaGraph_t upd_graph,
-                                   // cudaGraphNode_t& set_bounds_changed_node,
                                    cudaGraphNode_t& bounds_changed_node,
                                    bounds_update_view_t view,
                                    i_t vars_sub_warp_count,
diff --git a/cpp/tests/mip/CMakeLists.txt b/cpp/tests/mip/CMakeLists.txt
index 18a4270ca4..020c537f6a 100644
--- a/cpp/tests/mip/CMakeLists.txt
+++ b/cpp/tests/mip/CMakeLists.txt
@@ -28,7 +28,7 @@ ConfigureTest(STANDARDIZATION_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/bounds_standardization_test.cu
 )
 ConfigureTest(LB_TEST
-    ${CMAKE_CURRENT_SOURCE_DIR}/lb_test.cu
+    ${CMAKE_CURRENT_SOURCE_DIR}/load_balancing_test.cu
 )
 ConfigureTest(MULTI_PROBE_TEST
     ${CMAKE_CURRENT_SOURCE_DIR}/multi_probe_test.cu
diff --git a/cpp/tests/mip/lb_test.cu b/cpp/tests/mip/load_balancing_test.cu
similarity index 98%
rename from cpp/tests/mip/lb_test.cu
rename to cpp/tests/mip/load_balancing_test.cu
index 5b5a4f1f05..deed9ea85a 100644
--- a/cpp/tests/mip/lb_test.cu
+++ b/cpp/tests/mip/load_balancing_test.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights
  * reserved. SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");