From e448efc90c354accd7678e28c5e2533b43b7afa0 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 18 Jun 2025 12:22:31 +0000
Subject: [PATCH] fix batch graph capture issue caused by pinned memory
 allocator

---
 .../adaptive_step_size_strategy.cu            |  9 ++--
 .../adaptive_step_size_strategy.hpp           | 13 ++---
 .../utilities/cython_solve.cu                 |  4 +-
 cpp/src/utilities/unique_pinned_ptr.hpp       | 47 +++++++++++++++++++
 4 files changed, 57 insertions(+), 16 deletions(-)
 create mode 100644 cpp/src/utilities/unique_pinned_ptr.hpp
diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu
index 3abfa669e5..e8355dfa1c 100644
--- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu
@@ -19,6 +19,7 @@
 #include <linear_programming/pdlp_constants.hpp>
 #include <linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp>
 #include <mip/mip_constants.hpp>
+#include <utilities/unique_pinned_ptr.hpp>
 
 #include <raft/sparse/detail/cusparse_macros.h>
 #include <raft/sparse/detail/cusparse_wrappers.h>
@@ -48,7 +49,6 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
     stream_view_(handle_ptr_->get_stream()),
     primal_weight_(primal_weight),
     step_size_(step_size),
-    valid_step_size_(1),
     interaction_{stream_view_},
     movement_{stream_view_},
     norm_squared_delta_primal_{stream_view_},
@@ -57,6 +57,7 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
     reusable_device_scalar_value_0_{f_t(0.0), stream_view_},
     graph(stream_view_)
 {
+  valid_step_size_ = make_unique_cuda_host_pinned<i_t>();
 }
 
 void set_adaptive_step_size_hyper_parameters(rmm::cuda_stream_view stream_view)
@@ -189,13 +190,13 @@ __global__ void compute_step_sizes_from_movement_and_interaction(
 template <typename i_t, typename f_t>
 i_t adaptive_step_size_strategy_t<i_t, f_t>::get_valid_step_size() const
 {
-  return valid_step_size_[0];
+  return *valid_step_size_;
 }
 
 template <typename i_t, typename f_t>
 void adaptive_step_size_strategy_t<i_t, f_t>::set_valid_step_size(i_t valid)
 {
-  valid_step_size_[0] = valid;
+  *valid_step_size_ = valid;
 }
 
 template <typename i_t, typename f_t>
@@ -374,7 +375,7 @@ adaptive_step_size_strategy_t<i_t, f_t>::view()
 
   v.primal_weight   = primal_weight_->data();
   v.step_size       = step_size_->data();
-  v.valid_step_size = thrust::raw_pointer_cast(valid_step_size_.data());
+  v.valid_step_size = valid_step_size_.get();
 
   v.interaction = interaction_.data();
   v.movement    = movement_.data();
diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp
index d848429dc4..225aa2de07 100644
--- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp
+++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <utilities/event_handler.cuh>
+#include <utilities/unique_pinned_ptr.hpp>
 
 #include <linear_programming/cusparse_view.hpp>
 #include <linear_programming/pdhg.hpp>
@@ -28,10 +29,6 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <thrust/host_vector.h>
-#include <thrust/mr/allocator.h>
-#include <thrust/system/cuda/memory_resource.h>
-
 namespace cuopt::linear_programming::detail {
 void set_adaptive_step_size_hyper_parameters(rmm::cuda_stream_view stream_view);
 template <typename i_t, typename f_t>
@@ -99,11 +96,9 @@ class adaptive_step_size_strategy_t {
   // Host pinned memory scalar written in kernel
   // Combines both numerical_issue and valid_step size and save the device/host memcpy
   // -1: Error ; 0: Invalid step size ; 1: Valid step size
-  thrust::host_vector<i_t,
-                      thrust::mr::stateless_resource_allocator<
-                        i_t,
-                        thrust::system::cuda::universal_host_pinned_memory_resource>>
-    valid_step_size_;
+  // TODO: Replace with thrust::universal_host_pinned_vector once the bug is fixed:
+  // https://github.com/NVIDIA/cccl/issues/5027
+  std::unique_ptr<i_t, cuda_host_deleter<i_t>> valid_step_size_;
 
   rmm::device_scalar<f_t> interaction_;
   rmm::device_scalar<f_t> movement_;
diff --git a/cpp/src/linear_programming/utilities/cython_solve.cu b/cpp/src/linear_programming/utilities/cython_solve.cu
index 2b784beebe..8f3624850c 100644
--- a/cpp/src/linear_programming/utilities/cython_solve.cu
+++ b/cpp/src/linear_programming/utilities/cython_solve.cu
@@ -284,11 +284,9 @@ std::pair<std::vector<std::unique_ptr<solver_ret_t>>, double> call_batch_solve(
     solver_settings->set_parameter(CUOPT_METHOD, CUOPT_METHOD_PDLP);
   }
 
-  // Use a default stream instead of a non-blocking to avoid invalid operations while some CUDA
-  // Graph might be capturing in another stream
 #pragma omp parallel for num_threads(max_thread)
   for (std::size_t i = 0; i < size; ++i)
-    list[i] = std::move(call_solve(data_models[i], solver_settings, cudaStreamDefault));
+    list[i] = std::move(call_solve(data_models[i], solver_settings, cudaStreamNonBlocking));
 
   auto end      = std::chrono::high_resolution_clock::now();
   auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start_solver);
diff --git a/cpp/src/utilities/unique_pinned_ptr.hpp b/cpp/src/utilities/unique_pinned_ptr.hpp
new file mode 100644
index 0000000000..b27688eff0
--- /dev/null
+++ b/cpp/src/utilities/unique_pinned_ptr.hpp
@@ -0,0 +1,47 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <cuda_runtime.h>
+
+// This is a temporary solution to replace thrust::host_pinned_vector while this bug is not fixed:
+// https://github.com/NVIDIA/cccl/issues/5027
+
+namespace cuopt {
+
+// Custom deleter using cudaFreeHost
+template <typename T>
+struct cuda_host_deleter {
+  void operator()(T* ptr) const
+  {
+    if (ptr != nullptr) RAFT_CUDA_TRY(cudaFreeHost(ptr));
+  }
+};
+
+// Creates a unique_ptr using cudaMallocHost
+template <typename T>
+std::unique_ptr<T, cuda_host_deleter<T>> make_unique_cuda_host_pinned()
+{
+  T* ptr = nullptr;
+  RAFT_CUDA_TRY(cudaMallocHost(reinterpret_cast<void**>(&ptr), sizeof(T)));
+  return std::unique_ptr<T, cuda_host_deleter<T>>(ptr);
+}
+
+}  // namespace cuopt
\ No newline at end of file