NVIDIA · rapids-bot · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025 · rg20
@@ -129,7 +129,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --expt-extend
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -static-global-template-stub=false")
 endif()
-list(APPEND CUOPT_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xcompiler=-Werror)
+list(APPEND CUOPT_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xcompiler=-Werror --default-stream=per-thread)
 list(APPEND CUOPT_CUDA_FLAGS -Xcompiler=-Wall -Wno-error=non-template-friend)
 list(APPEND CUOPT_CUDA_FLAGS -Xfatbin=-compress-all)
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13.0)

@@ -868,8 +868,8 @@ class iteration_data_t {
       return chol->solve(d_b, d_x);
     } else {
       // TMP until this is ported to the GPU
-      dense_vector_t<i_t, f_t> b = host_copy(d_b);
-      dense_vector_t<i_t, f_t> x = host_copy(d_x);
+      dense_vector_t<i_t, f_t> b = host_copy(d_b, stream_view_);
+      dense_vector_t<i_t, f_t> x = host_copy(d_x, stream_view_);
 
       i_t out = solve_adat(b, x);
 

@@ -263,7 +263,7 @@ void cusparse_view_t<i_t, f_t>::spmv(f_t alpha,
   cusparseDnVecDescr_t x_cusparse = create_vector(d_x);
   cusparseDnVecDescr_t y_cusparse = create_vector(d_y);
   spmv(alpha, x_cusparse, beta, y_cusparse);
-  y = cuopt::host_copy<f_t, AllocatorB>(d_y);
+  y = cuopt::host_copy<f_t, AllocatorB>(d_y, handle_ptr_->get_stream());
 }
 
 template <typename i_t, typename f_t>
@@ -306,7 +306,7 @@ void cusparse_view_t<i_t, f_t>::transpose_spmv(f_t alpha,
   cusparseDnVecDescr_t x_cusparse = create_vector(d_x);
   cusparseDnVecDescr_t y_cusparse = create_vector(d_y);
   transpose_spmv(alpha, x_cusparse, beta, y_cusparse);
-  y = cuopt::host_copy<f_t, AllocatorB>(d_y);
+  y = cuopt::host_copy<f_t, AllocatorB>(d_y, handle_ptr_->get_stream());
 }
 
 template <typename i_t, typename f_t>

@@ -184,7 +184,7 @@ class device_csc_matrix_t {
 
     // Inclusive cumulative sum to have the corresponding column for each entry
     rmm::device_buffer d_temp_storage;
-    size_t temp_storage_bytes;
+    size_t temp_storage_bytes{0};
     cub::DeviceScan::InclusiveSum(
       nullptr, temp_storage_bytes, col_index.data(), col_index.data(), col_index.size(), stream);
     d_temp_storage.resize(temp_storage_bytes, stream);

@@ -670,8 +670,16 @@ optimization_problem_solution_t<i_t, f_t> run_concurrent(
   // Initialize the dual simplex structures before we run PDLP.
   // Otherwise, CUDA API calls to the problem stream may occur in both threads and throw graph
   // capture off
+  auto barrier_handle = raft::handle_t(*op_problem.get_handle_ptr());
+  detail::problem_t<i_t, f_t> d_barrier_problem(problem);
-  detail::problem_t<i_t, f_t> d_barrier_problem(problem);
+  auto barrier_straem = rmm::cuda_stream_per_thread;
+  raft::resource::set_cuda_stream(barrier_handle, barrier_stream);
+  detail::problem_t<i_t, f_t> d_barrier_problem(problem, barrier_handle);
-  detail::problem_t<i_t, f_t> d_barrier_problem(problem);
+  auto barrier_straem = rmm::cuda_stream_per_thread;
+  raft::resource::set_cuda_stream(barrier_handle, barrier_stream);
+  detail::problem_t<i_t, f_t> d_barrier_problem(problem, barrier_handle);
+  rmm::cuda_stream_view barrier_stream = rmm::cuda_stream_per_thread;
+  d_barrier_problem.handle_ptr         = &barrier_handle;
+  raft::resource::set_cuda_stream(barrier_handle, barrier_stream);
+  // Make sure allocations are done on the original stream
+  problem.handle_ptr->sync_stream();
+
   dual_simplex::user_problem_t<i_t, f_t> dual_simplex_problem =
-    cuopt_problem_to_simplex_problem<i_t, f_t>(problem);
+    cuopt_problem_to_simplex_problem<i_t, f_t>(d_barrier_problem);
   // Create a thread for dual simplex
   std::unique_ptr<
     std::tuple<dual_simplex::lp_solution_t<i_t, f_t>, dual_simplex::lp_status_t, f_t, f_t, f_t>>
@@ -700,6 +708,7 @@ optimization_problem_solution_t<i_t, f_t> run_concurrent(
   dual_simplex_thread.join();
 
   // Wait for barrier thread to finish
+  barrier_handle.sync_stream();
   barrier_thread.join();
 
   // copy the dual simplex solution to the device

@@ -179,7 +179,6 @@ auto host_copy(rmm::device_uvector<T> const& device_vec)
  *
  * @tparam T
  * @param device_vec
- * @param stream_view
  * @return auto
  */
 template <typename T, typename Allocator>
@@ -191,6 +190,23 @@ auto host_copy(rmm::device_uvector<T> const& device_vec)
   return host_vec;
 }
 
+/**
+ * @brief Simple utility function to copy device_uvector to host
+ *
+ * @tparam T
+ * @param device_vec
+ * @param stream_view
+ * @return auto
+ */
+template <typename T, typename Allocator>
+auto host_copy(rmm::device_uvector<T> const& device_vec, rmm::cuda_stream_view stream_view)
+{
+  std::vector<T, Allocator> host_vec(device_vec.size());
+  raft::copy(host_vec.data(), device_vec.data(), device_vec.size(), stream_view);
+  stream_view.synchronize();
+  return host_vec;
+}
+
 /**
  * @brief Simple utility function to copy device span to host
  *