From cfdce3a74fe84664d9afbe2520ebe436b803c3a3 Mon Sep 17 00:00:00 2001 From: Hugo Linsenmaier Date: Mon, 6 Oct 2025 22:12:30 -0700 Subject: [PATCH] Explicitely create new stream --- cpp/CMakeLists.txt | 2 +- cpp/src/dual_simplex/barrier.cu | 4 ++-- cpp/src/dual_simplex/cusparse_view.cu | 4 ++-- cpp/src/dual_simplex/device_sparse_matrix.cuh | 2 +- cpp/src/linear_programming/solve.cu | 11 ++++++++++- cpp/src/utilities/copy_helpers.hpp | 18 +++++++++++++++++- 6 files changed, 33 insertions(+), 8 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e03d2581ff..fc2203f325 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -129,7 +129,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --expt-extend if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -static-global-template-stub=false") endif() -list(APPEND CUOPT_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xcompiler=-Werror) +list(APPEND CUOPT_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xcompiler=-Werror --default-stream=per-thread) list(APPEND CUOPT_CUDA_FLAGS -Xcompiler=-Wall -Wno-error=non-template-friend) list(APPEND CUOPT_CUDA_FLAGS -Xfatbin=-compress-all) if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13.0) diff --git a/cpp/src/dual_simplex/barrier.cu b/cpp/src/dual_simplex/barrier.cu index ac9e5ae9a9..62ccde07e4 100644 --- a/cpp/src/dual_simplex/barrier.cu +++ b/cpp/src/dual_simplex/barrier.cu @@ -868,8 +868,8 @@ class iteration_data_t { return chol->solve(d_b, d_x); } else { // TMP until this is ported to the GPU - dense_vector_t b = host_copy(d_b); - dense_vector_t x = host_copy(d_x); + dense_vector_t b = host_copy(d_b, stream_view_); + dense_vector_t x = host_copy(d_x, stream_view_); i_t out = solve_adat(b, x); diff --git a/cpp/src/dual_simplex/cusparse_view.cu b/cpp/src/dual_simplex/cusparse_view.cu index 653950c12e..8d22604734 100644 --- a/cpp/src/dual_simplex/cusparse_view.cu +++ b/cpp/src/dual_simplex/cusparse_view.cu @@ -263,7 +263,7 @@ void cusparse_view_t::spmv(f_t alpha, cusparseDnVecDescr_t x_cusparse = create_vector(d_x); cusparseDnVecDescr_t y_cusparse = create_vector(d_y); spmv(alpha, x_cusparse, beta, y_cusparse); - y = cuopt::host_copy(d_y); + y = cuopt::host_copy(d_y, handle_ptr_->get_stream()); } template @@ -306,7 +306,7 @@ void cusparse_view_t::transpose_spmv(f_t alpha, cusparseDnVecDescr_t x_cusparse = create_vector(d_x); cusparseDnVecDescr_t y_cusparse = create_vector(d_y); transpose_spmv(alpha, x_cusparse, beta, y_cusparse); - y = cuopt::host_copy(d_y); + y = cuopt::host_copy(d_y, handle_ptr_->get_stream()); } template diff --git a/cpp/src/dual_simplex/device_sparse_matrix.cuh b/cpp/src/dual_simplex/device_sparse_matrix.cuh index f347f956bb..00c198d3f9 100644 --- a/cpp/src/dual_simplex/device_sparse_matrix.cuh +++ b/cpp/src/dual_simplex/device_sparse_matrix.cuh @@ -184,7 +184,7 @@ class device_csc_matrix_t { // Inclusive cumulative sum to have the corresponding column for each entry rmm::device_buffer d_temp_storage; - size_t temp_storage_bytes; + size_t temp_storage_bytes{0}; cub::DeviceScan::InclusiveSum( nullptr, temp_storage_bytes, col_index.data(), col_index.data(), col_index.size(), stream); d_temp_storage.resize(temp_storage_bytes, stream); diff --git a/cpp/src/linear_programming/solve.cu b/cpp/src/linear_programming/solve.cu index 089b5cb9ac..d399f26479 100644 --- a/cpp/src/linear_programming/solve.cu +++ b/cpp/src/linear_programming/solve.cu @@ -670,8 +670,16 @@ optimization_problem_solution_t run_concurrent( // Initialize the dual simplex structures before we run PDLP. // Otherwise, CUDA API calls to the problem stream may occur in both threads and throw graph // capture off + auto barrier_handle = raft::handle_t(*op_problem.get_handle_ptr()); + detail::problem_t d_barrier_problem(problem); + rmm::cuda_stream_view barrier_stream = rmm::cuda_stream_per_thread; + d_barrier_problem.handle_ptr = &barrier_handle; + raft::resource::set_cuda_stream(barrier_handle, barrier_stream); + // Make sure allocations are done on the original stream + problem.handle_ptr->sync_stream(); + dual_simplex::user_problem_t dual_simplex_problem = - cuopt_problem_to_simplex_problem(problem); + cuopt_problem_to_simplex_problem(d_barrier_problem); // Create a thread for dual simplex std::unique_ptr< std::tuple, dual_simplex::lp_status_t, f_t, f_t, f_t>> @@ -700,6 +708,7 @@ optimization_problem_solution_t run_concurrent( dual_simplex_thread.join(); // Wait for barrier thread to finish + barrier_handle.sync_stream(); barrier_thread.join(); // copy the dual simplex solution to the device diff --git a/cpp/src/utilities/copy_helpers.hpp b/cpp/src/utilities/copy_helpers.hpp index 3dcb33edca..b7112cdda8 100644 --- a/cpp/src/utilities/copy_helpers.hpp +++ b/cpp/src/utilities/copy_helpers.hpp @@ -179,7 +179,6 @@ auto host_copy(rmm::device_uvector const& device_vec) * * @tparam T * @param device_vec - * @param stream_view * @return auto */ template @@ -191,6 +190,23 @@ auto host_copy(rmm::device_uvector const& device_vec) return host_vec; } +/** + * @brief Simple utility function to copy device_uvector to host + * + * @tparam T + * @param device_vec + * @param stream_view + * @return auto + */ +template +auto host_copy(rmm::device_uvector const& device_vec, rmm::cuda_stream_view stream_view) +{ + std::vector host_vec(device_vec.size()); + raft::copy(host_vec.data(), device_vec.data(), device_vec.size(), stream_view); + stream_view.synchronize(); + return host_vec; +} + /** * @brief Simple utility function to copy device span to host *