Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --expt-extend
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -static-global-template-stub=false")
endif()
list(APPEND CUOPT_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xcompiler=-Werror)
list(APPEND CUOPT_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xcompiler=-Werror --default-stream=per-thread)
list(APPEND CUOPT_CUDA_FLAGS -Xcompiler=-Wall -Wno-error=non-template-friend)
list(APPEND CUOPT_CUDA_FLAGS -Xfatbin=-compress-all)
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13.0)
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/dual_simplex/barrier.cu
Original file line number Diff line number Diff line change
Expand Up @@ -868,8 +868,8 @@ class iteration_data_t {
return chol->solve(d_b, d_x);
} else {
// TMP until this is ported to the GPU
dense_vector_t<i_t, f_t> b = host_copy(d_b);
dense_vector_t<i_t, f_t> x = host_copy(d_x);
dense_vector_t<i_t, f_t> b = host_copy(d_b, stream_view_);
dense_vector_t<i_t, f_t> x = host_copy(d_x, stream_view_);

i_t out = solve_adat(b, x);

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/dual_simplex/cusparse_view.cu
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ void cusparse_view_t<i_t, f_t>::spmv(f_t alpha,
cusparseDnVecDescr_t x_cusparse = create_vector(d_x);
cusparseDnVecDescr_t y_cusparse = create_vector(d_y);
spmv(alpha, x_cusparse, beta, y_cusparse);
y = cuopt::host_copy<f_t, AllocatorB>(d_y);
y = cuopt::host_copy<f_t, AllocatorB>(d_y, handle_ptr_->get_stream());
}

template <typename i_t, typename f_t>
Expand Down Expand Up @@ -306,7 +306,7 @@ void cusparse_view_t<i_t, f_t>::transpose_spmv(f_t alpha,
cusparseDnVecDescr_t x_cusparse = create_vector(d_x);
cusparseDnVecDescr_t y_cusparse = create_vector(d_y);
transpose_spmv(alpha, x_cusparse, beta, y_cusparse);
y = cuopt::host_copy<f_t, AllocatorB>(d_y);
y = cuopt::host_copy<f_t, AllocatorB>(d_y, handle_ptr_->get_stream());
}

template <typename i_t, typename f_t>
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dual_simplex/device_sparse_matrix.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ class device_csc_matrix_t {

// Inclusive cumulative sum to have the corresponding column for each entry
rmm::device_buffer d_temp_storage;
size_t temp_storage_bytes;
size_t temp_storage_bytes{0};
cub::DeviceScan::InclusiveSum(
nullptr, temp_storage_bytes, col_index.data(), col_index.data(), col_index.size(), stream);
d_temp_storage.resize(temp_storage_bytes, stream);
Expand Down
11 changes: 10 additions & 1 deletion cpp/src/linear_programming/solve.cu
Original file line number Diff line number Diff line change
Expand Up @@ -670,8 +670,16 @@ optimization_problem_solution_t<i_t, f_t> run_concurrent(
// Initialize the dual simplex structures before we run PDLP.
// Otherwise, CUDA API calls to the problem stream may occur in both threads and throw graph
// capture off
auto barrier_handle = raft::handle_t(*op_problem.get_handle_ptr());
detail::problem_t<i_t, f_t> d_barrier_problem(problem);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
detail::problem_t<i_t, f_t> d_barrier_problem(problem);
auto barrier_straem = rmm::cuda_stream_per_thread;
raft::resource::set_cuda_stream(barrier_handle, barrier_stream);
detail::problem_t<i_t, f_t> d_barrier_problem(problem, barrier_handle);

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I considered that overload but I don't think it is needed. According to rmm doc and current problem_t passing a new stream will create a deep copy on that stream.

rmm::cuda_stream_view barrier_stream = rmm::cuda_stream_per_thread;
d_barrier_problem.handle_ptr = &barrier_handle;
raft::resource::set_cuda_stream(barrier_handle, barrier_stream);
// Make sure allocations are done on the original stream
problem.handle_ptr->sync_stream();

dual_simplex::user_problem_t<i_t, f_t> dual_simplex_problem =
cuopt_problem_to_simplex_problem<i_t, f_t>(problem);
cuopt_problem_to_simplex_problem<i_t, f_t>(d_barrier_problem);
// Create a thread for dual simplex
std::unique_ptr<
std::tuple<dual_simplex::lp_solution_t<i_t, f_t>, dual_simplex::lp_status_t, f_t, f_t, f_t>>
Expand Down Expand Up @@ -700,6 +708,7 @@ optimization_problem_solution_t<i_t, f_t> run_concurrent(
dual_simplex_thread.join();

// Wait for barrier thread to finish
barrier_handle.sync_stream();
barrier_thread.join();

// copy the dual simplex solution to the device
Expand Down
18 changes: 17 additions & 1 deletion cpp/src/utilities/copy_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,6 @@ auto host_copy(rmm::device_uvector<T> const& device_vec)
*
* @tparam T
* @param device_vec
* @param stream_view
* @return auto
*/
template <typename T, typename Allocator>
Expand All @@ -191,6 +190,23 @@ auto host_copy(rmm::device_uvector<T> const& device_vec)
return host_vec;
}

/**
* @brief Simple utility function to copy device_uvector to host
*
* @tparam T
* @param device_vec
* @param stream_view
* @return auto
*/
template <typename T, typename Allocator>
auto host_copy(rmm::device_uvector<T> const& device_vec, rmm::cuda_stream_view stream_view)
{
std::vector<T, Allocator> host_vec(device_vec.size());
raft::copy(host_vec.data(), device_vec.data(), device_vec.size(), stream_view);
stream_view.synchronize();
return host_vec;
}

/**
* @brief Simple utility function to copy device span to host
*
Expand Down