From e448efc90c354accd7678e28c5e2533b43b7afa0 Mon Sep 17 00:00:00 2001 From: Nicolas Blin Date: Wed, 18 Jun 2025 12:22:31 +0000 Subject: [PATCH] fix batch graph capture issue caused by pinned memory allocator --- .../adaptive_step_size_strategy.cu | 9 ++-- .../adaptive_step_size_strategy.hpp | 13 ++--- .../utilities/cython_solve.cu | 4 +- cpp/src/utilities/unique_pinned_ptr.hpp | 47 +++++++++++++++++++ 4 files changed, 57 insertions(+), 16 deletions(-) create mode 100644 cpp/src/utilities/unique_pinned_ptr.hpp diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu index 3abfa669e5..e8355dfa1c 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -48,7 +49,6 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( stream_view_(handle_ptr_->get_stream()), primal_weight_(primal_weight), step_size_(step_size), - valid_step_size_(1), interaction_{stream_view_}, movement_{stream_view_}, norm_squared_delta_primal_{stream_view_}, @@ -57,6 +57,7 @@ adaptive_step_size_strategy_t::adaptive_step_size_strategy_t( reusable_device_scalar_value_0_{f_t(0.0), stream_view_}, graph(stream_view_) { + valid_step_size_ = make_unique_cuda_host_pinned(); } void set_adaptive_step_size_hyper_parameters(rmm::cuda_stream_view stream_view) @@ -189,13 +190,13 @@ __global__ void compute_step_sizes_from_movement_and_interaction( template i_t adaptive_step_size_strategy_t::get_valid_step_size() const { - return valid_step_size_[0]; + return *valid_step_size_; } template void adaptive_step_size_strategy_t::set_valid_step_size(i_t valid) { - valid_step_size_[0] = valid; + *valid_step_size_ = valid; } template @@ -374,7 +375,7 @@ adaptive_step_size_strategy_t::view() v.primal_weight = primal_weight_->data(); v.step_size = step_size_->data(); - v.valid_step_size = thrust::raw_pointer_cast(valid_step_size_.data()); + v.valid_step_size = valid_step_size_.get(); v.interaction = interaction_.data(); v.movement = movement_.data(); diff --git a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp index d848429dc4..225aa2de07 100644 --- a/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/linear_programming/step_size_strategy/adaptive_step_size_strategy.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -28,10 +29,6 @@ #include #include -#include -#include -#include - namespace cuopt::linear_programming::detail { void set_adaptive_step_size_hyper_parameters(rmm::cuda_stream_view stream_view); template @@ -99,11 +96,9 @@ class adaptive_step_size_strategy_t { // Host pinned memory scalar written in kernel // Combines both numerical_issue and valid_step size and save the device/host memcpy // -1: Error ; 0: Invalid step size ; 1: Valid step size - thrust::host_vector> - valid_step_size_; + // TODO: Replace with thrust::universal_host_pinned_vector once the bug is fixed: + // https://github.com/NVIDIA/cccl/issues/5027 + std::unique_ptr> valid_step_size_; rmm::device_scalar interaction_; rmm::device_scalar movement_; diff --git a/cpp/src/linear_programming/utilities/cython_solve.cu b/cpp/src/linear_programming/utilities/cython_solve.cu index 2b784beebe..8f3624850c 100644 --- a/cpp/src/linear_programming/utilities/cython_solve.cu +++ b/cpp/src/linear_programming/utilities/cython_solve.cu @@ -284,11 +284,9 @@ std::pair>, double> call_batch_solve( solver_settings->set_parameter(CUOPT_METHOD, CUOPT_METHOD_PDLP); } - // Use a default stream instead of a non-blocking to avoid invalid operations while some CUDA - // Graph might be capturing in another stream #pragma omp parallel for num_threads(max_thread) for (std::size_t i = 0; i < size; ++i) - list[i] = std::move(call_solve(data_models[i], solver_settings, cudaStreamDefault)); + list[i] = std::move(call_solve(data_models[i], solver_settings, cudaStreamNonBlocking)); auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end - start_solver); diff --git a/cpp/src/utilities/unique_pinned_ptr.hpp b/cpp/src/utilities/unique_pinned_ptr.hpp new file mode 100644 index 0000000000..b27688eff0 --- /dev/null +++ b/cpp/src/utilities/unique_pinned_ptr.hpp @@ -0,0 +1,47 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights + * reserved. SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +// This is a temporary solution to replace thrust::host_pinned_vector while this bug is not fixed: +// https://github.com/NVIDIA/cccl/issues/5027 + +namespace cuopt { + +// Custom deleter using cudaFreeHost +template +struct cuda_host_deleter { + void operator()(T* ptr) const + { + if (ptr != nullptr) RAFT_CUDA_TRY(cudaFreeHost(ptr)); + } +}; + +// Creates a unique_ptr using cudaMallocHost +template +std::unique_ptr> make_unique_cuda_host_pinned() +{ + T* ptr = nullptr; + RAFT_CUDA_TRY(cudaMallocHost(reinterpret_cast(&ptr), sizeof(T))); + return std::unique_ptr>(ptr); +} + +} // namespace cuopt \ No newline at end of file