diff --git a/c/src/core/c_api.cpp b/c/src/core/c_api.cpp index b407f40b3e..dc1d93379f 100644 --- a/c/src/core/c_api.cpp +++ b/c/src/core/c_api.cpp @@ -14,17 +14,20 @@ #include #include #include -#include +#include #include -#include #include -#include #include +#include +#include #include "../core/exceptions.hpp" +#include + #include #include +#include #include extern "C" cuvsError_t cuvsResourcesCreate(cuvsResources_t* res) @@ -132,8 +135,8 @@ extern "C" cuvsError_t cuvsRMMAlloc(cuvsResources_t res, void** ptr, size_t byte { return cuvs::core::translate_exceptions([=] { auto res_ptr = reinterpret_cast(res); - auto mr = rmm::mr::get_current_device_resource(); - *ptr = mr->allocate(raft::resource::get_cuda_stream(*res_ptr), bytes); + auto mr = rmm::mr::get_current_device_resource_ref(); + *ptr = mr.allocate(raft::resource::get_cuda_stream(*res_ptr), bytes); }); } @@ -141,51 +144,38 @@ extern "C" cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes) { return cuvs::core::translate_exceptions([=] { auto res_ptr = reinterpret_cast(res); - auto mr = rmm::mr::get_current_device_resource(); - mr->deallocate(raft::resource::get_cuda_stream(*res_ptr), ptr, bytes); + auto mr = rmm::mr::get_current_device_resource_ref(); + mr.deallocate(raft::resource::get_cuda_stream(*res_ptr), ptr, bytes); }); } -thread_local std::shared_ptr< - rmm::mr::owning_wrapper, - rmm::mr::device_memory_resource>> - pool_mr; +thread_local cuda::mr::any_resource pool_upstream; +thread_local std::optional pool_mr; extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent, int max_pool_size_percent, bool managed) { return cuvs::core::translate_exceptions([=] { - // Upstream memory resource needs to be a cuda_memory_resource - auto cuda_mr = rmm::mr::get_current_device_resource(); - auto* cuda_mr_casted = dynamic_cast(cuda_mr); - if (cuda_mr_casted == nullptr) { - throw std::runtime_error("Current memory resource is not a cuda_memory_resource"); - } - auto initial_size = rmm::percent_of_free_device_memory(initial_pool_size_percent); auto max_size = rmm::percent_of_free_device_memory(max_pool_size_percent); - auto mr = std::shared_ptr(); if (managed) { - mr = std::static_pointer_cast( - std::make_shared()); + pool_upstream = rmm::mr::managed_memory_resource{}; } else { - mr = std::static_pointer_cast( - std::make_shared()); + pool_upstream = rmm::mr::cuda_memory_resource{}; } - pool_mr = - rmm::mr::make_owning_wrapper(mr, initial_size, max_size); + pool_mr.emplace(pool_upstream, initial_size, max_size); - rmm::mr::set_current_device_resource(pool_mr.get()); + rmm::mr::set_current_device_resource(*pool_mr); }); } extern "C" cuvsError_t cuvsRMMMemoryResourceReset() { return cuvs::core::translate_exceptions([=] { - rmm::mr::set_current_device_resource(rmm::mr::detail::initial_resource()); + rmm::mr::reset_current_device_resource(); pool_mr.reset(); }); } diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index b4195323ba..5eb3f8d808 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -1,8 +1,9 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 set -euo pipefail +source ./ci/use_conda_packages_from_prs.sh source rapids-configure-sccache source rapids-date-string diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 65c8f29f8a..2489393edc 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 set -euo pipefail +source ./ci/use_conda_packages_from_prs.sh rapids-logger "Downloading artifacts from previous jobs" CPP_CHANNEL=$(rapids-download-conda-from-github cpp) @@ -18,6 +19,7 @@ RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" export RAPIDS_VERSION_MAJOR_MINOR rapids-dependency-file-generator \ + "${RAPIDS_EXTRA_CONDA_CHANNEL_ARGS[@]}" \ --output conda \ --file-key docs \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \ diff --git a/ci/build_go.sh b/ci/build_go.sh index 80370048ff..127190491b 100755 --- a/ci/build_go.sh +++ b/ci/build_go.sh @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 set -euo pipefail +source ./ci/use_conda_packages_from_prs.sh rapids-logger "Downloading artifacts from previous jobs" CPP_CHANNEL=$(rapids-download-conda-from-github cpp) @@ -14,6 +15,7 @@ rapids-logger "Configuring conda strict channel priority" conda config --set channel_priority strict rapids-dependency-file-generator \ + "${RAPIDS_EXTRA_CONDA_CHANNEL_ARGS[@]}" \ --output conda \ --file-key go \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \ diff --git a/ci/build_java.sh b/ci/build_java.sh index daf814884d..c958657155 100755 --- a/ci/build_java.sh +++ b/ci/build_java.sh @@ -1,8 +1,9 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 set -euo pipefail +source ./ci/use_conda_packages_from_prs.sh source rapids-configure-sccache @@ -33,6 +34,7 @@ rapids-logger "Generate Java testing dependencies" ENV_YAML_DIR="$(mktemp -d)" rapids-dependency-file-generator \ + "${RAPIDS_EXTRA_CONDA_CHANNEL_ARGS[@]}" \ --output conda \ --file-key java \ --prepend-channel "${CPP_CHANNEL}" \ diff --git a/ci/build_python.sh b/ci/build_python.sh index c153bdd948..cce29bd41c 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 set -euo pipefail +source ./ci/use_conda_packages_from_prs.sh source rapids-configure-sccache source rapids-date-string diff --git a/ci/build_rust.sh b/ci/build_rust.sh index edce9447f2..166eac86cc 100755 --- a/ci/build_rust.sh +++ b/ci/build_rust.sh @@ -1,8 +1,9 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 set -euo pipefail +source ./ci/use_conda_packages_from_prs.sh rapids-logger "Downloading artifacts from previous jobs" CPP_CHANNEL=$(rapids-download-conda-from-github cpp) @@ -14,6 +15,7 @@ rapids-logger "Configuring conda strict channel priority" conda config --set channel_priority strict rapids-dependency-file-generator \ + "${RAPIDS_EXTRA_CONDA_CHANNEL_ARGS[@]}" \ --output conda \ --file-key rust \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \ diff --git a/ci/build_standalone_c.sh b/ci/build_standalone_c.sh index 33123cd79c..1b09b35907 100755 --- a/ci/build_standalone_c.sh +++ b/ci/build_standalone_c.sh @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 set -euo pipefail +source ./ci/use_conda_packages_from_prs.sh TOOLSET_VERSION=14 NINJA_VERSION=v1.13.1 diff --git a/ci/build_wheel_cuvs.sh b/ci/build_wheel_cuvs.sh index 75be8a6dff..2cdcc5460a 100755 --- a/ci/build_wheel_cuvs.sh +++ b/ci/build_wheel_cuvs.sh @@ -3,9 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 set -euo pipefail - source rapids-init-pip +source ./ci/use_wheels_from_prs.sh + package_dir="python/cuvs" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")" diff --git a/ci/build_wheel_libcuvs.sh b/ci/build_wheel_libcuvs.sh index 6eb6bfa5ab..bd2694c31b 100755 --- a/ci/build_wheel_libcuvs.sh +++ b/ci/build_wheel_libcuvs.sh @@ -3,9 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 set -euo pipefail - source rapids-init-pip +source ./ci/use_wheels_from_prs.sh + package_name="libcuvs" package_dir="python/libcuvs" diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh index 5b7a393f31..3444658256 100755 --- a/ci/test_cpp.sh +++ b/ci/test_cpp.sh @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 set -euo pipefail +source ./ci/use_conda_packages_from_prs.sh . /opt/conda/etc/profile.d/conda.sh @@ -13,6 +14,7 @@ CPP_CHANNEL=$(rapids-download-conda-from-github cpp) rapids-logger "Generate C++ testing dependencies" rapids-dependency-file-generator \ + "${RAPIDS_EXTRA_CONDA_CHANNEL_ARGS[@]}" \ --output conda \ --file-key test_cpp \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" \ diff --git a/ci/test_python.sh b/ci/test_python.sh index ebb6ab2762..e12bcb03d2 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 set -euo pipefail +source ./ci/use_conda_packages_from_prs.sh . /opt/conda/etc/profile.d/conda.sh @@ -15,6 +16,7 @@ PYTHON_CHANNEL=$(rapids-download-from-github "$(rapids-package-name "conda_pytho rapids-logger "Generate Python testing dependencies" rapids-dependency-file-generator \ + "${RAPIDS_EXTRA_CONDA_CHANNEL_ARGS[@]}" \ --output conda \ --file-key test_python \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \ diff --git a/ci/test_wheel_cuvs.sh b/ci/test_wheel_cuvs.sh index c0758da20b..dd85d0a886 100755 --- a/ci/test_wheel_cuvs.sh +++ b/ci/test_wheel_cuvs.sh @@ -3,9 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 set -euo pipefail - source rapids-init-pip +source ./ci/use_wheels_from_prs.sh + # Delete system libnccl.so to ensure the wheel is used rm -rf /usr/lib64/libnccl* diff --git a/ci/use_conda_packages_from_prs.sh b/ci/use_conda_packages_from_prs.sh new file mode 100755 index 0000000000..97892ea0dd --- /dev/null +++ b/ci/use_conda_packages_from_prs.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# download CI artifacts +LIBRMM_CHANNEL=$(rapids-get-pr-artifact rmm 2361 cpp conda) +RMM_CHANNEL=$(rapids-get-pr-artifact rmm 2361 python conda --stable) +LIBRAFT_CHANNEL=$(rapids-get-pr-artifact raft 2996 cpp conda) +RAFT_CHANNEL=$(rapids-get-pr-artifact raft 2996 python conda --stable) + +RAPIDS_PREPENDED_CONDA_CHANNELS=( + "${LIBRMM_CHANNEL}" + "${RMM_CHANNEL}" + "${LIBRAFT_CHANNEL}" + "${RAFT_CHANNEL}" +) +export RAPIDS_PREPENDED_CONDA_CHANNELS + +RAPIDS_EXTRA_CONDA_CHANNEL_ARGS=() +for _channel in "${RAPIDS_PREPENDED_CONDA_CHANNELS[@]}" +do + conda config --system --add channels "${_channel}" + RAPIDS_EXTRA_CONDA_CHANNEL_ARGS+=(--prepend-channel "${_channel}") +done diff --git a/ci/use_wheels_from_prs.sh b/ci/use_wheels_from_prs.sh new file mode 100755 index 0000000000..a49b027e4b --- /dev/null +++ b/ci/use_wheels_from_prs.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +RAPIDS_PY_CUDA_SUFFIX=$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}") + +LIBRMM_WHEELHOUSE=$( + RAPIDS_PY_WHEEL_NAME="librmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-get-pr-artifact rmm 2361 cpp wheel +) +RMM_WHEELHOUSE=$( + rapids-get-pr-artifact rmm 2361 python wheel --stable +) +LIBRAFT_WHEELHOUSE=$( + RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-get-pr-artifact raft 2996 cpp wheel +) +RAFT_WHEELHOUSE=$( + rapids-get-pr-artifact raft 2996 python wheel --stable --pkg_name pylibraft +) + +cat >> "${PIP_CONSTRAINT}" < #include -#include -#include +#include +#include + +#include #include @@ -17,37 +19,25 @@ namespace raft::mr { /** - * @brief `device_memory_resource` derived class that uses mmap to allocate memory. - * This class enables memory allocation using huge pages. + * @brief Memory resource that uses mmap to allocate memory with huge pages. * It is assumed that the allocated memory is directly accessible on device. This currently only * works on GH systems. * * TODO(tfeher): consider improving or removing this helper once we made progress with * https://github.com/rapidsai/raft/issues/1819 */ -class cuda_huge_page_resource final : public rmm::mr::device_memory_resource { +class cuda_huge_page_resource { public: cuda_huge_page_resource() = default; - ~cuda_huge_page_resource() override = default; + ~cuda_huge_page_resource() = default; cuda_huge_page_resource(cuda_huge_page_resource const&) = default; cuda_huge_page_resource(cuda_huge_page_resource&&) = default; auto operator=(cuda_huge_page_resource const&) -> cuda_huge_page_resource& = default; auto operator=(cuda_huge_page_resource&&) -> cuda_huge_page_resource& = default; - private: - /** - * @brief Allocates memory of size at least `bytes` using cudaMalloc. - * - * The returned pointer has at least 256B alignment. - * - * @note Stream argument is ignored - * - * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled - * - * @param bytes The size, in bytes, of the allocation - * @return void* Pointer to the newly allocated memory - */ - auto do_allocate(std::size_t bytes, rmm::cuda_stream_view) -> void* override + void* allocate(cuda::stream_ref, + std::size_t bytes, + std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) { void* addr{nullptr}; addr = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -60,36 +50,29 @@ class cuda_huge_page_resource final : public rmm::mr::device_memory_resource { return addr; } - /** - * @brief Deallocate memory pointed to by \p p. - * - * @note Stream argument is ignored. - * - * @throws Nothing. - * - * @param p Pointer to be deallocated - */ - void do_deallocate(void* ptr, std::size_t size, rmm::cuda_stream_view) noexcept override + void deallocate(cuda::stream_ref, + void* ptr, + std::size_t size, + std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept { if (munmap(ptr, size) == -1) { RAFT_LOG_ERROR("huge_page_resource::munmap failed"); } } - /** - * @brief Compare this resource to another. - * - * Two cuda_huge_page_resources always compare equal, because they can each - * deallocate memory allocated by the other. - * - * @throws Nothing. - * - * @param other The other resource to compare to - * @return true If the two resources are equivalent - * @return false If the two resources are not equal - */ - [[nodiscard]] auto do_is_equal(device_memory_resource const& other) const noexcept - -> bool override + void* allocate_sync(std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) + { + return allocate(cuda::stream_ref{cudaStream_t{nullptr}}, bytes, alignment); + } + + void deallocate_sync(void* ptr, + std::size_t bytes, + std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept { - return dynamic_cast(&other) != nullptr; + deallocate(cuda::stream_ref{cudaStream_t{nullptr}}, ptr, bytes, alignment); } + + bool operator==(cuda_huge_page_resource const&) const noexcept { return true; } + + friend void get_property(cuda_huge_page_resource const&, cuda::mr::device_accessible) noexcept {} }; +static_assert(cuda::mr::resource_with); } // namespace raft::mr diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h index 83cb7303c8..537f0f0aa7 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h +++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -15,14 +15,16 @@ #include #include #include +#include #include #include #include -#include #include #include +#include #include +#include #include #include @@ -65,17 +67,16 @@ inline auto rmm_oom_callback(std::size_t bytes, void*) -> bool */ class shared_raft_resources { public: - using pool_mr_type = rmm::mr::pool_memory_resource; - using mr_type = rmm::mr::failure_callback_resource_adaptor; + using pool_mr_type = rmm::mr::pool_memory_resource; + using mr_type = rmm::mr::failure_callback_resource_adaptor<>; using large_mr_type = rmm::mr::managed_memory_resource; shared_raft_resources() try - : orig_resource_{rmm::mr::get_current_device_resource()}, - pool_resource_(orig_resource_, 1024 * 1024 * 1024ull), - resource_(&pool_resource_, rmm_oom_callback, nullptr), + : pool_resource_(rmm::mr::get_current_device_resource_ref(), 1024 * 1024 * 1024ull), + resource_(pool_resource_, rmm_oom_callback, nullptr), large_mr_() { - rmm::mr::set_current_device_resource(&resource_); + orig_resource_ = rmm::mr::set_current_device_resource(resource_); } catch (const std::exception& e) { auto cuda_status = cudaGetLastError(); size_t free = 0; @@ -97,13 +98,10 @@ class shared_raft_resources { ~shared_raft_resources() noexcept { rmm::mr::set_current_device_resource(orig_resource_); } - auto get_large_memory_resource() noexcept - { - return static_cast(&large_mr_); - } + auto get_large_memory_resource() noexcept -> rmm::device_async_resource_ref { return large_mr_; } private: - rmm::mr::device_memory_resource* orig_resource_; + cuda::mr::any_resource orig_resource_; pool_mr_type pool_resource_; mr_type resource_; large_mr_type large_mr_; @@ -129,12 +127,8 @@ class configured_raft_resources { res_{std::make_unique( rmm::cuda_stream_view(get_stream_from_global_pool()))} { - // set the large workspace resource to the raft handle, but without the deleter - // (this resource is managed by the shared_res). raft::resource::set_large_workspace_resource( - *res_, - std::shared_ptr(shared_res_->get_large_memory_resource(), - raft::void_op{})); + *res_, raft::mr::device_resource{shared_res_->get_large_memory_resource()}); } /** Default constructor creates all resources anew. */ diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h index 34fea2f82a..98dd94c2e1 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h @@ -166,9 +166,9 @@ class cuvs_cagra : public algo, public algo_gpu { inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type) { switch (mem_type) { - case (AllocatorType::kHostPinned): return &mr_pinned_; - case (AllocatorType::kHostHugePage): return &mr_huge_page_; - default: return rmm::mr::get_current_device_resource(); + case (AllocatorType::kHostPinned): return mr_pinned_; + case (AllocatorType::kHostHugePage): return mr_huge_page_; + default: return rmm::mr::get_current_device_resource_ref(); } } }; diff --git a/cpp/internal/cuvs_internal/neighbors/naive_knn.cuh b/cpp/internal/cuvs_internal/neighbors/naive_knn.cuh index 6c7577065b..7bc37193a0 100644 --- a/cpp/internal/cuvs_internal/neighbors/naive_knn.cuh +++ b/cpp/internal/cuvs_internal/neighbors/naive_knn.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include namespace cuvs::neighbors { @@ -87,8 +88,7 @@ void naive_knn(raft::resources const& handle, uint32_t k, cuvs::distance::DistanceType type) { - rmm::mr::device_memory_resource* mr = nullptr; - auto pool_guard = raft::get_pool_memory_resource(mr, 1024 * 1024); + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(); auto stream = raft::resource::get_cuda_stream(handle); dim3 block_dim(16, 32, 1); @@ -116,8 +116,7 @@ void naive_knn(raft::resources const& handle, static_cast(k), dist_topk + offset * k, indices_topk + offset * k, - type != cuvs::distance::DistanceType::InnerProduct, - mr); + type != cuvs::distance::DistanceType::InnerProduct); } RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } diff --git a/cpp/src/cluster/detail/kmeans_balanced.cuh b/cpp/src/cluster/detail/kmeans_balanced.cuh index f5dc759725..aab18680ac 100644 --- a/cpp/src/cluster/detail/kmeans_balanced.cuh +++ b/cpp/src/cluster/detail/kmeans_balanced.cuh @@ -364,7 +364,7 @@ void compute_norm(const raft::resources& handle, raft::common::nvtx::range fun_scope("compute_norm"); auto stream = raft::resource::get_cuda_stream(handle); rmm::device_uvector mapped_dataset( - 0, stream, mr.value_or(raft::resource::get_workspace_resource(handle))); + 0, stream, mr.value_or(raft::resource::get_workspace_resource_ref(handle))); const MathT* dataset_ptr = nullptr; @@ -426,7 +426,7 @@ void predict(const raft::resources& handle, auto stream = raft::resource::get_cuda_stream(handle); raft::common::nvtx::range fun_scope( "predict(%zu, %u)", static_cast(n_rows), n_clusters); - auto mem_res = mr.value_or(raft::resource::get_workspace_resource(handle)); + auto mem_res = mr.value_or(raft::resource::get_workspace_resource_ref(handle)); auto [max_minibatch_size, _mem_per_row] = calc_minibatch_size(n_clusters, n_rows, dim, params.metric, std::is_same_v); rmm::device_uvector cur_dataset( @@ -1038,7 +1038,7 @@ void build_hierarchical(const raft::resources& handle, // TODO: Remove the explicit managed memory- we shouldn't be creating this on the user's behalf. rmm::mr::managed_memory_resource managed_memory; - rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle); + rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource_ref(handle); auto [max_minibatch_size, mem_per_row] = calc_minibatch_size(n_clusters, n_rows, dim, params.metric, std::is_same_v); @@ -1079,8 +1079,8 @@ void build_hierarchical(const raft::resources& handle, CounterT; // build coarse clusters (mesoclusters) - rmm::device_uvector mesocluster_labels_buf(n_rows, stream, &managed_memory); - rmm::device_uvector mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory); + rmm::device_uvector mesocluster_labels_buf(n_rows, stream, managed_memory); + rmm::device_uvector mesocluster_sizes_buf(n_mesoclusters, stream, managed_memory); { rmm::device_uvector mesocluster_centers_buf(n_mesoclusters * dim, stream, device_memory); build_clusters(handle, @@ -1136,7 +1136,7 @@ void build_hierarchical(const raft::resources& handle, fine_clusters_nums_max, cluster_centers, mapping_op, - &managed_memory, + managed_memory, device_memory); RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters."); diff --git a/cpp/src/cluster/detail/kmeans_batched.cuh b/cpp/src/cluster/detail/kmeans_batched.cuh index e2fc8d334f..93888490a4 100644 --- a/cpp/src/cluster/detail/kmeans_batched.cuh +++ b/cpp/src/cluster/detail/kmeans_batched.cuh @@ -173,7 +173,7 @@ void accumulate_batch_centroids( cudaStream_t stream = raft::resource::get_cuda_stream(handle); auto workspace = rmm::device_uvector( - batch_data.extent(0), stream, raft::resource::get_workspace_resource(handle)); + batch_data.extent(0), stream, raft::resource::get_workspace_resource_ref(handle)); cuvs::cluster::kmeans::detail::KeyValueIndexOp conversion_op; thrust::transform_iterator, diff --git a/cpp/src/cluster/kmeans_balanced.cuh b/cpp/src/cluster/kmeans_balanced.cuh index 0c0df03397..f3f52c2d8f 100644 --- a/cpp/src/cluster/kmeans_balanced.cuh +++ b/cpp/src/cluster/kmeans_balanced.cuh @@ -154,7 +154,7 @@ void predict(const raft::resources& handle, X.extent(0), labels.data_handle(), mapping_op, - raft::resource::get_workspace_resource(handle)); + raft::resource::get_workspace_resource_ref(handle)); } namespace helpers { @@ -305,7 +305,7 @@ void calc_centers_and_sizes(const raft::resources& handle, labels.data_handle(), reset_counters, mapping_op, - raft::resource::get_workspace_resource(handle)); + raft::resource::get_workspace_resource_ref(handle)); } } // namespace helpers diff --git a/cpp/src/cluster/kmeans_balanced_build_clusters_impl.cuh b/cpp/src/cluster/kmeans_balanced_build_clusters_impl.cuh index 6e60611df6..2bce856c6c 100644 --- a/cpp/src/cluster/kmeans_balanced_build_clusters_impl.cuh +++ b/cpp/src/cluster/kmeans_balanced_build_clusters_impl.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -68,7 +68,7 @@ void build_clusters(const raft::resources& handle, labels.data_handle(), cluster_sizes.data_handle(), mapping_op, - raft::resource::get_workspace_resource(handle), + raft::resource::get_workspace_resource_ref(handle), X_norm.has_value() ? X_norm.value().data_handle() : nullptr); } diff --git a/cpp/src/distance/detail/masked_nn.cuh b/cpp/src/distance/detail/masked_nn.cuh index 315ecf9d7e..a3a187e8c4 100644 --- a/cpp/src/distance/detail/masked_nn.cuh +++ b/cpp/src/distance/detail/masked_nn.cuh @@ -246,9 +246,8 @@ void masked_l2_nn_impl(raft::resources const& handle, static_assert(P::Mblk == 64, "masked_l2_nn_impl only supports a policy with 64 rows per block."); // Get stream and workspace memory resource - rmm::mr::device_memory_resource* ws_mr = - dynamic_cast(raft::resource::get_workspace_resource(handle)); - auto stream = raft::resource::get_cuda_stream(handle); + rmm::device_async_resource_ref ws_mr = raft::resource::get_workspace_resource_ref(handle); + auto stream = raft::resource::get_cuda_stream(handle); // Acquire temporary buffers and initialize to zero: // 1) Adjacency matrix bitfield diff --git a/cpp/src/neighbors/composite/index.cu b/cpp/src/neighbors/composite/index.cu index 57bd0816eb..7c02d0e43e 100644 --- a/cpp/src/neighbors/composite/index.cu +++ b/cpp/src/neighbors/composite/index.cu @@ -40,7 +40,7 @@ void composite_index::search( size_t buffer_size = num_queries * K * num_indices; auto main_stream = raft::resource::get_cuda_stream(handle); - auto tmp_res = raft::resource::get_workspace_resource(handle); + auto tmp_res = raft::resource::get_workspace_resource_ref(handle); rmm::device_uvector neighbors_buffer(buffer_size, main_stream, tmp_res); rmm::device_uvector distances_buffer(buffer_size, main_stream, tmp_res); diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh index 82bd6e755a..a7872f87a0 100644 --- a/cpp/src/neighbors/detail/ann_utils.cuh +++ b/cpp/src/neighbors/detail/ann_utils.cuh @@ -572,13 +572,14 @@ struct batch_load_iterator { * @param mr a custom memory resource for the intermediate buffer, if applicable. * @param prefetch enable prefetch feature in order to achieve kernel/copy overlapping. */ - batch_load_iterator(const T* source, - size_type n_rows, - size_type row_width, - size_type batch_size, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource(), - bool prefetch = false) + batch_load_iterator( + const T* source, + size_type n_rows, + size_type row_width, + size_type batch_size, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(), + bool prefetch = false) : cur_batch_(new batch(source, n_rows, row_width, batch_size, stream, mr, prefetch)), cur_pos_(0), cur_prefetch_pos_(0) diff --git a/cpp/src/neighbors/detail/cagra/add_nodes.cuh b/cpp/src/neighbors/detail/cagra/add_nodes.cuh index 71ec2457f6..f198cf957d 100644 --- a/cpp/src/neighbors/detail/cagra/add_nodes.cuh +++ b/cpp/src/neighbors/detail/cagra/add_nodes.cuh @@ -70,7 +70,7 @@ void add_node_core( params.itopk_size = std::max(base_degree * 2lu, 256lu); // Memory space for rank-based neighbor list - auto mr = raft::resource::get_workspace_resource(handle); + auto mr = raft::resource::get_workspace_resource_ref(handle); auto neighbor_indices = raft::make_device_mdarray( handle, mr, raft::make_extents(max_search_batch_size, base_degree)); diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index 415ebc635c..dd2042bd12 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -1696,8 +1696,8 @@ void build_knn_graph( // If the workspace is smaller than desired, put the I/O buffers into the large workspace. rmm::device_async_resource_ref workspace_mr = - use_large_workspace ? raft::resource::get_large_workspace_resource(res) - : raft::resource::get_workspace_resource(res); + use_large_workspace ? raft::resource::get_large_workspace_resource_ref(res) + : raft::resource::get_workspace_resource_ref(res); RAFT_LOG_DEBUG( "IVF-PQ search node_degree: %d, top_k: %d, gpu_top_k: %d, max_batch_size:: %d, n_probes: %u", @@ -2116,7 +2116,7 @@ auto iterative_build_graph( dev_query_view.extent(1), max_chunk_size, raft::resource::get_cuda_stream(res), - raft::resource::get_workspace_resource(res)); + raft::resource::get_workspace_resource_ref(res)); for (const auto& batch : query_batch) { auto batch_dev_query_view = raft::make_device_matrix_view( batch.data(), batch.size(), dev_query_view.extent(1)); diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index 8546ad307e..d16759b2b0 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -530,7 +530,7 @@ void sort_knn_graph( const uint64_t input_graph_degree = knn_graph.extent(1); IdxT* const input_graph_ptr = knn_graph.data_handle(); - auto large_tmp_mr = raft::resource::get_large_workspace_resource(res); + auto large_tmp_mr = raft::resource::get_large_workspace_resource_ref(res); auto d_input_graph = raft::make_device_mdarray( res, large_tmp_mr, raft::make_extents(graph_size, input_graph_degree)); @@ -1156,7 +1156,7 @@ void optimize( { RAFT_LOG_DEBUG( "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1)); - auto large_tmp_mr = raft::resource::get_large_workspace_resource(res); + auto large_tmp_mr = raft::resource::get_large_workspace_resource_ref(res); RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0), "Each input array is expected to have the same number of rows"); diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh index 68771fb895..68f22d7688 100644 --- a/cpp/src/neighbors/detail/cagra/search_plan.cuh +++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh @@ -57,7 +57,7 @@ struct lightweight_uvector { if (new_size == size_) { return; } if (std::holds_alternative(res_)) { auto& h = std::get(res_); - res_ = rmm_res_type{raft::resource::get_workspace_resource(*h), + res_ = rmm_res_type{raft::resource::get_workspace_resource_ref(*h), raft::resource::get_cuda_stream(*h)}; } auto& [r, s] = std::get(res_); @@ -79,7 +79,7 @@ struct lightweight_uvector { if (new_size == size_) { return; } if (std::holds_alternative(res_)) { auto& h = std::get(res_); - res_ = rmm_res_type{raft::resource::get_workspace_resource(*h), stream}; + res_ = rmm_res_type{raft::resource::get_workspace_resource_ref(*h), stream}; } else { std::get(std::get(res_)) = stream; } diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index 59b983b511..91d6619c7d 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -184,7 +184,7 @@ class device_matrix_view_from_host { // live on stack and not returned to a user. // The user may opt to set this resource to managed memory to allow large allocations. device_mem_.emplace(raft::make_device_mdarray( - res, raft::resource::get_large_workspace_resource(res), host_view.extents())); + res, raft::resource::get_large_workspace_resource_ref(res), host_view.extents())); raft::copy(res, device_mem_->view(), host_view); device_ptr = device_mem_->data_handle(); } @@ -270,7 +270,7 @@ void copy_with_padding( raft::resources const& res, raft::device_matrix& dst, raft::mdspan, raft::row_major, data_accessor> src, - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref()) { size_t padded_dim = raft::round_up_safe(src.extent(1) * sizeof(T), 16) / sizeof(T); diff --git a/cpp/src/neighbors/detail/vamana/vamana_build.cuh b/cpp/src/neighbors/detail/vamana/vamana_build.cuh index 7e62c32def..107bc39ee5 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_build.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_build.cuh @@ -143,7 +143,7 @@ void batched_insert_vamana( auto query_ids = raft::make_device_vector(res, max_batchsize); auto query_list_ptr = raft::make_device_mdarray>( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(max_batchsize + 1)); QueryCandidates* query_list = static_cast*>(query_list_ptr.data_handle()); @@ -151,11 +151,11 @@ void batched_insert_vamana( // Results of each batch of inserts during build - Memory is used by query_list structure auto visited_ids = raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(max_batchsize, visited_size)); auto visited_dists = raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(max_batchsize, visited_size)); // Assign memory to query_list structures and initialize @@ -167,14 +167,14 @@ void batched_insert_vamana( 1); auto topk_pq_mem = raft::make_device_mdarray>(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(max_batchsize, visited_size)); int align_padding = raft::alignTo(dim, 16) - dim; auto s_coords_mem = raft::make_device_mdarray( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(min(maxBlocks, max(max_batchsize, reverse_batch)), dim + align_padding)); @@ -318,7 +318,7 @@ void batched_insert_vamana( // compute prefix sums of query_list sizes - TODO parallelize prefix sums // auto d_total_edges = raft::make_device_mdarray( - // res, raft::resource::get_workspace_resource(res), raft::make_extents(1)); + // res, raft::resource::get_workspace_resource_ref(res), raft::make_extents(1)); rmm::device_scalar d_total_edges(stream); prefix_sums_sizes<<<1, 1, 0, stream>>>(query_list, step_size, d_total_edges.data()); RAFT_CUDA_TRY(cudaPeekAtLastError()); @@ -329,16 +329,16 @@ void batched_insert_vamana( auto edge_dist_pair = raft::make_device_mdarray>( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(total_edges)); auto edge_dest = raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(total_edges)); auto edge_src = raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(total_edges)); // Create reverse edge list @@ -367,7 +367,7 @@ void batched_insert_vamana( auto temp_sort_storage = raft::make_device_mdarray( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(temp_storage_bytes / sizeof(IdxT))); // Sort to group reverse edges by destination @@ -406,7 +406,7 @@ void batched_insert_vamana( auto temp_sort_storage = raft::make_device_mdarray( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(temp_storage_bytes / sizeof(IdxT))); // Sort to group reverse edges by destination @@ -451,16 +451,16 @@ void batched_insert_vamana( // Allocate reverse QueryCandidate list based on number of unique destinations auto reverse_list_ptr = raft::make_device_mdarray>( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(reverse_batch)); auto rev_ids = raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(reverse_batch, visited_size)); auto rev_dists = raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(reverse_batch, visited_size)); QueryCandidates* reverse_list = @@ -662,7 +662,7 @@ index build( dim, max_batch_size, raft::resource::get_cuda_stream(res), - raft::resource::get_workspace_resource(res))) { + raft::resource::get_workspace_resource_ref(res))) { // perform rotation auto dataset_rotated = raft::make_device_matrix(res, batch.size(), dim); if constexpr (std::is_same_v) { diff --git a/cpp/src/neighbors/detail/vpq_dataset.cuh b/cpp/src/neighbors/detail/vpq_dataset.cuh index cbe06f5ca4..a186ec6188 100644 --- a/cpp/src/neighbors/detail/vpq_dataset.cuh +++ b/cpp/src/neighbors/detail/vpq_dataset.cuh @@ -508,7 +508,7 @@ void process_and_fill_codes( dim, max_batch_size, stream, - rmm::mr::get_current_device_resource())) { + rmm::mr::get_current_device_resource_ref())) { auto batch_view = raft::make_device_matrix_view(batch.data(), ix_t(batch.size()), dim); auto batch_labels_view = raft::make_device_vector_view(nullptr, 0); if (inline_vq_labels) { @@ -901,7 +901,7 @@ void process_and_fill_codes_subspaces( dim, max_batch_size, copy_stream, - raft::resource::get_workspace_resource(res), + raft::resource::get_workspace_resource_ref(res), enable_prefetch_stream); vec_batches.prefetch_next_batch(); for (const auto& batch : vec_batches) { diff --git a/cpp/src/neighbors/ivf_common.cu b/cpp/src/neighbors/ivf_common.cu index 9fb7b59b0f..b87a14f7c3 100644 --- a/cpp/src/neighbors/ivf_common.cu +++ b/cpp/src/neighbors/ivf_common.cu @@ -79,7 +79,7 @@ void sort_cluster_sizes_descending(uint32_t* input, uint32_t* output, uint32_t n_lists, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* tmp_res) + rmm::device_async_resource_ref tmp_res) { int begin_bit = 0; int end_bit = sizeof(uint32_t) * 8; diff --git a/cpp/src/neighbors/ivf_common.cuh b/cpp/src/neighbors/ivf_common.cuh index 80aac970dd..e466a13fd7 100644 --- a/cpp/src/neighbors/ivf_common.cuh +++ b/cpp/src/neighbors/ivf_common.cuh @@ -20,7 +20,7 @@ void sort_cluster_sizes_descending(uint32_t* input, uint32_t* output, uint32_t n_lists, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* tmp_res); + rmm::device_async_resource_ref tmp_res); /** * Default value returned by `search` when the `n_probes` is too small and top-k is too large. @@ -256,7 +256,7 @@ template void recompute_internal_state(const raft::resources& res, Index& index) { auto stream = raft::resource::get_cuda_stream(res); - auto tmp_res = raft::resource::get_workspace_resource(res); + auto tmp_res = raft::resource::get_workspace_resource_ref(res); rmm::device_uvector sorted_sizes(index.n_lists(), stream, tmp_res); // Actualize the list pointers diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh index 229c703505..35005b4279 100644 --- a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh +++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh @@ -180,8 +180,10 @@ void extend(raft::resources const& handle, RAFT_EXPECTS(new_indices != nullptr || index->size() == 0, "You must pass data indices when the index is non-empty."); - auto new_labels = raft::make_device_mdarray( - handle, raft::resource::get_large_workspace_resource(handle), raft::make_extents(n_rows)); + auto new_labels = + raft::make_device_mdarray(handle, + raft::resource::get_large_workspace_resource_ref(handle), + raft::make_extents(n_rows)); cuvs::cluster::kmeans::balanced_params kmeans_params; kmeans_params.metric = index->metric(); auto orig_centroids_view = @@ -206,7 +208,7 @@ void extend(raft::resources const& handle, index->dim(), max_batch_size, copy_stream, - raft::resource::get_workspace_resource(handle), + raft::resource::get_workspace_resource_ref(handle), enable_prefetch); vec_batches.prefetch_next_batch(); @@ -225,7 +227,7 @@ void extend(raft::resources const& handle, auto* list_sizes_ptr = index->list_sizes().data_handle(); auto old_list_sizes_dev = raft::make_device_mdarray( - handle, raft::resource::get_workspace_resource(handle), raft::make_extents(n_lists)); + handle, raft::resource::get_workspace_resource_ref(handle), raft::make_extents(n_lists)); raft::copy(handle, old_list_sizes_dev.view(), raft::make_device_vector_view(list_sizes_ptr, n_lists)); @@ -294,8 +296,12 @@ void extend(raft::resources const& handle, raft::make_device_vector_view(list_sizes_ptr, n_lists), raft::make_device_vector_view(old_list_sizes_dev.data_handle(), n_lists)); - utils::batch_load_iterator vec_indices( - new_indices, n_rows, 1, max_batch_size, stream, raft::resource::get_workspace_resource(handle)); + utils::batch_load_iterator vec_indices(new_indices, + n_rows, + 1, + max_batch_size, + stream, + raft::resource::get_workspace_resource_ref(handle)); vec_batches.reset(); vec_batches.prefetch_next_batch(); utils::batch_load_iterator idx_batch = vec_indices.begin(); @@ -410,7 +416,7 @@ inline auto build(raft::resources const& handle, 1, n_rows / std::max(params.kmeans_trainset_fraction * n_rows, index.n_lists())); auto n_rows_train = n_rows / trainset_ratio; rmm::device_uvector trainset( - n_rows_train * index.dim(), stream, raft::resource::get_large_workspace_resource(handle)); + n_rows_train * index.dim(), stream, raft::resource::get_large_workspace_resource_ref(handle)); // TODO: a proper sampling raft::copy_matrix(trainset.data(), index.dim(), @@ -469,7 +475,7 @@ inline void fill_refinement_index(raft::resources const& handle, "ivf_flat::fill_refinement_index(%zu, %u)", size_t(n_queries)); rmm::device_uvector new_labels( - n_queries * n_candidates, stream, raft::resource::get_workspace_resource(handle)); + n_queries * n_candidates, stream, raft::resource::get_workspace_resource_ref(handle)); auto new_labels_view = raft::make_device_vector_view(new_labels.data(), n_queries * n_candidates); raft::linalg::map_offset( diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh index f42ffdc837..960d48c818 100644 --- a/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh +++ b/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh @@ -355,21 +355,22 @@ inline void search_with_filtering(raft::resources const& handle, for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) { uint32_t queries_batch = raft::min(max_queries, n_queries - offset_q); - search_impl(handle, - index, - effective_metric, - params, - queries + offset_q * index.dim(), - queries_batch, - offset_q, - k, - n_probes, - max_samples, - cuvs::distance::is_min_close(effective_metric), - neighbors + offset_q * k, - distances + offset_q * k, - raft::resource::get_workspace_resource(handle), - sample_filter); + search_impl( + handle, + index, + effective_metric, + params, + queries + offset_q * index.dim(), + queries_batch, + offset_q, + k, + n_probes, + max_samples, + cuvs::distance::is_min_close(effective_metric), + neighbors + offset_q * k, + distances + offset_q * k, + raft::resource::get_workspace_resource_ref(handle), + sample_filter); } } diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh index 05c0176faa..d7eb46c5a3 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh @@ -83,7 +83,7 @@ void select_residuals(raft::resources const& handle, const float* center, // [dim] const T* dataset, // [.., dim] const IdxT* row_ids, // [n_rows] - rmm::mr::device_memory_resource* device_memory + rmm::device_async_resource_ref device_memory ) { @@ -326,7 +326,7 @@ void train_per_subset(raft::resources const& handle, uint32_t max_train_points_per_pq_code) { auto stream = raft::resource::get_cuda_stream(handle); - auto device_memory = raft::resource::get_workspace_resource(handle); + auto device_memory = raft::resource::get_workspace_resource_ref(handle); rmm::device_uvector pq_centers_tmp(impl->pq_centers().size(), stream, device_memory); // Subsampling the train set for codebook generation based on max_train_points_per_pq_code. @@ -408,7 +408,7 @@ void train_per_cluster(raft::resources const& handle, uint32_t max_train_points_per_pq_code) { auto stream = raft::resource::get_cuda_stream(handle); - auto device_memory = raft::resource::get_workspace_resource(handle); + auto device_memory = raft::resource::get_workspace_resource_ref(handle); // NB: Managed memory is used for small arrays accessed from both device and host. There's no // performance reasoning behind this, just avoiding the boilerplate of explicit copies. rmm::mr::managed_memory_resource managed_memory; @@ -588,7 +588,7 @@ void reconstruct_list_data(raft::resources const& res, auto tmp = raft::make_device_mdarray(res, - raft::resource::get_workspace_resource(res), + raft::resource::get_workspace_resource_ref(res), raft::make_extents(n_rows, index.rot_dim())); constexpr uint32_t kBlockSize = 256; @@ -615,7 +615,7 @@ void reconstruct_list_data(raft::resources const& res, float* out_float_ptr = nullptr; rmm::device_uvector out_float_buf( - 0, raft::resource::get_cuda_stream(res), raft::resource::get_workspace_resource(res)); + 0, raft::resource::get_cuda_stream(res), raft::resource::get_workspace_resource_ref(res)); if constexpr (std::is_same_v) { out_float_ptr = out_vectors.data_handle(); } else { @@ -698,7 +698,7 @@ void encode_list_data(raft::resources const& res, auto n_rows = new_vectors.extent(0); if (n_rows == 0) { return; } - auto mr = raft::resource::get_workspace_resource(res); + auto mr = raft::resource::get_workspace_resource_ref(res); auto new_vectors_residual = raft::make_device_mdarray( res, mr, raft::make_extents(n_rows, index->rot_dim())); @@ -989,9 +989,9 @@ void extend(raft::resources const& handle, std::is_same_v, "Unsupported data type"); - rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle); + rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource_ref(handle); rmm::device_async_resource_ref large_memory = - raft::resource::get_large_workspace_resource(handle); + raft::resource::get_large_workspace_resource_ref(handle); // Try to allocate an index with the same parameters and the projected new size // (which can be slightly larger than index->size() + n_rows, due to padding for interleaved). @@ -1255,13 +1255,14 @@ auto build(raft::resources const& handle, size_t(n_rows) / std::max(params.kmeans_trainset_fraction * n_rows, impl->n_lists())); size_t n_rows_train = n_rows / trainset_ratio; - rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle); + rmm::device_async_resource_ref device_memory = + raft::resource::get_workspace_resource_ref(handle); // If the trainset is small enough to comfortably fit into device memory, put it there. // Otherwise, use the managed memory. constexpr size_t kTolerableRatio = 4; rmm::device_async_resource_ref big_memory_resource = - raft::resource::get_large_workspace_resource(handle); + raft::resource::get_large_workspace_resource_ref(handle); if (sizeof(float) * n_rows_train * impl->dim() * kTolerableRatio < raft::resource::get_workspace_free_bytes(handle)) { big_memory_resource = device_memory; diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh index 598cf37245..08fcd1f09a 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh @@ -69,7 +69,7 @@ void select_clusters(raft::resources const& handle, cuvs::distance::DistanceType metric, const T* queries, // [n_queries, dim] const float* cluster_centers, // [n_lists, dim_ext] - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { raft::common::nvtx::range fun_scope( "ivf_pq::search::select_clusters(n_probes = %u, n_queries = %u, n_lists = %u, dim = %u)", @@ -179,7 +179,7 @@ void select_clusters(raft::resources const& handle, cuvs::distance::DistanceType metric, const T* queries, // [n_queries, dim] const int8_t* cluster_centers, // [n_lists, dim_ext] - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { raft::common::nvtx::range fun_scope( "ivf_pq::search::select_clusters(n_probes = %u, n_queries = %u, n_lists = %u, dim = %u)", @@ -267,7 +267,7 @@ void select_clusters(raft::resources const& handle, cuvs::distance::DistanceType metric, const T* queries, // [n_queries, dim] const half* cluster_centers, // [n_lists, dim_ext] - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { raft::common::nvtx::range fun_scope( "ivf_pq::search::select_clusters(n_probes = %u, n_queries = %u, n_lists = %u, dim = %u)", @@ -440,7 +440,7 @@ void ivfpq_search_worker(raft::resources const& handle, topK, index.dim()); auto stream = raft::resource::get_cuda_stream(handle); - auto mr = raft::resource::get_workspace_resource(handle); + auto mr = raft::resource::get_workspace_resource_ref(handle); bool manage_local_topk = is_local_topk_feasible(topK, n_probes, n_queries); auto topk_len = manage_local_topk ? n_probes * topK : max_samples; @@ -927,7 +927,7 @@ inline void search(raft::resources const& handle, max_samples = ms; } - auto mr = raft::resource::get_workspace_resource(handle); + auto mr = raft::resource::get_workspace_resource_ref(handle); // Maximum number of query vectors to search at the same time. // Number of queries in the outer loop, which includes query transform and coarse search. diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_transform.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_transform.cuh index ba92c53231..1d55692488 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_transform.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_transform.cuh @@ -51,7 +51,7 @@ void transform_batch(raft::resources const& res, raft::device_matrix_view output_dataset) { IdxT n_rows = dataset.extent(0); - rmm::device_async_resource_ref mr = raft::resource::get_workspace_resource(res); + rmm::device_async_resource_ref mr = raft::resource::get_workspace_resource_ref(res); // Compute the labels for each vector cuvs::cluster::kmeans::balanced_params kmeans_params; @@ -115,7 +115,7 @@ void transform(raft::resources const& res, raft::common::nvtx::range fun_scope( "ivf_pq::transform(n_rows = %u, dim = %u)", n_rows, dataset.extent(1)); - rmm::device_async_resource_ref mr = raft::resource::get_workspace_resource(res); + rmm::device_async_resource_ref mr = raft::resource::get_workspace_resource_ref(res); // The cluster centers in the index are stored padded, which is not acceptable by // the kmeans_balanced::predict. Thus, we need the restructuring raft::copy. @@ -138,7 +138,7 @@ void transform(raft::resources const& res, } constexpr size_t max_batch_size = 65536; - rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(res); + rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource_ref(res); utils::batch_load_iterator vec_batches(dataset.data_handle(), n_rows, diff --git a/cpp/src/neighbors/scann/detail/scann_avq.cuh b/cpp/src/neighbors/scann/detail/scann_avq.cuh index 6c3bb045e4..0e138ce1a3 100644 --- a/cpp/src/neighbors/scann/detail/scann_avq.cuh +++ b/cpp/src/neighbors/scann/detail/scann_avq.cuh @@ -59,7 +59,7 @@ void compute_cluster_offsets(raft::resources const& dev_resources, { cudaStream_t stream = raft::resource::get_cuda_stream(dev_resources); rmm::device_async_resource_ref device_memory = - raft::resource::get_workspace_resource(dev_resources); + raft::resource::get_workspace_resource_ref(dev_resources); // Histrogram to compute cluster sizes int num_levels = cluster_sizes.extent(0) + 1; @@ -138,7 +138,7 @@ void sum_reduce_vector(raft::resources const& dev_resources, { cudaStream_t stream = raft::resource::get_cuda_stream(dev_resources); rmm::device_async_resource_ref device_memory = - raft::resource::get_workspace_resource(dev_resources); + raft::resource::get_workspace_resource_ref(dev_resources); size_t temp_storage_bytes = 0; @@ -166,7 +166,7 @@ void cholesky_solver(raft::resources const& dev_resources, cudaStream_t stream = raft::resource::get_cuda_stream(dev_resources); cusolverDnHandle_t cusolverH = raft::resource::get_cusolver_dn_handle(dev_resources); rmm::device_async_resource_ref device_memory = - raft::resource::get_workspace_resource(dev_resources); + raft::resource::get_workspace_resource_ref(dev_resources); // RAFT_CUSOLVER_TRY(cusolverDnSetStream(cusolverH, stream)); diff --git a/cpp/src/neighbors/scann/detail/scann_build.cuh b/cpp/src/neighbors/scann/detail/scann_build.cuh index 37f74f29bb..bfe38209f7 100644 --- a/cpp/src/neighbors/scann/detail/scann_build.cuh +++ b/cpp/src/neighbors/scann/detail/scann_build.cuh @@ -87,7 +87,7 @@ index build( raft::device_vector_view labels_view = idx.labels(); // setup batching for kmeans prediction + quantization - auto* device_memory = raft::resource::get_workspace_resource(res); + auto device_memory = raft::resource::get_workspace_resource_ref(res); constexpr size_t kReasonableMaxBatchSize = 65536; size_t max_batch_size = std::min(dataset.extent(0), kReasonableMaxBatchSize); diff --git a/cpp/src/preprocessing/quantize/detail/binary.cuh b/cpp/src/preprocessing/quantize/detail/binary.cuh index fe8288a7cb..6695b8b126 100644 --- a/cpp/src/preprocessing/quantize/detail/binary.cuh +++ b/cpp/src/preprocessing/quantize/detail/binary.cuh @@ -146,7 +146,7 @@ void mean_f16_in_f32(raft::resources const& res, const size_t dataset_size, cudaStream_t cuda_stream) { - auto mr = raft::resource::get_workspace_resource(res); + auto mr = raft::resource::get_workspace_resource_ref(res); auto f32_result_vec = raft::make_device_mdarray(res, mr, raft::make_extents(dataset_dim)); raft::matrix::fill(res, f32_result_vec.view(), float(0)); @@ -212,7 +212,7 @@ auto train(raft::resources const& res, static_cast(dataset_dim)); raft::random::RngState rng(29837lu); - auto mr = raft::resource::get_workspace_resource(res); + auto mr = raft::resource::get_workspace_resource_ref(res); auto sampled_dataset_chunk = raft::make_device_mdarray( res, mr, raft::make_extents(num_samples, max_dim_chunk)); auto transposed_sampled_dataset_chunk = raft::make_device_mdarray( @@ -331,7 +331,7 @@ auto train(raft::resources const& res, raft::make_host_vector_view(host_threshold_vec.data(), (int64_t)dataset_dim)); } else { - auto mr = raft::resource::get_workspace_resource(res); + auto mr = raft::resource::get_workspace_resource_ref(res); auto casted_vec = raft::make_device_mdarray( res, mr, raft::make_extents(dataset_dim)); raft::copy(res, @@ -425,7 +425,7 @@ void transform(raft::resources const& res, raft::make_device_vector_view(quantizer.threshold.data_handle(), (int64_t)dataset_dim)); } else { - auto mr = raft::resource::get_workspace_resource(res); + auto mr = raft::resource::get_workspace_resource_ref(res); auto casted_vec = raft::make_device_mdarray( res, mr, raft::make_extents(dataset_dim)); raft::linalg::map(res, diff --git a/cpp/src/preprocessing/quantize/detail/pq.cuh b/cpp/src/preprocessing/quantize/detail/pq.cuh index 77fb0ac4f9..74fa5fae2a 100644 --- a/cpp/src/preprocessing/quantize/detail/pq.cuh +++ b/cpp/src/preprocessing/quantize/detail/pq.cuh @@ -91,7 +91,7 @@ auto train_pq_subspaces( auto trainset_ptr = !vq_centers.empty() ? pq_trainset.data_handle() : dataset.data_handle(); auto sub_labels = raft::make_device_vector(res, 0); auto pq_cluster_sizes = raft::make_device_vector(res, 0); - auto device_memory = raft::resource::get_workspace_resource(res); + auto device_memory = raft::resource::get_workspace_resource_ref(res); if (params.pq_kmeans_type == cuvs::cluster::kmeans::kmeans_type::KMeansBalanced) { sub_labels = raft::make_device_mdarray( res, device_memory, raft::make_extents(n_rows_train)); diff --git a/cpp/tests/neighbors/ann_ivf_pq.cuh b/cpp/tests/neighbors/ann_ivf_pq.cuh index 4ce9c96077..033f0af9c2 100644 --- a/cpp/tests/neighbors/ann_ivf_pq.cuh +++ b/cpp/tests/neighbors/ann_ivf_pq.cuh @@ -103,7 +103,7 @@ void compare_vectors_l2( auto dim = a.extent(1); rmm::mr::managed_memory_resource managed_memory; auto dist = - raft::make_device_mdarray(res, &managed_memory, raft::make_extents(n_rows)); + raft::make_device_mdarray(res, managed_memory, raft::make_extents(n_rows)); raft::linalg::map_offset(res, dist.view(), [a, b, dim] __device__(uint32_t i) { cuvs::spatial::knn::detail::utils::mapping f{}; double d = 0.0f; diff --git a/cpp/tests/neighbors/ann_utils.cuh b/cpp/tests/neighbors/ann_utils.cuh index 8a908c0187..cbc95d7bb7 100644 --- a/cpp/tests/neighbors/ann_utils.cuh +++ b/cpp/tests/neighbors/ann_utils.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -16,7 +16,6 @@ #include #include -#include #include "naive_knn.cuh" diff --git a/cpp/tests/neighbors/naive_knn.cuh b/cpp/tests/neighbors/naive_knn.cuh index 058cf8571d..d21e982902 100644 --- a/cpp/tests/neighbors/naive_knn.cuh +++ b/cpp/tests/neighbors/naive_knn.cuh @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include namespace cuvs::neighbors { @@ -103,7 +104,7 @@ void naive_knn(raft::resources const& handle, uint32_t k, cuvs::distance::DistanceType type) { - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(); + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(); auto stream = raft::resource::get_cuda_stream(handle); dim3 block_dim(16, 32, 1); @@ -131,8 +132,7 @@ void naive_knn(raft::resources const& handle, static_cast(k), dist_topk + offset * k, indices_topk + offset * k, - cuvs::distance::is_min_close(type), - mr); + cuvs::distance::is_min_close(type)); } RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } diff --git a/cpp/tests/preprocessing/product_quantization.cu b/cpp/tests/preprocessing/product_quantization.cu index f1b84854f1..f637f446af 100644 --- a/cpp/tests/preprocessing/product_quantization.cu +++ b/cpp/tests/preprocessing/product_quantization.cu @@ -53,7 +53,7 @@ void compare_vectors_l2(const raft::resources& res, auto dim = a.extent(1); rmm::mr::managed_memory_resource managed_memory; auto dist = - raft::make_device_mdarray(res, &managed_memory, raft::make_extents(n_rows)); + raft::make_device_mdarray(res, managed_memory, raft::make_extents(n_rows)); raft::linalg::map_offset(res, dist.view(), [a, b, dim] __device__(uint32_t i) { double d = 0.0f; for (uint32_t j = 0; j < dim; j++) {