diff --git a/README.md b/README.md index 77377fe2bbc..4bdbcd00280 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ As of Release 0.18 - including 0.18 nightly | Community | | | | | | EgoNet | Single-GPU | | | | Leiden | Single-GPU | | -| | Louvain | Multi-GPU | | +| | Louvain | Multi-GPU | [C++ README](cpp/src/community/README.md#Louvain) | | | Ensemble Clustering for Graphs | Single-GPU | | | | Spectral-Clustering - Balanced Cut | Single-GPU | | | | Spectral-Clustering - Modularity | Single-GPU | | @@ -71,16 +71,16 @@ As of Release 0.18 - including 0.18 nightly | Linear Assignment| | | | | | Hungarian | Single-GPU | [README](cpp/src/linear_assignment/README-hungarian.md) | | Link Analysis| | | | -| | Pagerank | Multi-GPU | | -| | Personal Pagerank | Multi-GPU | | +| | Pagerank | Multi-GPU | [C++ README](cpp/src/centrality/README.md#Pagerank) | +| | Personal Pagerank | Multi-GPU | [C++ README](cpp/src/centrality/README.md#Personalized-Pagerank) | | | HITS | Single-GPU | leverages Gunrock | | Link Prediction | | | | | | Jaccard Similarity | Single-GPU | | | | Weighted Jaccard Similarity | Single-GPU | | | | Overlap Similarity | Single-GPU | | | Traversal | | | | -| | Breadth First Search (BFS) | Multi-GPU | with cutoff support | -| | Single Source Shortest Path (SSSP) | Multi-GPU | | +| | Breadth First Search (BFS) | Multi-GPU | with cutoff support
[C++ README](cpp/src/traversal/README.md#BFS) | +| | Single Source Shortest Path (SSSP) | Multi-GPU | [C++ README](cpp/src/traversal/README.md#SSSP) | | | Traveling Salesperson Problem (TSP) | Single-GPU | | | Structure | | | | | | Renumbering | Single-GPU | multiple columns, any data type | diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index d69448cda4e..8d12b10a640 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -26,6 +26,7 @@ export GPUCI_CONDA_RETRY_SLEEP=30 # Use Ninja to build export CMAKE_GENERATOR="Ninja" +export CONDA_BLD_DIR="${WORKSPACE}/.conda-bld" ################################################################################ # SETUP - Check environment @@ -58,18 +59,20 @@ conda config --set ssl_verify False gpuci_logger "Build conda pkg for libcugraph" if [ "$BUILD_LIBCUGRAPH" == '1' ]; then if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then - conda build conda/recipes/libcugraph + gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcugraph else - conda build --dirty --no-remove-work-dir conda/recipes/libcugraph + gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libcugraph + mkdir -p ${CONDA_BLD_DIR}/libcugraph/work + cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcugraph/work fi fi gpuci_logger "Build conda pkg for cugraph" if [ "$BUILD_CUGRAPH" == "1" ]; then if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then - conda build conda/recipes/cugraph --python=$PYTHON + gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/cugraph --python=$PYTHON else - conda build conda/recipes/cugraph -c ci/artifacts/cugraph/cpu/conda-bld/ --dirty --no-remove-work-dir --python=$PYTHON + gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/cugraph -c ci/artifacts/cugraph/cpu/.conda-bld/ --dirty --no-remove-work-dir --python=$PYTHON fi fi diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh index 0fca82216c3..50e4c25b90b 100644 --- a/ci/cpu/upload.sh +++ b/ci/cpu/upload.sh @@ -1,4 +1,5 @@ #!/bin/bash +# Copyright (c) 2018-2021, NVIDIA CORPORATION. # # Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh @@ -29,8 +30,8 @@ fi gpuci_logger "Get conda file output locations" -export LIBCUGRAPH_FILE=`conda build conda/recipes/libcugraph --output` -export CUGRAPH_FILE=`conda build conda/recipes/cugraph --python=$PYTHON --output` +export LIBCUGRAPH_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcugraph --output` +export CUGRAPH_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cugraph --python=$PYTHON --output` ################################################################################ # UPLOAD - Conda packages diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 7242b4a11f5..30dc7373e15 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -16,6 +16,7 @@ function hasArg { export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} export CUDA_REL=${CUDA_VERSION%.*} +export CONDA_ARTIFACT_PATH=${WORKSPACE}/ci/artifacts/cugraph/cpu/.conda-bld/ function cleanup { gpuci_logger "Removing datasets and temp files" @@ -101,11 +102,11 @@ else chrpath -d libcugraph.so patchelf --replace-needed `patchelf --print-needed libcugraph.so | grep faiss` libfaiss.so libcugraph.so - CONDA_FILE=`find $WORKSPACE/ci/artifacts/cugraph/cpu/conda-bld/ -name "libcugraph*.tar.bz2"` + CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcugraph*.tar.bz2"` CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension CONDA_FILE=${CONDA_FILE//-/=} #convert to conda install echo "Installing $CONDA_FILE" - conda install -c $WORKSPACE/ci/artifacts/cugraph/cpu/conda-bld/ "$CONDA_FILE" + conda install -c ${CONDA_ARTIFACT_PATH} "$CONDA_FILE" echo "Build cugraph..." $WORKSPACE/build.sh cugraph diff --git a/ci/test.sh b/ci/test.sh index 58cbb950f73..31660cd15ec 100755 --- a/ci/test.sh +++ b/ci/test.sh @@ -66,13 +66,17 @@ fi # EXITCODE for the script. set +e -echo "C++ gtests for cuGraph..." -for gt in tests/*_TEST; do - test_name=$(basename $gt) - echo "Running gtest $test_name" - ${gt} ${GTEST_FILTER} ${GTEST_ARGS} - echo "Ran gtest $test_name : return code was: $?, test script exit code is now: $EXITCODE" -done +if (python ${CUGRAPH_ROOT}/ci/utils/is_pascal.py); then + echo "WARNING: skipping C++ tests on Pascal GPU arch." +else + echo "C++ gtests for cuGraph..." + for gt in tests/*_TEST; do + test_name=$(basename $gt) + echo "Running gtest $test_name" + ${gt} ${GTEST_FILTER} ${GTEST_ARGS} + echo "Ran gtest $test_name : return code was: $?, test script exit code is now: $EXITCODE" + done +fi echo "Python pytest for cuGraph..." cd ${CUGRAPH_ROOT}/python diff --git a/ci/utils/is_pascal.py b/ci/utils/is_pascal.py new file mode 100644 index 00000000000..e55a3153a12 --- /dev/null +++ b/ci/utils/is_pascal.py @@ -0,0 +1,39 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import sys +import glob + +from numba import cuda + +# FIXME: consolidate this code with ci/gpu/notebook_list.py + +# +# Not strictly true... however what we mean is +# Pascal or earlier +# +pascal = False + +device = cuda.get_current_device() +# check for the attribute using both pre and post numba 0.53 names +cc = getattr(device, 'COMPUTE_CAPABILITY', None) or \ + getattr(device, 'compute_capability') +if (cc[0] < 7): + pascal = True + +# Return zero (success) if pascal is True +if pascal: + sys.exit(0) +else: + sys.exit(1) diff --git a/conda/environments/cugraph_dev_cuda10.1.yml b/conda/environments/cugraph_dev_cuda10.1.yml index f26c3dd45d9..a138f5e80df 100644 --- a/conda/environments/cugraph_dev_cuda10.1.yml +++ b/conda/environments/cugraph_dev_cuda10.1.yml @@ -18,7 +18,7 @@ dependencies: - ucx-py=0.19* - ucx-proc=*=gpu - scipy -- networkx +- networkx>=2.5.1 - python-louvain - cudatoolkit=10.1 - clang=8.0.1 diff --git a/conda/environments/cugraph_dev_cuda10.2.yml b/conda/environments/cugraph_dev_cuda10.2.yml index 2848cc49dc7..d53fefc086a 100644 --- a/conda/environments/cugraph_dev_cuda10.2.yml +++ b/conda/environments/cugraph_dev_cuda10.2.yml @@ -18,7 +18,7 @@ dependencies: - ucx-py=0.19* - ucx-proc=*=gpu - scipy -- networkx +- networkx>=2.5.1 - python-louvain - cudatoolkit=10.2 - clang=8.0.1 diff --git a/conda/environments/cugraph_dev_cuda11.0.yml b/conda/environments/cugraph_dev_cuda11.0.yml index 82e8b409d13..771b175aa92 100644 --- a/conda/environments/cugraph_dev_cuda11.0.yml +++ b/conda/environments/cugraph_dev_cuda11.0.yml @@ -18,7 +18,7 @@ dependencies: - ucx-py=0.19* - ucx-proc=*=gpu - scipy -- networkx +- networkx>=2.5.1 - python-louvain - cudatoolkit=11.0 - clang=8.0.1 diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml index e714b61d774..1ef64ddbe72 100644 --- a/conda/recipes/cugraph/meta.yaml +++ b/conda/recipes/cugraph/meta.yaml @@ -10,7 +10,7 @@ package: version: {{ version }} source: - path: ../../.. + git_url: ../../.. build: number: {{ GIT_DESCRIBE_NUMBER }} diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 26a8f98e265..0388a76d729 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -18,6 +18,10 @@ cmake_minimum_required(VERSION 3.18...3.18 FATAL_ERROR) project(CUGRAPH VERSION 0.19.0 LANGUAGES C CXX CUDA) +# Write the version header +include(cmake/Modules/Version.cmake) +write_version() + ################################################################################################### # - build type ------------------------------------------------------------------------------------ @@ -112,6 +116,7 @@ set(FAISS_GPU_ARCHS "${FAISS_GPU_ARCHS} -gencode arch=compute_${ptx},code=comput set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xptxas --disable-warnings") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wno-error=sign-compare,-Wno-error=unused-but-set-variable") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all") # Option to enable line info in CUDA device compilation to allow introspection when profiling / # memchecking @@ -271,7 +276,7 @@ message("set LIBCUDACXX_INCLUDE_DIR to: ${LIBCUDACXX_INCLUDE_DIR}") FetchContent_Declare( cuhornet GIT_REPOSITORY https://github.com/rapidsai/cuhornet.git - GIT_TAG 9cb8e8803852bd895a9c95c0fe778ad6eeefa7ad + GIT_TAG e58d0ecdbc270fc28867d66c965787a62a7a882c GIT_SHALLOW true SOURCE_SUBDIR hornet ) @@ -297,7 +302,7 @@ else(DEFINED ENV{RAFT_PATH}) FetchContent_Declare( raft GIT_REPOSITORY https://github.com/rapidsai/raft.git - GIT_TAG 6455e05b3889db2b495cf3189b33c2b07bfbebf2 + GIT_TAG f0cd81fb49638eaddc9bf18998cc894f292bc293 SOURCE_SUBDIR raft ) @@ -397,6 +402,7 @@ endif(BUILD_STATIC_FAISS) add_library(cugraph SHARED src/utilities/spmv_1D.cu src/utilities/cython.cu + src/utilities/path_retrieval.cu src/structure/graph.cu src/linear_assignment/hungarian.cu src/link_analysis/gunrock_hits.cpp @@ -416,6 +422,7 @@ add_library(cugraph SHARED src/community/triangles_counting.cu src/community/extract_subgraph_by_vertex.cu src/community/egonet.cu + src/sampling/random_walks.cu src/cores/core_number.cu src/traversal/two_hop_neighbors.cu src/components/connectivity.cu @@ -426,6 +433,7 @@ add_library(cugraph SHARED src/experimental/graph_view.cu src/experimental/coarsen_graph.cu src/experimental/renumber_edgelist.cu + src/experimental/renumber_utils.cu src/experimental/relabel.cu src/experimental/induced_subgraph.cu src/experimental/bfs.cu @@ -560,6 +568,9 @@ install(TARGETS cugraph LIBRARY install(DIRECTORY include/ DESTINATION include/cugraph) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/include/cugraph/version_config.hpp + DESTINATION include/cugraph) + install(DIRECTORY ${RAFT_DIR}/cpp/include/raft/ DESTINATION include/cugraph/raft) ################################################################################################### diff --git a/cpp/cmake/Modules/Version.cmake b/cpp/cmake/Modules/Version.cmake new file mode 100644 index 00000000000..15046784175 --- /dev/null +++ b/cpp/cmake/Modules/Version.cmake @@ -0,0 +1,18 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + +# Generate version_config.hpp from the version found in CMakeLists.txt +function(write_version) + message(STATUS "CUGRAPH VERSION: ${CUGRAPH_VERSION}") + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/version_config.hpp.in + ${CMAKE_CURRENT_BINARY_DIR}/include/cugraph/version_config.hpp @ONLY) +endfunction(write_version) diff --git a/cpp/cmake/version_config.hpp.in b/cpp/cmake/version_config.hpp.in new file mode 100644 index 00000000000..c669d1b97f3 --- /dev/null +++ b/cpp/cmake/version_config.hpp.in @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#define CUGRAPH_VERSION_MAJOR @CUGRAPH_VERSION_MAJOR@ +#define CUGRAPH_VERSION_MINOR @CUGRAPH_VERSION_MINOR@ +#define CUGRAPH_VERSION_PATCH @CUGRAPH_VERSION_PATCH@ diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp index c3a4f3ec985..0b45b799357 100644 --- a/cpp/include/algorithms.hpp +++ b/cpp/include/algorithms.hpp @@ -14,10 +14,14 @@ * limitations under the License. */ #pragma once + +#include #include #include + #include #include + #include namespace cugraph { @@ -218,7 +222,7 @@ void force_atlas2(GraphCOOView &graph, * @param[out] route Device array containing the returned route. * */ -float traveling_salesperson(raft::handle_t &handle, +float traveling_salesperson(raft::handle_t const &handle, int const *vtx_ptr, float const *x_pos, float const *y_pos, @@ -612,7 +616,7 @@ weight_t hungarian(raft::handle_t const &handle, * * @throws cugraph::logic_error when an error occurs. * - * @tparam graph_t Type of graph + * @tparam graph_view_t Type of graph * * @param[in] handle Library handle (RAFT). If a communicator is set in the handle, * @param[in] graph input graph object (CSR) @@ -629,13 +633,74 @@ weight_t hungarian(raft::handle_t const &handle, * 2) modularity of the returned clustering * */ -template -std::pair louvain( +template +std::pair louvain( raft::handle_t const &handle, - graph_t const &graph, - typename graph_t::vertex_type *clustering, - size_t max_level = 100, - typename graph_t::weight_type resolution = typename graph_t::weight_type{1}); + graph_view_t const &graph_view, + typename graph_view_t::vertex_type *clustering, + size_t max_level = 100, + typename graph_view_t::weight_type resolution = typename graph_view_t::weight_type{1}); + +/** + * @brief Louvain implementation, returning dendrogram + * + * Compute a clustering of the graph by maximizing modularity + * + * Computed using the Louvain method described in: + * + * VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of + * community hierarchies in large networks, J Stat Mech P10008 (2008), + * http://arxiv.org/abs/0803.0476 + * + * @throws cugraph::logic_error when an error occurs. + * + * @tparam graph_view_t Type of graph + * + * @param[in] handle Library handle (RAFT) + * @param[in] graph_view Input graph view object (CSR) + * @param[in] max_level (optional) maximum number of levels to run (default 100) + * @param[in] resolution (optional) The value of the resolution parameter to use. + * Called gamma in the modularity formula, this changes the size + * of the communities. Higher resolutions lead to more smaller + * communities, lower resolutions lead to fewer larger + * communities. (default 1) + * + * @return a pair containing: + * 1) unique pointer to dendrogram + * 2) modularity of the returned clustering + * + */ +template +std::pair>, + typename graph_view_t::weight_type> +louvain(raft::handle_t const &handle, + graph_view_t const &graph_view, + size_t max_level = 100, + typename graph_view_t::weight_type resolution = typename graph_view_t::weight_type{1}); + +/** + * @brief Flatten a Dendrogram at a particular level + * + * A Dendrogram represents a hierarchical clustering/partitioning of + * a graph. This function will flatten the hierarchical clustering into + * a label for each vertex representing the final cluster/partition to + * which it is assigned + * + * @throws cugraph::logic_error when an error occurs. + * + * @tparam graph_view_t Type of graph + * + * @param[in] handle Library handle (RAFT). If a communicator is set in the handle, + * @param[in] graph input graph object + * @param[in] dendrogram input dendrogram object + * @param[out] clustering Pointer to device array where the clustering should be stored + * + */ +template +void flatten_dendrogram(raft::handle_t const &handle, + graph_view_t const &graph_view, + Dendrogram const &dendrogram, + typename graph_view_t::vertex_type *clustering); /** * @brief Leiden implementation @@ -1187,5 +1252,33 @@ extract_ego(raft::handle_t const &handle, vertex_t *source_vertex, vertex_t n_subgraphs, vertex_t radius); + +/** + * @brief returns random walks (RW) from starting sources, where each path is of given maximum + * length. Uniform distribution is assumed for the random engine. + * + * @tparam graph_t Type of graph/view (typically, graph_view_t). + * @tparam index_t Type used to store indexing and sizes. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph Graph (view )object to generate RW on. + * @param ptr_d_start Device pointer to set of starting vertex indices for the RW. + * @param num_paths = number(paths). + * @param max_depth maximum length of RWs. + * @return std::tuple, device_vec_t, + * device_vec_t> Triplet of coalesced RW paths, with corresponding edge weights for + * each, and corresponding path sizes. This is meant to minimize the number of DF's to be passed to + * the Python layer. The meaning of "coalesced" here is that a 2D array of paths of different sizes + * is represented as a 1D array. + */ +template +std::tuple, + rmm::device_uvector, + rmm::device_uvector> +random_walks(raft::handle_t const &handle, + graph_t const &graph, + typename graph_t::vertex_type const *ptr_d_start, + index_t num_paths, + index_t max_depth); } // namespace experimental } // namespace cugraph diff --git a/cpp/src/community/dendrogram.cuh b/cpp/include/dendrogram.hpp similarity index 53% rename from cpp/src/community/dendrogram.cuh rename to cpp/include/dendrogram.hpp index 414f5f3854d..aa0802e80b3 100644 --- a/cpp/src/community/dendrogram.cuh +++ b/cpp/include/dendrogram.hpp @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include @@ -25,30 +25,26 @@ namespace cugraph { template class Dendrogram { public: - void add_level(vertex_t num_verts, - cudaStream_t stream = 0, + void add_level(vertex_t first_index, + vertex_t num_verts, + cudaStream_t stream, rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) { - level_ptr_.push_back( - std::make_unique(num_verts * sizeof(vertex_t), stream, mr)); - level_size_.push_back(num_verts); + level_ptr_.push_back(std::make_unique>(num_verts, stream, mr)); + level_first_index_.push_back(first_index); } - size_t current_level() const { return level_size_.size() - 1; } + size_t current_level() const { return level_ptr_.size() - 1; } - size_t num_levels() const { return level_size_.size(); } + size_t num_levels() const { return level_ptr_.size(); } - vertex_t const *get_level_ptr_nocheck(size_t level) const - { - return static_cast(level_ptr_[level]->data()); - } + vertex_t const *get_level_ptr_nocheck(size_t level) const { return level_ptr_[level]->data(); } - vertex_t *get_level_ptr_nocheck(size_t level) - { - return static_cast(level_ptr_[level]->data()); - } + vertex_t *get_level_ptr_nocheck(size_t level) { return level_ptr_[level]->data(); } - vertex_t get_level_size_nocheck(size_t level) const { return level_size_[level]; } + size_t get_level_size_nocheck(size_t level) const { return level_ptr_[level]->size(); } + + vertex_t get_level_first_index_nocheck(size_t level) const { return level_first_index_[level]; } vertex_t const *current_level_begin() const { return get_level_ptr_nocheck(current_level()); } @@ -58,11 +54,16 @@ class Dendrogram { vertex_t *current_level_end() { return current_level_begin() + current_level_size(); } - vertex_t current_level_size() const { return get_level_size_nocheck(current_level()); } + size_t current_level_size() const { return get_level_size_nocheck(current_level()); } + + vertex_t current_level_first_index() const + { + return get_level_first_index_nocheck(current_level()); + } private: - std::vector level_size_; - std::vector> level_ptr_; + std::vector level_first_index_; + std::vector>> level_ptr_; }; } // namespace cugraph diff --git a/cpp/include/experimental/detail/graph_utils.cuh b/cpp/include/experimental/detail/graph_utils.cuh index 084d68b8ba4..d79788e59ce 100644 --- a/cpp/include/experimental/detail/graph_utils.cuh +++ b/cpp/include/experimental/detail/graph_utils.cuh @@ -56,65 +56,32 @@ rmm::device_uvector compute_major_degrees( rmm::device_uvector degrees(0, handle.get_stream()); vertex_t max_num_local_degrees{0}; - for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size); - ++i) { - auto vertex_partition_idx = partition.is_hypergraph_partitioned() - ? static_cast(i * row_comm_size + row_comm_rank) - : static_cast(col_comm_rank * row_comm_size + i); + for (int i = 0; i < col_comm_size; ++i) { + auto vertex_partition_idx = static_cast(i * row_comm_size + row_comm_rank); auto vertex_partition_size = partition.get_vertex_partition_size(vertex_partition_idx); max_num_local_degrees = std::max(max_num_local_degrees, vertex_partition_size); - if (i == (partition.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank)) { - degrees.resize(vertex_partition_size, handle.get_stream()); - } + if (i == col_comm_rank) { degrees.resize(vertex_partition_size, handle.get_stream()); } } local_degrees.resize(max_num_local_degrees, handle.get_stream()); - for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size); - ++i) { - auto vertex_partition_idx = partition.is_hypergraph_partitioned() - ? static_cast(i * row_comm_size + row_comm_rank) - : static_cast(col_comm_rank * row_comm_size + i); + for (int i = 0; i < col_comm_size; ++i) { + auto vertex_partition_idx = static_cast(i * row_comm_size + row_comm_rank); vertex_t major_first{}; vertex_t major_last{}; std::tie(major_first, major_last) = partition.get_vertex_partition_range(vertex_partition_idx); - auto p_offsets = - partition.is_hypergraph_partitioned() - ? adj_matrix_partition_offsets[i] - : adj_matrix_partition_offsets[0] + - (major_first - partition.get_vertex_partition_first(col_comm_rank * row_comm_size)); + auto p_offsets = adj_matrix_partition_offsets[i]; thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), thrust::make_counting_iterator(vertex_t{0}), thrust::make_counting_iterator(major_last - major_first), local_degrees.data(), [p_offsets] __device__(auto i) { return p_offsets[i + 1] - p_offsets[i]; }); - if (partition.is_hypergraph_partitioned()) { - col_comm.reduce(local_degrees.data(), - i == col_comm_rank ? degrees.data() : static_cast(nullptr), - static_cast(major_last - major_first), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } else { - row_comm.reduce(local_degrees.data(), - i == row_comm_rank ? degrees.data() : static_cast(nullptr), - static_cast(major_last - major_first), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } + col_comm.reduce(local_degrees.data(), + i == col_comm_rank ? degrees.data() : static_cast(nullptr), + static_cast(major_last - major_first), + raft::comms::op_t::SUM, + i, + handle.get_stream()); } - raft::comms::status_t status{}; - if (partition.is_hypergraph_partitioned()) { - status = - col_comm.sync_stream(handle.get_stream()); // this is neessary as local_degrees will become - // out-of-scope once this function returns. - } else { - status = - row_comm.sync_stream(handle.get_stream()); // this is neessary as local_degrees will become - // out-of-scope once this function returns. - } - CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure."); - return degrees; } @@ -170,7 +137,6 @@ struct compute_gpu_id_from_vertex_t { template struct compute_gpu_id_from_edge_t { - bool hypergraph_partitioned{false}; int comm_size{0}; int row_comm_size{0}; int col_comm_size{0}; @@ -180,12 +146,22 @@ struct compute_gpu_id_from_edge_t { cuco::detail::MurmurHash3_32 hash_func{}; auto major_comm_rank = static_cast(hash_func(major) % comm_size); auto minor_comm_rank = static_cast(hash_func(minor) % comm_size); - if (hypergraph_partitioned) { - return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size); - } else { - return (major_comm_rank - (major_comm_rank % row_comm_size)) + - (minor_comm_rank / col_comm_size); - } + return (minor_comm_rank / row_comm_size) * row_comm_size + (major_comm_rank % row_comm_size); + } +}; + +template +struct compute_partition_id_from_edge_t { + int comm_size{0}; + int row_comm_size{0}; + int col_comm_size{0}; + + __device__ int operator()(vertex_t major, vertex_t minor) const + { + cuco::detail::MurmurHash3_32 hash_func{}; + auto major_comm_rank = static_cast(hash_func(major) % comm_size); + auto minor_comm_rank = static_cast(hash_func(minor) % comm_size); + return major_comm_rank * col_comm_size + minor_comm_rank / row_comm_size; } }; diff --git a/cpp/include/experimental/graph.hpp b/cpp/include/experimental/graph.hpp index 6a10256e6f4..27f766b8593 100644 --- a/cpp/include/experimental/graph.hpp +++ b/cpp/include/experimental/graph.hpp @@ -88,12 +88,12 @@ class graph_tget_number_of_vertices(), this->get_number_of_edges(), this->get_graph_properties(), - vertex_partition_segment_offsets_.size() > 0, + adj_matrix_partition_segment_offsets_.size() > 0, false); } @@ -105,9 +105,10 @@ class graph_t partition_{}; std::vector - vertex_partition_segment_offsets_{}; // segment offsets within the vertex partition based on - // vertex degree, relevant only if - // sorted_by_global_degree_within_vertex_partition is true + adj_matrix_partition_segment_offsets_{}; // segment offsets within the vertex partition based + // on vertex degree, relevant only if + // sorted_by_global_degree_within_vertex_partition is + // true }; // single-GPU version @@ -188,6 +189,20 @@ template struct invalid_edge_id : invalid_idx { }; +template +__host__ __device__ std::enable_if_t::value, bool> is_valid_vertex( + vertex_t num_vertices, vertex_t v) +{ + return (v >= 0) && (v < num_vertices); +} + +template +__host__ __device__ std::enable_if_t::value, bool> is_valid_vertex( + vertex_t num_vertices, vertex_t v) +{ + return v < num_vertices; +} + } // namespace experimental } // namespace cugraph diff --git a/cpp/include/experimental/graph_functions.hpp b/cpp/include/experimental/graph_functions.hpp index 7b4bb466b97..b48dc6da136 100644 --- a/cpp/include/experimental/graph_functions.hpp +++ b/cpp/include/experimental/graph_functions.hpp @@ -17,13 +17,13 @@ #include #include -#include #include #include #include #include +#include namespace cugraph { namespace experimental { @@ -40,19 +40,24 @@ namespace experimental { * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. - * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as + * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition + * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex - * IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t functor to - * every (major, minor) pair should return the local GPU ID for this function to work (edges should - * be pre-shuffled). - * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is - * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). - * Vertex IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t - * functor to every (major, minor) pair should return the local GPU ID for this function to work - * (edges should be pre-shuffled). - * @param num_edgelist_edges Number of edges in the edgelist. - * @param is_hypergraph_partitioned Flag indicating whether we are assuming hypergraph partitioning - * (this flag will be removed in the future). + * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target + * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, + * minor) pair should return the GPU ID of this process and applying the + * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition + * should return the partition ID of the corresponding matrix partition. + * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition + * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored + * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs + * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target + * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, + * minor) pair should return the GPU ID of this process and applying the + * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition + * should return the partition ID of the corresponding matrix partition. + * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition + * assigned to this process). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). * @return std::tuple, partition_t, vertex_t, edge_t> * Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to @@ -63,10 +68,9 @@ template std::enable_if_t, partition_t, vertex_t, edge_t>> renumber_edgelist(raft::handle_t const& handle, - vertex_t* edgelist_major_vertices /* [INOUT] */, - vertex_t* edgelist_minor_vertices /* [INOUT] */, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check = false); /** @@ -115,19 +119,24 @@ std::enable_if_t> renumber_edgelist( * the compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function * to work (vertices should be pre-shuffled). * @param num_local_vertices Number of local vertices. - * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as + * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition + * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex - * IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t functor to - * every (major, minor) pair should return the local GPU ID for this function to work (edges should - * be pre-shuffled). - * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is - * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). - * Vertex IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t - * functor to every (major, minor) pair should return the local GPU ID for this function to work - * (edges should be pre-shuffled). - * @param num_edgelist_edges Number of edges in the edgelist. - * @param is_hypergraph_partitioned Flag indicating whether we are assuming hypergraph partitioning - * (this flag will be removed in the future). + * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target + * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, + * minor) pair should return the GPU ID of this process and applying the + * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition + * should return the partition ID of the corresponding matrix partition. + * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition + * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored + * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs + * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target + * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major, + * minor) pair should return the GPU ID of this process and applying the + * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition + * should return the partition ID of the corresponding matrix partition. + * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition + * assigned to this process). * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). * @return std::tuple, partition_t, vertex_t, edge_t> * Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to @@ -140,10 +149,9 @@ std::enable_if_t const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check = false); /** @@ -181,6 +189,104 @@ std::enable_if_t> renumber_edgelist( edge_t num_edgelist_edges, bool do_expensive_check = false); +/** + * @brief Renumber external vertices to internal vertices based on the provoided @p + * renumber_map_labels. + * + * Note cugraph::experimental::invalid_id::value remains unchanged. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * or multi-GPU (true). + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param vertices Pointer to the vertices to be renumbered. The input external vertices are + * renumbered to internal vertices in-place. + * @param num_vertices Number of vertices to be renumbered. + * @param renumber_map_labels Pointer to the external vertices corresponding to the internal + * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last). + * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this + * process in multi-GPU). + * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process + * in multi-GPU). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void renumber_ext_vertices(raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + bool do_expensive_check = false); + +/** + * @brief Unrenumber local internal vertices to external vertices based on the providied @p + * renumber_map_labels. + * + * Note cugraph::experimental::invalid_id::value remains unchanged. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param vertices Pointer to the local internal vertices to be unrenumbered. Each input element + * should be in [@p local_int_vertex_first, @p local_int_vertex_last). The input internal vertices + * are renumbered to external vertices in-place. + * @param num_vertices Number of vertices to be unrenumbered. + * @param renumber_map_labels Pointer to the external vertices corresponding to the internal + * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last). + * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this + * process in multi-GPU). + * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process + * in multi-GPU). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void unrenumber_local_int_vertices( + raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels /* size = local_int_vertex_last - local_int_vertex_first */, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + bool do_expensive_check = false); + +// FIXME: We may add unrenumber_int_rows(or cols) as this will require communication only within a +// sub-communicator and potentially be more efficient. +/** + * @brief Unrenumber (possibly non-local) internal vertices to external vertices based on the + * providied @p renumber_map_labels. + * + * Note cugraph::experimental::invalid_id::value remains unchanged. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * or multi-GPU (true). + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param vertices Pointer to the internal vertices to be unrenumbered. The input internal vertices + * are renumbered to external vertices in-place. + * @param num_vertices Number of vertices to be unrenumbered. + * @param renumber_map_labels Pointer to the external vertices corresponding to the internal + * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last). + * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this + * process in multi-GPU). + * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process + * in multi-GPU). + * @param vertex_partition_lasts Last local internal vertices (exclusive, assigned to each process + * in multi-GPU). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void unrenumber_int_vertices(raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check = false); + /** * @brief Compute the coarsened graph. * diff --git a/cpp/include/experimental/graph_generator.hpp b/cpp/include/experimental/graph_generator.hpp index b8495ed7581..bc7337944f3 100644 --- a/cpp/include/experimental/graph_generator.hpp +++ b/cpp/include/experimental/graph_generator.hpp @@ -72,7 +72,7 @@ template std::tuple, rmm::device_uvector> generate_rmat_edgelist( raft::handle_t const& handle, size_t scale, - size_t edge_factor = 16, + size_t num_edges, double a = 0.57, double b = 0.19, double c = 0.19, @@ -80,5 +80,58 @@ std::tuple, rmm::device_uvector> generat bool clip_and_flip = false, bool scramble_vertex_ids = false); +enum class generator_distribution_t { POWER_LAW = 0, UNIFORM }; + +/** + * @brief generate multiple edge lists using the R-mat graph generator. + * + * This function allows multi-edges and self-loops similar to the Graph 500 reference + * implementation. + * + * @p scramble_vertex_ids needs to be set to `true` to generate a graph conforming to the Graph 500 + * specification (note that scrambling does not affect cuGraph's graph construction performance, so + * this is generally unnecessary). If `edge_factor` is given (e.g. Graph 500), set @p num_edges to + * (size_t{1} << @p scale) * `edge_factor`. To generate an undirected graph, set @p b == @p c and @p + * clip_and_flip = true. All the resulting edges will be placed in the lower triangular part + * (inculding the diagonal) of the graph adjacency matrix. + * + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param n_edgelists Number of edge lists (graphs) to generate + * @param min_scale Scale factor to set the minimum number of verties in the graph. + * @param max_scale Scale factor to set the maximum number of verties in the graph. + * @param edge_factor Average number of edges per vertex to generate. + * @param size_distribution Distribution of the graph sizes, impacts the scale parameter of the + * R-MAT generator + * @param edge_distribution Edges distribution for each graph, impacts how R-MAT parameters a,b,c,d, + * are set. + * @param seed Seed value for the random number generator. + * @param clip_and_flip Flag controlling whether to generate edges only in the lower triangular part + * (including the diagonal) of the graph adjacency matrix (if set to `true`) or not (if set to + * `false`). + * @param scramble_vertex_ids Flag controlling whether to scramble vertex ID bits (if set to `true`) + * or not (if set to `false`); scrambling vertx ID bits breaks correlation between vertex ID values + * and vertex degrees. The scramble code here follows the algorithm in the Graph 500 reference + * implementation version 3.0.0. + * @return A vector of std::tuple, rmm::device_uvector> of + *size @p n_edgelists, each vector element being a tuple of rmm::device_uvector objects for edge + *source vertex IDs and edge destination vertex IDs. + */ +template +std::vector, rmm::device_uvector>> +generate_rmat_edgelists( + raft::handle_t const& handle, + size_t n_edgelists, + size_t min_scale, + size_t max_scale, + size_t edge_factor = 16, + generator_distribution_t size_distribution = generator_distribution_t::POWER_LAW, + generator_distribution_t edge_distribution = generator_distribution_t::POWER_LAW, + uint64_t seed = 0, + bool clip_and_flip = false, + bool scramble_vertex_ids = false); + } // namespace experimental } // namespace cugraph diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/experimental/graph_view.hpp index 5d3d09bb087..e9593b70ddb 100644 --- a/cpp/include/experimental/graph_view.hpp +++ b/cpp/include/experimental/graph_view.hpp @@ -40,32 +40,11 @@ namespace experimental { * * We need to partition 1D vertex arrays (storing per vertex values) and the 2D graph adjacency * matrix (or transposed 2D graph adjacency matrix) of G. An 1D vertex array of size V is divided to - * P linear partitions; each partition has the size close to V / P. We consider two different - * strategies to partition the 2D matrix: the default strategy and the hypergraph partitioning based - * strategy (the latter is for future extension). - * FIXME: in the future we may use the latter for both as this leads to simpler communication - * patterns and better control over parallelism vs memory footprint trade-off. + * P linear partitions; each partition has the size close to V / P. * - * In the default case, one GPU will be responsible for 1 rectangular partition. The matrix will be - * horizontally partitioned first to P_row slabs. Each slab will be further vertically partitioned - * to P_col rectangles. Each rectangular partition will have the size close to V / P_row by V / - * P_col. - * - * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for one - * rectangular partition [a,b) by [c,d) where a = vertex_partition_offsets[row_comm_size * - * col_comm_rank], b = vertex_partition_offsets[row_comm_size * (col_comm_rank + 1)], c = - * vertex_partition_offsets[col_comm_size * row_comm_rank], and d = - * vertex_partition_offsets[col_comm_size * (row_comm_rank + 1)]. - * - * In the future, we may apply hyper-graph partitioning to divide V vertices to P groups minimizing - * edge cuts across groups while balancing the number of vertices in each group. We will also - * renumber vertices so the vertices in each group are mapped to consecutive integers. Then, there - * will be more non-zeros in the diagonal partitions of the 2D graph adjacency matrix (or the - * transposed 2D graph adjacency matrix) than the off-diagonal partitions. The default strategy does - * not balance the number of nonzeros if hyper-graph partitioning is applied. To solve this problem, - * the matrix is first horizontally partitioned to P slabs, then each slab will be further - * vertically partitioned to P_row (instead of P_col in the default case) rectangles. One GPU will - * be responsible col_comm_size rectangular partitions in this case. + * The 2D graph adjacency matrix is first horizontally partitioned to P slabs, then each slab will + * be further vertically partitioned to P_row (instead of P_col in the default case) rectangles. One + * GPU will be responsible col_comm_size rectangular partitions. * * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for * col_comm_size rectangular partitions [a_i,b_i) by [c,d) where a_i = @@ -85,13 +64,11 @@ class partition_t { partition_t() = default; partition_t(std::vector const& vertex_partition_offsets, - bool hypergraph_partitioned, int row_comm_size, int col_comm_size, int row_comm_rank, int col_comm_rank) : vertex_partition_offsets_(vertex_partition_offsets), - hypergraph_partitioned_(hypergraph_partitioned), comm_rank_(col_comm_rank * row_comm_size + row_comm_rank), row_comm_size_(row_comm_size), col_comm_size_(col_comm_size), @@ -159,10 +136,7 @@ class partition_t { get_vertex_partition_first(vertex_partition_idx); } - size_t get_number_of_matrix_partitions() const - { - return hypergraph_partitioned_ ? col_comm_size_ : 1; - } + size_t get_number_of_matrix_partitions() const { return col_comm_size_; } // major: row of the graph adjacency matrix (if the graph adjacency matrix is stored as is) or // column of the graph adjacency matrix (if the transposed graph adjacency matrix is stored). @@ -175,16 +149,18 @@ class partition_t { vertex_t get_matrix_partition_major_first(size_t partition_idx) const { - return hypergraph_partitioned_ - ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_] - : vertex_partition_offsets_[col_comm_rank_ * row_comm_size_]; + return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_]; } vertex_t get_matrix_partition_major_last(size_t partition_idx) const { - return hypergraph_partitioned_ - ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1] - : vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_]; + return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1]; + } + + vertex_t get_matrix_partition_major_size(size_t partition_idx) const + { + return get_matrix_partition_major_last(partition_idx) - + get_matrix_partition_major_first(partition_idx); } vertex_t get_matrix_partition_major_value_start_offset(size_t partition_idx) const @@ -204,24 +180,21 @@ class partition_t { vertex_t get_matrix_partition_minor_first() const { - return hypergraph_partitioned_ ? vertex_partition_offsets_[col_comm_rank_ * row_comm_size_] - : vertex_partition_offsets_[row_comm_rank_ * col_comm_size_]; + return vertex_partition_offsets_[col_comm_rank_ * row_comm_size_]; } vertex_t get_matrix_partition_minor_last() const { - return hypergraph_partitioned_ - ? vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_] - : vertex_partition_offsets_[(row_comm_rank_ + 1) * col_comm_size_]; + return vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_]; } - // FIXME: this function may be removed if we use the same partitioning strategy whether hypergraph - // partitioning is applied or not - bool is_hypergraph_partitioned() const { return hypergraph_partitioned_; } + vertex_t get_matrix_partition_minor_size() const + { + return get_matrix_partition_minor_last() - get_matrix_partition_minor_first(); + } private: std::vector vertex_partition_offsets_{}; // size = P + 1 - bool hypergraph_partitioned_{false}; int comm_rank_{0}; int row_comm_size_{0}; @@ -236,6 +209,7 @@ class partition_t { struct graph_properties_t { bool is_symmetric{false}; bool is_multigraph{false}; + bool is_weighted{false}; }; namespace detail { @@ -277,6 +251,7 @@ class graph_base_t { bool is_symmetric() const { return properties_.is_symmetric; } bool is_multigraph() const { return properties_.is_multigraph; } + bool is_weighted() const { return properties_.is_weighted; } protected: raft::handle_t const* get_handle_ptr() const { return handle_ptr_; }; @@ -326,7 +301,7 @@ class graph_view_t const& adj_matrix_partition_offsets, std::vector const& adj_matrix_partition_indices, std::vector const& adj_matrix_partition_weights, - std::vector const& vertex_partition_segment_offsets, + std::vector const& adj_matrix_partition_segment_offsets, partition_t const& partition, vertex_t number_of_vertices, edge_t number_of_edges, @@ -334,11 +309,6 @@ class graph_view_t 0; } - - // FIXME: this should be removed once MNMG Louvain is updated to use graph primitives - partition_t get_partition() const { return partition_; } - vertex_t get_number_of_local_vertices() const { return partition_.get_local_vertex_last() - partition_.get_local_vertex_first(); @@ -421,6 +391,12 @@ class graph_view_t get_local_adj_matrix_partition_segment_offsets(size_t partition_idx) const + { + return adj_matrix_partition_segment_offsets_.size() > 0 + ? std::vector( + adj_matrix_partition_segment_offsets_.begin() + + partition_idx * (detail::num_segments_per_vertex_partition + 1), + adj_matrix_partition_segment_offsets_.begin() + + (partition_idx + 1) * (detail::num_segments_per_vertex_partition + 1)) + : std::vector{}; + } // FIXME: this function is not part of the public stable API. This function is mainly for pattern // accelerator implementation. This function is currently public to support the legacy @@ -504,6 +495,12 @@ class graph_view_t compute_in_weight_sums(raft::handle_t const& handle) const; rmm::device_uvector compute_out_weight_sums(raft::handle_t const& handle) const; + edge_t compute_max_in_degree(raft::handle_t const& handle) const; + edge_t compute_max_out_degree(raft::handle_t const& handle) const; + + weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const; + weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const; + private: std::vector adj_matrix_partition_offsets_{}; std::vector adj_matrix_partition_indices_{}; @@ -513,9 +510,10 @@ class graph_view_t partition_{}; std::vector - vertex_partition_segment_offsets_{}; // segment offsets within the vertex partition based on - // vertex degree, relevant only if - // sorted_by_global_degree_within_vertex_partition is true + adj_matrix_partition_segment_offsets_{}; // segment offsets within the vertex partition based + // on vertex degree, relevant only if + // sorted_by_global_degree_within_vertex_partition is + // true }; // single-GPU version @@ -549,8 +547,6 @@ class graph_view_tget_number_of_vertices(); } constexpr vertex_t get_local_vertex_first() const { return vertex_t{0}; } @@ -628,7 +624,12 @@ class graph_view_t get_local_adj_matrix_partition_segment_offsets( + size_t adj_matrix_partition_idx) const + { + assert(adj_matrix_partition_idx == 0); + return segment_offsets_.size() > 0 ? segment_offsets_ : std::vector{}; + } // FIXME: this function is not part of the public stable API.This function is mainly for pattern // accelerator implementation. This function is currently public to support the legacy @@ -654,6 +655,12 @@ class graph_view_t compute_in_weight_sums(raft::handle_t const& handle) const; rmm::device_uvector compute_out_weight_sums(raft::handle_t const& handle) const; + edge_t compute_max_in_degree(raft::handle_t const& handle) const; + edge_t compute_max_out_degree(raft::handle_t const& handle) const; + + weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const; + weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const; + private: edge_t const* offsets_{nullptr}; vertex_t const* indices_{nullptr}; diff --git a/cpp/src/experimental/include_cuco_static_map.cuh b/cpp/include/experimental/include_cuco_static_map.cuh similarity index 100% rename from cpp/src/experimental/include_cuco_static_map.cuh rename to cpp/include/experimental/include_cuco_static_map.cuh diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp index b30159566b5..8ea58546ce1 100644 --- a/cpp/include/graph.hpp +++ b/cpp/include/graph.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -69,6 +69,10 @@ class GraphViewBase { edge_t *local_edges; vertex_t *local_offsets; + vertex_t get_number_of_vertices() const { return number_of_vertices; } + + vertex_t get_local_vertex_first() const { return vertex_t{0}; } + /** * @brief Fill the identifiers array with the vertex identifiers. * diff --git a/cpp/include/matrix_partition_device.cuh b/cpp/include/matrix_partition_device.cuh index b41119e7be6..30d6540bcfe 100644 --- a/cpp/include/matrix_partition_device.cuh +++ b/cpp/include/matrix_partition_device.cuh @@ -192,7 +192,7 @@ class matrix_partition_device_t rx_counts(row_comm_size, size_t{0}); - std::vector displacements(row_comm_size, size_t{0}); - for (int i = 0; i < row_comm_size; ++i) { - rx_counts[i] = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i); - displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; - } - device_allgatherv(row_comm, - vertex_value_input_first, - matrix_major_value_output_first, - rx_counts, - displacements, - handle.get_stream()); + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + std::vector rx_counts(col_comm_size, size_t{0}); + std::vector displacements(col_comm_size, size_t{0}); + for (int i = 0; i < col_comm_size; ++i) { + rx_counts[i] = graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank); + displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; } + device_allgatherv(col_comm, + vertex_value_input_first, + matrix_major_value_output_first, + rx_counts, + displacements, + handle.get_stream()); } else { assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols() @@ -101,80 +97,78 @@ void copy_to_matrix_major(raft::handle_t const& handle, using vertex_t = typename GraphViewType::vertex_type; if (GraphViewType::is_multi_gpu) { - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); - - auto rx_counts = - host_scalar_allgather(row_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); - - matrix_partition_device_t matrix_partition(graph_view, 0); - for (int i = 0; i < row_comm_size; ++i) { - rmm::device_uvector rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i], - handle.get_stream()); - auto rx_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(rx_counts[i], - handle.get_stream()); - auto rx_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(rx_tmp_buffer); + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + auto rx_counts = + host_scalar_allgather(col_comm, + static_cast(thrust::distance(vertex_first, vertex_last)), + handle.get_stream()); + + for (int i = 0; i < col_comm_size; ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); + + rmm::device_uvector rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i], + handle.get_stream()); + auto rx_tmp_buffer = allocate_dataframe_buffer< + typename std::iterator_traits::value_type>(rx_counts[i], + handle.get_stream()); + auto rx_value_first = get_dataframe_buffer_begin< + typename std::iterator_traits::value_type>(rx_tmp_buffer); - if (row_comm_rank == i) { - vertex_partition_device_t vertex_partition(graph_view); - auto map_first = - thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) { - return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); - }); - // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a - // permutation iterator (and directly gathers to the internal buffer) - thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_value_input_first, - rx_value_first); - } + if (col_comm_rank == i) { + vertex_partition_device_t vertex_partition(graph_view); + auto map_first = + thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) { + return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); + }); + // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a + // permutation iterator (and directly gathers to the internal buffer) + thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + map_first, + map_first + thrust::distance(vertex_first, vertex_last), + vertex_value_input_first, + rx_value_first); + } - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - device_bcast( - row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); - device_bcast( - row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); + // FIXME: these broadcast operations can be placed between ncclGroupStart() and + // ncclGroupEnd() + device_bcast( + col_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(col_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); - if (row_comm_rank == i) { - auto map_first = - thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) { - return matrix_partition.get_major_offset_from_major_nocheck(v); - }); - // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and - // directly scatters from the internal buffer) - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_major_value_output_first); - } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), [matrix_partition] __device__(auto v) { - return matrix_partition.get_major_offset_from_major_nocheck(v); - }); - // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and - // directly scatters from the internal buffer) - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_major_value_output_first); - } + if (col_comm_rank == i) { + auto map_first = + thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) { + return matrix_partition.get_major_offset_from_major_nocheck(v); + }); + // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and + // directly scatters from the internal buffer) + thrust::scatter( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_major_value_output_first + matrix_partition.get_major_value_start_offset()); + } else { + auto map_first = thrust::make_transform_iterator( + rx_vertices.begin(), [matrix_partition] __device__(auto v) { + return matrix_partition.get_major_offset_from_major_nocheck(v); + }); + // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and + // directly scatters from the internal buffer) + thrust::scatter( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_major_value_output_first + matrix_partition.get_major_value_start_offset()); } } } else { @@ -199,59 +193,27 @@ void copy_to_matrix_minor(raft::handle_t const& handle, MatrixMinorValueOutputIterator matrix_minor_value_output_first) { if (GraphViewType::is_multi_gpu) { - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); - - // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph - // partitioning - auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank; - auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size; - // FIXME: this branch may be no longer necessary with NCCL backend - if (comm_src_rank == comm_rank) { - assert(comm_dst_rank == comm_rank); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertex_value_input_first, - vertex_value_input_first + graph_view.get_number_of_local_vertices(), - matrix_minor_value_output_first + - (graph_view.get_vertex_partition_first(comm_src_rank) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size))); - } else { - device_sendrecv( - comm, - vertex_value_input_first, - static_cast(graph_view.get_number_of_local_vertices()), - comm_dst_rank, - matrix_minor_value_output_first + - (graph_view.get_vertex_partition_first(comm_src_rank) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)), - static_cast(graph_view.get_vertex_partition_size(comm_src_rank)), - comm_src_rank, - handle.get_stream()); - } - - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - for (int i = 0; i < col_comm_size; ++i) { - auto offset = graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size); - auto count = graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i); - device_bcast(col_comm, - matrix_minor_value_output_first + offset, - matrix_minor_value_output_first + offset, - count, - i, - handle.get_stream()); - } + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + std::vector rx_counts(row_comm_size, size_t{0}); + std::vector displacements(row_comm_size, size_t{0}); + for (int i = 0; i < row_comm_size; ++i) { + rx_counts[i] = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i); + displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1]; } + device_allgatherv(row_comm, + vertex_value_input_first, + matrix_minor_value_output_first, + rx_counts, + displacements, + handle.get_stream()); } else { assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed ? graph_view.get_number_of_local_adj_matrix_partition_rows() @@ -277,143 +239,75 @@ void copy_to_matrix_minor(raft::handle_t const& handle, using vertex_t = typename GraphViewType::vertex_type; if (GraphViewType::is_multi_gpu) { - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - auto const col_comm_size = col_comm.get_size(); - - // FIXME: this P2P is unnecessary if apply the same partitioning scheme regardless of - // hypergraph partitioning is applied or not - auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank; - auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size; - size_t tx_count = thrust::distance(vertex_first, vertex_last); - size_t rx_count{}; - // FIXME: it seems like raft::isend and raft::irecv do not properly handle the destination (or - // source) == self case. Need to double check and fix this if this is indeed the case (or RAFT - // may use ncclSend/ncclRecv instead of UCX for device data). - if (comm_src_rank == comm_rank) { - assert(comm_dst_rank == comm_rank); - rx_count = tx_count; - } else { - std::vector count_requests(2); - comm.isend(&tx_count, 1, comm_dst_rank, 0 /* tag */, count_requests.data()); - comm.irecv(&rx_count, 1, comm_src_rank, 0 /* tag */, count_requests.data() + 1); - comm.waitall(count_requests.size(), count_requests.data()); - } - - vertex_partition_device_t vertex_partition(graph_view); - rmm::device_uvector dst_vertices(rx_count, handle.get_stream()); - auto dst_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(rx_count, + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + auto rx_counts = + host_scalar_allgather(row_comm, + static_cast(thrust::distance(vertex_first, vertex_last)), + handle.get_stream()); + + matrix_partition_device_t matrix_partition(graph_view, 0); + for (int i = 0; i < row_comm_size; ++i) { + rmm::device_uvector rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i], + handle.get_stream()); + auto rx_tmp_buffer = allocate_dataframe_buffer< + typename std::iterator_traits::value_type>(rx_counts[i], handle.get_stream()); - auto dst_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(dst_tmp_buffer); - if (comm_src_rank == comm_rank) { - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertex_first, - vertex_last, - dst_vertices.begin()); - auto map_first = - thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) { - return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); - }); - thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_value_input_first, - dst_value_first); - } else { - auto src_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(tx_count, - handle.get_stream()); - auto src_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(src_tmp_buffer); + auto rx_value_first = get_dataframe_buffer_begin< + typename std::iterator_traits::value_type>(rx_tmp_buffer); + if (row_comm_rank == i) { + vertex_partition_device_t vertex_partition(graph_view); auto map_first = thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) { return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); }); + // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a + // permutation iterator (and directly gathers to the internal buffer) thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), map_first, map_first + thrust::distance(vertex_first, vertex_last), vertex_value_input_first, - src_value_first); - - device_sendrecv( - comm, - vertex_first, - tx_count, - comm_dst_rank, - dst_vertices.begin(), - rx_count, - comm_src_rank, - handle.get_stream()); - - device_sendrecv(comm, - src_value_first, - tx_count, - comm_dst_rank, - dst_value_first, - rx_count, - comm_src_rank, - handle.get_stream()); + rx_value_first); } - // FIXME: now we can clear tx_tmp_buffer - - auto rx_counts = host_scalar_allgather(col_comm, rx_count, handle.get_stream()); - - matrix_partition_device_t matrix_partition(graph_view, 0); - for (int i = 0; i < col_comm_size; ++i) { - rmm::device_uvector rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i], - handle.get_stream()); - auto rx_tmp_buffer = allocate_dataframe_buffer< - typename std::iterator_traits::value_type>(rx_counts[i], - handle.get_stream()); - auto rx_value_first = get_dataframe_buffer_begin< - typename std::iterator_traits::value_type>(rx_tmp_buffer); - - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - device_bcast(col_comm, - dst_vertices.begin(), - rx_vertices.begin(), - rx_counts[i], - i, - handle.get_stream()); - device_bcast( - col_comm, dst_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); - - if (col_comm_rank == i) { - auto map_first = thrust::make_transform_iterator( - dst_vertices.begin(), [matrix_partition] __device__(auto v) { - return matrix_partition.get_minor_offset_from_minor_nocheck(v); - }); - - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - dst_value_first, - dst_value_first + rx_counts[i], - map_first, - matrix_minor_value_output_first); - } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), [matrix_partition] __device__(auto v) { - return matrix_partition.get_minor_offset_from_minor_nocheck(v); - }); + // FIXME: these broadcast operations can be placed between ncclGroupStart() and + // ncclGroupEnd() + device_bcast( + row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream()); - thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_value_first, - rx_value_first + rx_counts[i], - map_first, - matrix_minor_value_output_first); - } + if (row_comm_rank == i) { + auto map_first = + thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) { + return matrix_partition.get_minor_offset_from_minor_nocheck(v); + }); + // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and + // directly scatters from the internal buffer) + thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_minor_value_output_first); + } else { + auto map_first = thrust::make_transform_iterator( + rx_vertices.begin(), [matrix_partition] __device__(auto v) { + return matrix_partition.get_minor_offset_from_minor_nocheck(v); + }); + // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and + // directly scatters from the internal buffer) + thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_value_first, + rx_value_first + rx_counts[i], + map_first, + matrix_minor_value_output_first); } } } else { diff --git a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh index 3059cf95852..6d828dab513 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh @@ -42,23 +42,7 @@ namespace experimental { namespace detail { -// FIXME: block size requires tuning -int32_t constexpr copy_v_transform_reduce_nbr_for_all_block_size = 128; - -#if 0 -// FIXME: delete this once we verify that the thrust replace in for_all_major_for_all_nbr_low_degree is no slower than the original for loop based imoplementation -template -__device__ std::enable_if_t accumulate_edge_op_result(T& lhs, T const& rhs) -{ - lhs = plus_edge_op_result(lhs, rhs); -} - -template -__device__ std::enable_if_t accumulate_edge_op_result(T& lhs, T const& rhs) -{ - atomic_add(&lhs, rhs); -} -#endif +int32_t constexpr copy_v_transform_reduce_nbr_for_all_block_size = 512; template (tid); while (idx < static_cast(major_last - major_first)) { + auto major_offset = major_start_offset + idx; vertex_t const* indices{nullptr}; weight_t const* weights{nullptr}; edge_t local_degree{}; - auto major_offset = major_start_offset + idx; thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(static_cast(major_offset)); -#if 1 auto transform_op = [&matrix_partition, &adj_matrix_row_value_input_first, &adj_matrix_col_value_input_first, @@ -148,44 +131,6 @@ __global__ void for_all_major_for_all_nbr_low_degree( atomic_accumulate_edge_op_result(result_value_output_first + minor_offset, e_op_result); }); } -#else - // FIXME: delete this once we verify that the code above is not slower than this. - e_op_result_t e_op_result_sum{init}; // relevent only if update_major == true - for (edge_t i = 0; i < local_degree; ++i) { - auto minor = indices[i]; - auto weight = weights != nullptr ? weights[i] : weight_t{1.0}; - auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor); - auto row = GraphViewType::is_adj_matrix_transposed - ? minor - : matrix_partition.get_major_from_major_offset_nocheck(major_offset); - auto col = GraphViewType::is_adj_matrix_transposed - ? matrix_partition.get_major_from_major_offset_nocheck(major_offset) - : minor; - auto row_offset = GraphViewType::is_adj_matrix_transposed - ? minor_offset - : static_cast(major_offset); - auto col_offset = GraphViewType::is_adj_matrix_transposed - ? static_cast(major_offset) - : minor_offset; - auto e_op_result = evaluate_edge_op() - .compute(row, - col, - weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), - e_op); - if (update_major) { - accumulate_edge_op_result(e_op_result_sum, e_op_result); - } else { - accumulate_edge_op_result(*(result_value_output_first + minor_offset), - e_op_result); - } - } - if (update_major) { *(result_value_output_first + idx) = e_op_result_sum; } -#endif idx += gridDim.x * blockDim.x; } } @@ -219,14 +164,14 @@ __global__ void for_all_major_for_all_nbr_mid_degree( auto idx = static_cast(tid / raft::warp_size()); while (idx < static_cast(major_last - major_first)) { + auto major_offset = major_start_offset + idx; vertex_t const* indices{nullptr}; weight_t const* weights{nullptr}; edge_t local_degree{}; - auto major_offset = major_start_offset + idx; thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset); auto e_op_result_sum = lane_id == 0 ? init : e_op_result_t{}; // relevent only if update_major == true - for (edge_t i = lane_id; i < local_degree; i += raft::warp_size) { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { auto minor = indices[i]; auto weight = weights != nullptr ? weights[i] : weight_t{1.0}; auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor); @@ -293,10 +238,10 @@ __global__ void for_all_major_for_all_nbr_high_degree( auto idx = static_cast(blockIdx.x); while (idx < static_cast(major_last - major_first)) { + auto major_offset = major_start_offset + idx; vertex_t const* indices{nullptr}; weight_t const* weights{nullptr}; edge_t local_degree{}; - auto major_offset = major_start_offset + idx; thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset); auto e_op_result_sum = threadIdx.x == 0 ? init : e_op_result_t{}; // relevent only if update_major == true @@ -358,20 +303,11 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, T init, VertexValueOutputIterator vertex_value_output_first) { - using vertex_t = typename GraphViewType::vertex_type; + constexpr auto update_major = (in == GraphViewType::is_adj_matrix_transposed); + using vertex_t = typename GraphViewType::vertex_type; static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - auto loop_count = size_t{1}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - loop_count = graph_view.is_hypergraph_partitioned() - ? graph_view.get_number_of_local_adj_matrix_partitions() - : static_cast(row_comm_size); - } - auto comm_rank = handle.comms_initialized() ? handle.get_comms().get_rank() : int{0}; - auto minor_tmp_buffer_size = (GraphViewType::is_multi_gpu && (in != GraphViewType::is_adj_matrix_transposed)) ? GraphViewType::is_adj_matrix_transposed @@ -386,10 +322,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, if (GraphViewType::is_multi_gpu) { auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); auto const row_comm_rank = row_comm.get_rank(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - minor_init = graph_view.is_hypergraph_partitioned() ? (row_comm_rank == 0) ? init : T{} - : (col_comm_rank == 0) ? init : T{}; + minor_init = (row_comm_rank == 0) ? init : T{}; } if (GraphViewType::is_multi_gpu) { @@ -407,97 +340,162 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, assert(minor_tmp_buffer_size == 0); } - for (size_t i = 0; i < loop_count; ++i) { - matrix_partition_device_t matrix_partition( - graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i); - - auto major_tmp_buffer_size = vertex_t{0}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); - major_tmp_buffer_size = - (in == GraphViewType::is_adj_matrix_transposed) - ? graph_view.is_hypergraph_partitioned() - ? matrix_partition.get_major_size() - : graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i) - : vertex_t{0}; - } + auto major_tmp_buffer_size = + GraphViewType::is_multi_gpu && update_major ? matrix_partition.get_major_size() : vertex_t{0}; auto major_tmp_buffer = allocate_dataframe_buffer(major_tmp_buffer_size, handle.get_stream()); auto major_buffer_first = get_dataframe_buffer_begin(major_tmp_buffer); auto major_init = T{}; - if (in == GraphViewType::is_adj_matrix_transposed) { + if (update_major) { if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - major_init = graph_view.is_hypergraph_partitioned() ? (col_comm_rank == 0) ? init : T{} - : (row_comm_rank == 0) ? init : T{}; + major_init = (col_comm_rank == 0) ? init : T{}; } else { major_init = init; } } - int comm_root_rank = 0; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank - : col_comm_rank * row_comm_size + i; - } - - if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) { - raft::grid_1d_thread_t update_grid(graph_view.get_vertex_partition_size(comm_root_rank), + auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed + ? vertex_t{0} + : matrix_partition.get_major_value_start_offset(); + auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed + ? matrix_partition.get_major_value_start_offset() + : vertex_t{0}; + auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); + if (segment_offsets.size() > 0) { + // FIXME: we may further improve performance by 1) concurrently running kernels on different + // segments; 2) individually tuning block sizes for different segments; and 3) adding one more + // segment for very high degree vertices and running segmented reduction + static_assert(detail::num_segments_per_vertex_partition == 3); + if (segment_offsets[1] > 0) { + raft::grid_1d_block_t update_grid(segment_offsets[1], + detail::copy_v_transform_reduce_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + // FIXME: with C++17 we can collapse the if-else statement below with a functor with "if + // constexpr" that returns either a multi-GPU output buffer or a single-GPU output buffer. + if (GraphViewType::is_multi_gpu) { + detail::for_all_major_for_all_nbr_high_degree + <<>>( + matrix_partition, + matrix_partition.get_major_first(), + matrix_partition.get_major_first() + segment_offsets[1], + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + update_major ? major_buffer_first : minor_buffer_first, + e_op, + major_init); + } else { + detail::for_all_major_for_all_nbr_high_degree + <<>>( + matrix_partition, + matrix_partition.get_major_first(), + matrix_partition.get_major_first() + segment_offsets[1], + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + vertex_value_output_first, + e_op, + major_init); + } + } + if (segment_offsets[2] - segment_offsets[1] > 0) { + raft::grid_1d_warp_t update_grid(segment_offsets[2] - segment_offsets[1], detail::copy_v_transform_reduce_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); - - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - - auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? vertex_t{0} - : matrix_partition.get_major_value_start_offset(); - auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? matrix_partition.get_major_value_start_offset() - : vertex_t{0}; - - detail::for_all_major_for_all_nbr_low_degree - <<>>( - matrix_partition, - graph_view.get_vertex_partition_first(comm_root_rank), - graph_view.get_vertex_partition_last(comm_root_rank), - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, - (in == GraphViewType::is_adj_matrix_transposed) ? major_buffer_first - : minor_buffer_first, - e_op, - major_init); - } else { - detail::for_all_major_for_all_nbr_low_degree - <<>>( - matrix_partition, - graph_view.get_vertex_partition_first(comm_root_rank), - graph_view.get_vertex_partition_last(comm_root_rank), - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, - vertex_value_output_first, - e_op, - major_init); + // FIXME: with C++17 we can collapse the if-else statement below with a functor with "if + // constexpr" that returns either a multi-GPU output buffer or a single-GPU output buffer. + if (GraphViewType::is_multi_gpu) { + detail::for_all_major_for_all_nbr_mid_degree + <<>>( + matrix_partition, + matrix_partition.get_major_first() + segment_offsets[1], + matrix_partition.get_major_first() + segment_offsets[2], + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + update_major ? major_buffer_first + segment_offsets[1] : minor_buffer_first, + e_op, + major_init); + } else { + detail::for_all_major_for_all_nbr_mid_degree + <<>>( + matrix_partition, + matrix_partition.get_major_first() + segment_offsets[1], + matrix_partition.get_major_first() + segment_offsets[2], + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + vertex_value_output_first + (update_major ? segment_offsets[1] : vertex_t{0}), + e_op, + major_init); + } + } + if (segment_offsets[3] - segment_offsets[2] > 0) { + raft::grid_1d_thread_t update_grid(segment_offsets[3] - segment_offsets[2], + detail::copy_v_transform_reduce_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + // FIXME: with C++17 we can collapse the if-else statement below with a functor with "if + // constexpr" that returns either a multi-GPU output buffer or a single-GPU output buffer. + if (GraphViewType::is_multi_gpu) { + detail::for_all_major_for_all_nbr_low_degree + <<>>( + matrix_partition, + matrix_partition.get_major_first() + segment_offsets[2], + matrix_partition.get_major_last(), + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + update_major ? major_buffer_first + segment_offsets[2] : minor_buffer_first, + e_op, + major_init); + } else { + detail::for_all_major_for_all_nbr_low_degree + <<>>( + matrix_partition, + matrix_partition.get_major_first() + segment_offsets[2], + matrix_partition.get_major_last(), + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + vertex_value_output_first + (update_major ? segment_offsets[2] : vertex_t{0}), + e_op, + major_init); + } + } + } else { + if (matrix_partition.get_major_size() > 0) { + raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(), + detail::copy_v_transform_reduce_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + // FIXME: with C++17 we can collapse the if-else statement below with a functor with "if + // constexpr" that returns either a multi-GPU output buffer or a single-GPU output buffer. + if (GraphViewType::is_multi_gpu) { + detail::for_all_major_for_all_nbr_low_degree + <<>>( + matrix_partition, + matrix_partition.get_major_first(), + matrix_partition.get_major_last(), + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + update_major ? major_buffer_first : minor_buffer_first, + e_op, + major_init); + } else { + detail::for_all_major_for_all_nbr_low_degree + <<>>( + matrix_partition, + matrix_partition.get_major_first(), + matrix_partition.get_major_last(), + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + vertex_value_output_first, + e_op, + major_init); + } } } - if (GraphViewType::is_multi_gpu && (in == GraphViewType::is_adj_matrix_transposed)) { + if (GraphViewType::is_multi_gpu && update_major) { auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); auto const row_comm_rank = row_comm.get_rank(); auto const row_comm_size = row_comm.get_size(); @@ -505,29 +503,17 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - if (graph_view.is_hypergraph_partitioned()) { - device_reduce( - col_comm, - major_buffer_first, - vertex_value_output_first, - static_cast(graph_view.get_vertex_partition_size(i * row_comm_size + i)), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } else { - device_reduce(row_comm, - major_buffer_first, - vertex_value_output_first, - static_cast( - graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)), - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } + device_reduce(col_comm, + major_buffer_first, + vertex_value_output_first, + matrix_partition.get_major_size(), + raft::comms::op_t::SUM, + i, + handle.get_stream()); } } - if (GraphViewType::is_multi_gpu && (in != GraphViewType::is_adj_matrix_transposed)) { + if (GraphViewType::is_multi_gpu && !update_major) { auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -537,53 +523,17 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle, auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - for (int i = 0; i < col_comm_size; ++i) { - auto offset = (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)); - auto size = static_cast( - graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i)); - device_reduce(col_comm, - minor_buffer_first + offset, - minor_buffer_first + offset, - size, - raft::comms::op_t::SUM, - i, - handle.get_stream()); - } - - // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph - // partitioning - auto comm_src_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size; - auto comm_dst_rank = row_comm_rank * col_comm_size + col_comm_rank; - // FIXME: this branch may no longer necessary with NCCL backend - if (comm_src_rank == comm_rank) { - assert(comm_dst_rank == comm_rank); - auto offset = - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size); - auto size = static_cast( - graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + col_comm_rank)); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - minor_buffer_first + offset, - minor_buffer_first + offset + size, - vertex_value_output_first); - } else { - device_sendrecv( - comm, - minor_buffer_first + - (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) - - graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)), - static_cast( - graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + col_comm_rank)), - comm_dst_rank, - vertex_value_output_first, - static_cast(graph_view.get_vertex_partition_size(comm_rank)), - comm_src_rank, - handle.get_stream()); - } + for (int i = 0; i < row_comm_size; ++i) { + auto offset = (graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size + i) - + graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size)); + device_reduce(row_comm, + minor_buffer_first + offset, + vertex_value_output_first, + static_cast( + graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)), + raft::comms::op_t::SUM, + i, + handle.get_stream()); } } } diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh index 11cf2cb1137..f904c35ef9e 100644 --- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh +++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,7 @@ #include -#include +#include #include @@ -59,10 +60,10 @@ __global__ void for_all_major_for_all_nbr_low_degree( auto idx = static_cast(tid); while (idx < static_cast(major_last - major_first)) { + auto major_offset = major_start_offset + idx; vertex_t const* indices{nullptr}; weight_t const* weights{nullptr}; edge_t local_degree{}; - auto major_offset = major_start_offset + idx; thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(static_cast(major_offset)); if (local_degree > 0) { @@ -170,8 +171,8 @@ __global__ void for_all_major_for_all_nbr_low_degree( */ template ::value_type, + static_assert(std::is_same::value_type, typename GraphViewType::vertex_type>::value); + static_assert(std::is_same::value_type, + typename std::iterator_traits::value_type>::value); static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); using vertex_t = typename GraphViewType::vertex_type; @@ -206,64 +209,113 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( // 1. build a cuco::static_map object for the k, v pairs. auto kv_map_ptr = std::make_unique>( - static_cast(static_cast(thrust::distance(map_key_first, map_key_last)) / - load_factor), - invalid_vertex_id::value, - invalid_vertex_id::value); - auto pair_first = thrust::make_transform_iterator( - thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)), - [] __device__(auto val) { - return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); - }); - kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); - - // 2. aggregate each vertex out-going edges based on keys and transform-reduce. - - auto loop_count = size_t{1}; + size_t{0}, invalid_vertex_id::value, invalid_vertex_id::value); if (GraphViewType::is_multi_gpu) { auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_rank = row_comm.get_rank(); auto const row_comm_size = row_comm.get_size(); - loop_count = graph_view.is_hypergraph_partitioned() - ? graph_view.get_number_of_local_adj_matrix_partitions() - : static_cast(row_comm_size); + + auto map_counts = + host_scalar_allgather(row_comm, + static_cast(thrust::distance(map_key_first, map_key_last)), + handle.get_stream()); + std::vector map_displacements(row_comm_size, size_t{0}); + std::partial_sum(map_counts.begin(), map_counts.end() - 1, map_displacements.begin() + 1); + rmm::device_uvector map_keys(map_displacements.back() + map_counts.back(), + handle.get_stream()); + auto map_value_buffer = + allocate_dataframe_buffer(map_keys.size(), handle.get_stream()); + for (int i = 0; i < row_comm_size; ++i) { + device_bcast(row_comm, + map_key_first, + map_keys.begin() + map_displacements[i], + map_counts[i], + i, + handle.get_stream()); + device_bcast(row_comm, + map_value_first, + get_dataframe_buffer_begin(map_value_buffer) + map_displacements[i], + map_counts[i], + i, + handle.get_stream()); + } + // FIXME: these copies are unnecessary, better fix RAFT comm's bcast to take separate input & + // output pointers + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + map_key_first, + map_key_last, + map_keys.begin() + map_displacements[row_comm_rank]); + thrust::copy( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + map_value_first, + map_value_first + thrust::distance(map_key_first, map_key_last), + get_dataframe_buffer_begin(map_value_buffer) + map_displacements[row_comm_rank]); + + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + kv_map_ptr.reset(); + + kv_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(map_keys.size()) / load_factor), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple( + map_keys.begin(), get_dataframe_buffer_begin(map_value_buffer))), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (map_keys.size()) { kv_map_ptr->insert(pair_first, pair_first + map_keys.size()); } + } else { + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + kv_map_ptr.reset(); + + kv_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast( + static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(map_key_first, map_key_last) > 0) { + kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + } } + // 2. aggregate each vertex out-going edges based on keys and transform-reduce. + rmm::device_uvector major_vertices(0, handle.get_stream()); auto e_op_result_buffer = allocate_dataframe_buffer(0, handle.get_stream()); - for (size_t i = 0; i < loop_count; ++i) { - matrix_partition_device_t matrix_partition( - graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i); - - int comm_root_rank = 0; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - auto const col_comm_rank = col_comm.get_rank(); - comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank - : col_comm_rank * row_comm_size + i; - } + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); - auto num_edges = thrust::transform_reduce( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - thrust::make_counting_iterator(graph_view.get_vertex_partition_first(comm_root_rank)), - thrust::make_counting_iterator(graph_view.get_vertex_partition_last(comm_root_rank)), - [matrix_partition] __device__(auto row) { - auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row); - return matrix_partition.get_local_degree(row_offset); - }, - edge_t{0}, - thrust::plus()); - - rmm::device_uvector tmp_major_vertices(num_edges, handle.get_stream()); + rmm::device_uvector tmp_major_vertices(matrix_partition.get_number_of_edges(), + handle.get_stream()); rmm::device_uvector tmp_minor_keys(tmp_major_vertices.size(), handle.get_stream()); rmm::device_uvector tmp_key_aggregated_edge_weights(tmp_major_vertices.size(), handle.get_stream()); - if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) { + if (matrix_partition.get_major_size() > 0) { raft::grid_1d_thread_t update_grid( - graph_view.get_vertex_partition_size(comm_root_rank), + matrix_partition.get_major_size(), detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size, handle.get_device_properties().maxGridSize[0]); @@ -277,8 +329,8 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( 0, handle.get_stream()>>>( matrix_partition, - graph_view.get_vertex_partition_first(comm_root_rank), - graph_view.get_vertex_partition_last(comm_root_rank), + matrix_partition.get_major_first(), + matrix_partition.get_major_last(), adj_matrix_col_key_first, tmp_major_vertices.data(), tmp_minor_keys.data(), @@ -300,10 +352,14 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( tmp_key_aggregated_edge_weights.resize(tmp_major_vertices.size(), handle.get_stream()); if (GraphViewType::is_multi_gpu) { - auto& sub_comm = handle.get_subcomm(graph_view.is_hypergraph_partitioned() - ? cugraph::partition_2d::key_naming_t().col_name() - : cugraph::partition_2d::key_naming_t().row_name()); - auto const sub_comm_size = sub_comm.get_size(); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); triplet_first = thrust::make_zip_iterator(thrust::make_tuple(tmp_major_vertices.begin(), @@ -315,11 +371,13 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( std::forward_as_tuple( std::tie(rx_major_vertices, rx_minor_keys, rx_key_aggregated_edge_weights), std::ignore) = groupby_gpuid_and_shuffle_values( - sub_comm, + col_comm, triplet_first, triplet_first + tmp_major_vertices.size(), - [key_func = detail::compute_gpu_id_from_vertex_t{sub_comm_size}] __device__( - auto val) { return key_func(thrust::get<1>(val)); }, + [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}, + row_comm_size] __device__(auto val) { + return key_func(thrust::get<1>(val)) / row_comm_size; + }, handle.get_stream()); auto pair_first = thrust::make_zip_iterator( @@ -355,56 +413,52 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( triplet_first = thrust::make_zip_iterator(thrust::make_tuple( tmp_major_vertices.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_weights.begin())); - thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - triplet_first, - triplet_first + tmp_major_vertices.size(), - tmp_e_op_result_buffer_first, - [adj_matrix_row_value_input_first, - key_aggregated_e_op, - matrix_partition, - kv_map = kv_map_ptr->get_device_view()] __device__(auto val) { - auto major = thrust::get<0>(val); - auto key = thrust::get<1>(val); - auto w = thrust::get<2>(val); - return key_aggregated_e_op( - major, - key, - w, - *(adj_matrix_row_value_input_first + - matrix_partition.get_major_offset_from_major_nocheck(major)), - kv_map.find(key)->second.load(cuda::std::memory_order_relaxed)); - }); + thrust::transform( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + triplet_first, + triplet_first + tmp_major_vertices.size(), + tmp_e_op_result_buffer_first, + [adj_matrix_row_value_input_first = + adj_matrix_row_value_input_first + matrix_partition.get_major_value_start_offset(), + key_aggregated_e_op, + matrix_partition, + kv_map = kv_map_ptr->get_device_view()] __device__(auto val) { + auto major = thrust::get<0>(val); + auto key = thrust::get<1>(val); + auto w = thrust::get<2>(val); + return key_aggregated_e_op(major, + key, + w, + *(adj_matrix_row_value_input_first + + matrix_partition.get_major_offset_from_major_nocheck(major)), + kv_map.find(key)->second.load(cuda::std::memory_order_relaxed)); + }); tmp_minor_keys.resize(0, handle.get_stream()); tmp_key_aggregated_edge_weights.resize(0, handle.get_stream()); tmp_minor_keys.shrink_to_fit(handle.get_stream()); tmp_key_aggregated_edge_weights.shrink_to_fit(handle.get_stream()); if (GraphViewType::is_multi_gpu) { - auto& sub_comm = handle.get_subcomm(graph_view.is_hypergraph_partitioned() - ? cugraph::partition_2d::key_naming_t().col_name() - : cugraph::partition_2d::key_naming_t().row_name()); - auto const sub_comm_rank = sub_comm.get_rank(); - auto const sub_comm_size = sub_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); // FIXME: additional optimization is possible if reduce_op is a pure function (and reduce_op // can be mapped to ncclRedOp_t). auto rx_sizes = - host_scalar_gather(sub_comm, tmp_major_vertices.size(), i, handle.get_stream()); - std::vector rx_displs( - static_cast(sub_comm_rank) == i ? sub_comm_size : int{0}, size_t{0}); - if (static_cast(sub_comm_rank) == i) { + host_scalar_gather(col_comm, tmp_major_vertices.size(), i, handle.get_stream()); + std::vector rx_displs{}; + rmm::device_uvector rx_major_vertices(0, handle.get_stream()); + if (static_cast(col_comm_rank) == i) { + rx_displs.assign(col_comm_size, size_t{0}); std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); + rx_major_vertices.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream()); } - rmm::device_uvector rx_major_vertices( - static_cast(sub_comm_rank) == i - ? std::accumulate(rx_sizes.begin(), rx_sizes.end(), size_t{0}) - : size_t{0}, - handle.get_stream()); auto rx_tmp_e_op_result_buffer = allocate_dataframe_buffer(rx_major_vertices.size(), handle.get_stream()); - device_gatherv(sub_comm, + device_gatherv(col_comm, tmp_major_vertices.data(), rx_major_vertices.data(), tmp_major_vertices.size(), @@ -412,7 +466,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( rx_displs, i, handle.get_stream()); - device_gatherv(sub_comm, + device_gatherv(col_comm, tmp_e_op_result_buffer_first, get_dataframe_buffer_begin(rx_tmp_e_op_result_buffer), tmp_major_vertices.size(), @@ -421,7 +475,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr( i, handle.get_stream()); - if (static_cast(sub_comm_rank) == i) { + if (static_cast(col_comm_rank) == i) { major_vertices = std::move(rx_major_vertices); e_op_result_buffer = std::move(rx_tmp_e_op_result_buffer); } diff --git a/cpp/include/patterns/count_if_e.cuh b/cpp/include/patterns/count_if_e.cuh index 99bfc80f643..4eb3fea24c4 100644 --- a/cpp/include/patterns/count_if_e.cuh +++ b/cpp/include/patterns/count_if_e.cuh @@ -16,132 +16,16 @@ #pragma once #include -#include #include -#include -#include +#include -#include -#include #include -#include -#include - #include -#include namespace cugraph { namespace experimental { -namespace detail { - -// FIXME: block size requires tuning -int32_t constexpr count_if_e_for_all_block_size = 128; - -// FIXME: function names conflict if included with transform_reduce_e.cuh -template -__global__ void for_all_major_for_all_nbr_low_degree( - matrix_partition_device_t matrix_partition, - AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, - AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, - typename GraphViewType::edge_type* block_counts, - EdgeOp e_op) -{ - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using weight_t = typename GraphViewType::weight_type; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto idx = static_cast(tid); - - edge_t count{0}; - while (idx < static_cast(matrix_partition.get_major_size())) { - vertex_t const* indices{nullptr}; - weight_t const* weights{nullptr}; - edge_t local_degree{}; - thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(idx); -#if 1 - count += thrust::count_if( - thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - [&matrix_partition, - &adj_matrix_row_value_input_first, - &adj_matrix_col_value_input_first, - &e_op, - idx, - indices, - weights] __device__(auto i) { - auto minor = indices[i]; - auto weight = weights != nullptr ? weights[i] : 1.0; - auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor); - auto row = GraphViewType::is_adj_matrix_transposed - ? minor - : matrix_partition.get_major_from_major_offset_nocheck(idx); - auto col = GraphViewType::is_adj_matrix_transposed - ? matrix_partition.get_major_from_major_offset_nocheck(idx) - : minor; - auto row_offset = - GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast(idx); - auto col_offset = - GraphViewType::is_adj_matrix_transposed ? static_cast(idx) : minor_offset; - auto e_op_result = evaluate_edge_op() - .compute(row, - col, - weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), - e_op); - - return e_op_result; - }); -#else - // FIXME: delete this once we verify that the code above is not slower than this. - for (vertex_t i = 0; i < local_degree; ++i) { - auto minor = indices[i]; - auto weight = weights != nullptr ? weights[i] : 1.0; - auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor); - auto row = GraphViewType::is_adj_matrix_transposed - ? minor - : matrix_partition.get_major_from_major_offset_nocheck(idx); - auto col = GraphViewType::is_adj_matrix_transposed - ? matrix_partition.get_major_from_major_offset_nocheck(idx) - : minor; - auto row_offset = - GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast(idx); - auto col_offset = - GraphViewType::is_adj_matrix_transposed ? static_cast(idx) : minor_offset; - auto e_op_result = evaluate_edge_op() - .compute(row, - col, - weight, - *(adj_matrix_row_value_input_first + row_offset), - *(adj_matrix_col_value_input_first + col_offset), - e_op); - if (e_op_result) { count++; } - } -#endif - idx += gridDim.x * blockDim.x; - } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - count = BlockReduce(temp_storage).Sum(count); - if (threadIdx.x == 0) { *(block_counts + blockIdx.x) = count; } -} - -} // namespace detail - /** * @brief Count the number of edges that satisfies the given predicate. * @@ -182,55 +66,18 @@ typename GraphViewType::edge_type count_if_e( AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, EdgeOp e_op) { - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - - edge_t count{0}; - for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { - matrix_partition_device_t matrix_partition(graph_view, i); - - if (matrix_partition.get_major_size() > 0) { - auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? vertex_t{0} - : matrix_partition.get_major_value_start_offset(); - auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? matrix_partition.get_major_value_start_offset() - : vertex_t{0}; - - raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(), - detail::count_if_e_for_all_block_size, - handle.get_device_properties().maxGridSize[0]); - - rmm::device_uvector block_counts(update_grid.num_blocks, handle.get_stream()); - - detail::for_all_major_for_all_nbr_low_degree<<>>( - matrix_partition, - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, - block_counts.data(), - e_op); - - // FIXME: we have several options to implement this. With cooperative group support - // (https://devblogs.nvidia.com/cooperative-groups/), we can run this synchronization within - // the previous kernel. Using atomics at the end of the previous kernel is another option - // (sequentialization due to atomics may not be bad as different blocks may reach the - // synchronization point in varying timings and the number of SMs is not very big) - count += thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - block_counts.begin(), - block_counts.end(), - edge_t{0}, - thrust::plus()); - } - } - - if (GraphViewType::is_multi_gpu) { - count = host_scalar_allreduce(handle.get_comms(), count, handle.get_stream()); - } - - return count; + using edge_t = typename GraphViewType::edge_type; + + return transform_reduce_e(handle, + graph_view, + adj_matrix_row_value_input_first, + adj_matrix_col_value_input_first, + cast_edge_op_bool_to_integer{e_op}, + edge_t{0}); } } // namespace experimental diff --git a/cpp/include/patterns/edge_op_utils.cuh b/cpp/include/patterns/edge_op_utils.cuh index 58fb31c7605..198c1880ff4 100644 --- a/cpp/include/patterns/edge_op_utils.cuh +++ b/cpp/include/patterns/edge_op_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -77,6 +77,42 @@ struct evaluate_edge_op { } }; +template +struct cast_edge_op_bool_to_integer { + static_assert(std::is_integral::value); + using vertex_type = typename GraphViewType::vertex_type; + using weight_type = typename GraphViewType::weight_type; + using row_value_type = typename std::iterator_traits::value_type; + using col_value_type = typename std::iterator_traits::value_type; + + EdgeOp e_op{}; + + template + __device__ std::enable_if_t>::valid, T> + operator()(V r, V c, W w, R rv, C cv) + { + return e_op(r, c, w, rv, cv) ? T{1} : T{0}; + } + + template + __device__ std::enable_if_t>::valid, T> + operator()(V r, V c, R rv, C cv) + { + return e_op(r, c, rv, cv) ? T{1} : T{0}; + } +}; + template __host__ __device__ std::enable_if_t::value, T> plus_edge_op_result( T const& lhs, T const& rhs) diff --git a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh index 0b3588bc8c5..9848aa21f88 100644 --- a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh +++ b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh @@ -25,8 +25,6 @@ #include -#include - #include namespace cugraph { @@ -64,10 +62,10 @@ __global__ void for_all_major_for_all_nbr_low_degree( auto idx = static_cast(tid); while (idx < static_cast(major_last - major_first)) { + auto major_offset = major_start_offset + idx; vertex_t const* indices{nullptr}; weight_t const* weights{nullptr}; edge_t local_degree{}; - auto major_offset = major_start_offset + idx; thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(static_cast(major_offset)); if (local_degree > 0) { @@ -181,20 +179,10 @@ transform_reduce_by_adj_matrix_row_col_key_e( using edge_t = typename GraphViewType::edge_type; using weight_t = typename GraphViewType::weight_type; - auto loop_count = size_t{1}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - loop_count = graph_view.is_hypergraph_partitioned() - ? graph_view.get_number_of_local_adj_matrix_partitions() - : static_cast(row_comm_size); - } - rmm::device_uvector keys(0, handle.get_stream()); auto value_buffer = allocate_dataframe_buffer(0, handle.get_stream()); - for (size_t i = 0; i < loop_count; ++i) { - matrix_partition_device_t matrix_partition( - graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); int comm_root_rank = 0; if (GraphViewType::is_multi_gpu) { @@ -203,8 +191,7 @@ transform_reduce_by_adj_matrix_row_col_key_e( auto const row_comm_size = row_comm.get_size(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank - : col_comm_rank * row_comm_size + i; + comm_root_rank = i * row_comm_size + row_comm_rank; } auto num_edges = thrust::transform_reduce( @@ -226,6 +213,13 @@ transform_reduce_by_adj_matrix_row_col_key_e( detail::transform_reduce_by_key_e_for_all_block_size, handle.get_device_properties().maxGridSize[0]); + auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed + ? vertex_t{0} + : matrix_partition.get_major_value_start_offset(); + auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed + ? matrix_partition.get_major_value_start_offset() + : vertex_t{0}; + // FIXME: This is highly inefficient for graphs with high-degree vertices. If we renumber // vertices to insure that rows within a partition are sorted by their out-degree in // decreasing order, we will apply this kernel only to low out-degree vertices. @@ -234,9 +228,10 @@ transform_reduce_by_adj_matrix_row_col_key_e( matrix_partition, graph_view.get_vertex_partition_first(comm_root_rank), graph_view.get_vertex_partition_last(comm_root_rank), - adj_matrix_row_value_input_first, - adj_matrix_col_value_input_first, - adj_matrix_row_col_key_first, + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + adj_matrix_row_col_key_first + + (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset), e_op, tmp_keys.data(), get_dataframe_buffer_begin(tmp_value_buffer)); diff --git a/cpp/include/patterns/transform_reduce_e.cuh b/cpp/include/patterns/transform_reduce_e.cuh index 1f59777bc35..b95e036d460 100644 --- a/cpp/include/patterns/transform_reduce_e.cuh +++ b/cpp/include/patterns/transform_reduce_e.cuh @@ -41,31 +41,34 @@ int32_t constexpr transform_reduce_e_for_all_block_size = 128; template __global__ void for_all_major_for_all_nbr_low_degree( matrix_partition_device_t matrix_partition, + typename GraphViewType::vertex_type major_first, + typename GraphViewType::vertex_type major_last, AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, - BlockResultIterator block_result_first, + ResultIterator result_iter /* size 1 */, EdgeOp e_op) { using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; using weight_t = typename GraphViewType::weight_type; - using e_op_result_t = typename std::iterator_traits::value_type; + using e_op_result_t = typename std::iterator_traits::value_type; - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - size_t idx = static_cast(tid); + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto major_start_offset = static_cast(major_first - matrix_partition.get_major_first()); + size_t idx = static_cast(tid); e_op_result_t e_op_result_sum{}; - while (idx < static_cast(matrix_partition.get_major_size())) { + while (idx < static_cast(major_last - major_first)) { + auto major_offset = major_start_offset + idx; vertex_t const* indices{nullptr}; weight_t const* weights{nullptr}; edge_t local_degree{}; - thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(idx); -#if 1 - auto sum = thrust::transform_reduce( + thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset); + auto sum = thrust::transform_reduce( thrust::seq, thrust::make_counting_iterator(edge_t{0}), thrust::make_counting_iterator(local_degree), @@ -104,9 +107,112 @@ __global__ void for_all_major_for_all_nbr_low_degree( [] __device__(auto lhs, auto rhs) { return plus_edge_op_result(lhs, rhs); }); e_op_result_sum = plus_edge_op_result(e_op_result_sum, sum); -#else - // FIXME: delete this once we verify that the code above is not slower than this. - for (vertex_t i = 0; i < local_degree; ++i) { + idx += gridDim.x * blockDim.x; + } + + e_op_result_sum = + block_reduce_edge_op_result().compute( + e_op_result_sum); + if (threadIdx.x == 0) { atomic_accumulate_edge_op_result(result_iter, e_op_result_sum); } +} + +template +__global__ void for_all_major_for_all_nbr_mid_degree( + matrix_partition_device_t matrix_partition, + typename GraphViewType::vertex_type major_first, + typename GraphViewType::vertex_type major_last, + AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, + AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + ResultIterator result_iter /* size 1 */, + EdgeOp e_op) +{ + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using weight_t = typename GraphViewType::weight_type; + using e_op_result_t = typename std::iterator_traits::value_type; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + static_assert(transform_reduce_e_for_all_block_size % raft::warp_size() == 0); + auto const lane_id = tid % raft::warp_size(); + auto major_start_offset = static_cast(major_first - matrix_partition.get_major_first()); + size_t idx = static_cast(tid / raft::warp_size()); + + e_op_result_t e_op_result_sum{}; + while (idx < static_cast(major_last - major_first)) { + auto major_offset = major_start_offset + idx; + vertex_t const* indices{nullptr}; + weight_t const* weights{nullptr}; + edge_t local_degree{}; + thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset); + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { + auto minor = indices[i]; + auto weight = weights != nullptr ? weights[i] : weight_t{1.0}; + auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor); + auto row = GraphViewType::is_adj_matrix_transposed + ? minor + : matrix_partition.get_major_from_major_offset_nocheck(idx); + auto col = GraphViewType::is_adj_matrix_transposed + ? matrix_partition.get_major_from_major_offset_nocheck(idx) + : minor; + auto row_offset = + GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast(idx); + auto col_offset = + GraphViewType::is_adj_matrix_transposed ? static_cast(idx) : minor_offset; + auto e_op_result = evaluate_edge_op() + .compute(row, + col, + weight, + *(adj_matrix_row_value_input_first + row_offset), + *(adj_matrix_col_value_input_first + col_offset), + e_op); + e_op_result_sum = plus_edge_op_result(e_op_result_sum, e_op_result); + } + idx += gridDim.x * (blockDim.x / raft::warp_size()); + } + + e_op_result_sum = + block_reduce_edge_op_result().compute( + e_op_result_sum); + if (threadIdx.x == 0) { atomic_accumulate_edge_op_result(result_iter, e_op_result_sum); } +} + +template +__global__ void for_all_major_for_all_nbr_high_degree( + matrix_partition_device_t matrix_partition, + typename GraphViewType::vertex_type major_first, + typename GraphViewType::vertex_type major_last, + AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, + AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + ResultIterator result_iter /* size 1 */, + EdgeOp e_op) +{ + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using weight_t = typename GraphViewType::weight_type; + using e_op_result_t = typename std::iterator_traits::value_type; + + auto major_start_offset = static_cast(major_first - matrix_partition.get_major_first()); + size_t idx = static_cast(blockIdx.x); + + e_op_result_t e_op_result_sum{}; + while (idx < static_cast(major_last - major_first)) { + auto major_offset = major_start_offset + idx; + vertex_t const* indices{nullptr}; + weight_t const* weights{nullptr}; + edge_t local_degree{}; + thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset); + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { auto minor = indices[i]; auto weight = weights != nullptr ? weights[i] : weight_t{1.0}; auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor); @@ -132,14 +238,13 @@ __global__ void for_all_major_for_all_nbr_low_degree( e_op); e_op_result_sum = plus_edge_op_result(e_op_result_sum, e_op_result); } -#endif - idx += gridDim.x * blockDim.x; + idx += gridDim.x; } e_op_result_sum = block_reduce_edge_op_result().compute( e_op_result_sum); - if (threadIdx.x == 0) { *(block_result_first + blockIdx.x) = e_op_result_sum; } + if (threadIdx.x == 0) { atomic_accumulate_edge_op_result(result_iter, e_op_result_sum); } } } // namespace detail @@ -190,51 +295,106 @@ T transform_reduce_e(raft::handle_t const& handle, using vertex_t = typename GraphViewType::vertex_type; - T result{}; + auto result_buffer = allocate_dataframe_buffer(1, handle.get_stream()); + thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + get_dataframe_buffer_begin(result_buffer), + get_dataframe_buffer_begin(result_buffer) + 1, + T{}); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { matrix_partition_device_t matrix_partition(graph_view, i); - if (matrix_partition.get_major_size() > 0) { - auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? vertex_t{0} - : matrix_partition.get_major_value_start_offset(); - auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed - ? matrix_partition.get_major_value_start_offset() - : vertex_t{0}; + auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed + ? vertex_t{0} + : matrix_partition.get_major_value_start_offset(); + auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed + ? matrix_partition.get_major_value_start_offset() + : vertex_t{0}; + auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i); + if (segment_offsets.size() > 0) { + // FIXME: we may further improve performance by 1) concurrently running kernels on different + // segments; 2) individually tuning block sizes for different segments; and 3) adding one more + // segment for very high degree vertices and running segmented reduction + static_assert(detail::num_segments_per_vertex_partition == 3); + if (segment_offsets[1] > 0) { + raft::grid_1d_block_t update_grid(segment_offsets[1], + detail::transform_reduce_e_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); - raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(), + detail::for_all_major_for_all_nbr_high_degree<<>>( + matrix_partition, + matrix_partition.get_major_first(), + matrix_partition.get_major_first() + segment_offsets[1], + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + get_dataframe_buffer_begin(result_buffer), + e_op); + } + if (segment_offsets[2] - segment_offsets[1] > 0) { + raft::grid_1d_warp_t update_grid(segment_offsets[2] - segment_offsets[1], detail::transform_reduce_e_for_all_block_size, handle.get_device_properties().maxGridSize[0]); - auto block_result_buffer = - allocate_dataframe_buffer(update_grid.num_blocks, handle.get_stream()); - - detail::for_all_major_for_all_nbr_low_degree<<>>( - matrix_partition, - adj_matrix_row_value_input_first + row_value_input_offset, - adj_matrix_col_value_input_first + col_value_input_offset, - get_dataframe_buffer_begin(block_result_buffer), - e_op); - - // FIXME: we have several options to implement this. With cooperative group support - // (https://devblogs.nvidia.com/cooperative-groups/), we can run this synchronization within - // the previous kernel. Using atomics at the end of the previous kernel is another option - // (sequentialization due to atomics may not be bad as different blocks may reach the - // synchronization point in varying timings and the number of SMs is not very big) - auto partial_result = - thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - get_dataframe_buffer_begin(block_result_buffer), - get_dataframe_buffer_begin(block_result_buffer) + update_grid.num_blocks, - T(), - [] __device__(T lhs, T rhs) { return plus_edge_op_result(lhs, rhs); }); - - result = plus_edge_op_result(result, partial_result); + detail::for_all_major_for_all_nbr_mid_degree<<>>( + matrix_partition, + matrix_partition.get_major_first() + segment_offsets[1], + matrix_partition.get_major_first() + segment_offsets[2], + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + get_dataframe_buffer_begin(result_buffer), + e_op); + } + if (segment_offsets[3] - segment_offsets[2] > 0) { + raft::grid_1d_thread_t update_grid(segment_offsets[3] - segment_offsets[2], + detail::transform_reduce_e_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + + detail::for_all_major_for_all_nbr_low_degree<<>>( + matrix_partition, + matrix_partition.get_major_first() + segment_offsets[2], + matrix_partition.get_major_last(), + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + get_dataframe_buffer_begin(result_buffer), + e_op); + } + } else { + if (matrix_partition.get_major_size() > 0) { + raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(), + detail::transform_reduce_e_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + + detail::for_all_major_for_all_nbr_low_degree<<>>( + matrix_partition, + matrix_partition.get_major_first(), + matrix_partition.get_major_last(), + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first + col_value_input_offset, + get_dataframe_buffer_begin(result_buffer), + e_op); + } } } + auto result = + thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + get_dataframe_buffer_begin(result_buffer), + get_dataframe_buffer_begin(result_buffer) + 1, + T{}, + [] __device__(T lhs, T rhs) { return plus_edge_op_result(lhs, rhs); }); + if (GraphViewType::is_multi_gpu) { result = host_scalar_allreduce(handle.get_comms(), result, handle.get_stream()); } diff --git a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh index 4efd32bcac7..3d87f19969e 100644 --- a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh +++ b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh @@ -15,7 +15,6 @@ */ #pragma once -#include #include #include #include @@ -25,23 +24,27 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include -#include +#include +#include #include #include #include #include #include +#include #include #include #include @@ -53,9 +56,7 @@ namespace experimental { namespace detail { -// FIXME: block size requires tuning -int32_t constexpr update_frontier_v_push_if_out_nbr_for_all_block_size = 128; -int32_t constexpr update_frontier_v_push_if_out_nbr_update_block_size = 128; +int32_t constexpr update_frontier_v_push_if_out_nbr_for_all_block_size = 512; template (thrust::distance(row_first, row_last)); auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - size_t idx = tid; + auto idx = static_cast(tid); - while (idx < num_rows) { + while (idx < static_cast(thrust::distance(row_first, row_last))) { vertex_t row = *(row_first + idx); auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row); vertex_t const* indices{nullptr}; weight_t const* weights{nullptr}; edge_t local_out_degree{}; thrust::tie(indices, weights, local_out_degree) = matrix_partition.get_local_edges(row_offset); - for (vertex_t i = 0; i < local_out_degree; ++i) { + for (edge_t i = 0; i < local_out_degree; ++i) { auto col = indices[i]; auto weight = weights != nullptr ? weights[i] : 1.0; auto col_offset = matrix_partition.get_minor_offset_from_minor_nocheck(col); @@ -115,22 +115,153 @@ __global__ void for_all_frontier_row_for_all_nbr_low_degree( static_assert(sizeof(unsigned long long int) == sizeof(size_t)); auto buffer_idx = atomicAdd(reinterpret_cast(buffer_idx_ptr), static_cast(1)); - *(buffer_key_output_first + buffer_idx) = col; - *(buffer_payload_output_first + buffer_idx) = - remove_first_thrust_tuple_element()(e_op_result); + *(buffer_key_output_first + buffer_idx) = col; + *(buffer_payload_output_first + buffer_idx) = thrust::get<1>(e_op_result); } } - idx += gridDim.x * blockDim.x; } } +template +__global__ void for_all_frontier_row_for_all_nbr_mid_degree( + matrix_partition_device_t matrix_partition, + RowIterator row_first, + RowIterator row_last, + AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, + AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + BufferKeyOutputIterator buffer_key_output_first, + BufferPayloadOutputIterator buffer_payload_output_first, + size_t* buffer_idx_ptr, + EdgeOp e_op) +{ + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using weight_t = typename GraphViewType::weight_type; + + static_assert(!GraphViewType::is_adj_matrix_transposed, + "GraphViewType should support the push model."); + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + static_assert(update_frontier_v_push_if_out_nbr_for_all_block_size % raft::warp_size() == 0); + auto const lane_id = tid % raft::warp_size(); + auto idx = static_cast(tid / raft::warp_size()); + + while (idx < static_cast(thrust::distance(row_first, row_last))) { + vertex_t row = *(row_first + idx); + auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row); + vertex_t const* indices{nullptr}; + weight_t const* weights{nullptr}; + edge_t local_out_degree{}; + thrust::tie(indices, weights, local_out_degree) = matrix_partition.get_local_edges(row_offset); + for (edge_t i = lane_id; i < local_out_degree; i += raft::warp_size()) { + auto col = indices[i]; + auto weight = weights != nullptr ? weights[i] : 1.0; + auto col_offset = matrix_partition.get_minor_offset_from_minor_nocheck(col); + auto e_op_result = evaluate_edge_op() + .compute(row, + col, + weight, + *(adj_matrix_row_value_input_first + row_offset), + *(adj_matrix_col_value_input_first + col_offset), + e_op); + if (thrust::get<0>(e_op_result) == true) { + // FIXME: This atomicAdd serializes execution. If we renumber vertices to insure that rows + // within a partition are sorted by their out-degree in decreasing order, we can compute + // a tight uppper bound for the maximum number of pushes per warp/block and use shared + // memory buffer to reduce the number of atomicAdd operations. + static_assert(sizeof(unsigned long long int) == sizeof(size_t)); + auto buffer_idx = atomicAdd(reinterpret_cast(buffer_idx_ptr), + static_cast(1)); + *(buffer_key_output_first + buffer_idx) = col; + *(buffer_payload_output_first + buffer_idx) = thrust::get<1>(e_op_result); + } + } + + idx += gridDim.x * (blockDim.x / raft::warp_size()); + } +} + +template +__global__ void for_all_frontier_row_for_all_nbr_high_degree( + matrix_partition_device_t matrix_partition, + RowIterator row_first, + RowIterator row_last, + AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, + AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, + BufferKeyOutputIterator buffer_key_output_first, + BufferPayloadOutputIterator buffer_payload_output_first, + size_t* buffer_idx_ptr, + EdgeOp e_op) +{ + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using weight_t = typename GraphViewType::weight_type; + + static_assert(!GraphViewType::is_adj_matrix_transposed, + "GraphViewType should support the push model."); + + auto idx = static_cast(blockIdx.x); + + while (idx < static_cast(thrust::distance(row_first, row_last))) { + vertex_t row = *(row_first + idx); + auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row); + vertex_t const* indices{nullptr}; + weight_t const* weights{nullptr}; + edge_t local_out_degree{}; + thrust::tie(indices, weights, local_out_degree) = matrix_partition.get_local_edges(row_offset); + for (edge_t i = threadIdx.x; i < local_out_degree; i += blockDim.x) { + auto col = indices[i]; + auto weight = weights != nullptr ? weights[i] : 1.0; + auto col_offset = matrix_partition.get_minor_offset_from_minor_nocheck(col); + auto e_op_result = evaluate_edge_op() + .compute(row, + col, + weight, + *(adj_matrix_row_value_input_first + row_offset), + *(adj_matrix_col_value_input_first + col_offset), + e_op); + if (thrust::get<0>(e_op_result) == true) { + // FIXME: This atomicAdd serializes execution. If we renumber vertices to insure that rows + // within a partition are sorted by their out-degree in decreasing order, we can compute + // a tight uppper bound for the maximum number of pushes per warp/block and use shared + // memory buffer to reduce the number of atomicAdd operations. + static_assert(sizeof(unsigned long long int) == sizeof(size_t)); + auto buffer_idx = atomicAdd(reinterpret_cast(buffer_idx_ptr), + static_cast(1)); + *(buffer_key_output_first + buffer_idx) = col; + *(buffer_payload_output_first + buffer_idx) = thrust::get<1>(e_op_result); + } + } + + idx += gridDim.x; + } +} + template -size_t reduce_buffer_elements(raft::handle_t const& handle, - BufferKeyOutputIterator buffer_key_output_first, - BufferPayloadOutputIterator buffer_payload_output_first, - size_t num_buffer_elements, - ReduceOp reduce_op) +size_t sort_and_reduce_buffer_elements(raft::handle_t const& handle, + BufferKeyOutputIterator buffer_key_output_first, + BufferPayloadOutputIterator buffer_payload_output_first, + size_t num_buffer_elements, + ReduceOp reduce_op) { thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), buffer_key_output_first, @@ -155,8 +286,8 @@ size_t reduce_buffer_elements(raft::handle_t const& handle, // temporary buffer size exceeds the maximum buffer size (may be definied as percentage of the // system HBM size or a function of the maximum number of threads in the system)) // FIXME: actually, we can find how many unique keys are here by now. - // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding the - // vertex unless reduce_op is a pure function. + // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding + // the vertex unless reduce_op is a pure function. rmm::device_uvector keys(num_buffer_elements, handle.get_stream()); auto value_buffer = allocate_dataframe_buffer(num_buffer_elements, handle.get_stream()); @@ -182,93 +313,6 @@ size_t reduce_buffer_elements(raft::handle_t const& handle, } } -template -__global__ void update_frontier_and_vertex_output_values( - vertex_partition_device_t vertex_partition, - BufferKeyInputIterator buffer_key_input_first, - BufferPayloadInputIterator buffer_payload_input_first, - size_t num_buffer_elements, - VertexValueInputIterator vertex_value_input_first, - VertexValueOutputIterator vertex_value_output_first, - vertex_t** bucket_ptrs, - size_t* bucket_sizes_ptr, - size_t invalid_bucket_idx, - vertex_t invalid_vertex, - VertexOp v_op) -{ - static_assert(std::is_same::value_type, - vertex_t>::value); - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - size_t idx = tid; - size_t block_idx = blockIdx.x; - // FIXME: it might be more performant to process more than one element per thread - auto num_blocks = (num_buffer_elements + blockDim.x - 1) / blockDim.x; - - using BlockScan = - cub::BlockScan; - __shared__ typename BlockScan::TempStorage temp_storage; - - __shared__ size_t bucket_block_start_offsets[num_buckets]; - - size_t bucket_block_local_offsets[num_buckets]; - size_t bucket_block_aggregate_sizes[num_buckets]; - - while (block_idx < num_blocks) { - for (size_t i = 0; i < num_buckets; ++i) { bucket_block_local_offsets[i] = 0; } - - size_t selected_bucket_idx{invalid_bucket_idx}; - vertex_t key{invalid_vertex}; - - if (idx < num_buffer_elements) { - key = *(buffer_key_input_first + idx); - auto key_offset = vertex_partition.get_local_vertex_offset_from_vertex_nocheck(key); - auto v_val = *(vertex_value_input_first + key_offset); - auto payload = *(buffer_payload_input_first + idx); - auto v_op_result = v_op(v_val, payload); - selected_bucket_idx = thrust::get<0>(v_op_result); - if (selected_bucket_idx != invalid_bucket_idx) { - *(vertex_value_output_first + key_offset) = - remove_first_thrust_tuple_element()(v_op_result); - bucket_block_local_offsets[selected_bucket_idx] = 1; - } - } - - for (size_t i = 0; i < num_buckets; ++i) { - BlockScan(temp_storage) - .ExclusiveSum(bucket_block_local_offsets[i], - bucket_block_local_offsets[i], - bucket_block_aggregate_sizes[i]); - } - - if (threadIdx.x == 0) { - for (size_t i = 0; i < num_buckets; ++i) { - static_assert(sizeof(unsigned long long int) == sizeof(size_t)); - bucket_block_start_offsets[i] = - atomicAdd(reinterpret_cast(bucket_sizes_ptr + i), - static_cast(bucket_block_aggregate_sizes[i])); - } - } - - __syncthreads(); - - // FIXME: better use shared memory buffer to aggreaget global memory writes - if (selected_bucket_idx != invalid_bucket_idx) { - bucket_ptrs[selected_bucket_idx][bucket_block_start_offsets[selected_bucket_idx] + - bucket_block_local_offsets[selected_bucket_idx]] = key; - } - - idx += gridDim.x * blockDim.x; - block_idx += gridDim.x; - } -} - } // namespace detail /** @@ -290,10 +334,12 @@ __global__ void update_frontier_and_vertex_output_values( * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex in the current frontier. v - * in [vertex_first, vertex_last) should be distinct (and should belong to this process in - * multi-GPU), otherwise undefined behavior - * @param vertex_last Iterator pointing to the last (exclusive) vertex in the current frontier. + * @param vertex_frontier VertexFrontier class object for vertex frontier managements. This object + * includes multiple bucket objects. + * @param cur_fontier_bucket_idx Index of the VertexFrontier bucket holding vertices for the current + * iteration. + * @param next_frontier_bucket_indices Indices of the VertexFrontier buckets to store new frontier + * vertices for the next iteration. * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input * properties for the first (inclusive) row (assigned to this process in multi-GPU). * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first + @@ -315,115 +361,103 @@ __global__ void update_frontier_and_vertex_output_values( * (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last` * (exclusive) is deduced as @p vertex_value_output_first + @p * graph_view.get_number_of_local_vertices(). - * @param vertex_frontier vertex frontier class object for vertex frontier managements. This object - * includes multiple bucket objects. * @param v_op Binary operator takes *(@p vertex_value_input_first + i) (where i is [0, @p * graph_view.get_number_of_local_vertices())) and reduced value of the @p e_op outputs for * this vertex and returns the target bucket index (for frontier update) and new verrtex property - * values (to update *(@p vertex_value_output_first + i)). + * values (to update *(@p vertex_value_output_first + i)). The target bucket index should either be + * VertexFrontier::kInvalidBucketIdx or an index in @p next_frontier_bucket_indices. */ template void update_frontier_v_push_if_out_nbr( raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexFrontierType& vertex_frontier, + size_t cur_frontier_bucket_idx, + std::vector const& next_frontier_bucket_indices, AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first, AdjMatrixColValueInputIterator adj_matrix_col_value_input_first, EdgeOp e_op, ReduceOp reduce_op, VertexValueInputIterator vertex_value_input_first, VertexValueOutputIterator vertex_value_output_first, - VertexFrontierType& vertex_frontier, VertexOp v_op) { static_assert(!GraphViewType::is_adj_matrix_transposed, "GraphViewType should support the push model."); - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - - // 1. fill the buffer - - vertex_frontier.set_buffer_idx_value(0); + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using weight_t = typename GraphViewType::weight_type; + using payload_t = typename ReduceOp::type; - auto loop_count = size_t{1}; - if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_size = row_comm.get_size(); - loop_count = graph_view.is_hypergraph_partitioned() - ? graph_view.get_number_of_local_adj_matrix_partitions() - : static_cast(row_comm_size); - } + auto cur_frontier_vertex_first = vertex_frontier.get_bucket(cur_frontier_bucket_idx).begin(); + auto cur_frontier_vertex_last = vertex_frontier.get_bucket(cur_frontier_bucket_idx).end(); - for (size_t i = 0; i < loop_count; ++i) { - matrix_partition_device_t matrix_partition( - graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i); + // 1. fill the buffer - rmm::device_uvector frontier_rows( - 0, handle.get_stream()); // relevant only if GraphViewType::is_multi_gpu is true + rmm::device_uvector keys(size_t{0}, handle.get_stream()); + auto payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + rmm::device_scalar buffer_idx(size_t{0}, handle.get_stream()); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + matrix_partition_device_t matrix_partition(graph_view, i); - size_t frontier_size{}; + rmm::device_uvector frontier_rows(0, handle.get_stream()); if (GraphViewType::is_multi_gpu) { - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); - auto const row_comm_rank = row_comm.get_rank(); - auto const row_comm_size = row_comm.get_size(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_rank = col_comm.get_rank(); - auto sub_comm_rank = graph_view.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank; - frontier_size = host_scalar_bcast( - graph_view.is_hypergraph_partitioned() ? col_comm : row_comm, - (static_cast(sub_comm_rank) == i) ? thrust::distance(vertex_first, vertex_last) - : size_t{0}, - i, - handle.get_stream()); - if (static_cast(sub_comm_rank) != i) { - frontier_rows.resize(frontier_size, handle.get_stream()); + auto frontier_size = + host_scalar_bcast(col_comm, + (static_cast(col_comm_rank) == i) + ? thrust::distance(cur_frontier_vertex_first, cur_frontier_vertex_last) + : size_t{0} /* dummy */, + i, + handle.get_stream()); + frontier_rows.resize(frontier_size, handle.get_stream()); + + if (static_cast(col_comm_rank) == i) { + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + cur_frontier_vertex_first, + cur_frontier_vertex_last, + frontier_rows.begin()); } - device_bcast(graph_view.is_hypergraph_partitioned() ? col_comm : row_comm, - vertex_first, + + device_bcast(col_comm, + cur_frontier_vertex_first, frontier_rows.begin(), frontier_size, i, handle.get_stream()); } else { - frontier_size = thrust::distance(vertex_first, vertex_last); + frontier_rows.resize(thrust::distance(cur_frontier_vertex_first, cur_frontier_vertex_last), + handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + cur_frontier_vertex_first, + cur_frontier_vertex_last, + frontier_rows.begin()); } - auto max_pushes = - frontier_size > 0 - ? frontier_rows.size() > 0 - ? thrust::transform_reduce( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - frontier_rows.begin(), - frontier_rows.end(), - [matrix_partition] __device__(auto row) { - auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row); - return matrix_partition.get_local_degree(row_offset); - }, - edge_t{0}, - thrust::plus()) - : thrust::transform_reduce( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertex_first, - vertex_last, - [matrix_partition] __device__(auto row) { - auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row); - return matrix_partition.get_local_degree(row_offset); - }, - edge_t{0}, - thrust::plus()) - : edge_t{0}; + auto max_pushes = frontier_rows.size() > 0 + ? thrust::transform_reduce( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + frontier_rows.begin(), + frontier_rows.end(), + [matrix_partition] __device__(auto row) { + auto row_offset = + matrix_partition.get_major_offset_from_major_nocheck(row); + return matrix_partition.get_local_degree(row_offset); + }, + edge_t{0}, + thrust::plus()) + : edge_t{0}; // FIXME: This is highly pessimistic for single GPU (and multi-GPU as well if we maintain // additional per column data for filtering in e_op). If we can pause & resume execution if @@ -439,51 +473,113 @@ void update_frontier_v_push_if_out_nbr( // locking. // FIXME: if i != 0, this will require costly reallocation if we don't use the new CUDA feature // to reserve address space. - vertex_frontier.resize_buffer(vertex_frontier.get_buffer_idx_value() + max_pushes); - auto buffer_first = vertex_frontier.buffer_begin(); - auto buffer_key_first = std::get<0>(buffer_first); - auto buffer_payload_first = std::get<1>(buffer_first); + keys.resize(buffer_idx.value(handle.get_stream()) + max_pushes, handle.get_stream()); + resize_dataframe_buffer(payload_buffer, keys.size(), handle.get_stream()); auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed ? vertex_t{0} : matrix_partition.get_major_value_start_offset(); - - // FIXME: This is highly inefficeint for graphs with high-degree vertices. If we renumber - // vertices to insure that rows within a partition are sorted by their out-degree in decreasing - // order, we will apply this kernel only to low out-degree vertices. - if (frontier_size > 0) { - raft::grid_1d_thread_t for_all_low_degree_grid( - frontier_size, - detail::update_frontier_v_push_if_out_nbr_for_all_block_size, - handle.get_device_properties().maxGridSize[0]); - - if (frontier_rows.size() > 0) { - detail::for_all_frontier_row_for_all_nbr_low_degree<< 0) { + static_assert(detail::num_segments_per_vertex_partition == 3); + std::vector h_thresholds(detail::num_segments_per_vertex_partition - 1); + h_thresholds[0] = matrix_partition.get_major_first() + segment_offsets[1]; + h_thresholds[1] = matrix_partition.get_major_first() + segment_offsets[2]; + rmm::device_uvector d_thresholds(h_thresholds.size(), handle.get_stream()); + raft::update_device( + d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream()); + rmm::device_uvector d_offsets(d_thresholds.size(), handle.get_stream()); + thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + frontier_rows.begin(), + frontier_rows.end(), + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin()); + std::vector h_offsets(d_offsets.size()); + raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream()); + CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + // FIXME: we may further improve performance by 1) concurrently running kernels on different + // segments; 2) individually tuning block sizes for different segments; and 3) adding one more + // segment for very high degree vertices and running segmented reduction + if (h_offsets[0] > 0) { + raft::grid_1d_block_t update_grid( + h_offsets[0], + detail::update_frontier_v_push_if_out_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + + detail::for_all_frontier_row_for_all_nbr_high_degree<<>>( + matrix_partition, + frontier_rows.begin(), + frontier_rows.begin() + h_offsets[0], + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first, + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), + buffer_idx.data(), + e_op); + } + if (h_offsets[1] - h_offsets[0] > 0) { + raft::grid_1d_warp_t update_grid( + h_offsets[1] - h_offsets[0], + detail::update_frontier_v_push_if_out_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + + detail::for_all_frontier_row_for_all_nbr_mid_degree<<>>( matrix_partition, - frontier_rows.begin(), + frontier_rows.begin() + h_offsets[0], + frontier_rows.begin() + h_offsets[1], + adj_matrix_row_value_input_first + row_value_input_offset, + adj_matrix_col_value_input_first, + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), + buffer_idx.data(), + e_op); + } + if (frontier_rows.size() - h_offsets[1] > 0) { + raft::grid_1d_thread_t update_grid( + frontier_rows.size() - h_offsets[1], + detail::update_frontier_v_push_if_out_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + + detail::for_all_frontier_row_for_all_nbr_low_degree<<>>( + matrix_partition, + frontier_rows.begin() + h_offsets[1], frontier_rows.end(), adj_matrix_row_value_input_first + row_value_input_offset, adj_matrix_col_value_input_first, - buffer_key_first, - buffer_payload_first, - vertex_frontier.get_buffer_idx_ptr(), + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), + buffer_idx.data(), e_op); - } else { - detail::for_all_frontier_row_for_all_nbr_low_degree<< 0) { + raft::grid_1d_thread_t update_grid( + frontier_rows.size(), + detail::update_frontier_v_push_if_out_nbr_for_all_block_size, + handle.get_device_properties().maxGridSize[0]); + + detail::for_all_frontier_row_for_all_nbr_low_degree<<>>( matrix_partition, - vertex_first, - vertex_last, + frontier_rows.begin(), + frontier_rows.end(), adj_matrix_row_value_input_first + row_value_input_offset, adj_matrix_col_value_input_first, - buffer_key_first, - buffer_payload_first, - vertex_frontier.get_buffer_idx_ptr(), + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), + buffer_idx.data(), e_op); } } @@ -491,19 +587,14 @@ void update_frontier_v_push_if_out_nbr( // 2. reduce the buffer - auto num_buffer_offset = edge_t{0}; - - auto buffer_first = vertex_frontier.buffer_begin(); - auto buffer_key_first = std::get<0>(buffer_first) + num_buffer_offset; - auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset; - - auto num_buffer_elements = detail::reduce_buffer_elements(handle, - buffer_key_first, - buffer_payload_first, - vertex_frontier.get_buffer_idx_value(), - reduce_op); - + auto num_buffer_elements = + detail::sort_and_reduce_buffer_elements(handle, + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), + buffer_idx.value(handle.get_stream()), + reduce_op); if (GraphViewType::is_multi_gpu) { + // FIXME: this step is unnecessary if row_comm_size== 1 auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -513,12 +604,9 @@ void update_frontier_v_push_if_out_nbr( auto const col_comm_rank = col_comm.get_rank(); auto const col_comm_size = col_comm.get_size(); - std::vector h_vertex_lasts(graph_view.is_hypergraph_partitioned() ? row_comm_size - : col_comm_size); + std::vector h_vertex_lasts(row_comm_size); for (size_t i = 0; i < h_vertex_lasts.size(); ++i) { - h_vertex_lasts[i] = graph_view.get_vertex_partition_last( - graph_view.is_hypergraph_partitioned() ? col_comm_rank * row_comm_size + i - : row_comm_rank * col_comm_size + i); + h_vertex_lasts[i] = graph_view.get_vertex_partition_last(col_comm_rank * row_comm_size + i); } rmm::device_uvector d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream()); @@ -527,8 +615,8 @@ void update_frontier_v_push_if_out_nbr( rmm::device_uvector d_tx_buffer_last_boundaries(d_vertex_lasts.size(), handle.get_stream()); thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - buffer_key_first, - buffer_key_first + num_buffer_elements, + keys.begin(), + keys.begin() + num_buffer_elements, d_vertex_lasts.begin(), d_vertex_lasts.end(), d_tx_buffer_last_boundaries.begin()); @@ -537,174 +625,135 @@ void update_frontier_v_push_if_out_nbr( d_tx_buffer_last_boundaries.data(), d_tx_buffer_last_boundaries.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + handle.get_stream_view().synchronize(); std::vector tx_counts(h_tx_buffer_last_boundaries.size()); std::adjacent_difference( h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); - std::vector rx_counts(graph_view.is_hypergraph_partitioned() ? row_comm_size - : col_comm_size); - std::vector count_requests(tx_counts.size() + rx_counts.size()); - size_t tx_self_i = std::numeric_limits::max(); - for (size_t i = 0; i < tx_counts.size(); ++i) { - auto comm_dst_rank = graph_view.is_hypergraph_partitioned() - ? col_comm_rank * row_comm_size + static_cast(i) - : row_comm_rank * col_comm_size + static_cast(i); - if (comm_dst_rank == comm_rank) { - tx_self_i = i; - // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms - count_requests[i] = std::numeric_limits::max(); - } else { - comm.isend(&tx_counts[i], 1, comm_dst_rank, 0 /* tag */, count_requests.data() + i); - } - } - for (size_t i = 0; i < rx_counts.size(); ++i) { - auto comm_src_rank = graph_view.is_hypergraph_partitioned() - ? col_comm_rank * row_comm_size + static_cast(i) - : static_cast(i) * row_comm_size + comm_rank / col_comm_size; - if (comm_src_rank == comm_rank) { - assert(tx_self_i != std::numeric_limits::max()); - rx_counts[i] = tx_counts[tx_self_i]; - // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms - count_requests[tx_counts.size() + i] = std::numeric_limits::max(); - } else { - comm.irecv(&rx_counts[i], - 1, - comm_src_rank, - 0 /* tag */, - count_requests.data() + tx_counts.size() + i); - } - } - // FIXME: better define request_null (similar to MPI_REQUEST_NULL) under raft::comms, if - // raft::comms::wait immediately returns on seeing request_null, this remove is unnecessary - count_requests.erase(std::remove(count_requests.begin(), - count_requests.end(), - std::numeric_limits::max()), - count_requests.end()); - comm.waitall(count_requests.size(), count_requests.data()); - - std::vector tx_offsets(tx_counts.size() + 1, edge_t{0}); - std::partial_sum(tx_counts.begin(), tx_counts.end(), tx_offsets.begin() + 1); - std::vector rx_offsets(rx_counts.size() + 1, edge_t{0}); - std::partial_sum(rx_counts.begin(), rx_counts.end(), rx_offsets.begin() + 1); - - // FIXME: this will require costly reallocation if we don't use the new CUDA feature to reserve - // address space. - // FIXME: std::max(actual size, 1) as ncclRecv currently hangs if recvuff is nullptr even if - // count is 0 - vertex_frontier.resize_buffer(std::max(num_buffer_elements + rx_offsets.back(), size_t(1))); - - auto buffer_first = vertex_frontier.buffer_begin(); - auto buffer_key_first = std::get<0>(buffer_first) + num_buffer_offset; - auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset; - - std::vector tx_dst_ranks(tx_counts.size()); - std::vector rx_src_ranks(rx_counts.size()); - for (size_t i = 0; i < tx_dst_ranks.size(); ++i) { - tx_dst_ranks[i] = graph_view.is_hypergraph_partitioned() - ? col_comm_rank * row_comm_size + static_cast(i) - : row_comm_rank * col_comm_size + static_cast(i); - } - for (size_t i = 0; i < rx_src_ranks.size(); ++i) { - rx_src_ranks[i] = graph_view.is_hypergraph_partitioned() - ? col_comm_rank * row_comm_size + static_cast(i) - : static_cast(i) * row_comm_size + comm_rank / col_comm_size; - } - - device_multicast_sendrecv( - comm, - buffer_key_first, - tx_counts, - tx_offsets, - tx_dst_ranks, - buffer_key_first + num_buffer_elements, - rx_counts, - rx_offsets, - rx_src_ranks, - handle.get_stream()); - device_multicast_sendrecv( - comm, - buffer_payload_first, - tx_counts, - tx_offsets, - tx_dst_ranks, - buffer_payload_first + num_buffer_elements, - rx_counts, - rx_offsets, - rx_src_ranks, - handle.get_stream()); - - // FIXME: this does not exploit the fact that each segment is sorted. Lost performance - // optimization opportunities. - // FIXME: we can use [vertex_frontier.buffer_begin(), vertex_frontier.buffer_begin() + - // num_buffer_elements) as temporary buffer inside reduce_buffer_elements(). - num_buffer_offset = num_buffer_elements; - num_buffer_elements = detail::reduce_buffer_elements(handle, - buffer_key_first + num_buffer_elements, - buffer_payload_first + num_buffer_elements, - rx_offsets.back(), - reduce_op); + rmm::device_uvector rx_keys(size_t{0}, handle.get_stream()); + std::tie(rx_keys, std::ignore) = + shuffle_values(row_comm, keys.begin(), tx_counts, handle.get_stream()); + keys = std::move(rx_keys); + + auto rx_payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, std::ignore) = + shuffle_values(row_comm, + get_dataframe_buffer_begin(payload_buffer), + tx_counts, + handle.get_stream()); + payload_buffer = std::move(rx_payload_buffer); + + num_buffer_elements = + detail::sort_and_reduce_buffer_elements(handle, + keys.begin(), + get_dataframe_buffer_begin(payload_buffer), + keys.size(), + reduce_op); } // 3. update vertex properties if (num_buffer_elements > 0) { - auto buffer_first = vertex_frontier.buffer_begin(); - auto buffer_key_first = std::get<0>(buffer_first) + num_buffer_offset; - auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset; - - raft::grid_1d_thread_t update_grid(num_buffer_elements, - detail::update_frontier_v_push_if_out_nbr_update_block_size, - handle.get_device_properties().maxGridSize[0]); - - auto constexpr invalid_vertex = invalid_vertex_id::value; + static_assert(VertexFrontierType::kNumBuckets <= std::numeric_limits::max()); + rmm::device_uvector bucket_indices(num_buffer_elements, handle.get_stream()); vertex_partition_device_t vertex_partition(graph_view); - auto bucket_and_bucket_size_device_ptrs = - vertex_frontier.get_bucket_and_bucket_size_device_pointers(); - detail::update_frontier_and_vertex_output_values - <<>>( - vertex_partition, - buffer_key_first, - buffer_payload_first, - num_buffer_elements, - vertex_value_input_first, - vertex_value_output_first, - std::get<0>(bucket_and_bucket_size_device_ptrs), - std::get<1>(bucket_and_bucket_size_device_ptrs), - VertexFrontierType::kInvalidBucketIdx, - invalid_vertex, - v_op); - - auto bucket_sizes_device_ptr = std::get<1>(bucket_and_bucket_size_device_ptrs); - std::vector bucket_sizes(VertexFrontierType::kNumBuckets); - raft::update_host(bucket_sizes.data(), - bucket_sizes_device_ptr, - VertexFrontierType::kNumBuckets, - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - for (size_t i = 0; i < VertexFrontierType::kNumBuckets; ++i) { - vertex_frontier.get_bucket(i).set_size(bucket_sizes[i]); + auto key_payload_pair_first = thrust::make_zip_iterator( + thrust::make_tuple(keys.begin(), get_dataframe_buffer_begin(payload_buffer))); + thrust::transform( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + key_payload_pair_first, + key_payload_pair_first + num_buffer_elements, + bucket_indices.begin(), + [vertex_value_input_first, + vertex_value_output_first, + v_op, + vertex_partition, + invalid_bucket_idx = VertexFrontierType::kInvalidBucketIdx] __device__(auto pair) { + auto key = thrust::get<0>(pair); + auto payload = thrust::get<1>(pair); + auto key_offset = vertex_partition.get_local_vertex_offset_from_vertex_nocheck(key); + auto v_val = *(vertex_value_input_first + key_offset); + auto v_op_result = v_op(v_val, payload); + auto bucket_idx = thrust::get<0>(v_op_result); + if (bucket_idx != invalid_bucket_idx) { + *(vertex_value_output_first + key_offset) = thrust::get<1>(v_op_result); + return static_cast(bucket_idx); + } else { + return std::numeric_limits::max(); + } + }); + + resize_dataframe_buffer(payload_buffer, size_t{0}, handle.get_stream()); + shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream()); + + auto bucket_key_pair_first = + thrust::make_zip_iterator(thrust::make_tuple(bucket_indices.begin(), keys.begin())); + keys.resize(thrust::distance( + bucket_key_pair_first, + thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + bucket_key_pair_first, + bucket_key_pair_first + num_buffer_elements, + [] __device__(auto pair) { + return thrust::get<0>(pair) == + std::numeric_limits::max(); + })), + handle.get_stream()); + bucket_indices.resize(keys.size(), handle.get_stream()); + keys.shrink_to_fit(handle.get_stream()); + bucket_indices.shrink_to_fit(handle.get_stream()); + + bucket_key_pair_first = + thrust::make_zip_iterator(thrust::make_tuple(bucket_indices.begin(), keys.begin())); + if (next_frontier_bucket_indices.size() == 1) { + vertex_frontier.get_bucket(next_frontier_bucket_indices[0]).insert(keys.begin(), keys.size()); + } else if (next_frontier_bucket_indices.size() == 2) { + auto first_bucket_size = thrust::distance( + bucket_key_pair_first, + thrust::stable_partition( // stalbe_partition to maintain sorted order within each bucket + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + bucket_key_pair_first, + bucket_key_pair_first + bucket_indices.size(), + [first_bucket_idx = static_cast(next_frontier_bucket_indices[0])] __device__( + auto pair) { return thrust::get<0>(pair) == first_bucket_idx; })); + vertex_frontier.get_bucket(next_frontier_bucket_indices[0]) + .insert(keys.begin(), first_bucket_size); + vertex_frontier.get_bucket(next_frontier_bucket_indices[1]) + .insert(keys.begin() + first_bucket_size, + thrust::distance(keys.begin() + first_bucket_size, keys.end())); + } else { + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + bucket_key_pair_first, + bucket_key_pair_first + bucket_indices.size()); + rmm::device_uvector d_indices(next_frontier_bucket_indices.size(), + handle.get_stream()); + rmm::device_uvector d_counts(d_indices.size(), handle.get_stream()); + auto it = + thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + bucket_indices.begin(), + bucket_indices.end(), + thrust::make_constant_iterator(size_t{1}), + d_indices.begin(), + d_counts.begin()); + d_indices.resize(thrust::distance(d_indices.begin(), thrust::get<0>(it)), + handle.get_stream()); + d_counts.resize(d_indices.size(), handle.get_stream()); + std::vector h_indices(d_indices.size()); + std::vector h_counts(h_indices.size()); + raft::update_host(h_indices.data(), d_indices.data(), d_indices.size(), handle.get_stream()); + raft::update_host(h_counts.data(), d_counts.data(), d_counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + std::vector h_offsets(h_indices.size(), 0); + std::partial_sum(h_counts.begin(), h_counts.end() - 1, h_offsets.begin() + 1); + for (size_t i = 0; i < h_indices.size(); ++i) { + if (h_counts[i] > 0) { + vertex_frontier.get_bucket(h_indices[i]).insert(keys.begin() + h_offsets[i], h_counts[i]); + } + } } } } -/* - -FIXME: - -iterating over lower triangular (or upper triangular) : triangle counting -LRB might be necessary if the cost of processing an edge (i, j) is a function of degree(i) and -degree(j) : triangle counting -push-pull switching support (e.g. DOBFS), in this case, we need both -CSR & CSC (trade-off execution time vs memory requirement, unless graph is symmetric) -if graph is symmetric, there will be additional optimization opportunities (e.g. in-degree == -out-degree) For BFS, sending a bit vector (for the entire set of dest vertices per partitoin may -work better we can use thrust::set_intersection for triangle counting think about adding thrust -wrappers for reduction functions. Can I pass nullptr for dummy -instead of thrust::make_counting_iterator(0)? -*/ - } // namespace experimental } // namespace cugraph diff --git a/cpp/include/patterns/vertex_frontier.cuh b/cpp/include/patterns/vertex_frontier.cuh index c11142d3cf7..4758334e9fc 100644 --- a/cpp/include/patterns/vertex_frontier.cuh +++ b/cpp/include/patterns/vertex_frontier.cuh @@ -24,8 +24,7 @@ #include #include -#include -#include +#include #include #include @@ -37,149 +36,80 @@ namespace cugraph { namespace experimental { -namespace detail { - -// FIXME: block size requires tuning -int32_t constexpr move_and_invalidate_if_block_size = 128; - -// FIXME: better move to another file for reusability -inline size_t round_up(size_t number_to_round, size_t modulus) -{ - return ((number_to_round + (modulus - 1)) / modulus) * modulus; -} - -template -auto make_buffer_zip_iterator_impl(std::vector& buffer_ptrs, - size_t offset, - std::index_sequence) -{ - auto key_ptr = reinterpret_cast(buffer_ptrs[0]) + offset; - auto payload_it = thrust::make_zip_iterator( - thrust::make_tuple(reinterpret_cast::type*>( - buffer_ptrs[1 + Is])...)); - return std::make_tuple(key_ptr, payload_it); -} - -template -auto make_buffer_zip_iterator(std::vector& buffer_ptrs, size_t offset) -{ - size_t constexpr tuple_size = thrust::tuple_size::value; - return make_buffer_zip_iterator_impl( - buffer_ptrs, offset, std::make_index_sequence()); -} - -template -__global__ void move_and_invalidate_if(RowIterator row_first, - RowIterator row_last, - vertex_t** bucket_ptrs, - size_t* bucket_sizes_ptr, - size_t this_bucket_idx, - size_t invalid_bucket_idx, - vertex_t invalid_vertex, - SplitOp split_op) -{ - static_assert( - std::is_same::value_type, vertex_t>::value); - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - size_t idx = tid; - size_t block_idx = blockIdx.x; - auto num_elements = thrust::distance(row_first, row_last); - // FIXME: it might be more performant to process more than one element per thread - auto num_blocks = (num_elements + blockDim.x - 1) / blockDim.x; - - using BlockScan = cub::BlockScan; - __shared__ typename BlockScan::TempStorage temp_storage; - - __shared__ size_t bucket_block_start_offsets[num_buckets]; - - size_t bucket_block_local_offsets[num_buckets]; - size_t bucket_block_aggregate_sizes[num_buckets]; - - while (block_idx < num_blocks) { - for (size_t i = 0; i < num_buckets; ++i) { bucket_block_local_offsets[i] = 0; } - - size_t selected_bucket_idx{invalid_bucket_idx}; - vertex_t key{invalid_vertex}; - - if (idx < num_elements) { - key = *(row_first + idx); - selected_bucket_idx = split_op(key); - if (selected_bucket_idx != this_bucket_idx) { - *(row_first + idx) = invalid_vertex; - if (selected_bucket_idx != invalid_bucket_idx) { - bucket_block_local_offsets[selected_bucket_idx] = 1; - } - } - } - - for (size_t i = 0; i < num_buckets; ++i) { - BlockScan(temp_storage) - .ExclusiveSum(bucket_block_local_offsets[i], - bucket_block_local_offsets[i], - bucket_block_aggregate_sizes[i]); - } - - if (threadIdx.x == 0) { - for (size_t i = 0; i < num_buckets; ++i) { - static_assert(sizeof(unsigned long long int) == sizeof(size_t)); - bucket_block_start_offsets[i] = - atomicAdd(reinterpret_cast(bucket_sizes_ptr + i), - static_cast(bucket_block_aggregate_sizes[i])); - } - } - - __syncthreads(); - - // FIXME: better use shared memory buffer to aggreaget global memory writes - if ((selected_bucket_idx != this_bucket_idx) && (selected_bucket_idx != invalid_bucket_idx)) { - bucket_ptrs[selected_bucket_idx][bucket_block_start_offsets[selected_bucket_idx] + - bucket_block_local_offsets[selected_bucket_idx]] = key; - } - - idx += gridDim.x * blockDim.x; - block_idx += gridDim.x; - } -} - -} // namespace detail - template -class Bucket { +class SortedUniqueElementBucket { public: - Bucket(raft::handle_t const& handle, size_t capacity) - : handle_ptr_(&handle), elements_(capacity, handle.get_stream()) + SortedUniqueElementBucket(raft::handle_t const& handle) + : handle_ptr_(&handle), elements_(0, handle.get_stream()) { - thrust::fill(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), - elements_.begin(), - elements_.end(), - invalid_vertex_id::value); } void insert(vertex_t v) { - raft::update_device(elements_.data() + size_, &v, 1, handle_ptr_->get_stream()); - ++size_; + if (elements_.size() > 0) { + rmm::device_scalar vertex(v, handle_ptr_->get_stream()); + insert(vertex.data(), vertex_t{1}); + } else { + elements_.resize(1, handle_ptr_->get_stream()); + raft::update_device(elements_.data(), &v, size_t{1}, handle_ptr_->get_stream()); + } } - size_t size() const { return size_; } + /** + * @ brief insert a list of vertices to the bucket + * + * @param sorted_unique_vertices Device pointer to the array storing the vertex list. + * @param num_sorted_unique_vertices Size of the vertex list to insert. + */ + void insert(vertex_t const* sorted_unique_vertices, vertex_t num_sorted_unique_vertices) + { + if (elements_.size() > 0) { + rmm::device_uvector merged_vertices(elements_.size() + num_sorted_unique_vertices, + handle_ptr_->get_stream()); + thrust::merge(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), + elements_.begin(), + elements_.end(), + sorted_unique_vertices, + sorted_unique_vertices + num_sorted_unique_vertices, + merged_vertices.begin()); + merged_vertices.resize( + thrust::distance( + merged_vertices.begin(), + thrust::unique(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), + merged_vertices.begin(), + merged_vertices.end())), + handle_ptr_->get_stream()); + merged_vertices.shrink_to_fit(handle_ptr_->get_stream()); + elements_ = std::move(merged_vertices); + } else { + elements_.resize(num_sorted_unique_vertices, handle_ptr_->get_stream()); + thrust::copy(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), + sorted_unique_vertices, + sorted_unique_vertices + num_sorted_unique_vertices, + elements_.begin()); + } + } - void set_size(size_t size) { size_ = size; } + size_t size() const { return elements_.size(); } template std::enable_if_t aggregate_size() const { - return host_scalar_allreduce(handle_ptr_->get_comms(), size_, handle_ptr_->get_stream()); + return host_scalar_allreduce( + handle_ptr_->get_comms(), elements_.size(), handle_ptr_->get_stream()); } template std::enable_if_t aggregate_size() const { - return size_; + return elements_.size(); } - void clear() { size_ = 0; } + void resize(size_t size) { elements_.resize(size, handle_ptr_->get_stream()); } + + void clear() { elements_.resize(0, handle_ptr_->get_stream()); } - size_t capacity() const { return elements_.size(); } + void shrink_to_fit() { elements_.shrink_to_fit(handle_ptr_->get_stream()); } auto const data() const { return elements_.data(); } @@ -189,51 +119,32 @@ class Bucket { auto begin() { return elements_.begin(); } - auto const end() const { return elements_.begin() + size_; } + auto const end() const { return elements_.end(); } - auto end() { return elements_.begin() + size_; } + auto end() { return elements_.end(); } private: raft::handle_t const* handle_ptr_{nullptr}; rmm::device_uvector elements_; - size_t size_{0}; }; -template +template class VertexFrontier { public: static size_t constexpr kNumBuckets = num_buckets; static size_t constexpr kInvalidBucketIdx{std::numeric_limits::max()}; - VertexFrontier(raft::handle_t const& handle, std::vector bucket_capacities) - : handle_ptr_(&handle), - tmp_bucket_ptrs_(num_buckets, handle.get_stream()), - tmp_bucket_sizes_(num_buckets, handle.get_stream()), - buffer_ptrs_(kReduceInputTupleSize + 1 /* to store destination column number */, nullptr), - buffer_idx_(0, handle_ptr_->get_stream()) + VertexFrontier(raft::handle_t const& handle) : handle_ptr_(&handle) { - CUGRAPH_EXPECTS(bucket_capacities.size() == num_buckets, - "invalid input argument bucket_capacities (size mismatch)"); - thrust::fill(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), - tmp_bucket_ptrs_.begin(), - tmp_bucket_ptrs_.end(), - static_cast(nullptr)); - thrust::fill(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), - tmp_bucket_sizes_.begin(), - tmp_bucket_sizes_.end(), - size_t{0}); - for (size_t i = 0; i < num_buckets; ++i) { - buckets_.emplace_back(handle, bucket_capacities[i]); - } - buffer_.set_stream(handle_ptr_->get_stream()); + for (size_t i = 0; i < num_buckets; ++i) { buckets_.emplace_back(handle); } } - Bucket& get_bucket(size_t bucket_idx) { return buckets_[bucket_idx]; } + SortedUniqueElementBucket& get_bucket(size_t bucket_idx) + { + return buckets_[bucket_idx]; + } - Bucket const& get_bucket(size_t bucket_idx) const + SortedUniqueElementBucket const& get_bucket(size_t bucket_idx) const { return buckets_[bucket_idx]; } @@ -244,157 +155,111 @@ class VertexFrontier { } template - void split_bucket(size_t bucket_idx, SplitOp split_op) + void split_bucket(size_t this_bucket_idx, + std::vector const& move_to_bucket_indices, + SplitOp split_op) { - auto constexpr invalid_vertex = invalid_vertex_id::value; - - auto bucket_and_bucket_size_device_ptrs = get_bucket_and_bucket_size_device_pointers(); - - auto& this_bucket = get_bucket(bucket_idx); + auto& this_bucket = get_bucket(this_bucket_idx); if (this_bucket.size() > 0) { - raft::grid_1d_thread_t move_and_invalidate_if_grid( - this_bucket.size(), - detail::move_and_invalidate_if_block_size, - handle_ptr_->get_device_properties().maxGridSize[0]); - - detail::move_and_invalidate_if - <<get_stream()>>>(this_bucket.begin(), - this_bucket.end(), - std::get<0>(bucket_and_bucket_size_device_ptrs), - std::get<1>(bucket_and_bucket_size_device_ptrs), - bucket_idx, - kInvalidBucketIdx, - invalid_vertex, - split_op); - } + static_assert(kNumBuckets <= std::numeric_limits::max()); + rmm::device_uvector bucket_indices(this_bucket.size(), handle_ptr_->get_stream()); + thrust::transform( + rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), + this_bucket.begin(), + this_bucket.end(), + bucket_indices.begin(), + [split_op] __device__(auto v) { return static_cast(split_op(v)); }); + + auto pair_first = + thrust::make_zip_iterator(thrust::make_tuple(bucket_indices.begin(), this_bucket.begin())); + this_bucket.resize(thrust::distance( + pair_first, + thrust::remove_if( + rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), + pair_first, + pair_first + bucket_indices.size(), + [invalid_bucket_idx = static_cast(kInvalidBucketIdx)] __device__(auto pair) { + return thrust::get<0>(pair) == invalid_bucket_idx; + }))); + bucket_indices.resize(this_bucket.size(), handle_ptr_->get_stream()); + this_bucket.shrink_to_fit(); + bucket_indices.shrink_to_fit(handle_ptr_->get_stream()); + + pair_first = + thrust::make_zip_iterator(thrust::make_tuple(bucket_indices.begin(), this_bucket.begin())); + auto new_this_bucket_size = thrust::distance( + pair_first, + thrust::stable_partition( // stalbe_partition to maintain sorted order within each bucket + rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), + pair_first, + pair_first + bucket_indices.size(), + [this_bucket_idx = static_cast(this_bucket_idx)] __device__(auto pair) { + return thrust::get<0>(pair) == this_bucket_idx; + })); + + if (move_to_bucket_indices.size() == 1) { + get_bucket(move_to_bucket_indices[0]) + .insert(this_bucket.begin() + new_this_bucket_size, + thrust::distance(this_bucket.begin() + new_this_bucket_size, this_bucket.end())); + } else if (move_to_bucket_indices.size() == 2) { + auto next_bucket_size = thrust::distance( + pair_first + new_this_bucket_size, + thrust::stable_partition( // stalbe_partition to maintain sorted order within each bucket + rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), + pair_first + new_this_bucket_size, + pair_first + bucket_indices.size(), + [next_bucket_idx = static_cast(move_to_bucket_indices[0])] __device__( + auto pair) { return thrust::get<0>(pair) == next_bucket_idx; })); + get_bucket(move_to_bucket_indices[0]) + .insert(this_bucket.begin() + new_this_bucket_size, next_bucket_size); + get_bucket(move_to_bucket_indices[1]) + .insert(this_bucket.begin() + new_this_bucket_size + next_bucket_size, + thrust::distance(this_bucket.begin() + new_this_bucket_size + next_bucket_size, + this_bucket.end())); + } else { + thrust::sort(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), + pair_first + new_this_bucket_size, + pair_first + bucket_indices.size()); + rmm::device_uvector d_indices(move_to_bucket_indices.size(), + handle_ptr_->get_stream()); + rmm::device_uvector d_counts(d_indices.size(), handle_ptr_->get_stream()); + auto it = thrust::reduce_by_key( + rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), + bucket_indices.begin() + new_this_bucket_size, + bucket_indices.end(), + thrust::make_constant_iterator(size_t{1}), + d_indices.begin(), + d_counts.begin()); + d_indices.resize(thrust::distance(d_indices.begin(), thrust::get<0>(it)), + handle_ptr_->get_stream()); + d_counts.resize(d_indices.size(), handle_ptr_->get_stream()); + std::vector h_indices(d_indices.size()); + std::vector h_counts(h_indices.size()); + raft::update_host( + h_indices.data(), d_indices.data(), d_indices.size(), handle_ptr_->get_stream()); + raft::update_host( + h_counts.data(), d_counts.data(), d_counts.size(), handle_ptr_->get_stream()); + handle_ptr_->get_stream_view().synchronize(); + std::vector h_offsets(h_indices.size(), 0); + std::partial_sum(h_counts.begin(), h_counts.end() - 1, h_offsets.begin() + 1); + for (size_t i = 0; i < h_indices.size(); ++i) { + if (h_counts[i] > 0) { + get_bucket(h_indices[i]) + .insert(this_bucket.begin() + new_this_bucket_size + h_offsets[i], h_counts[i]); + } + } + } - // FIXME: if we adopt CUDA cooperative group https://devblogs.nvidia.com/cooperative-groups - // and global sync(), we can merge this step with the above kernel (and rename the above kernel - // to move_if) - auto it = - thrust::remove_if(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()), - get_bucket(bucket_idx).begin(), - get_bucket(bucket_idx).end(), - [] __device__(auto value) { return value == invalid_vertex; }); - - auto bucket_sizes_device_ptr = std::get<1>(bucket_and_bucket_size_device_ptrs); - std::vector bucket_sizes(kNumBuckets); - raft::update_host( - bucket_sizes.data(), bucket_sizes_device_ptr, kNumBuckets, handle_ptr_->get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle_ptr_->get_stream())); - for (size_t i = 0; i < kNumBuckets; ++i) { - if (i != bucket_idx) { get_bucket(i).set_size(bucket_sizes[i]); } + this_bucket.resize(new_this_bucket_size); + this_bucket.shrink_to_fit(); } - auto size = thrust::distance(get_bucket(bucket_idx).begin(), it); - get_bucket(bucket_idx).set_size(size); - return; } - auto get_bucket_and_bucket_size_device_pointers() - { - std::vector tmp_ptrs(buckets_.size(), nullptr); - std::vector tmp_sizes(buckets_.size(), 0); - for (size_t i = 0; i < buckets_.size(); ++i) { - tmp_ptrs[i] = get_bucket(i).data(); - tmp_sizes[i] = get_bucket(i).size(); - } - raft::update_device( - tmp_bucket_ptrs_.data(), tmp_ptrs.data(), tmp_ptrs.size(), handle_ptr_->get_stream()); - raft::update_device( - tmp_bucket_sizes_.data(), tmp_sizes.data(), tmp_sizes.size(), handle_ptr_->get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle_ptr_->get_stream())); - return std::make_tuple(tmp_bucket_ptrs_.data(), tmp_bucket_sizes_.data()); - } - - void resize_buffer(size_t size) - { - // FIXME: rmm::device_buffer resize incurs copy if memory is reallocated, which is unnecessary - // in this case. - buffer_.resize(compute_aggregate_buffer_size_in_bytes(size), handle_ptr_->get_stream()); - if (size > buffer_capacity_) { - buffer_capacity_ = size; - update_buffer_ptrs(); - } - buffer_size_ = size; - } - - void clear_buffer() { resize_buffer(0); } - - void shrink_to_fit_buffer() - { - if (buffer_size_ != buffer_capacity_) { - // FIXME: rmm::device_buffer shrink_to_fit incurs copy if memory is reallocated, which is - // unnecessary in this case. - buffer_.shrink_to_fit(handle_ptr_->get_stream()); - update_buffer_ptrs(); - buffer_capacity_ = buffer_size_; - } - } - - auto buffer_begin() - { - return detail::make_buffer_zip_iterator(buffer_ptrs_, 0); - } - - auto buffer_end() - { - return detail::make_buffer_zip_iterator(buffer_ptrs_, - buffer_size_); - } - - auto get_buffer_idx_ptr() { return buffer_idx_.data(); } - - size_t get_buffer_idx_value() { return buffer_idx_.value(handle_ptr_->get_stream()); } - - void set_buffer_idx_value(size_t value) - { - buffer_idx_.set_value(value, handle_ptr_->get_stream()); - } - private: - static size_t constexpr kReduceInputTupleSize = thrust::tuple_size::value; - static size_t constexpr kBufferAlignment = 128; - raft::handle_t const* handle_ptr_{nullptr}; - std::vector> buckets_{}; - rmm::device_uvector tmp_bucket_ptrs_; - rmm::device_uvector tmp_bucket_sizes_; - - std::array tuple_element_sizes_ = - compute_thrust_tuple_element_sizes()(); - std::vector buffer_ptrs_{}; - rmm::device_buffer buffer_{}; - size_t buffer_size_{0}; - size_t buffer_capacity_{0}; - rmm::device_scalar buffer_idx_{}; - - // FIXME: better pick between this apporach or the approach used in allocate_comm_buffer - size_t compute_aggregate_buffer_size_in_bytes(size_t size) - { - size_t aggregate_buffer_size_in_bytes = - detail::round_up(sizeof(vertex_t) * size, kBufferAlignment); - for (size_t i = 0; i < kReduceInputTupleSize; ++i) { - aggregate_buffer_size_in_bytes += - detail::round_up(tuple_element_sizes_[i] * size, kBufferAlignment); - } - return aggregate_buffer_size_in_bytes; - } - - void update_buffer_ptrs() - { - uintptr_t ptr = reinterpret_cast(buffer_.data()); - buffer_ptrs_[0] = reinterpret_cast(ptr); - ptr += detail::round_up(sizeof(vertex_t) * buffer_capacity_, kBufferAlignment); - for (size_t i = 0; i < kReduceInputTupleSize; ++i) { - buffer_ptrs_[1 + i] = reinterpret_cast(ptr); - ptr += detail::round_up(tuple_element_sizes_[i] * buffer_capacity_, kBufferAlignment); - } - } + std::vector> buckets_{}; }; } // namespace experimental diff --git a/cpp/include/utilities/collect_comm.cuh b/cpp/include/utilities/collect_comm.cuh index 5ca58ebeb17..481717d7c38 100644 --- a/cpp/include/utilities/collect_comm.cuh +++ b/cpp/include/utilities/collect_comm.cuh @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -24,7 +25,6 @@ #include #include -#include #include #include @@ -58,13 +58,18 @@ collect_values_for_keys(raft::comms::comms_t const &comm, double constexpr load_factor = 0.7; // FIXME: we may compare the performance & memory footprint of this hash based approach vs binary - // search based approach + // search based approach (especially when thrust::distance(collect_key_first, collect_key_last) << + // thrust::distance(map_key_first, map_key_last) // 1. build a cuco::static_map object for the map k, v pairs. auto kv_map_ptr = std::make_unique>( - static_cast(static_cast(thrust::distance(map_key_first, map_key_last)) / - load_factor), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast( + static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), invalid_vertex_id::value, invalid_vertex_id::value); { @@ -73,7 +78,11 @@ collect_values_for_keys(raft::comms::comms_t const &comm, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(map_key_first, map_key_last) > 0) { + kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + } } // 2. collect values for the unique keys in [collect_key_first, collect_key_last) @@ -82,9 +91,6 @@ collect_values_for_keys(raft::comms::comms_t const &comm, stream); thrust::copy( rmm::exec_policy(stream)->on(stream), collect_key_first, collect_key_last, unique_keys.begin()); - // FIXME: sort and unique are unnecessary if the keys in [collect_key_first, collect_key_last) are - // already unique, if this cost becomes a performance bottlenec, we may add - // collect_values_for_unique_keys in the future thrust::sort(rmm::exec_policy(stream)->on(stream), unique_keys.begin(), unique_keys.end()); unique_keys.resize( thrust::distance( @@ -107,8 +113,12 @@ collect_values_for_keys(raft::comms::comms_t const &comm, CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream - kv_map_ptr->find( - rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (rx_unique_keys.size() > 0) { + kv_map_ptr->find( + rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); + } rmm::device_uvector rx_values_for_unique_keys(0, stream); std::tie(rx_values_for_unique_keys, std::ignore) = @@ -125,7 +135,11 @@ collect_values_for_keys(raft::comms::comms_t const &comm, kv_map_ptr.reset(); kv_map_ptr = std::make_unique>( - static_cast(static_cast(unique_keys.size()) / load_factor), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(unique_keys.size()) / load_factor), + unique_keys.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value); { @@ -136,15 +150,154 @@ collect_values_for_keys(raft::comms::comms_t const &comm, return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (unique_keys.size() > 0) { kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); } } // 4. find values for [collect_key_first, collect_key_last) auto value_buffer = allocate_dataframe_buffer( thrust::distance(collect_key_first, collect_key_last), stream); - kv_map_ptr->find( - collect_key_first, collect_key_last, get_dataframe_buffer_begin(value_buffer)); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(collect_key_first, collect_key_last) > 0) { + kv_map_ptr->find( + collect_key_first, collect_key_last, get_dataframe_buffer_begin(value_buffer)); + } + + return value_buffer; +} + +// for key = [map_key_first, map_key_last), key_to_gpu_id_op(key) should be coincide with +// comm.get_rank() +template +decltype(allocate_dataframe_buffer::value_type>( + 0, cudaStream_t{nullptr})) +collect_values_for_unique_keys(raft::comms::comms_t const &comm, + VertexIterator0 map_key_first, + VertexIterator0 map_key_last, + ValueIterator map_value_first, + VertexIterator1 collect_unique_key_first, + VertexIterator1 collect_unique_key_last, + KeyToGPUIdOp key_to_gpu_id_op, + cudaStream_t stream) +{ + using vertex_t = typename std::iterator_traits::value_type; + static_assert( + std::is_same::value_type, vertex_t>::value); + using value_t = typename std::iterator_traits::value_type; + + double constexpr load_factor = 0.7; + + // FIXME: we may compare the performance & memory footprint of this hash based approach vs binary + // search based approach (especially when thrust::distance(collect_unique_key_first, + // collect_unique_key_last) << thrust::distance(map_key_first, map_key_last) + + // 1. build a cuco::static_map object for the map k, v pairs. + + auto kv_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast( + static_cast(thrust::distance(map_key_first, map_key_last)) / load_factor), + static_cast(thrust::distance(map_key_first, map_key_last)) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + { + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(map_key_first, map_key_last)) { + kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last)); + } + } + + // 2. collect values for the unique keys in [collect_unique_key_first, collect_unique_key_last) + + rmm::device_uvector unique_keys( + thrust::distance(collect_unique_key_first, collect_unique_key_last), stream); + thrust::copy(rmm::exec_policy(stream)->on(stream), + collect_unique_key_first, + collect_unique_key_last, + unique_keys.begin()); + + rmm::device_uvector values_for_unique_keys(0, stream); + { + rmm::device_uvector rx_unique_keys(0, stream); + std::vector rx_value_counts{}; + std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values( + comm, + unique_keys.begin(), + unique_keys.end(), + [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, + stream); + + rmm::device_uvector values_for_rx_unique_keys(rx_unique_keys.size(), stream); + + CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream + + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (rx_unique_keys.size() > 0) { + kv_map_ptr->find( + rx_unique_keys.begin(), rx_unique_keys.end(), values_for_rx_unique_keys.begin()); + } + + rmm::device_uvector rx_values_for_unique_keys(0, stream); + std::tie(rx_values_for_unique_keys, std::ignore) = + shuffle_values(comm, values_for_rx_unique_keys.begin(), rx_value_counts, stream); + + values_for_unique_keys = std::move(rx_values_for_unique_keys); + } + + // 3. re-build a cuco::static_map object for the k, v pairs in unique_keys, + // values_for_unique_keys. + + CUDA_TRY(cudaStreamSynchronize(stream)); // cuco::static_map currently does not take stream + + kv_map_ptr.reset(); + + kv_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(unique_keys.size()) / load_factor), + unique_keys.size() + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + { + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator( + thrust::make_tuple(unique_keys.begin(), values_for_unique_keys.begin())), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (unique_keys.size() > 0) { kv_map_ptr->insert(pair_first, pair_first + unique_keys.size()); } + } + + // 4. find values for [collect_unique_key_first, collect_unique_key_last) + + auto value_buffer = allocate_dataframe_buffer( + thrust::distance(collect_unique_key_first, collect_unique_key_last), stream); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (thrust::distance(collect_unique_key_first, collect_unique_key_last)) { + kv_map_ptr->find(collect_unique_key_first, + collect_unique_key_last, + get_dataframe_buffer_begin(value_buffer)); + } return value_buffer; } diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp index 98e850abbf0..d8c476760f0 100644 --- a/cpp/include/utilities/cython.hpp +++ b/cpp/include/utilities/cython.hpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace cugraph { namespace cython { @@ -92,7 +93,7 @@ struct graph_container_t { void* weights; void* vertex_partition_offsets; - size_t num_partition_edges; + size_t num_local_edges; size_t num_global_vertices; size_t num_global_edges; numberTypeEnum vertexType; @@ -102,7 +103,6 @@ struct graph_container_t { bool is_multi_gpu; bool sorted_by_degree; bool do_expensive_check; - bool hypergraph_partitioned; int row_comm_size; int col_comm_size; int row_comm_rank; @@ -146,7 +146,7 @@ struct cy_multi_edgelists_t { // replacement for std::tuple<,,>, since std::tuple is not // supported in cython // -template +template struct major_minor_weights_t { explicit major_minor_weights_t(raft::handle_t const& handle) : shuffled_major_vertices_(0, handle.get_stream()), @@ -154,12 +154,15 @@ struct major_minor_weights_t { shuffled_weights_(0, handle.get_stream()) { } + rmm::device_uvector& get_major(void) { return shuffled_major_vertices_; } rmm::device_uvector& get_minor(void) { return shuffled_minor_vertices_; } rmm::device_uvector& get_weights(void) { return shuffled_weights_; } + std::vector& get_edge_counts(void) { return edge_counts_; } + std::pair, size_t> get_major_wrap( void) // const: triggers errors in Cython autogen-ed C++ { @@ -179,10 +182,29 @@ struct major_minor_weights_t { sizeof(weight_t)); } + std::unique_ptr> get_edge_counts_wrap(void) // const + { + return std::make_unique>(edge_counts_); + } + private: rmm::device_uvector shuffled_major_vertices_; rmm::device_uvector shuffled_minor_vertices_; rmm::device_uvector shuffled_weights_; + std::vector edge_counts_{}; +}; + +// aggregate for random_walks() return type +// to be exposed to cython: +// +struct random_walk_ret_t { + size_t coalesced_sz_v_; + size_t coalesced_sz_w_; + size_t num_paths_; + size_t max_depth_; + std::unique_ptr d_coalesced_v_; + std::unique_ptr d_coalesced_w_; + std::unique_ptr d_sizes_; }; // wrapper for renumber_edgelist() return @@ -339,6 +361,9 @@ struct renum_quad_t { // The number of vertices and edges respectively in the graph represented by // the above arrays. // +// bool is_weighted +// true if the resulting graph object should store edge weights +// // bool transposed // true if the resulting graph object should store a transposed adjacency // matrix @@ -355,10 +380,11 @@ void populate_graph_container(graph_container_t& graph_container, numberTypeEnum vertexType, numberTypeEnum edgeType, numberTypeEnum weightType, - size_t num_partition_edges, + size_t num_local_edges, size_t num_global_vertices, size_t num_global_edges, bool sorted_by_degree, + bool is_weighted, bool transposed, bool multi_gpu); @@ -442,18 +468,27 @@ std::unique_ptr call_egonet(raft::handle_t const& handle, vertex_t* source_vertex, vertex_t n_subgraphs, vertex_t radius); +// wrapper for random_walks. +// +template +std::enable_if_t::value, + std::unique_ptr> +call_random_walks(raft::handle_t const& handle, + graph_container_t const& graph_container, + vertex_t const* ptr_start_set, + edge_t num_paths, + edge_t max_depth); // wrapper for shuffling: // template -std::unique_ptr> call_shuffle( +std::unique_ptr> call_shuffle( raft::handle_t const& handle, vertex_t* edgelist_major_vertices, // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place vertex_t* edgelist_minor_vertices, // [IN / OUT] weight_t* edgelist_weights, // [IN / OUT] - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned); // = false + edge_t num_edgelist_edges); // Wrapper for calling renumber_edeglist() inplace: // @@ -462,8 +497,7 @@ std::unique_ptr> call_renumber( raft::handle_t const& handle, vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */, vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu); diff --git a/cpp/include/utilities/dataframe_buffer.cuh b/cpp/include/utilities/dataframe_buffer.cuh index 06352b8e217..b0e9c1ebfec 100644 --- a/cpp/include/utilities/dataframe_buffer.cuh +++ b/cpp/include/utilities/dataframe_buffer.cuh @@ -47,21 +47,34 @@ auto allocate_dataframe_buffer_tuple_impl(std::index_sequence, } template -void resize_dataframe_buffer_tuple_element_impl(BufferType& buffer, - size_t new_buffer_size, - cudaStream_t stream) -{ - std::get(buffer).resize(new_buffer_size, stream); - resize_dataframe_buffer_tuple_element_impl( - buffer, new_buffer_size, stream); -} +struct resize_dataframe_buffer_tuple_iterator_element_impl { + void run(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream) + { + std::get(buffer).resize(new_buffer_size, stream); + resize_dataframe_buffer_tuple_iterator_element_impl().run( + buffer, new_buffer_size, stream); + } +}; template -void resize_dataframe_buffer_tuple_impl(BufferType& buffer, - size_t new_buffer_size, - cudaStream_t stream) -{ -} +struct resize_dataframe_buffer_tuple_iterator_element_impl { + void run(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream) {} +}; + +template +struct shrink_to_fit_dataframe_buffer_tuple_iterator_element_impl { + void run(BufferType& buffer, cudaStream_t stream) + { + std::get(buffer).shrink_to_fit(stream); + shrink_to_fit_dataframe_buffer_tuple_iterator_element_impl() + .run(buffer, stream); + } +}; + +template +struct shrink_to_fit_dataframe_buffer_tuple_iterator_element_impl { + void run(BufferType& buffer, cudaStream_t stream) {} +}; template auto get_dataframe_buffer_begin_tuple_element_impl(BufferType& buffer) @@ -108,8 +121,30 @@ template ::value; - detail::resize_dataframe_buffer_tuple_impl( - buffer, new_buffer_size, stream); + detail:: + resize_dataframe_buffer_tuple_iterator_element_impl() + .run(buffer, new_buffer_size, stream); +} + +template ::value>* = nullptr> +void shrink_to_fit_dataframe_buffer(BufferType& buffer, cudaStream_t stream) +{ + buffer.shrink_to_fit(stream); +} + +template ::value>* = nullptr> +void shrink_to_fit_dataframe_buffer(BufferType& buffer, cudaStream_t stream) +{ + size_t constexpr tuple_size = thrust::tuple_size::value; + detail::shrink_to_fit_dataframe_buffer_tuple_iterator_element_impl() + .run(buffer, stream); } template struct device_sendrecv_tuple_iterator_element_impl { void run(raft::comms::comms_t const& comm, InputIterator input_first, - size_t count, + size_t tx_count, int dst, - int base_tag, - raft::comms::request_t* requests) const + OutputIterator output_first, + size_t rx_count, + int src, + cudaStream_t stream) const { } }; @@ -413,6 +415,66 @@ struct device_bcast_tuple_iterator_element_impl +std::enable_if_t::value, void> +device_allreduce_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) +{ + // no-op +} + +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allreduce_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) +{ + static_assert(std::is_same::value_type, + typename std::iterator_traits::value_type>::value); + comm.allreduce(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, stream); +} + +template +struct device_allreduce_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) const + { + device_allreduce_impl(comm, + thrust::get(input_first.get_iterator_tuple()), + thrust::get(output_first.get_iterator_tuple()), + count, + op, + stream); + device_allreduce_tuple_iterator_element_impl( + comm, input_first, output_first, count, op, stream); + } +}; + +template +struct device_allreduce_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) const + { + } +}; + template std::enable_if_t::value, void> device_reduce_impl(raft::comms::comms_t const& comm, @@ -460,7 +522,7 @@ struct device_reduce_tuple_iterator_element_impl { op, root, stream); - device_reduce_tuple_iterator_element_impl( + device_reduce_tuple_iterator_element_impl().run( comm, input_first, output_first, count, op, root, stream); } }; @@ -854,6 +916,46 @@ device_bcast(raft::comms::comms_t const& comm, comm, input_first, output_first, count, root, stream); } +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allreduce(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) +{ + detail::device_allreduce_impl(comm, input_first, output_first, count, op, stream); +} + +template +std::enable_if_t< + is_thrust_tuple_of_arithmetic::value_type>::value && + is_thrust_tuple::value_type>::value, + void> +device_allreduce(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t count, + raft::comms::op_t op, + cudaStream_t stream) +{ + static_assert( + thrust::tuple_size::value_type>::value == + thrust::tuple_size::value_type>::value); + + size_t constexpr tuple_size = + thrust::tuple_size::value_type>::value; + + detail::device_allreduce_tuple_iterator_element_impl( + comm, input_first, output_first, count, op, stream); +} + template std::enable_if_t< std::is_arithmetic::value_type>::value, @@ -889,9 +991,11 @@ device_reduce(raft::comms::comms_t const& comm, size_t constexpr tuple_size = thrust::tuple_size::value_type>::value; - detail:: - device_reduce_tuple_iterator_element_impl( - comm, input_first, output_first, count, op, root, stream); + detail::device_reduce_tuple_iterator_element_impl() + .run(comm, input_first, output_first, count, op, root, stream); } template diff --git a/cpp/include/utilities/graph_traits.hpp b/cpp/include/utilities/graph_traits.hpp new file mode 100644 index 00000000000..363a13190be --- /dev/null +++ b/cpp/include/utilities/graph_traits.hpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cugraph { +namespace experimental { + +// primary template: +// +template +struct is_one_of; // purposely empty + +// partial specializations: +// +template +struct is_one_of { + static constexpr bool value = std::is_same::value || is_one_of::value; +}; + +template +struct is_one_of { + static constexpr bool value = false; +}; + +// meta-function that constrains +// vertex_t and edge_t template param candidates: +// +template +struct is_vertex_edge_combo { + static constexpr bool value = is_one_of::value && + is_one_of::value && + (sizeof(vertex_t) <= sizeof(edge_t)); +}; + +// meta-function that constrains +// all 3 template param candidates: +// +template +struct is_candidate { + static constexpr bool value = + is_vertex_edge_combo::value && is_one_of::value; +}; + +} // namespace experimental +} // namespace cugraph diff --git a/cpp/include/utilities/path_retrieval.hpp b/cpp/include/utilities/path_retrieval.hpp new file mode 100644 index 00000000000..e626d6af1ab --- /dev/null +++ b/cpp/include/utilities/path_retrieval.hpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cugraph { + +/** + * @brief Takes the results of BFS or SSSP function call and sums the given + * weights along the path to the starting vertex. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam weight_t Type of edge weights. Needs to be a floating point type. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. Must have at least one worker stream. + * @param vertices Pointer to vertex ids. + * @param preds Pointer to predecessors. + * @param info_weights Secondary weights along the edge from predecessor to vertex. + * @param out Contains for each index the sum of weights along the path unfolding. + * @param num_vertices Number of vertices. + **/ +template +void get_traversed_cost(raft::handle_t const &handle, + vertex_t const *vertices, + vertex_t const *preds, + weight_t const *info_weights, + weight_t *out, + vertex_t stop_vertex, + vertex_t num_vertices); +} // namespace cugraph diff --git a/cpp/include/utilities/shuffle_comm.cuh b/cpp/include/utilities/shuffle_comm.cuh index 8c363c9a346..b318009d9bf 100644 --- a/cpp/include/utilities/shuffle_comm.cuh +++ b/cpp/include/utilities/shuffle_comm.cuh @@ -22,6 +22,12 @@ #include #include +#include +#include +#include +#include +#include + #include #include #include @@ -31,89 +37,6 @@ namespace experimental { namespace detail { -template -rmm::device_uvector sort_and_count(raft::comms::comms_t const &comm, - ValueIterator tx_value_first /* [INOUT */, - ValueIterator tx_value_last /* [INOUT */, - ValueToGPUIdOp value_to_gpu_id_op, - cudaStream_t stream) -{ - auto const comm_size = comm.get_size(); - - thrust::sort(rmm::exec_policy(stream)->on(stream), - tx_value_first, - tx_value_last, - [value_to_gpu_id_op] __device__(auto lhs, auto rhs) { - return value_to_gpu_id_op(lhs) < value_to_gpu_id_op(rhs); - }); - - auto gpu_id_first = thrust::make_transform_iterator( - tx_value_first, - [value_to_gpu_id_op] __device__(auto value) { return value_to_gpu_id_op(value); }); - rmm::device_uvector d_tx_dst_ranks(comm_size, stream); - rmm::device_uvector d_tx_value_counts(comm_size, stream); - auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), - gpu_id_first, - gpu_id_first + thrust::distance(tx_value_first, tx_value_last), - thrust::make_constant_iterator(size_t{1}), - d_tx_dst_ranks.begin(), - d_tx_value_counts.begin()); - if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) { - rmm::device_uvector d_counts(comm_size, stream); - thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); - thrust::scatter(rmm::exec_policy(stream)->on(stream), - d_tx_value_counts.begin(), - thrust::get<1>(last), - d_tx_dst_ranks.begin(), - d_counts.begin()); - d_tx_value_counts = std::move(d_counts); - } - - return d_tx_value_counts; -} - -template -rmm::device_uvector sort_and_count(raft::comms::comms_t const &comm, - VertexIterator tx_key_first /* [INOUT */, - VertexIterator tx_key_last /* [INOUT */, - ValueIterator tx_value_first /* [INOUT */, - KeyToGPUIdOp key_to_gpu_id_op, - cudaStream_t stream) -{ - auto const comm_size = comm.get_size(); - - thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), - tx_key_first, - tx_key_last, - tx_value_first, - [key_to_gpu_id_op] __device__(auto lhs, auto rhs) { - return key_to_gpu_id_op(lhs) < key_to_gpu_id_op(rhs); - }); - - auto gpu_id_first = thrust::make_transform_iterator( - tx_key_first, [key_to_gpu_id_op] __device__(auto key) { return key_to_gpu_id_op(key); }); - rmm::device_uvector d_tx_dst_ranks(comm_size, stream); - rmm::device_uvector d_tx_value_counts(comm_size, stream); - auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), - gpu_id_first, - gpu_id_first + thrust::distance(tx_key_first, tx_key_last), - thrust::make_constant_iterator(size_t{1}), - d_tx_dst_ranks.begin(), - d_tx_value_counts.begin()); - if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) { - rmm::device_uvector d_counts(comm_size, stream); - thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); - thrust::scatter(rmm::exec_policy(stream)->on(stream), - d_tx_value_counts.begin(), - thrust::get<1>(last), - d_tx_dst_ranks.begin(), - d_counts.begin()); - d_tx_value_counts = std::move(d_counts); - } - - return d_tx_value_counts; -} - // inline to suppress a complaint about ODR violation inline std::tuple, std::vector, @@ -187,6 +110,86 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const &comm, } // namespace detail +template +rmm::device_uvector groupby_and_count(ValueIterator tx_value_first /* [INOUT */, + ValueIterator tx_value_last /* [INOUT */, + ValueToGPUIdOp value_to_group_id_op, + int num_groups, + cudaStream_t stream) +{ + thrust::sort(rmm::exec_policy(stream)->on(stream), + tx_value_first, + tx_value_last, + [value_to_group_id_op] __device__(auto lhs, auto rhs) { + return value_to_group_id_op(lhs) < value_to_group_id_op(rhs); + }); + + auto group_id_first = thrust::make_transform_iterator( + tx_value_first, + [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); }); + rmm::device_uvector d_tx_dst_ranks(num_groups, stream); + rmm::device_uvector d_tx_value_counts(d_tx_dst_ranks.size(), stream); + auto last = + thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + group_id_first, + group_id_first + thrust::distance(tx_value_first, tx_value_last), + thrust::make_constant_iterator(size_t{1}), + d_tx_dst_ranks.begin(), + d_tx_value_counts.begin()); + if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) { + rmm::device_uvector d_counts(num_groups, stream); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); + thrust::scatter(rmm::exec_policy(stream)->on(stream), + d_tx_value_counts.begin(), + thrust::get<1>(last), + d_tx_dst_ranks.begin(), + d_counts.begin()); + d_tx_value_counts = std::move(d_counts); + } + + return d_tx_value_counts; +} + +template +rmm::device_uvector groupby_and_count(VertexIterator tx_key_first /* [INOUT */, + VertexIterator tx_key_last /* [INOUT */, + ValueIterator tx_value_first /* [INOUT */, + KeyToGPUIdOp key_to_group_id_op, + int num_groups, + cudaStream_t stream) +{ + thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), + tx_key_first, + tx_key_last, + tx_value_first, + [key_to_group_id_op] __device__(auto lhs, auto rhs) { + return key_to_group_id_op(lhs) < key_to_group_id_op(rhs); + }); + + auto group_id_first = thrust::make_transform_iterator( + tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); }); + rmm::device_uvector d_tx_dst_ranks(num_groups, stream); + rmm::device_uvector d_tx_value_counts(d_tx_dst_ranks.size(), stream); + auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream), + group_id_first, + group_id_first + thrust::distance(tx_key_first, tx_key_last), + thrust::make_constant_iterator(size_t{1}), + d_tx_dst_ranks.begin(), + d_tx_value_counts.begin()); + if (thrust::distance(d_tx_dst_ranks.begin(), thrust::get<0>(last)) < num_groups) { + rmm::device_uvector d_counts(num_groups, stream); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0}); + thrust::scatter(rmm::exec_policy(stream)->on(stream), + d_tx_value_counts.begin(), + thrust::get<1>(last), + d_tx_dst_ranks.begin(), + d_counts.begin()); + d_tx_value_counts = std::move(d_counts); + } + + return d_tx_value_counts; +} + template auto shuffle_values(raft::comms::comms_t const &comm, TxValueIterator tx_value_first, @@ -250,7 +253,7 @@ auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const &comm, auto const comm_size = comm.get_size(); auto d_tx_value_counts = - detail::sort_and_count(comm, tx_value_first, tx_value_last, value_to_gpu_id_op, stream); + groupby_and_count(tx_value_first, tx_value_last, value_to_gpu_id_op, comm.get_size(), stream); std::vector tx_counts{}; std::vector tx_offsets{}; @@ -301,8 +304,8 @@ auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const &comm, { auto const comm_size = comm.get_size(); - auto d_tx_value_counts = detail::sort_and_count( - comm, tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, stream); + auto d_tx_value_counts = groupby_and_count( + tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, comm.get_size(), stream); std::vector tx_counts{}; std::vector tx_offsets{}; diff --git a/cpp/include/utilities/thrust_tuple_utils.cuh b/cpp/include/utilities/thrust_tuple_utils.cuh index 01843a583eb..d5ce6ff1a29 100644 --- a/cpp/include/utilities/thrust_tuple_utils.cuh +++ b/cpp/include/utilities/thrust_tuple_utils.cuh @@ -61,13 +61,6 @@ struct compute_thrust_tuple_element_sizes_impl { void compute(std::array::value>& arr) const {} }; -template -__device__ constexpr auto remove_first_thrust_tuple_element_impl(TupleType const& tuple, - std::index_sequence) -{ - return thrust::make_tuple(thrust::get<1 + Is>(tuple)...); -} - template struct plus_thrust_tuple_impl { __host__ __device__ constexpr void compute(TupleType& lhs, TupleType const& rhs) const @@ -200,16 +193,6 @@ struct compute_thrust_tuple_element_sizes { } }; -template -struct remove_first_thrust_tuple_element { - __device__ constexpr auto operator()(TupleType const& tuple) const - { - size_t constexpr tuple_size = thrust::tuple_size::value; - return detail::remove_first_thrust_tuple_element_impl( - tuple, std::make_index_sequence()); - } -}; - template struct plus_thrust_tuple { __host__ __device__ constexpr TupleType operator()(TupleType const& lhs, diff --git a/cpp/src/centrality/README.md b/cpp/src/centrality/README.md new file mode 100644 index 00000000000..db7838fb0cc --- /dev/null +++ b/cpp/src/centrality/README.md @@ -0,0 +1,81 @@ +# Centrality algorithms +cuGraph Pagerank is implemented using our graph primitive library + +## Pagerank + +The unit test code is the best place to search for examples on calling pagerank. + + * [SG Implementation](../../tests/experimental/pagerank_test.cpp) + * [MG Implementation](../../tests/pagerank/mg_pagerank_test.cpp) + +## Simple pagerank + +The example assumes that you create an SG or MG graph somehow. The caller must create the pageranks vector in device memory and pass in the raw pointer to that vector into the pagerank function. + +```cpp +#include +... +using vertex_t = int32_t; // or int64_t, whichever is appropriate +using weight_t = float; // or double, whichever is appropriate +using result_t = weight_t; // could specify float or double also +raft::handle_t handle; // Must be configured if MG +auto graph_view = graph.view(); // assumes you have created a graph somehow + +result_t constexpr alpha{0.85}; +result_t constexpr epsilon{1e-6}; + +rmm::device_uvector pageranks_v(graph_view.get_number_of_vertices(), handle.get_stream()); + +// pagerank optionally supports three additional parameters: +// max_iterations - maximum number of iterations, if pagerank doesn't coverge by +// then we abort +// has_initial_guess - if true, values in the pagerank array when the call is initiated +// will be used as the initial pagerank values. These values will +// be normalized before use. If false (the default), the values +// in the pagerank array will be set to 1/num_vertices before +// starting the computation. +// do_expensive_check - perform extensive validation of the input data before +// executing algorithm. Off by default. Note: turning this on +// is expensive +cugraph::experimental::pagerank(handle, graph_view, nullptr, nullptr, nullptr, vertex_t{0}, + pageranks_v.data(), alpha, epsilon); +``` + +## Personalized Pagerank + +The example assumes that you create an SG or MG graph somehow. The caller must create the pageranks vector in device memory and pass in the raw pointer to that vector into the pagerank function. Additionally, the caller must create personalization_vertices and personalized_values vectors in device memory, populate them and pass in the raw pointers to those vectors. + +```cpp +#include +... +using vertex_t = int32_t; // or int64_t, whichever is appropriate +using weight_t = float; // or double, whichever is appropriate +using result_t = weight_t; // could specify float or double also +raft::handle_t handle; // Must be configured if MG +auto graph_view = graph.view(); // assumes you have created a graph somehow +vertex_t number_of_personalization_vertices; // Provided by caller + +result_t constexpr alpha{0.85}; +result_t constexpr epsilon{1e-6}; + +rmm::device_uvector pageranks_v(graph_view.get_number_of_vertices(), handle.get_stream()); +rmm::device_uvector personalization_vertices(number_of_personalization_vertices, handle.get_stream()); +rmm::device_uvector personalization_values(number_of_personalization_vertices, handle.get_stream()); + +// Populate personalization_vertices, personalization_values with user provided data + +// pagerank optionally supports three additional parameters: +// max_iterations - maximum number of iterations, if pagerank doesn't coverge by +// then we abort +// has_initial_guess - if true, values in the pagerank array when the call is initiated +// will be used as the initial pagerank values. These values will +// be normalized before use. If false (the default), the values +// in the pagerank array will be set to 1/num_vertices before +// starting the computation. +// do_expensive_check - perform extensive validation of the input data before +// executing algorithm. Off by default. Note: turning this on +// is expensive +cugraph::experimental::pagerank(handle, graph_view, nullptr, personalization_vertices.data(), + personalization_values.data(), number_of_personalization_vertices, + pageranks_v.data(), alpha, epsilon); +``` diff --git a/cpp/src/community/README.md b/cpp/src/community/README.md new file mode 100644 index 00000000000..4bff0a6e77e --- /dev/null +++ b/cpp/src/community/README.md @@ -0,0 +1,79 @@ +# Louvain and Related Clustering Algorithms +cuGraph contains a GPU implementation of the Louvain algorithm and several related clustering algorithms (Leiden and ECG). + +## Louvain + +The Louvain implementation is designed to assign clusters attempting to optimize modularity. The algorithm is derived from the serial implementation described in the following paper: + + * VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of community hierarchies in large networks, J Stat Mech P10008 (2008), http://arxiv.org/abs/0803.0476 + +It leverages some parallelism ideas from the following paper: + * Hao Lu, Mahantesh Halappanavar, Ananth Kalyanaraman: Parallel heuristics for scalable community detection, Elsevier Parallel Computing (2015), https://www.sciencedirect.com/science/article/pii/S0167819115000472 + + +The challenge in parallelizing Louvain lies in the primary loop which visits the vertices in serial. For each vertex v the change in modularity is computed for moving the vertex from its currently assigned cluster to each of the clusters to which v's neighbors are assigned. The largest positive delta modularity is used to select a new cluster (if there are no positive delta modularities then the vertex is not moved). If the vertex v is moved to a new cluster then the statistics of the vertex v's old cluster and new cluster change. This change in cluster statistics may affect the delta modularity computations of all vertices that follow vertex v in the serial iteration, creating a dependency between the different iterations of the loop. + +In order to make efficient use of the GPU parallelism, the cuGraph implementation computes the delta modularity for *all* vertex/neighbor pairs using the *current* vertex assignment. Decisions on moving vertices will be made based upon these delta modularities. This will potentially make choices that the serial version would not make. In order to minimize some of the negative effects of this (as described in the Lu paper), the cuGraph implementation uses an Up/Down technique. In even numbered iterations a vertex can only move from cluster i to cluster j if i > j; in odd numbered iterations a vertex can only move from cluster i to cluster j if i < j. This prevents two vertices from swapping clusters in the same iteration of the loop. We have had great success in converging on high modularity clustering using this technique. + +## Calling Louvain + +The unit test code is the best place to search for examples on calling louvain. + + * [SG Implementation](../../tests/community/louvain_test.cpp) + * [MG Implementation](../../tests/community/mg_louvain_test.cpp) + +The API itself is very simple. There are two variations: + * Return a flat clustering + * Return a Dendrogram + +### Return a flat clustering + +The example assumes that you create an SG or MG graph somehow. The caller must create the clustering vector in device memory and pass in the raw pointer to that vector into the louvain function. + +```cpp +#include +... +using vertex_t = int32_t; // or int64_t, whichever is appropriate +using weight_t = float; // or double, whichever is appropriate +raft::handle_t handle; // Must be configured if MG +auto graph_view = graph.view(); // assumes you have created a graph somehow + +size_t level; +weight_t modularity; + +rmm::device_uvector clustering_v(graph_view.get_number_of_vertices(), handle.get_stream()); + +// louvain optionally supports two additional parameters: +// max_level - maximum level of the Dendrogram +// resolution - constant in the modularity computation +std::tie(level, modularity) = cugraph::louvain(handle, graph_view, clustering_v.data()); +``` + +### Return a Dendrogram + +The Dendrogram represents the levels of hierarchical clustering that the Louvain algorithm computes. There is a separate function that will flatten the clustering into the same result as above. Returning the Dendrogram, however, provides a finer level of detail on the intermediate results which can be helpful in more fully understanding the data. + +```cpp +#include +... +using vertex_t = int32_t; // or int64_t, whichever is appropriate +using weight_t = float; // or double, whichever is appropriate +raft::handle_t handle; // Must be configured if MG +auto graph_view = graph.view(); // assumes you have created a graph somehow + +cugraph::Dendrogram dendrogram; +weight_t modularity; + +// louvain optionally supports two additional parameters: +// max_level - maximum level of the Dendrogram +// resolution - constant in the modularity computation +std::tie(dendrogram, modularity) = cugraph::louvain(handle, graph_view); + +// This will get the equivalent result to the earlier example +rmm::device_uvector clustering_v(graph_view.get_number_of_vertices(), handle.get_stream()); +cugraph::flatten_dendrogram(handle, graph_view, dendrogram, clustering.data()); +``` + +## Leiden + +## ECG diff --git a/cpp/src/community/ecg.cu b/cpp/src/community/ecg.cu index 994204ecd32..a176dfbd1c8 100644 --- a/cpp/src/community/ecg.cu +++ b/cpp/src/community/ecg.cu @@ -117,7 +117,7 @@ class EcgLouvain : public cugraph::Louvain { void initialize_dendrogram_level(vertex_t num_vertices) override { - this->dendrogram_->add_level(num_vertices); + this->dendrogram_->add_level(0, num_vertices, this->stream_); get_permutation_vector( num_vertices, seed_, this->dendrogram_->current_level_begin(), this->stream_); diff --git a/cpp/src/community/egonet.cu b/cpp/src/community/egonet.cu index 067d27f9a92..85ee327edb2 100644 --- a/cpp/src/community/egonet.cu +++ b/cpp/src/community/egonet.cu @@ -79,7 +79,12 @@ extract( // Streams will allocate concurrently later std::vector> reached{}; - reached.reserve(handle.get_num_internal_streams()); + reached.reserve(n_subgraphs); + for (vertex_t i = 0; i < n_subgraphs; i++) { + // Allocations and operations are attached to the worker stream + rmm::device_uvector local_reach(v, handle.get_internal_stream_view(i)); + reached.push_back(std::move(local_reach)); + } // h_source_vertex[i] is used by other streams in the for loop user_stream_view.synchronize(); @@ -87,15 +92,12 @@ extract( HighResTimer hr_timer; hr_timer.start("ego_neighbors"); #endif + for (vertex_t i = 0; i < n_subgraphs; i++) { // get light handle from worker pool raft::handle_t light_handle(handle, i); auto worker_stream_view = light_handle.get_stream_view(); - // Allocations and operations are attached to the worker stream - rmm::device_uvector local_reach(v, worker_stream_view); - reached.push_back(std::move(local_reach)); - // BFS with cutoff // consider adding a device API to BFS (ie. accept source on the device) rmm::device_uvector predecessors(v, worker_stream_view); // not used @@ -151,8 +153,7 @@ extract( // Construct the neighboors list concurrently for (vertex_t i = 0; i < n_subgraphs; i++) { - raft::handle_t light_handle(handle, i); - auto worker_stream_view = light_handle.get_stream_view(); + auto worker_stream_view = handle.get_internal_stream_view(i); thrust::copy(rmm::exec_policy(worker_stream_view), reached[i].begin(), reached[i].end(), @@ -265,4 +266,4 @@ extract_ego(raft::handle_t const &, int64_t, int64_t); } // namespace experimental -} // namespace cugraph \ No newline at end of file +} // namespace cugraph diff --git a/cpp/src/community/flatten_dendrogram.cuh b/cpp/src/community/flatten_dendrogram.cuh index 892fe2d1c51..6d455a68192 100644 --- a/cpp/src/community/flatten_dendrogram.cuh +++ b/cpp/src/community/flatten_dendrogram.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include @@ -31,23 +31,28 @@ void partition_at_level(raft::handle_t const &handle, size_t level) { vertex_t local_num_verts = dendrogram.get_level_size_nocheck(0); + rmm::device_uvector local_vertex_ids_v(local_num_verts, handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - d_vertex_ids, - d_vertex_ids + local_num_verts, - d_partition); - - std::for_each(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(level), - [&handle, &dendrogram, d_vertex_ids, &d_partition, local_num_verts](size_t l) { - cugraph::experimental::relabel( - handle, - std::tuple( - d_vertex_ids, dendrogram.get_level_ptr_nocheck(l)), - dendrogram.get_level_size_nocheck(l), - d_partition, - local_num_verts); - }); + raft::copy(d_partition, d_vertex_ids, local_num_verts, handle.get_stream()); + + std::for_each( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(level), + [&handle, &dendrogram, &local_vertex_ids_v, d_vertex_ids, &d_partition, local_num_verts]( + size_t l) { + thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + local_vertex_ids_v.begin(), + local_vertex_ids_v.begin() + dendrogram.get_level_size_nocheck(l), + dendrogram.get_level_first_index_nocheck(l)); + + cugraph::experimental::relabel( + handle, + std::tuple(local_vertex_ids_v.data(), + dendrogram.get_level_ptr_nocheck(l)), + dendrogram.get_level_size_nocheck(l), + d_partition, + local_num_verts); + }); } } // namespace cugraph diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh index 141f8beac40..4ffb7c20eb2 100644 --- a/cpp/src/community/leiden.cuh +++ b/cpp/src/community/leiden.cuh @@ -132,7 +132,7 @@ class Leiden : public Louvain { // // Initialize every cluster to reference each vertex to itself // - this->dendrogram_->add_level(current_graph.number_of_vertices); + this->dendrogram_->add_level(0, current_graph.number_of_vertices, this->stream_); thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_), this->dendrogram_->current_level_begin(), diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu index a851777ad93..2affcf29805 100644 --- a/cpp/src/community/louvain.cu +++ b/cpp/src/community/louvain.cu @@ -26,50 +26,28 @@ namespace cugraph { namespace detail { template -std::pair louvain(raft::handle_t const &handle, - GraphCSRView const &graph_view, - vertex_t *clustering, - size_t max_level, - weight_t resolution) +std::pair>, weight_t> louvain( + raft::handle_t const &handle, + GraphCSRView const &graph_view, + size_t max_level, + weight_t resolution) { CUGRAPH_EXPECTS(graph_view.edge_data != nullptr, "Invalid input argument: louvain expects a weighted graph"); - CUGRAPH_EXPECTS(clustering != nullptr, - "Invalid input argument: clustering is null, should be a device pointer to " - "memory for storing the result"); Louvain> runner(handle, graph_view); weight_t wt = runner(max_level, resolution); - rmm::device_uvector vertex_ids_v(graph_view.number_of_vertices, handle.get_stream()); - - thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertex_ids_v.begin(), - vertex_ids_v.end(), - vertex_t{0}); - - partition_at_level(handle, - runner.get_dendrogram(), - vertex_ids_v.data(), - clustering, - runner.get_dendrogram().num_levels()); - - // FIXME: Consider returning the Dendrogram at some point - return std::make_pair(runner.get_dendrogram().num_levels(), wt); + return std::make_pair(runner.move_dendrogram(), wt); } template -std::pair louvain( +std::pair>, weight_t> louvain( raft::handle_t const &handle, experimental::graph_view_t const &graph_view, - vertex_t *clustering, size_t max_level, weight_t resolution) { - CUGRAPH_EXPECTS(clustering != nullptr, - "Invalid input argument: clustering is null, should be a device pointer to " - "memory for storing the result"); - // "FIXME": remove this check and the guards below // // Disable louvain(experimental::graph_view_t,...) @@ -87,40 +65,153 @@ std::pair louvain( weight_t wt = runner(max_level, resolution); - rmm::device_uvector vertex_ids_v(graph_view.get_number_of_vertices(), - handle.get_stream()); + return std::make_pair(runner.move_dendrogram(), wt); + } +} - thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertex_ids_v.begin(), - vertex_ids_v.end(), - graph_view.get_local_vertex_first()); +template +void flatten_dendrogram(raft::handle_t const &handle, + GraphCSRView const &graph_view, + Dendrogram const &dendrogram, + vertex_t *clustering) +{ + rmm::device_uvector vertex_ids_v(graph_view.number_of_vertices, handle.get_stream()); - partition_at_level(handle, - runner.get_dendrogram(), - vertex_ids_v.data(), - clustering, - runner.get_dendrogram().num_levels()); + thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertex_ids_v.begin(), + vertex_ids_v.end(), + vertex_t{0}); - // FIXME: Consider returning the Dendrogram at some point - return std::make_pair(runner.get_dendrogram().num_levels(), wt); - } + partition_at_level( + handle, dendrogram, vertex_ids_v.data(), clustering, dendrogram.num_levels()); +} + +template +void flatten_dendrogram( + raft::handle_t const &handle, + experimental::graph_view_t const &graph_view, + Dendrogram const &dendrogram, + vertex_t *clustering) +{ + rmm::device_uvector vertex_ids_v(graph_view.get_number_of_vertices(), + handle.get_stream()); + + thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertex_ids_v.begin(), + vertex_ids_v.end(), + graph_view.get_local_vertex_first()); + + partition_at_level( + handle, dendrogram, vertex_ids_v.data(), clustering, dendrogram.num_levels()); } } // namespace detail -template -std::pair louvain(raft::handle_t const &handle, - graph_t const &graph, - typename graph_t::vertex_type *clustering, - size_t max_level, - typename graph_t::weight_type resolution) +template +std::pair>, + typename graph_view_t::weight_type> +louvain(raft::handle_t const &handle, + graph_view_t const &graph_view, + size_t max_level, + typename graph_view_t::weight_type resolution) +{ + return detail::louvain(handle, graph_view, max_level, resolution); +} + +template +void flatten_dendrogram(raft::handle_t const &handle, + graph_view_t const &graph_view, + Dendrogram const &dendrogram, + typename graph_view_t::vertex_type *clustering) { + detail::flatten_dendrogram(handle, graph_view, dendrogram, clustering); +} + +template +std::pair louvain( + raft::handle_t const &handle, + graph_view_t const &graph_view, + typename graph_view_t::vertex_type *clustering, + size_t max_level, + typename graph_view_t::weight_type resolution) +{ + using vertex_t = typename graph_view_t::vertex_type; + using weight_t = typename graph_view_t::weight_type; + CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null"); - return detail::louvain(handle, graph, clustering, max_level, resolution); + std::unique_ptr> dendrogram; + weight_t modularity; + + std::tie(dendrogram, modularity) = louvain(handle, graph_view, max_level, resolution); + + flatten_dendrogram(handle, graph_view, *dendrogram, clustering); + + return std::make_pair(dendrogram->num_levels(), modularity); } // Explicit template instantations +template std::pair>, float> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + float); +template std::pair>, float> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + float); +template std::pair>, float> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + float); +template std::pair>, double> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + double); +template std::pair>, double> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + double); +template std::pair>, double> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + double); +template std::pair>, float> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + float); +template std::pair>, float> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + float); +template std::pair>, float> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + float); +template std::pair>, double> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + double); +template std::pair>, double> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + double); +template std::pair>, double> louvain( + raft::handle_t const &, + experimental::graph_view_t const &, + size_t, + double); + template std::pair louvain( raft::handle_t const &, GraphCSRView const &, int32_t *, size_t, float); template std::pair louvain(raft::handle_t const &, diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh index e28f0f1746d..e3569d4c850 100644 --- a/cpp/src/community/louvain.cuh +++ b/cpp/src/community/louvain.cuh @@ -20,7 +20,7 @@ #include #include -#include +#include #include @@ -138,9 +138,11 @@ class Louvain { return Q; } - Dendrogram &get_dendrogram() const { return *dendrogram_; } + Dendrogram const &get_dendrogram() const { return *dendrogram_; } - std::unique_ptr> move_dendrogram() { return dendrogram_; } + Dendrogram &get_dendrogram() { return *dendrogram_; } + + std::unique_ptr> move_dendrogram() { return std::move(dendrogram_); } virtual weight_t operator()(size_t max_level, weight_t resolution) { @@ -208,7 +210,7 @@ class Louvain { virtual void initialize_dendrogram_level(vertex_t num_vertices) { - dendrogram_->add_level(num_vertices); + dendrogram_->add_level(0, num_vertices, stream_); thrust::sequence(rmm::exec_policy(stream_)->on(stream_), dendrogram_->current_level_begin(), diff --git a/cpp/src/components/connectivity.cu b/cpp/src/components/connectivity.cu index f4c7bf1d35c..09412160b37 100644 --- a/cpp/src/components/connectivity.cu +++ b/cpp/src/components/connectivity.cu @@ -78,7 +78,7 @@ std::enable_if_t::value> connected_components_impl( stream); } else { SCC_Data sccd(nrows, graph.offsets, graph.indices); - sccd.run_scc(labels); + auto num_iters = sccd.run_scc(labels); } } } // namespace detail diff --git a/cpp/src/components/scc_matrix.cuh b/cpp/src/components/scc_matrix.cuh index 801f1fe0fad..c7f4506b74e 100644 --- a/cpp/src/components/scc_matrix.cuh +++ b/cpp/src/components/scc_matrix.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -71,12 +71,13 @@ struct SCC_Data { p_d_r_o_(p_d_r_o), p_d_c_i_(p_d_c_i), d_C(nrows * nrows, 0), - d_Cprev(nrows * nrows, 0) + d_Cprev(nrows * nrows, 0), + p_d_C_(d_C.data().get()) { init(); } - const thrust::device_vector& get_C(void) const { return d_C; } + ByteT const* get_Cptr(void) const { return p_d_C_; } size_t nrows(void) const { return nrows_; } @@ -100,13 +101,12 @@ struct SCC_Data { void get_labels(IndexT* d_labels) const { - auto* p_d_C = d_C.data().get(); - size_t n = nrows_; // for lambda capture, since I cannot capture `this` (host), or `nrows_` + size_t n = nrows_; // for lambda capture, since I cannot capture `this` (host), or `nrows_` thrust::transform(thrust::device, thrust::make_counting_iterator(0), thrust::make_counting_iterator(nrows_), d_labels, - [n, p_d_C] __device__(IndexT k) { + [n, p_d_C = p_d_C_] __device__(IndexT k) { auto begin = p_d_C + k * n; auto end = begin + n; ByteT one{1}; @@ -124,7 +124,6 @@ struct SCC_Data { size_t nrows = nrows_; size_t count = 0; - ByteT* p_d_C = d_C.data().get(); ByteT* p_d_Cprev = get_Cprev().data().get(); size_t n2 = nrows * nrows; @@ -136,57 +135,60 @@ struct SCC_Data { do { flag.set(0); - thrust::for_each(thrust::device, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(n2), - [nrows, p_d_C, p_d_Cprev, p_d_flag, p_d_ro, p_d_ci] __device__(size_t indx) { - ByteT one{1}; - - auto i = indx / nrows; - auto j = indx % nrows; - - if ((i == j) || (p_d_Cprev[indx] == one)) - p_d_C[indx] = one; - else { - // this is where a hash-map could help: - // only need hashmap[(i,j)]={0,1} (`1` for "hit"); - // and only for new entries! - // already existent entries are covered by - // the `if`-branch above! - // Hence, hashmap[] can use limited space: - // M = max_l{number(new `1` entries)}, where - // l = #iterations in the do-loop! - // M ~ new `1` entries between A^k and A^{k+1}, - // k=1,2,... - // Might M actually be M ~ nnz(A) = |E| ?! - // Probably, because the primitive hash - //(via find_if) uses a search space of nnz(A) - // - // But, what if more than 1 entry pops-up in a row? - // Not an issue! Because the hash key is (i,j), and no - // more than one entry can exist in position (i,j)! - // - // And remember, we only need to store the new (i,j) keys - // that an iteration produces wrt to the previous iteration! - // - auto begin = p_d_ci + p_d_ro[i]; - auto end = p_d_ci + p_d_ro[i + 1]; - auto pos = thrust::find_if( - thrust::seq, begin, end, [one, j, nrows, p_d_Cprev, p_d_ci](IndexT k) { - return (p_d_Cprev[k * nrows + j] == one); - }); - - if (pos != end) p_d_C[indx] = one; - } - - if (p_d_C[indx] != p_d_Cprev[indx]) - *p_d_flag = 1; // race-condition: harmless, worst case many threads - // write the same value - }); + thrust::for_each( + thrust::device, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(n2), + [nrows, p_d_C = p_d_C_, p_d_Cprev, p_d_flag, p_d_ro, p_d_ci] __device__(size_t indx) { + ByteT one{1}; + + auto i = indx / nrows; + auto j = indx % nrows; + + if ((i == j) || (p_d_Cprev[indx] == one)) { + p_d_C[indx] = one; + } else { + // this ammounts to A (^,v) B + // (where A = adjacency matrix defined by (p_ro, p_ci), + // B := p_d_Cprev; (^,v) := (*,+) semiring); + // Here's why: + // (A (^,v) B)[i][j] := A[i][.] (^,v) B[j][.] + // (where X[i][.] := i-th row of X; + // X[.][j] := j-th column of X); + // which is: + // 1, iff A[i][.] and B[j][.] have a 1 in the same location, + // 0, otherwise; + // + // i.e., corresponfing entry in p_d_C is 1 + // if B[k][j] == 1 for any column k in A's i-th row; + // hence, for each column k of row A[i][.], + // which is the set: + // k \in {p_ci + p_ro[i], ..., p_ci + p_ro[i+1] - 1}, + // check if (B[k][j] == 1), + // i.e., p_d_Cprev[k*nrows + j]) == 1: + // + auto begin = p_d_ci + p_d_ro[i]; + auto end = p_d_ci + p_d_ro[i + 1]; + auto pos = thrust::find_if( + thrust::seq, begin, end, [one, j, nrows, p_d_Cprev, p_d_ci](IndexT k) { + return (p_d_Cprev[k * nrows + j] == one); + }); + + if (pos != end) p_d_C[indx] = one; + } + + if (p_d_C[indx] != p_d_Cprev[indx]) + *p_d_flag = 1; // race-condition: harmless, + // worst case many threads + // write the _same_ value + }); ++count; cudaDeviceSynchronize(); - std::swap(p_d_C, p_d_Cprev); + std::swap(p_d_C_, p_d_Cprev); // Note 1: this swap makes `p_d_Cprev` the + // most recently updated matrix pointer + // at the end of this loop + // (see `Note 2` why this matters); } while (flag.is_set()); // C & Ct: @@ -196,11 +198,13 @@ struct SCC_Data { thrust::for_each(thrust::device, thrust::make_counting_iterator(0), thrust::make_counting_iterator(n2), - [nrows, p_d_C, p_d_Cprev] __device__(size_t indx) { + [nrows, p_d_C = p_d_C_, p_d_Cprev] __device__(size_t indx) { auto i = indx / nrows; auto j = indx % nrows; auto tindx = j * nrows + i; + // Note 2: per Note 1, p_d_Cprev is latest: + // p_d_C[indx] = (p_d_Cprev[indx]) & (p_d_Cprev[tindx]); }); @@ -215,6 +219,9 @@ struct SCC_Data { const IndexT* p_d_c_i_; // column indices thrust::device_vector d_C; thrust::device_vector d_Cprev; + ByteT* p_d_C_{nullptr}; // holds the most recent update, + // which can have storage in any of d_C or d_Cprev, + // because the pointers get swapped! thrust::device_vector& get_Cprev(void) { return d_Cprev; } }; diff --git a/cpp/src/experimental/bfs.cu b/cpp/src/experimental/bfs.cu index 7adfbd7fbd7..2a703c1c85e 100644 --- a/cpp/src/experimental/bfs.cu +++ b/cpp/src/experimental/bfs.cu @@ -90,14 +90,9 @@ void bfs(raft::handle_t const &handle, // 3. initialize BFS frontier - enum class Bucket { cur, num_buckets }; - std::vector bucket_sizes(static_cast(Bucket::num_buckets), - push_graph_view.get_number_of_local_vertices()); - VertexFrontier, - vertex_t, - GraphViewType::is_multi_gpu, - static_cast(Bucket::num_buckets)> - vertex_frontier(handle, bucket_sizes); + enum class Bucket { cur, next, num_buckets }; + VertexFrontier(Bucket::num_buckets)> + vertex_frontier(handle); if (push_graph_view.is_local_vertex_nocheck(source_vertex)) { vertex_frontier.get_bucket(static_cast(Bucket::cur)).insert(source_vertex); @@ -106,23 +101,18 @@ void bfs(raft::handle_t const &handle, // 4. BFS iteration vertex_t depth{0}; - auto cur_local_vertex_frontier_first = - vertex_frontier.get_bucket(static_cast(Bucket::cur)).begin(); - auto cur_vertex_frontier_aggregate_size = - vertex_frontier.get_bucket(static_cast(Bucket::cur)).aggregate_size(); while (true) { if (direction_optimizing) { CUGRAPH_FAIL("unimplemented."); } else { vertex_partition_device_t vertex_partition(push_graph_view); - auto cur_local_vertex_frontier_last = - vertex_frontier.get_bucket(static_cast(Bucket::cur)).end(); update_frontier_v_push_if_out_nbr( handle, push_graph_view, - cur_local_vertex_frontier_first, - cur_local_vertex_frontier_last, + vertex_frontier, + static_cast(Bucket::cur), + std::vector{static_cast(Bucket::next)}, thrust::make_constant_iterator(0) /* dummy */, thrust::make_constant_iterator(0) /* dummy */, [vertex_partition, distances] __device__( @@ -133,28 +123,24 @@ void bfs(raft::handle_t const &handle, *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(dst)); if (distance != invalid_distance) { push = false; } } - // FIXME: need to test this works properly if payload size is 0 (returns a tuple of size - // 1) return thrust::make_tuple(push, src); }, - reduce_op::any>(), + reduce_op::any(), distances, thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first)), - vertex_frontier, [depth] __device__(auto v_val, auto pushed_val) { - auto idx = (v_val == invalid_distance) - ? static_cast(Bucket::cur) - : VertexFrontier, vertex_t>::kInvalidBucketIdx; - return thrust::make_tuple(idx, depth + 1, thrust::get<0>(pushed_val)); + auto idx = (v_val == invalid_distance) ? static_cast(Bucket::next) + : VertexFrontier::kInvalidBucketIdx; + return thrust::make_tuple(idx, thrust::make_tuple(depth + 1, pushed_val)); }); - auto new_vertex_frontier_aggregate_size = - vertex_frontier.get_bucket(static_cast(Bucket::cur)).aggregate_size() - - cur_vertex_frontier_aggregate_size; - if (new_vertex_frontier_aggregate_size == 0) { break; } - - cur_local_vertex_frontier_first = cur_local_vertex_frontier_last; - cur_vertex_frontier_aggregate_size += new_vertex_frontier_aggregate_size; + vertex_frontier.get_bucket(static_cast(Bucket::cur)).clear(); + vertex_frontier.get_bucket(static_cast(Bucket::cur)).shrink_to_fit(); + vertex_frontier.swap_buckets(static_cast(Bucket::cur), + static_cast(Bucket::next)); + if (vertex_frontier.get_bucket(static_cast(Bucket::cur)).aggregate_size() == 0) { + break; + } } depth++; diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu index 0cd551b0d73..1eccbd23584 100644 --- a/cpp/src/experimental/coarsen_graph.cu +++ b/cpp/src/experimental/coarsen_graph.cu @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -49,6 +50,7 @@ std:: weight_t const *compressed_sparse_weights, vertex_t major_first, vertex_t major_last, + bool is_weighted, cudaStream_t stream) { edge_t number_of_edges{0}; @@ -57,8 +59,7 @@ std:: CUDA_TRY(cudaStreamSynchronize(stream)); rmm::device_uvector edgelist_major_vertices(number_of_edges, stream); rmm::device_uvector edgelist_minor_vertices(number_of_edges, stream); - rmm::device_uvector edgelist_weights( - compressed_sparse_weights != nullptr ? number_of_edges : 0, stream); + rmm::device_uvector edgelist_weights(is_weighted ? number_of_edges : 0, stream); // FIXME: this is highly inefficient for very high-degree vertices, for better performance, we can // fill high-degree vertices using one CUDA block per vertex, mid-degree vertices using one CUDA @@ -77,7 +78,7 @@ std:: compressed_sparse_indices, compressed_sparse_indices + number_of_edges, edgelist_minor_vertices.begin()); - if (compressed_sparse_weights != nullptr) { + if (is_weighted) { thrust::copy(rmm::exec_policy(stream)->on(stream), compressed_sparse_weights, compressed_sparse_weights + number_of_edges, @@ -89,62 +90,62 @@ std:: std::move(edgelist_weights)); } -template -void sort_and_coarsen_edgelist(rmm::device_uvector &edgelist_major_vertices /* [INOUT] */, - rmm::device_uvector &edgelist_minor_vertices /* [INOUT] */, - rmm::device_uvector &edgelist_weights /* [INOUT] */, - cudaStream_t stream) +template +edge_t groupby_e_and_coarsen_edgelist(vertex_t *edgelist_major_vertices /* [INOUT] */, + vertex_t *edgelist_minor_vertices /* [INOUT] */, + weight_t *edgelist_weights /* [INOUT] */, + edge_t number_of_edges, + bool is_weighted, + cudaStream_t stream) { - auto pair_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + auto pair_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices)); - size_t number_of_edges{0}; - if (edgelist_weights.size() > 0) { + if (is_weighted) { thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), pair_first, - pair_first + edgelist_major_vertices.size(), - edgelist_weights.begin()); + pair_first + number_of_edges, + edgelist_weights); - rmm::device_uvector tmp_edgelist_major_vertices(edgelist_major_vertices.size(), - stream); + rmm::device_uvector tmp_edgelist_major_vertices(number_of_edges, stream); rmm::device_uvector tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(), stream); rmm::device_uvector tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream); auto it = thrust::reduce_by_key( rmm::exec_policy(stream)->on(stream), pair_first, - pair_first + edgelist_major_vertices.size(), - edgelist_weights.begin(), + pair_first + number_of_edges, + edgelist_weights, thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(), tmp_edgelist_minor_vertices.begin())), tmp_edgelist_weights.begin()); - number_of_edges = thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it)); + auto ret = + static_cast(thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it))); - edgelist_major_vertices = std::move(tmp_edgelist_major_vertices); - edgelist_minor_vertices = std::move(tmp_edgelist_minor_vertices); - edgelist_weights = std::move(tmp_edgelist_weights); + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(), + tmp_edgelist_minor_vertices.begin(), + tmp_edgelist_weights.begin())); + thrust::copy(rmm::exec_policy(stream)->on(stream), + edge_first, + edge_first + ret, + thrust::make_zip_iterator(thrust::make_tuple( + edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights))); + + return ret; } else { - thrust::sort(rmm::exec_policy(stream)->on(stream), - pair_first, - pair_first + edgelist_major_vertices.size()); - auto it = thrust::unique(rmm::exec_policy(stream)->on(stream), - pair_first, - pair_first + edgelist_major_vertices.size()); - number_of_edges = thrust::distance(pair_first, it); + thrust::sort(rmm::exec_policy(stream)->on(stream), pair_first, pair_first + number_of_edges); + return static_cast(thrust::distance( + pair_first, + thrust::unique( + rmm::exec_policy(stream)->on(stream), pair_first, pair_first + number_of_edges))); } - - edgelist_major_vertices.resize(number_of_edges, stream); - edgelist_minor_vertices.resize(number_of_edges, stream); - edgelist_weights.resize(number_of_edges, stream); - edgelist_major_vertices.shrink_to_fit(stream); - edgelist_minor_vertices.shrink_to_fit(stream); - edgelist_weights.shrink_to_fit(stream); } template std:: tuple, rmm::device_uvector, rmm::device_uvector> - compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + compressed_sparse_to_relabeled_and_grouped_and_coarsened_edgelist( edge_t const *compressed_sparse_offsets, vertex_t const *compressed_sparse_indices, weight_t const *compressed_sparse_weights, @@ -154,6 +155,7 @@ std:: vertex_t major_last, vertex_t minor_first, vertex_t minor_last, + bool is_weighted, cudaStream_t stream) { // FIXME: it might be possible to directly create relabled & coarsened edgelist from the @@ -168,6 +170,7 @@ std:: compressed_sparse_weights, major_first, major_last, + is_weighted, stream); auto pair_first = thrust::make_zip_iterator( @@ -182,8 +185,21 @@ std:: p_minor_labels[thrust::get<1>(val) - minor_first]); }); - sort_and_coarsen_edgelist( - edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights, stream); + auto number_of_edges = + groupby_e_and_coarsen_edgelist(edgelist_major_vertices.data(), + edgelist_minor_vertices.data(), + edgelist_weights.data(), + static_cast(edgelist_major_vertices.size()), + is_weighted, + stream); + edgelist_major_vertices.resize(number_of_edges, stream); + edgelist_major_vertices.shrink_to_fit(stream); + edgelist_minor_vertices.resize(number_of_edges, stream); + edgelist_minor_vertices.shrink_to_fit(stream); + if (is_weighted) { + edgelist_weights.resize(number_of_edges, stream); + edgelist_weights.shrink_to_fit(stream); + } return std::make_tuple(std::move(edgelist_major_vertices), std::move(edgelist_minor_vertices), @@ -220,48 +236,66 @@ coarsen_graph( // currently, nothing to do } - // 1. locally construct coarsened edge list + // 1. construct coarsened edge list - // FIXME: we don't need adj_matrix_major_labels if we apply the same partitioning scheme - // regardless of hypergraph partitioning is applied or not - rmm::device_uvector adj_matrix_major_labels( - store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols() - : graph_view.get_number_of_local_adj_matrix_partition_rows(), - handle.get_stream()); rmm::device_uvector adj_matrix_minor_labels( store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_rows() : graph_view.get_number_of_local_adj_matrix_partition_cols(), handle.get_stream()); if (store_transposed) { - copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_major_labels.data()); copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_minor_labels.data()); } else { - copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_major_labels.data()); copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels.data()); } - rmm::device_uvector coarsened_edgelist_major_vertices(0, handle.get_stream()); - rmm::device_uvector coarsened_edgelist_minor_vertices(0, handle.get_stream()); - rmm::device_uvector coarsened_edgelist_weights(0, handle.get_stream()); + std::vector> coarsened_edgelist_major_vertices{}; + std::vector> coarsened_edgelist_minor_vertices{}; + std::vector> coarsened_edgelist_weights{}; + coarsened_edgelist_major_vertices.reserve(graph_view.get_number_of_local_adj_matrix_partitions()); + coarsened_edgelist_minor_vertices.reserve(coarsened_edgelist_major_vertices.size()); + coarsened_edgelist_weights.reserve( + graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : size_t{0}); + for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { + coarsened_edgelist_major_vertices.emplace_back(0, handle.get_stream()); + coarsened_edgelist_minor_vertices.emplace_back(0, handle.get_stream()); + if (graph_view.is_weighted()) { + coarsened_edgelist_weights.emplace_back(0, handle.get_stream()); + } + } // FIXME: we may compare performance/memory footprint with the hash_based approach especially when // cuco::dynamic_map becomes available (so we don't need to preallocate memory assuming the worst // case). We may be able to limit the memory requirement close to the final coarsened edgelist // with the hash based approach. for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) { - // get edge list + // 1-1. locally construct coarsened edge list + + rmm::device_uvector major_labels( + store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols(i) + : graph_view.get_number_of_local_adj_matrix_partition_rows(i), + handle.get_stream()); + // FIXME: this copy is unnecessary, beter fix RAFT comm's bcast to take const iterators for + // input + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + labels, + labels + major_labels.size(), + major_labels.begin()); + device_bcast(col_comm, + major_labels.data(), + major_labels.data(), + major_labels.size(), + static_cast(i), + handle.get_stream()); rmm::device_uvector edgelist_major_vertices(0, handle.get_stream()); rmm::device_uvector edgelist_minor_vertices(0, handle.get_stream()); rmm::device_uvector edgelist_weights(0, handle.get_stream()); std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) = - compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + compressed_sparse_to_relabeled_and_grouped_and_coarsened_edgelist( graph_view.offsets(i), graph_view.indices(i), graph_view.weights(i), - adj_matrix_major_labels.begin() + - (store_transposed ? graph_view.get_local_adj_matrix_partition_col_value_start_offset(i) - : graph_view.get_local_adj_matrix_partition_row_value_start_offset(i)), - adj_matrix_minor_labels.begin(), + major_labels.data(), + adj_matrix_minor_labels.data(), store_transposed ? graph_view.get_local_adj_matrix_partition_col_first(i) : graph_view.get_local_adj_matrix_partition_row_first(i), store_transposed ? graph_view.get_local_adj_matrix_partition_col_last(i) @@ -270,86 +304,159 @@ coarsen_graph( : graph_view.get_local_adj_matrix_partition_col_first(i), store_transposed ? graph_view.get_local_adj_matrix_partition_row_last(i) : graph_view.get_local_adj_matrix_partition_col_last(i), + graph_view.is_weighted(), handle.get_stream()); - auto cur_size = coarsened_edgelist_major_vertices.size(); - // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we can - // reserve address space to avoid expensive reallocation. - // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management - coarsened_edgelist_major_vertices.resize(cur_size + edgelist_major_vertices.size(), - handle.get_stream()); - coarsened_edgelist_minor_vertices.resize(coarsened_edgelist_major_vertices.size(), - handle.get_stream()); - coarsened_edgelist_weights.resize( - graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : 0, handle.get_stream()); - - if (graph_view.is_weighted()) { - auto src_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), - edgelist_minor_vertices.begin(), - edgelist_weights.begin())); - auto dst_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(), - coarsened_edgelist_minor_vertices.begin(), - coarsened_edgelist_weights.begin())) + - cur_size; - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - src_edge_first, - src_edge_first + edgelist_major_vertices.size(), - dst_edge_first); - } else { - auto src_edge_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); - auto dst_edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(), - coarsened_edgelist_minor_vertices.begin())) + - cur_size; - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - src_edge_first, - src_edge_first + edgelist_major_vertices.size(), - dst_edge_first); + // 1-2. globaly shuffle + + { + rmm::device_uvector rx_edgelist_major_vertices(0, handle.get_stream()); + rmm::device_uvector rx_edgelist_minor_vertices(0, handle.get_stream()); + rmm::device_uvector rx_edgelist_weights(0, handle.get_stream()); + if (graph_view.is_weighted()) { + auto edge_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), + edgelist_minor_vertices.begin(), + edgelist_weights.begin())); + std::forward_as_tuple( + std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices, rx_edgelist_weights), + std::ignore) = + groupby_gpuid_and_shuffle_values( + handle.get_comms(), + edge_first, + edge_first + edgelist_major_vertices.size(), + [key_func = + detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + handle.get_stream()); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + std::forward_as_tuple(std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices), + std::ignore) = + groupby_gpuid_and_shuffle_values( + handle.get_comms(), + edge_first, + edge_first + edgelist_major_vertices.size(), + [key_func = + detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + handle.get_stream()); + } + + edgelist_major_vertices = std::move(rx_edgelist_major_vertices); + edgelist_minor_vertices = std::move(rx_edgelist_minor_vertices); + edgelist_weights = std::move(rx_edgelist_weights); } - } - sort_and_coarsen_edgelist(coarsened_edgelist_major_vertices, - coarsened_edgelist_minor_vertices, - coarsened_edgelist_weights, - handle.get_stream()); - - // 2. globally shuffle edge list and re-coarsen - - { - auto edge_first = - thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(), - coarsened_edgelist_minor_vertices.begin(), - coarsened_edgelist_weights.begin())); - rmm::device_uvector rx_edgelist_major_vertices(0, handle.get_stream()); - rmm::device_uvector rx_edgelist_minor_vertices(0, handle.get_stream()); - rmm::device_uvector rx_edgelist_weights(0, handle.get_stream()); - std::forward_as_tuple( - std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices, rx_edgelist_weights), - std::ignore) = - groupby_gpuid_and_shuffle_values( - handle.get_comms(), - edge_first, - edge_first + coarsened_edgelist_major_vertices.size(), - [key_func = - detail::compute_gpu_id_from_edge_t{graph_view.is_hypergraph_partitioned(), - comm.get_size(), - row_comm.get_size(), - col_comm.get_size()}] __device__(auto val) { - return key_func(thrust::get<0>(val), thrust::get<1>(val)); - }, + // 1-3. append data to local adjacency matrix partitions + + // FIXME: we can skip this if groupby_gpuid_and_shuffle_values is updated to return sorted edge + // list based on the final matrix partition (maybe add + // groupby_adj_matrix_partition_and_shuffle_values). + + auto local_partition_id_op = + [comm_size, + key_func = detail::compute_partition_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto pair) { + return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) / + comm_size; // global partition id to local partition id + }; + auto pair_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + auto counts = graph_view.is_weighted() + ? groupby_and_count(pair_first, + pair_first + edgelist_major_vertices.size(), + edgelist_weights.begin(), + local_partition_id_op, + graph_view.get_number_of_local_adj_matrix_partitions(), + handle.get_stream()) + : groupby_and_count(pair_first, + pair_first + edgelist_major_vertices.size(), + local_partition_id_op, + graph_view.get_number_of_local_adj_matrix_partitions(), + handle.get_stream()); + + std::vector h_counts(counts.size()); + raft::update_host(h_counts.data(), counts.data(), counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + + std::vector h_displacements(h_counts.size(), size_t{0}); + std::partial_sum(h_counts.begin(), h_counts.end() - 1, h_displacements.begin() + 1); + + for (int j = 0; j < col_comm_size; ++j) { + auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( + edgelist_major_vertices.begin() + h_displacements[j], + edgelist_minor_vertices.begin() + h_displacements[j], + graph_view.is_weighted() ? edgelist_weights.begin() + h_displacements[j] + : static_cast(nullptr), + h_counts[j], + graph_view.is_weighted(), handle.get_stream()); - sort_and_coarsen_edgelist(rx_edgelist_major_vertices, - rx_edgelist_minor_vertices, - rx_edgelist_weights, - handle.get_stream()); + auto cur_size = coarsened_edgelist_major_vertices[j].size(); + // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we + // can reserve address space to avoid expensive reallocation. + // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management + coarsened_edgelist_major_vertices[j].resize(cur_size + number_of_partition_edges, + handle.get_stream()); + coarsened_edgelist_minor_vertices[j].resize(coarsened_edgelist_major_vertices[j].size(), + handle.get_stream()); + if (graph_view.is_weighted()) { + coarsened_edgelist_weights[j].resize(coarsened_edgelist_major_vertices[j].size(), + handle.get_stream()); + + auto src_edge_first = + thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(), + edgelist_minor_vertices.begin(), + edgelist_weights.begin())) + + h_displacements[j]; + auto dst_edge_first = + thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(), + coarsened_edgelist_minor_vertices[j].begin(), + coarsened_edgelist_weights[j].begin())) + + cur_size; + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + src_edge_first, + src_edge_first + number_of_partition_edges, + dst_edge_first); + } else { + auto src_edge_first = thrust::make_zip_iterator(thrust::make_tuple( + edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())) + + h_displacements[j]; + auto dst_edge_first = thrust::make_zip_iterator( + thrust::make_tuple(coarsened_edgelist_major_vertices[j].begin(), + coarsened_edgelist_minor_vertices[j].begin())) + + cur_size; + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + src_edge_first, + src_edge_first + edgelist_major_vertices.size(), + dst_edge_first); + } + } + } - coarsened_edgelist_major_vertices = std::move(rx_edgelist_major_vertices); - coarsened_edgelist_minor_vertices = std::move(rx_edgelist_minor_vertices); - coarsened_edgelist_weights = std::move(rx_edgelist_weights); + for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) { + auto number_of_partition_edges = groupby_e_and_coarsen_edgelist( + coarsened_edgelist_major_vertices[i].data(), + coarsened_edgelist_minor_vertices[i].data(), + graph_view.is_weighted() ? coarsened_edgelist_weights[i].data() + : static_cast(nullptr), + static_cast(coarsened_edgelist_major_vertices[i].size()), + graph_view.is_weighted(), + handle.get_stream()); + coarsened_edgelist_major_vertices[i].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_major_vertices[i].shrink_to_fit(handle.get_stream()); + coarsened_edgelist_minor_vertices[i].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_minor_vertices[i].shrink_to_fit(handle.get_stream()); + if (coarsened_edgelist_weights.size() > 0) { + coarsened_edgelist_weights[i].resize(number_of_partition_edges, handle.get_stream()); + coarsened_edgelist_weights[i].shrink_to_fit(handle.get_stream()); + } } // 3. find unique labels for this GPU @@ -395,37 +502,43 @@ coarsen_graph( rmm::device_uvector renumber_map_labels(0, handle.get_stream()); partition_t partition(std::vector(comm_size + 1, 0), - graph_view.is_hypergraph_partitioned(), row_comm_size, col_comm_size, row_comm_rank, col_comm_rank); vertex_t number_of_vertices{}; edge_t number_of_edges{}; - std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) = - renumber_edgelist( - handle, - unique_labels.data(), - static_cast(unique_labels.size()), - coarsened_edgelist_major_vertices.data(), - coarsened_edgelist_minor_vertices.data(), - static_cast(coarsened_edgelist_major_vertices.size()), - graph_view.is_hypergraph_partitioned(), - do_expensive_check); + { + std::vector major_ptrs(coarsened_edgelist_major_vertices.size()); + std::vector minor_ptrs(major_ptrs.size()); + std::vector counts(major_ptrs.size()); + for (size_t i = 0; i < coarsened_edgelist_major_vertices.size(); ++i) { + major_ptrs[i] = coarsened_edgelist_major_vertices[i].data(); + minor_ptrs[i] = coarsened_edgelist_minor_vertices[i].data(); + counts[i] = static_cast(coarsened_edgelist_major_vertices[i].size()); + } + std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) = + renumber_edgelist(handle, + unique_labels.data(), + static_cast(unique_labels.size()), + major_ptrs, + minor_ptrs, + counts, + do_expensive_check); + } // 5. build a graph std::vector> edgelists{}; - if (graph_view.is_hypergraph_partitioned()) { - CUGRAPH_FAIL("unimplemented."); - } else { - edgelists.resize(1); - edgelists[0].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices.data() - : coarsened_edgelist_major_vertices.data(); - edgelists[0].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices.data() - : coarsened_edgelist_minor_vertices.data(); - edgelists[0].p_edge_weights = coarsened_edgelist_weights.data(); - edgelists[0].number_of_edges = static_cast(coarsened_edgelist_major_vertices.size()); + edgelists.resize(graph_view.get_number_of_local_adj_matrix_partitions()); + for (size_t i = 0; i < edgelists.size(); ++i) { + edgelists[i].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices[i].data() + : coarsened_edgelist_major_vertices[i].data(); + edgelists[i].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices[i].data() + : coarsened_edgelist_minor_vertices[i].data(); + edgelists[i].p_edge_weights = graph_view.is_weighted() ? coarsened_edgelist_weights[i].data() + : static_cast(nullptr); + edgelists[i].number_of_edges = static_cast(coarsened_edgelist_major_vertices[i].size()); } return std::make_tuple( @@ -435,7 +548,7 @@ coarsen_graph( partition, number_of_vertices, number_of_edges, - graph_properties_t{graph_view.is_symmetric(), false}, + graph_properties_t{graph_view.is_symmetric(), false, graph_view.is_weighted()}, true), std::move(renumber_map_labels)); } @@ -466,7 +579,7 @@ coarsen_graph( std::tie(coarsened_edgelist_major_vertices, coarsened_edgelist_minor_vertices, coarsened_edgelist_weights) = - compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + compressed_sparse_to_relabeled_and_grouped_and_coarsened_edgelist( graph_view.offsets(), graph_view.indices(), graph_view.weights(), @@ -476,6 +589,7 @@ coarsen_graph( graph_view.get_number_of_vertices(), vertex_t{0}, graph_view.get_number_of_vertices(), + graph_view.is_weighted(), handle.get_stream()); rmm::device_uvector unique_labels(graph_view.get_number_of_vertices(), @@ -516,7 +630,7 @@ coarsen_graph( handle, edgelist, static_cast(renumber_map_labels.size()), - graph_properties_t{graph_view.is_symmetric(), false}, + graph_properties_t{graph_view.is_symmetric(), false, graph_view.is_weighted()}, true), std::move(renumber_map_labels)); } diff --git a/cpp/src/experimental/generate_rmat_edgelist.cu b/cpp/src/experimental/generate_rmat_edgelist.cu index 0a6d666432f..d75a4654a15 100644 --- a/cpp/src/experimental/generate_rmat_edgelist.cu +++ b/cpp/src/experimental/generate_rmat_edgelist.cu @@ -27,7 +27,9 @@ #include #include +#include #include +#include "rmm/detail/error.hpp" namespace cugraph { namespace experimental { @@ -44,13 +46,13 @@ std::tuple, rmm::device_uvector> generat bool clip_and_flip, bool scramble_vertex_ids) { - CUGRAPH_EXPECTS(size_t{1} << scale <= std::numeric_limits::max(), + CUGRAPH_EXPECTS((size_t{1} << scale) <= static_cast(std::numeric_limits::max()), "Invalid input argument: scale too large for vertex_t."); CUGRAPH_EXPECTS((a >= 0.0) && (b >= 0.0) && (c >= 0.0) && (a + b + c <= 1.0), "Invalid input argument: a, b, c should be non-negative and a + b + c should not " "be larger than 1.0."); - raft::random::Rng rng(seed + 10); + raft::random::Rng rng(seed); // to limit memory footprint (1024 is a tuning parameter) auto max_edges_to_generate_per_iteration = static_cast(handle.get_device_properties().multiProcessorCount) * 1024; @@ -121,7 +123,57 @@ std::tuple, rmm::device_uvector> generat return std::make_tuple(std::move(srcs), std::move(dsts)); } -// explicit instantiation +template +std::vector, rmm::device_uvector>> +generate_rmat_edgelists(raft::handle_t const& handle, + size_t n_edgelists, + size_t min_scale, + size_t max_scale, + size_t edge_factor, + generator_distribution_t component_distribution, + generator_distribution_t edge_distribution, + uint64_t seed, + bool clip_and_flip, + bool scramble_vertex_ids) +{ + CUGRAPH_EXPECTS(min_scale > 0, "minimum graph scale is 1."); + CUGRAPH_EXPECTS(size_t{1} << max_scale <= std::numeric_limits::max(), + "Invalid input argument: scale too large for vertex_t."); + + std::vector, rmm::device_uvector>> output{}; + output.reserve(n_edgelists); + std::vector scale(n_edgelists); + + std::default_random_engine eng; + eng.seed(seed); + if (component_distribution == generator_distribution_t::UNIFORM) { + std::uniform_int_distribution dist(min_scale, max_scale); + std::generate(scale.begin(), scale.end(), [&dist, &eng]() { return dist(eng); }); + } else { + // May expose this as a parameter in the future + std::exponential_distribution dist(4); + // The modulo is here to protect the range because exponential distribution is defined on + // [0,infinity). With exponent 4 most values are between 0 and 1 + auto range = max_scale - min_scale; + std::generate(scale.begin(), scale.end(), [&dist, &eng, &min_scale, &range]() { + return min_scale + static_cast(static_cast(range) * dist(eng)) % range; + }); + } + + // intialized to standard powerlaw values + double a = 0.57, b = 0.19, c = 0.19; + if (edge_distribution == generator_distribution_t::UNIFORM) { + a = 0.25; + b = a; + c = a; + } + + for (size_t i = 0; i < n_edgelists; i++) { + output.push_back(generate_rmat_edgelist( + handle, scale[i], scale[i] * edge_factor, a, b, c, i, clip_and_flip, scramble_vertex_ids)); + } + return output; +} template std::tuple, rmm::device_uvector> generate_rmat_edgelist(raft::handle_t const& handle, @@ -145,5 +197,29 @@ generate_rmat_edgelist(raft::handle_t const& handle, bool clip_and_flip, bool scramble_vertex_ids); +template std::vector, rmm::device_uvector>> +generate_rmat_edgelists(raft::handle_t const& handle, + size_t n_edgelists, + size_t min_scale, + size_t max_scale, + size_t edge_factor, + generator_distribution_t component_distribution, + generator_distribution_t edge_distribution, + uint64_t seed, + bool clip_and_flip, + bool scramble_vertex_ids); + +template std::vector, rmm::device_uvector>> +generate_rmat_edgelists(raft::handle_t const& handle, + size_t n_edgelists, + size_t min_scale, + size_t max_scale, + size_t edge_factor, + generator_distribution_t component_distribution, + generator_distribution_t edge_distribution, + uint64_t seed, + bool clip_and_flip, + bool scramble_vertex_ids); + } // namespace experimental } // namespace cugraph diff --git a/cpp/src/experimental/graph.cu b/cpp/src/experimental/graph.cu index 5abe141dafd..18db57a737f 100644 --- a/cpp/src/experimental/graph.cu +++ b/cpp/src/experimental/graph.cu @@ -67,12 +67,12 @@ std:: vertex_t major_last, vertex_t minor_first, vertex_t minor_last, + bool is_weighted, cudaStream_t stream) { rmm::device_uvector offsets((major_last - major_first) + 1, stream); rmm::device_uvector indices(edgelist.number_of_edges, stream); - rmm::device_uvector weights( - edgelist.p_edge_weights != nullptr ? edgelist.number_of_edges : 0, stream); + rmm::device_uvector weights(is_weighted ? edgelist.number_of_edges : 0, stream); thrust::fill(rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end(), edge_t{0}); thrust::fill(rmm::exec_policy(stream)->on(stream), indices.begin(), indices.end(), vertex_t{0}); @@ -89,8 +89,7 @@ std:: auto p_offsets = offsets.data(); auto p_indices = indices.data(); - auto p_weights = - edgelist.p_edge_weights != nullptr ? weights.data() : static_cast(nullptr); + auto p_weights = is_weighted ? weights.data() : static_cast(nullptr); thrust::for_each(rmm::exec_policy(stream)->on(stream), store_transposed ? edgelist.p_dst_vertices : edgelist.p_src_vertices, @@ -103,7 +102,7 @@ std:: thrust::exclusive_scan( rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end(), offsets.begin()); - if (edgelist.p_edge_weights != nullptr) { + if (is_weighted) { auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( edgelist.p_src_vertices, edgelist.p_dst_vertices, edgelist.p_edge_weights)); thrust::for_each(rmm::exec_policy(stream)->on(stream), @@ -191,24 +190,22 @@ graph_t 0, "Invalid input argument: edgelists.size() should be non-zero."); - bool is_weighted = edgelists[0].p_edge_weights != nullptr; - CUGRAPH_EXPECTS( std::any_of(edgelists.begin() + 1, edgelists.end(), - [is_weighted](auto edgelist) { - return (edgelist.p_src_vertices == nullptr) || - (edgelist.p_dst_vertices == nullptr) || - (is_weighted && (edgelist.p_edge_weights == nullptr)) || + [is_weighted = properties.is_weighted](auto edgelist) { + return ((edgelist.number_of_edges > 0) && (edgelist.p_src_vertices == nullptr)) || + ((edgelist.number_of_edges > 0) && (edgelist.p_dst_vertices == nullptr)) || + (is_weighted && (edgelist.number_of_edges > 0) && + (edgelist.p_edge_weights == nullptr)) || (!is_weighted && (edgelist.p_edge_weights != nullptr)); }) == false, "Invalid input argument: edgelists[].p_src_vertices and edgelists[].p_dst_vertices should not " - "be nullptr and edgelists[].p_edge_weights should be nullptr (if edgelists[0].p_edge_weights " - "is nullptr) or should not be nullptr (otherwise)."); + "be nullptr if edgelists[].number_of_edges > 0 and edgelists[].p_edge_weights should be " + "nullptr if unweighted or should not be nullptr if weighted and edgelists[].number_of_edges > " + "0."); - CUGRAPH_EXPECTS((partition.is_hypergraph_partitioned() && - (edgelists.size() == static_cast(col_comm_size))) || - (!(partition.is_hypergraph_partitioned()) && (edgelists.size() == 1)), + CUGRAPH_EXPECTS(edgelists.size() == static_cast(col_comm_size), "Invalid input argument: errneous edgelists.size()."); // optional expensive checks (part 1/3) @@ -251,7 +248,7 @@ graph_tget_handle_ptr()->get_stream()); adj_matrix_partition_offsets_.push_back(std::move(offsets)); adj_matrix_partition_indices_.push_back(std::move(indices)); - if (is_weighted) { adj_matrix_partition_weights_.push_back(std::move(weights)); } + if (properties.is_weighted) { adj_matrix_partition_weights_.push_back(std::move(weights)); } } // update degree-based segment offsets (to be used for graph analytics kernel optimization) @@ -297,8 +295,8 @@ graph_t::max())); rmm::device_uvector d_thresholds(detail::num_segments_per_vertex_partition - 1, default_stream); - std::vector h_thresholds = {static_cast(detail::low_degree_threshold), - static_cast(detail::mid_degree_threshold)}; + std::vector h_thresholds = {static_cast(detail::mid_degree_threshold), + static_cast(detail::low_degree_threshold)}; raft::update_device( d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), default_stream); @@ -319,43 +317,26 @@ graph_t aggregate_segment_offsets(0, default_stream); - if (partition.is_hypergraph_partitioned()) { - rmm::device_uvector aggregate_segment_offsets( - col_comm_size * segment_offsets.size(), default_stream); - col_comm.allgather(segment_offsets.data(), - aggregate_segment_offsets.data(), - segment_offsets.size(), - default_stream); - } else { - rmm::device_uvector aggregate_segment_offsets( - row_comm_size * segment_offsets.size(), default_stream); - row_comm.allgather(segment_offsets.data(), - aggregate_segment_offsets.data(), - segment_offsets.size(), - default_stream); - } - - vertex_partition_segment_offsets_.resize(aggregate_segment_offsets.size()); - raft::update_host(vertex_partition_segment_offsets_.data(), + segment_offsets.begin() + 1, + thrust::greater{}); + + rmm::device_uvector aggregate_segment_offsets(col_comm_size * segment_offsets.size(), + default_stream); + col_comm.allgather(segment_offsets.data(), + aggregate_segment_offsets.data(), + segment_offsets.size(), + default_stream); + + adj_matrix_partition_segment_offsets_.resize(aggregate_segment_offsets.size()); + raft::update_host(adj_matrix_partition_segment_offsets_.data(), aggregate_segment_offsets.data(), aggregate_segment_offsets.size(), default_stream); - raft::comms::status_t status{}; - if (partition.is_hypergraph_partitioned()) { - status = col_comm.sync_stream( - default_stream); // this is necessary as degrees, d_thresholds, and segment_offsets will - // become out-of-scope once control flow exits this block and - // vertex_partition_segment_offsets_ can be used right after return. - } else { - status = row_comm.sync_stream( - default_stream); // this is necessary as degrees, d_thresholds, and segment_offsets will - // become out-of-scope once control flow exits this block and - // vertex_partition_segment_offsets_ can be used right after return. - } + auto status = col_comm.sync_stream( + default_stream); // this is necessary as degrees, d_thresholds, and segment_offsets will + // become out-of-scope once control flow exits this block and + // adj_matrix_partition_segment_offsets_ can be used right after return. CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure."); } @@ -393,9 +374,14 @@ graph_tget_handle_ptr()->get_stream(); CUGRAPH_EXPECTS( - (edgelist.p_src_vertices != nullptr) && (edgelist.p_dst_vertices != nullptr), + ((edgelist.number_of_edges == 0) || (edgelist.p_src_vertices != nullptr)) && + ((edgelist.number_of_edges == 0) || (edgelist.p_dst_vertices != nullptr)) && + ((properties.is_weighted && + ((edgelist.number_of_edges == 0) || (edgelist.p_edge_weights != nullptr))) || + (!properties.is_weighted && (edgelist.p_edge_weights == nullptr))), "Invalid input argument: edgelist.p_src_vertices and edgelist.p_dst_vertices should " - "not be nullptr."); + "not be nullptr if edgelist.number_of_edges > 0 and edgelist.p_edge_weights should be nullptr " + "if unweighted or should not be nullptr if weighted and edgelist.number_of_edges > 0."); // optional expensive checks (part 1/2) @@ -427,6 +413,7 @@ graph_tget_number_of_vertices(), vertex_t{0}, this->get_number_of_vertices(), + properties.is_weighted, this->get_handle_ptr()->get_stream()); // update degree-based segment offsets (to be used for graph analytics kernel optimization) @@ -453,8 +440,8 @@ graph_t::max())); rmm::device_uvector d_thresholds(detail::num_segments_per_vertex_partition - 1, default_stream); - std::vector h_thresholds = {static_cast(detail::low_degree_threshold), - static_cast(detail::mid_degree_threshold)}; + std::vector h_thresholds = {static_cast(detail::mid_degree_threshold), + static_cast(detail::low_degree_threshold)}; raft::update_device( d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), default_stream); @@ -476,7 +463,8 @@ graph_tget_number_of_vertices(), d_thresholds.begin(), d_thresholds.end(), - segment_offsets.begin() + 1); + segment_offsets.begin() + 1, + thrust::greater{}); segment_offsets_.resize(segment_offsets.size()); raft::update_host( diff --git a/cpp/src/experimental/graph_view.cu b/cpp/src/experimental/graph_view.cu index f443608e424..67603ae260b 100644 --- a/cpp/src/experimental/graph_view.cu +++ b/cpp/src/experimental/graph_view.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -160,7 +161,7 @@ graph_view_t const& adj_matrix_partition_offsets, std::vector const& adj_matrix_partition_indices, std::vector const& adj_matrix_partition_weights, - std::vector const& vertex_partition_segment_offsets, + std::vector const& adj_matrix_partition_segment_offsets, partition_t const& partition, vertex_t number_of_vertices, edge_t number_of_edges, @@ -175,7 +176,7 @@ graph_view_t(row_comm_size))) || - (!(partition.is_hypergraph_partitioned()) && (adj_matrix_partition_offsets.size() == 1)), - "Internal Error: erroneous adj_matrix_partition_offsets.size()."); + CUGRAPH_EXPECTS(adj_matrix_partition_offsets.size() == static_cast(col_comm_size), + "Internal Error: erroneous adj_matrix_partition_offsets.size()."); CUGRAPH_EXPECTS((sorted_by_global_degree_within_vertex_partition && - (vertex_partition_segment_offsets.size() == - (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size) * - (detail::num_segments_per_vertex_partition + 1))) || + (adj_matrix_partition_segment_offsets.size() == + col_comm_size * (detail::num_segments_per_vertex_partition + 1))) || (!sorted_by_global_degree_within_vertex_partition && - (vertex_partition_segment_offsets.size() == 0)), - "Internal Error: vertex_partition_segment_offsets.size() does not match " + (adj_matrix_partition_segment_offsets.size() == 0)), + "Internal Error: adj_matrix_partition_segment_offsets.size() does not match " "with sorted_by_global_degree_within_vertex_partition."); // optional expensive checks @@ -267,25 +264,22 @@ graph_view_t graph_view_t< } } +template +edge_t +graph_view_t>:: + compute_max_in_degree(raft::handle_t const& handle) const +{ + auto in_degrees = compute_in_degrees(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + in_degrees.begin(), + in_degrees.end()); + rmm::device_scalar ret(handle.get_stream()); + device_allreduce( + handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); + return ret.value(handle.get_stream()); +} + +template +edge_t graph_view_t>::compute_max_in_degree(raft::handle_t const& + handle) const +{ + auto in_degrees = compute_in_degrees(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + in_degrees.begin(), + in_degrees.end()); + edge_t ret{}; + raft::update_host(&ret, it, 1, handle.get_stream()); + handle.get_stream_view().synchronize(); + return ret; +} + +template +edge_t +graph_view_t>:: + compute_max_out_degree(raft::handle_t const& handle) const +{ + auto out_degrees = compute_out_degrees(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + out_degrees.begin(), + out_degrees.end()); + rmm::device_scalar ret(handle.get_stream()); + device_allreduce( + handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); + return ret.value(handle.get_stream()); +} + +template +edge_t graph_view_t>::compute_max_out_degree(raft::handle_t const& + handle) const +{ + auto out_degrees = compute_out_degrees(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + out_degrees.begin(), + out_degrees.end()); + edge_t ret{}; + raft::update_host(&ret, it, 1, handle.get_stream()); + handle.get_stream_view().synchronize(); + return ret; +} + +template +weight_t +graph_view_t>:: + compute_max_in_weight_sum(raft::handle_t const& handle) const +{ + auto in_weight_sums = compute_in_weight_sums(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + in_weight_sums.begin(), + in_weight_sums.end()); + rmm::device_scalar ret(handle.get_stream()); + device_allreduce( + handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); + return ret.value(handle.get_stream()); +} + +template +weight_t graph_view_t>::compute_max_in_weight_sum(raft::handle_t const& + handle) const +{ + auto in_weight_sums = compute_in_weight_sums(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + in_weight_sums.begin(), + in_weight_sums.end()); + weight_t ret{}; + raft::update_host(&ret, it, 1, handle.get_stream()); + handle.get_stream_view().synchronize(); + return ret; +} + +template +weight_t +graph_view_t>:: + compute_max_out_weight_sum(raft::handle_t const& handle) const +{ + auto out_weight_sums = compute_out_weight_sums(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + out_weight_sums.begin(), + out_weight_sums.end()); + rmm::device_scalar ret(handle.get_stream()); + device_allreduce( + handle.get_comms(), it, ret.data(), 1, raft::comms::op_t::MAX, handle.get_stream()); + return ret.value(handle.get_stream()); +} + +template +weight_t graph_view_t< + vertex_t, + edge_t, + weight_t, + store_transposed, + multi_gpu, + std::enable_if_t>::compute_max_out_weight_sum(raft::handle_t const& handle) const +{ + auto out_weight_sums = compute_out_weight_sums(handle); + auto it = thrust::max_element(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + out_weight_sums.begin(), + out_weight_sums.end()); + weight_t ret{}; + raft::update_host(&ret, it, 1, handle.get_stream()); + handle.get_stream_view().synchronize(); + return ret; +} + // explicit instantiation template class graph_view_t; diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh index fe8310a62ca..24914fb028b 100644 --- a/cpp/src/experimental/louvain.cuh +++ b/cpp/src/experimental/louvain.cuh @@ -15,28 +15,21 @@ */ #pragma once -#include +#include #include - -#include -#include -#include -#include - -#include - #include + #include #include +#include +#include #include #include +#include -#include - -#include - -#include +#include +#include //#define TIMING @@ -47,343 +40,6 @@ namespace cugraph { namespace experimental { -namespace detail { - -#ifdef CUCO_STATIC_MAP_DEFINED -template -struct create_cuco_pair_t { - cuco::pair_type __device__ operator()(data_t data) - { - cuco::pair_type tmp; - tmp.first = data; - tmp.second = data_t{0}; - return tmp; - } -}; -#endif - -// -// These classes should allow cuco::static_map to generate hash tables of -// different configurations. -// - -// -// Compare edges based on src[e] and dst[e] matching -// -template -class src_dst_equality_comparator_t { - public: - src_dst_equality_comparator_t(rmm::device_vector const &src, - rmm::device_vector const &dst, - sentinel_t sentinel_value) - : d_src_{src.data().get()}, d_dst_{dst.data().get()}, sentinel_value_(sentinel_value) - { - } - - src_dst_equality_comparator_t(data_t const *d_src, data_t const *d_dst, sentinel_t sentinel_value) - : d_src_{d_src}, d_dst_{d_dst}, sentinel_value_(sentinel_value) - { - } - - template - __device__ bool operator()(idx_type lhs_index, idx_type rhs_index) const noexcept - { - return (lhs_index != sentinel_value_) && (rhs_index != sentinel_value_) && - (d_src_[lhs_index] == d_src_[rhs_index]) && (d_dst_[lhs_index] == d_dst_[rhs_index]); - } - - private: - data_t const *d_src_; - data_t const *d_dst_; - sentinel_t sentinel_value_; -}; - -// -// Hash edges based src[e] and dst[e] -// -template -class src_dst_hasher_t { - public: - src_dst_hasher_t(rmm::device_vector const &src, rmm::device_vector const &dst) - : d_src_{src.data().get()}, d_dst_{dst.data().get()} - { - } - - src_dst_hasher_t(data_t const *d_src, data_t const *d_dst) : d_src_{d_src}, d_dst_{d_dst} {} - - template - __device__ auto operator()(idx_type index) const - { - cuco::detail::MurmurHash3_32 hasher; - - auto h_src = hasher(d_src_[index]); - auto h_dst = hasher(d_dst_[index]); - - /* - * Combine the source hash and the dest hash into a single hash value - * - * Taken from the Boost hash_combine function - * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html - */ - h_src ^= h_dst + 0x9e3779b9 + (h_src << 6) + (h_src >> 2); - - return h_src; - } - - private: - data_t const *d_src_; - data_t const *d_dst_; -}; - -// -// Compare edges based on src[e] and cluster[dst[e]] matching -// -template -class src_cluster_equality_comparator_t { - public: - src_cluster_equality_comparator_t(rmm::device_vector const &src, - rmm::device_vector const &dst, - rmm::device_vector const &dst_cluster_cache, - data_t base_dst_id, - sentinel_t sentinel_value) - : d_src_{src.data().get()}, - d_dst_{dst.data().get()}, - d_dst_cluster_{dst_cluster_cache.data().get()}, - base_dst_id_(base_dst_id), - sentinel_value_(sentinel_value) - { - } - - src_cluster_equality_comparator_t(data_t const *d_src, - data_t const *d_dst, - data_t const *d_dst_cluster_cache, - data_t base_dst_id, - sentinel_t sentinel_value) - : d_src_{d_src}, - d_dst_{d_dst}, - d_dst_cluster_{d_dst_cluster_cache}, - base_dst_id_(base_dst_id), - sentinel_value_(sentinel_value) - { - } - - __device__ bool operator()(sentinel_t lhs_index, sentinel_t rhs_index) const noexcept - { - return (lhs_index != sentinel_value_) && (rhs_index != sentinel_value_) && - (d_src_[lhs_index] == d_src_[rhs_index]) && - (d_dst_cluster_[d_dst_[lhs_index] - base_dst_id_] == - d_dst_cluster_[d_dst_[rhs_index] - base_dst_id_]); - } - - private: - data_t const *d_src_; - data_t const *d_dst_; - data_t const *d_dst_cluster_; - data_t base_dst_id_; - sentinel_t sentinel_value_; -}; - -// -// Hash edges based src[e] and cluster[dst[e]] -// -template -class src_cluster_hasher_t { - public: - src_cluster_hasher_t(rmm::device_vector const &src, - rmm::device_vector const &dst, - rmm::device_vector const &dst_cluster_cache, - data_t base_dst_id) - : d_src_{src.data().get()}, - d_dst_{dst.data().get()}, - d_dst_cluster_{dst_cluster_cache.data().get()}, - base_dst_id_(base_dst_id) - { - } - - src_cluster_hasher_t(data_t const *d_src, - data_t const *d_dst, - data_t const *d_dst_cluster_cache, - data_t base_dst_id) - : d_src_{d_src}, d_dst_{d_dst}, d_dst_cluster_{d_dst_cluster_cache}, base_dst_id_(base_dst_id) - { - } - - template - __device__ auto operator()(idx_type index) const - { - cuco::detail::MurmurHash3_32 hasher; - - auto h_src = hasher(d_src_[index]); - auto h_cluster = hasher(d_dst_cluster_[d_dst_[index] - base_dst_id_]); - - /* - * Combine the source hash and the cluster hash into a single hash value - * - * Taken from the Boost hash_combine function - * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html - */ - h_src ^= h_cluster + 0x9e3779b9 + (h_src << 6) + (h_src >> 2); - - return h_src; - } - - private: - data_t const *d_src_; - data_t const *d_dst_; - data_t const *d_dst_cluster_; - data_t base_dst_id_; -}; - -// -// Skip edges where src[e] == dst[e] -// -template -class skip_edge_t { - public: - skip_edge_t(rmm::device_vector const &src, rmm::device_vector const &dst) - : d_src_{src.data().get()}, d_dst_{dst.data().get()} - { - } - - skip_edge_t(data_t const *src, data_t const *dst) : d_src_{src}, d_dst_{dst} {} - - template - __device__ auto operator()(idx_type index) const - { - return d_src_[index] == d_dst_[index]; - } - - private: - data_t const *d_src_; - data_t const *d_dst_; -}; - -template -struct lookup_by_vertex_id { - public: - lookup_by_vertex_id(data_t const *d_array, vertex_t const *d_vertices, vertex_t base_vertex_id) - : d_array_(d_array), d_vertices_(d_vertices), base_vertex_id_(base_vertex_id) - { - } - - template - data_t operator() __device__(edge_t edge_id) const - { - return d_array_[d_vertices_[edge_id] - base_vertex_id_]; - } - - private: - data_t const *d_array_; - vertex_t const *d_vertices_; - vertex_t base_vertex_id_; -}; - -template -vector_t remove_elements_from_vector(vector_t const &input_v, - iterator_t iterator_begin, - iterator_t iterator_end, - function_t function, - cudaStream_t stream) -{ - vector_t temp_v(input_v.size()); - - auto last = thrust::copy_if( - rmm::exec_policy(stream)->on(stream), iterator_begin, iterator_end, temp_v.begin(), function); - - temp_v.resize(thrust::distance(temp_v.begin(), last)); - - return temp_v; -} - -template -vector_t remove_elements_from_vector(vector_t const &input_v, - function_t function, - cudaStream_t stream) -{ - return remove_elements_from_vector(input_v, input_v.begin(), input_v.end(), function, stream); -} - -// FIXME: This should be a generic utility. The one in cython.cu -// is very close to this -template * = nullptr> -std::unique_ptr> -create_graph(raft::handle_t const &handle, - rmm::device_vector const &src_v, - rmm::device_vector const &dst_v, - rmm::device_vector const &weight_v, - std::size_t num_local_verts, - experimental::graph_properties_t graph_props, - view_t const &view) -{ - std::vector> edgelist( - {{src_v.data().get(), - dst_v.data().get(), - weight_v.data().get(), - static_cast(src_v.size())}}); - - return std::make_unique>( - handle, - edgelist, - view.get_partition(), - num_local_verts, - src_v.size(), - graph_props, - false, - false); -} - -template * = nullptr> -std::unique_ptr> -create_graph(raft::handle_t const &handle, - rmm::device_vector const &src_v, - rmm::device_vector const &dst_v, - rmm::device_vector const &weight_v, - std::size_t num_local_verts, - experimental::graph_properties_t graph_props, - view_t const &view) -{ - experimental::edgelist_t edgelist{ - src_v.data().get(), - dst_v.data().get(), - weight_v.data().get(), - static_cast(src_v.size())}; - - return std::make_unique>( - handle, edgelist, num_local_verts, graph_props, false, false); -} - -} // namespace detail - -// -// FIXME: Ultimately, this would be cleaner and more efficient if we did the following: -// -// 1) Create an object that does a single level Louvain computation on an input graph -// (no graph contraction) -// 2) Create an object that does graph contraction -// 3) Create Louvain to use these objects in sequence to compute the aggregate result. -// -// In MNMG-world, the graph contraction step is going to create another graph that likely -// fits efficiently in a smaller number of GPUs (eventually one). Decomposing the algorithm -// as above would allow us to eventually run the single GPU version of single level Louvain -// on the contracted graphs - which should be more efficient. -// -// FIXME: We should return the dendrogram and let the python layer clean it up (or have a -// separate C++ function to flatten the dendrogram). There are customers that might -// like the dendrogram and the implementation would be a bit cleaner if we did the -// collapsing as a separate step -// template class Louvain { public: @@ -405,67 +61,31 @@ class Louvain { handle_(handle), dendrogram_(std::make_unique>()), current_graph_view_(graph_view), - compute_partition_(handle, graph_view), - local_num_vertices_(graph_view.get_number_of_local_vertices()), - local_num_rows_(graph_view.get_number_of_local_adj_matrix_partition_rows()), - local_num_cols_(graph_view.get_number_of_local_adj_matrix_partition_cols()), - local_num_edges_(graph_view.get_number_of_edges()), - vertex_weights_v_(graph_view.get_number_of_local_vertices()), - cluster_weights_v_(graph_view.get_number_of_local_vertices()), - number_of_vertices_(graph_view.get_number_of_local_vertices()), - stream_(handle.get_stream()) + cluster_keys_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), + cluster_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), + vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()), + src_vertex_weights_cache_v_(0, handle.get_stream()), + src_cluster_cache_v_(0, handle.get_stream()), + dst_cluster_cache_v_(0, handle.get_stream()) { - if (graph_view_t::is_multi_gpu) { - rank_ = handle.get_comms().get_rank(); - base_vertex_id_ = graph_view.get_local_vertex_first(); - base_src_vertex_id_ = graph_view.get_local_adj_matrix_partition_row_first(0); - base_dst_vertex_id_ = graph_view.get_local_adj_matrix_partition_col_first(0); - - local_num_edges_ = thrust::transform_reduce( - thrust::host, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator( - graph_view.get_number_of_local_adj_matrix_partitions()), - [&graph_view](auto indx) { - return graph_view.get_number_of_local_adj_matrix_partition_edges(indx); - }, - size_t{0}, - thrust::plus()); - - CUDA_TRY(cudaStreamSynchronize(stream_)); - } - - src_indices_v_.resize(local_num_edges_); - - cugraph::detail::offsets_to_indices( - current_graph_view_.offsets(), local_num_rows_, src_indices_v_.data().get()); - - if (base_src_vertex_id_ > 0) { - thrust::transform(rmm::exec_policy(stream_)->on(stream_), - src_indices_v_.begin(), - src_indices_v_.end(), - thrust::make_constant_iterator(base_src_vertex_id_), - src_indices_v_.begin(), - thrust::plus()); - } } - Dendrogram &get_dendrogram() const { return *dendrogram_; } + Dendrogram const &get_dendrogram() const { return *dendrogram_; } + + Dendrogram &get_dendrogram() { return *dendrogram_; } - std::unique_ptr> move_dendrogram() { return dendrogram_; } + std::unique_ptr> move_dendrogram() { return std::move(dendrogram_); } virtual weight_t operator()(size_t max_level, weight_t resolution) { weight_t best_modularity = weight_t{-1}; -#ifdef CUCO_STATIC_MAP_DEFINED - weight_t total_edge_weight; - total_edge_weight = experimental::transform_reduce_e( + weight_t total_edge_weight = experimental::transform_reduce_e( handle_, current_graph_view_, thrust::make_constant_iterator(0), thrust::make_constant_iterator(0), - [] __device__(auto, auto, weight_t wt, auto, auto) { return wt; }, + [] __device__(auto src, auto dst, weight_t wt, auto, auto) { return wt; }, weight_t{0}); while (dendrogram_->num_levels() < max_level) { @@ -486,7 +106,6 @@ class Louvain { } timer_display(std::cout); -#endif return best_modularity; } @@ -495,14 +114,23 @@ class Louvain { void timer_start(std::string const ®ion) { #ifdef TIMING - if (rank_ == 0) hr_timer_.start(region); + if (graph_view_t::is_multi_gpu) { + if (handle.get_comms().get_rank() == 0) hr_timer_.start(region); + } else { + hr_timer_.start(region); + } #endif } void timer_stop(cudaStream_t stream) { #ifdef TIMING - if (rank_ == 0) { + if (graph_view_t::is_multi_gpu) { + if (handle.get_comms().get_rank() == 0) { + CUDA_TRY(cudaStreamSynchronize(stream)); + hr_timer_.stop(); + } + } else { CUDA_TRY(cudaStreamSynchronize(stream)); hr_timer_.stop(); } @@ -512,36 +140,47 @@ class Louvain { void timer_display(std::ostream &os) { #ifdef TIMING - if (rank_ == 0) hr_timer_.display(os); + if (graph_view_t::is_multi_gpu) { + if (handle.get_comms().get_rank() == 0) hr_timer_.display(os); + } else { + hr_timer_.display(os); + } #endif } protected: void initialize_dendrogram_level(vertex_t num_vertices) { - dendrogram_->add_level(num_vertices); + dendrogram_->add_level( + current_graph_view_.get_local_vertex_first(), num_vertices, handle_.get_stream()); - thrust::sequence(rmm::exec_policy(stream_)->on(stream_), + thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), dendrogram_->current_level_begin(), dendrogram_->current_level_end(), - base_vertex_id_); + current_graph_view_.get_local_vertex_first()); } public: weight_t modularity(weight_t total_edge_weight, weight_t resolution) { - weight_t sum_degree_squared = experimental::transform_reduce_v( - handle_, - current_graph_view_, + weight_t sum_degree_squared = thrust::transform_reduce( + rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), cluster_weights_v_.begin(), + cluster_weights_v_.end(), [] __device__(weight_t p) { return p * p; }, - weight_t{0}); + weight_t{0}, + thrust::plus()); + + if (graph_t::is_multi_gpu) { + sum_degree_squared = + host_scalar_allreduce(handle_.get_comms(), sum_degree_squared, handle_.get_stream()); + } weight_t sum_internal = experimental::transform_reduce_e( handle_, current_graph_view_, - src_cluster_cache_v_.begin(), - dst_cluster_cache_v_.begin(), + d_src_cluster_cache_, + d_dst_cluster_cache_, [] __device__(auto src, auto dst, weight_t wt, auto src_cluster, auto nbr_cluster) { if (src_cluster == nbr_cluster) { return wt; @@ -561,58 +200,86 @@ class Louvain { { timer_start("compute_vertex_and_cluster_weights"); - experimental::copy_v_transform_reduce_out_nbr( - handle_, - current_graph_view_, - thrust::make_constant_iterator(0), - thrust::make_constant_iterator(0), - [] __device__(auto src, auto, auto wt, auto, auto) { return wt; }, - weight_t{0}, - vertex_weights_v_.begin()); + vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_); + + thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + cluster_keys_v_.begin(), + cluster_keys_v_.end(), + current_graph_view_.get_local_vertex_first()); - thrust::copy(rmm::exec_policy(stream_)->on(stream_), - vertex_weights_v_.begin(), - vertex_weights_v_.end(), - cluster_weights_v_.begin()); + raft::copy(cluster_weights_v_.begin(), + vertex_weights_v_.begin(), + vertex_weights_v_.size(), + handle_.get_stream()); - cache_vertex_properties( - vertex_weights_v_.begin(), src_vertex_weights_cache_v_, dst_vertex_weights_cache_v_); + d_src_vertex_weights_cache_ = + cache_src_vertex_properties(vertex_weights_v_, src_vertex_weights_cache_v_); - cache_vertex_properties( - cluster_weights_v_.begin(), src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_); + if (graph_view_t::is_multi_gpu) { + auto const comm_size = handle_.get_comms().get_size(); + rmm::device_uvector rx_keys_v(0, handle_.get_stream()); + rmm::device_uvector rx_weights_v(0, handle_.get_stream()); + + auto pair_first = thrust::make_zip_iterator( + thrust::make_tuple(cluster_keys_v_.begin(), cluster_weights_v_.begin())); + + std::forward_as_tuple(std::tie(rx_keys_v, rx_weights_v), std::ignore) = + groupby_gpuid_and_shuffle_values( + handle_.get_comms(), + pair_first, + pair_first + current_graph_view_.get_number_of_local_vertices(), + [key_func = + cugraph::experimental::detail::compute_gpu_id_from_vertex_t{ + comm_size}] __device__(auto val) { return key_func(thrust::get<0>(val)); }, + handle_.get_stream()); + + cluster_keys_v_ = std::move(rx_keys_v); + cluster_weights_v_ = std::move(rx_weights_v); + } - timer_stop(stream_); + timer_stop(handle_.get_stream()); } - template - void cache_vertex_properties(iterator_t const &local_input_iterator, - rmm::device_vector &src_cache_v, - rmm::device_vector &dst_cache_v, - bool src = true, - bool dst = true) + template + T *cache_src_vertex_properties(rmm::device_uvector &input, rmm::device_uvector &src_cache_v) { - if (src) { - src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows()); - copy_to_adj_matrix_row( - handle_, current_graph_view_, local_input_iterator, src_cache_v.begin()); + if (graph_view_t::is_multi_gpu) { + src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), + handle_.get_stream()); + copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_v.begin()); + return src_cache_v.begin(); + } else { + return input.begin(); } + } - if (dst) { - dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols()); - copy_to_adj_matrix_col( - handle_, current_graph_view_, local_input_iterator, dst_cache_v.begin()); + template + T *cache_dst_vertex_properties(rmm::device_uvector &input, rmm::device_uvector &dst_cache_v) + { + if (graph_view_t::is_multi_gpu) { + dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols(), + handle_.get_stream()); + copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_v.begin()); + return dst_cache_v.begin(); + } else { + return input.begin(); } } -#ifdef CUCO_STATIC_MAP_DEFINED virtual weight_t update_clustering(weight_t total_edge_weight, weight_t resolution) { timer_start("update_clustering"); - rmm::device_vector next_cluster_v(dendrogram_->current_level_begin(), - dendrogram_->current_level_end()); + rmm::device_uvector next_cluster_v(dendrogram_->current_level_size(), + handle_.get_stream()); + + raft::copy(next_cluster_v.begin(), + dendrogram_->current_level_begin(), + dendrogram_->current_level_size(), + handle_.get_stream()); - cache_vertex_properties(next_cluster_v.begin(), src_cluster_cache_v_, dst_cluster_cache_v_); + d_src_cluster_cache_ = cache_src_vertex_properties(next_cluster_v, src_cluster_cache_v_); + d_dst_cluster_cache_ = cache_dst_vertex_properties(next_cluster_v, dst_cluster_cache_v_); weight_t new_Q = modularity(total_edge_weight, resolution); weight_t cur_Q = new_Q - 1; @@ -629,691 +296,268 @@ class Louvain { up_down = !up_down; - cache_vertex_properties(next_cluster_v.begin(), src_cluster_cache_v_, dst_cluster_cache_v_); - new_Q = modularity(total_edge_weight, resolution); if (new_Q > cur_Q) { - thrust::copy(rmm::exec_policy(stream_)->on(stream_), - next_cluster_v.begin(), - next_cluster_v.end(), - dendrogram_->current_level_begin()); + raft::copy(dendrogram_->current_level_begin(), + next_cluster_v.begin(), + next_cluster_v.size(), + handle_.get_stream()); } } - // cache the final clustering locally on each cpu - cache_vertex_properties( - dendrogram_->current_level_begin(), src_cluster_cache_v_, dst_cluster_cache_v_); - - timer_stop(stream_); + timer_stop(handle_.get_stream()); return cur_Q; } - void update_by_delta_modularity(weight_t total_edge_weight, - weight_t resolution, - rmm::device_vector &next_cluster_v, - bool up_down) + void compute_cluster_sum_and_subtract(rmm::device_uvector &old_cluster_sum_v, + rmm::device_uvector &cluster_subtract_v) { - rmm::device_vector old_cluster_sum_v(local_num_vertices_); - rmm::device_vector src_old_cluster_sum_cache_v; + auto output_buffer = + cugraph::experimental::allocate_dataframe_buffer>( + current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); experimental::copy_v_transform_reduce_out_nbr( handle_, current_graph_view_, - src_cluster_cache_v_.begin(), - dst_cluster_cache_v_.begin(), + d_src_cluster_cache_, + d_dst_cluster_cache_, [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) { - if ((src != dst) && (src_cluster == nbr_cluster)) { - return wt; - } else - return weight_t{0}; - }, - weight_t{0}, - old_cluster_sum_v.begin()); - - cache_vertex_properties( - old_cluster_sum_v.begin(), src_old_cluster_sum_cache_v, empty_cache_weight_v_, true, false); - - detail::src_cluster_equality_comparator_t compare( - src_indices_v_.data().get(), - current_graph_view_.indices(), - dst_cluster_cache_v_.data().get(), - base_dst_vertex_id_, - std::numeric_limits::max()); - detail::src_cluster_hasher_t hasher(src_indices_v_.data().get(), - current_graph_view_.indices(), - dst_cluster_cache_v_.data().get(), - base_dst_vertex_id_); - detail::skip_edge_t skip_edge(src_indices_v_.data().get(), - current_graph_view_.indices()); - - // - // Group edges that lead from same source to same neighboring cluster together - // local_cluster_edge_ids_v will contain edge ids of unique pairs of (src,nbr_cluster). - // If multiple edges exist, one edge id will be chosen (by a parallel race). - // nbr_weights_v will contain the combined weight of all of the edges that connect - // that pair. - // - rmm::device_vector local_cluster_edge_ids_v; - rmm::device_vector nbr_weights_v; - - // - // Perform this combining on the local edges - // - std::tie(local_cluster_edge_ids_v, nbr_weights_v) = combine_local_src_nbr_cluster_weights( - hasher, compare, skip_edge, current_graph_view_.weights(), local_num_edges_); - - // - // In order to compute delta_Q for a given src/nbr_cluster pair, I need the following - // information: - // src - // old_cluster - the cluster that src is currently assigned to - // nbr_cluster - // sum of edges going to new cluster - // vertex weight of the src vertex - // sum of edges going to old cluster - // cluster_weights of old cluster - // cluster_weights of nbr_cluster - // - // Each GPU has locally cached: - // The sum of edges going to the old cluster (computed from - // experimental::copy_v_transform_reduce_out_nbr call above. - // old_cluster - // nbr_cluster - // vertex weight of src vertex - // partial sum of edges going to the new cluster (in nbr_weights) - // - // So the plan is to take the tuple: - // (src, old_cluster, src_vertex_weight, old_cluster_sum, nbr_cluster, nbr_weights) - // and shuffle it around the cluster so that they arrive at the GPU where the pair - // (old_cluster, new_cluster) would be assigned. Then we can aggregate this information - // and compute the delta_Q values. - // - - // - // Define the communication pattern, we're going to send detail - // for edge i to the GPU that is responsible for the vertex - // pair (cluster[src[i]], cluster[dst[i]]) - // - auto communication_schedule = thrust::make_transform_iterator( - local_cluster_edge_ids_v.begin(), - [d_edge_device_view = compute_partition_.edge_device_view(), - d_src_indices = src_indices_v_.data().get(), - d_src_cluster = src_cluster_cache_v_.data().get(), - d_dst_indices = current_graph_view_.indices(), - d_dst_cluster = dst_cluster_cache_v_.data().get(), - base_src_vertex_id = base_src_vertex_id_, - base_dst_vertex_id = base_dst_vertex_id_] __device__(edge_t edge_id) { - return d_edge_device_view(d_src_cluster[d_src_indices[edge_id] - base_src_vertex_id], - d_dst_cluster[d_dst_indices[edge_id] - base_dst_vertex_id]); - }); + weight_t subtract{0}; + weight_t sum{0}; - // FIXME: This should really be a variable_shuffle of a tuple, for time - // reasons I'm just doing 6 independent shuffles. - // - rmm::device_vector ocs_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_transform_iterator( - local_cluster_edge_ids_v.begin(), - detail::lookup_by_vertex_id(src_old_cluster_sum_cache_v.data().get(), - src_indices_v_.data().get(), - base_src_vertex_id_)), - communication_schedule); - - rmm::device_vector src_cluster_v = - variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_transform_iterator( - local_cluster_edge_ids_v.begin(), - detail::lookup_by_vertex_id( - src_cluster_cache_v_.data().get(), src_indices_v_.data().get(), base_src_vertex_id_)), - communication_schedule); - - rmm::device_vector src_vertex_weight_v = - variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_transform_iterator( - local_cluster_edge_ids_v.begin(), - detail::lookup_by_vertex_id(src_vertex_weights_cache_v_.data().get(), - src_indices_v_.data().get(), - base_src_vertex_id_)), - communication_schedule); - - rmm::device_vector src_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(src_indices_v_.begin(), local_cluster_edge_ids_v.begin()), - communication_schedule); + if (src == dst) + subtract = wt; + else if (src_cluster == nbr_cluster) + sum = wt; - rmm::device_vector nbr_cluster_v = - variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_transform_iterator( - local_cluster_edge_ids_v.begin(), - detail::lookup_by_vertex_id( - dst_cluster_cache_v_.data().get(), current_graph_view_.indices(), base_dst_vertex_id_)), - communication_schedule); - - nbr_weights_v = variable_shuffle( - handle_, nbr_weights_v.size(), nbr_weights_v.begin(), communication_schedule); - - // - // At this point, src_v, nbr_cluster_v and nbr_weights_v have been - // shuffled to the correct GPU. We can now compute the final - // value of delta_Q for each neigboring cluster - // - // Again, we'll combine edges that connect the same source to the same - // neighboring cluster and sum their weights. - // - detail::src_dst_equality_comparator_t compare2( - src_v, nbr_cluster_v, std::numeric_limits::max()); - detail::src_dst_hasher_t hasher2(src_v, nbr_cluster_v); - - auto skip_edge2 = [] __device__(auto) { return false; }; - - std::tie(local_cluster_edge_ids_v, nbr_weights_v) = combine_local_src_nbr_cluster_weights( - hasher2, compare2, skip_edge2, nbr_weights_v.data().get(), src_v.size()); - - // - // Now local_cluster_edge_ids_v contains the edge ids of the src id/dest - // cluster id pairs, and nbr_weights_v contains the weight of edges - // going to that cluster id - // - // Now we can compute (locally) each delta_Q value - // - auto iter = thrust::make_zip_iterator( - thrust::make_tuple(local_cluster_edge_ids_v.begin(), nbr_weights_v.begin())); - - thrust::transform(rmm::exec_policy(stream_)->on(stream_), - iter, - iter + local_cluster_edge_ids_v.size(), - nbr_weights_v.begin(), - [total_edge_weight, - resolution, - d_src = src_v.data().get(), - d_src_cluster = src_cluster_v.data().get(), - d_nbr_cluster = nbr_cluster_v.data().get(), - d_src_vertex_weights = src_vertex_weight_v.data().get(), - d_src_cluster_weights = src_cluster_weights_cache_v_.data().get(), - d_dst_cluster_weights = dst_cluster_weights_cache_v_.data().get(), - d_ocs = ocs_v.data().get(), - base_src_vertex_id = base_src_vertex_id_, - base_dst_vertex_id = base_dst_vertex_id_] __device__(auto tuple) { - edge_t edge_id = thrust::get<0>(tuple); - vertex_t nbr_cluster = d_nbr_cluster[edge_id]; - weight_t new_cluster_sum = thrust::get<1>(tuple); - vertex_t old_cluster = d_src_cluster[edge_id]; - weight_t k_k = d_src_vertex_weights[edge_id]; - weight_t old_cluster_sum = d_ocs[edge_id]; - - weight_t a_old = d_src_cluster_weights[old_cluster - base_src_vertex_id]; - weight_t a_new = d_dst_cluster_weights[nbr_cluster - base_dst_vertex_id]; - - return 2 * (((new_cluster_sum - old_cluster_sum) / total_edge_weight) - - resolution * (a_new * k_k - a_old * k_k + k_k * k_k) / - (total_edge_weight * total_edge_weight)); - }); - - // - // Pick the largest delta_Q value for each vertex on this gpu. - // Then we will shuffle back to the gpu by vertex id - // - rmm::device_vector final_src_v(local_cluster_edge_ids_v.size()); - rmm::device_vector final_nbr_cluster_v(local_cluster_edge_ids_v.size()); - rmm::device_vector final_nbr_weights_v(local_cluster_edge_ids_v.size()); - - auto final_input_iter = thrust::make_zip_iterator(thrust::make_tuple( - thrust::make_permutation_iterator(src_v.begin(), local_cluster_edge_ids_v.begin()), - thrust::make_permutation_iterator(nbr_cluster_v.begin(), local_cluster_edge_ids_v.begin()), - nbr_weights_v.begin())); - - auto final_output_iter = thrust::make_zip_iterator(thrust::make_tuple( - final_src_v.begin(), final_nbr_cluster_v.begin(), final_nbr_weights_v.begin())); - - auto final_output_pos = - thrust::copy_if(rmm::exec_policy(stream_)->on(stream_), - final_input_iter, - final_input_iter + local_cluster_edge_ids_v.size(), - final_output_iter, - [] __device__(auto p) { return (thrust::get<2>(p) > weight_t{0}); }); - - final_src_v.resize(thrust::distance(final_output_iter, final_output_pos)); - final_nbr_cluster_v.resize(thrust::distance(final_output_iter, final_output_pos)); - final_nbr_weights_v.resize(thrust::distance(final_output_iter, final_output_pos)); - - // - // Sort the results, pick the largest version - // - thrust::sort(rmm::exec_policy(stream_)->on(stream_), - thrust::make_zip_iterator(thrust::make_tuple( - final_src_v.begin(), final_nbr_weights_v.begin(), final_nbr_cluster_v.begin())), - thrust::make_zip_iterator(thrust::make_tuple( - final_src_v.end(), final_nbr_weights_v.end(), final_nbr_cluster_v.begin())), - [] __device__(auto left, auto right) { - if (thrust::get<0>(left) < thrust::get<0>(right)) return true; - if (thrust::get<0>(left) > thrust::get<0>(right)) return false; - if (thrust::get<1>(left) > thrust::get<1>(right)) return true; - if (thrust::get<1>(left) < thrust::get<1>(right)) return false; - return (thrust::get<2>(left) < thrust::get<2>(right)); - }); - - // - // Now that we're sorted the first entry for each src value is the largest. - // - local_cluster_edge_ids_v.resize(final_src_v.size()); - - thrust::transform(rmm::exec_policy(stream_)->on(stream_), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(final_src_v.size()), - local_cluster_edge_ids_v.begin(), - [sentinel = std::numeric_limits::max(), - d_src = final_src_v.data().get()] __device__(edge_t edge_id) { - if (edge_id == 0) { return edge_id; } - - if (d_src[edge_id - 1] != d_src[edge_id]) { return edge_id; } - - return sentinel; - }); - - local_cluster_edge_ids_v = detail::remove_elements_from_vector( - local_cluster_edge_ids_v, - [sentinel = std::numeric_limits::max()] __device__(auto edge_id) { - return (edge_id != sentinel); + return thrust::make_tuple(subtract, sum); }, - stream_); + thrust::make_tuple(weight_t{0}, weight_t{0}), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer)); + + thrust::transform( + rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer) + + current_graph_view_.get_number_of_local_vertices(), + old_cluster_sum_v.begin(), + [] __device__(auto p) { return thrust::get<1>(p); }); + + thrust::transform( + rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer) + + current_graph_view_.get_number_of_local_vertices(), + cluster_subtract_v.begin(), + [] __device__(auto p) { return thrust::get<0>(p); }); + } - final_nbr_cluster_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(final_nbr_cluster_v.begin(), - local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - final_nbr_weights_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(final_nbr_weights_v.begin(), - local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - final_src_v = variable_shuffle( + void update_by_delta_modularity(weight_t total_edge_weight, + weight_t resolution, + rmm::device_uvector &next_cluster_v, + bool up_down) + { +#ifdef CUCO_STATIC_MAP_DEFINED + rmm::device_uvector old_cluster_sum_v( + current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); + rmm::device_uvector cluster_subtract_v( + current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); + rmm::device_uvector src_cluster_weights_v(next_cluster_v.size(), + handle_.get_stream()); + + compute_cluster_sum_and_subtract(old_cluster_sum_v, cluster_subtract_v); + + auto output_buffer = + cugraph::experimental::allocate_dataframe_buffer>( + current_graph_view_.get_number_of_local_vertices(), handle_.get_stream()); + + vertex_t *map_key_first; + vertex_t *map_key_last; + weight_t *map_value_first; + + if (graph_t::is_multi_gpu) { + cugraph::experimental::detail::compute_gpu_id_from_vertex_t vertex_to_gpu_id_op{ + handle_.get_comms().get_size()}; + + src_cluster_weights_v = cugraph::experimental::collect_values_for_keys( + handle_.get_comms(), + cluster_keys_v_.begin(), + cluster_keys_v_.end(), + cluster_weights_v_.data(), + d_src_cluster_cache_, + d_src_cluster_cache_ + src_cluster_cache_v_.size(), + vertex_to_gpu_id_op, + handle_.get_stream()); + + map_key_first = cluster_keys_v_.begin(); + map_key_last = cluster_keys_v_.end(); + map_value_first = cluster_weights_v_.begin(); + } else { + thrust::sort_by_key(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + cluster_keys_v_.begin(), + cluster_keys_v_.end(), + cluster_weights_v_.begin()); + + thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + next_cluster_v.begin(), + next_cluster_v.end(), + src_cluster_weights_v.begin(), + [d_cluster_weights = cluster_weights_v_.data(), + d_cluster_keys = cluster_keys_v_.data(), + num_clusters = cluster_keys_v_.size()] __device__(vertex_t cluster) { + auto pos = thrust::lower_bound( + thrust::seq, d_cluster_keys, d_cluster_keys + num_clusters, cluster); + return d_cluster_weights[pos - d_cluster_keys]; + }); + + map_key_first = d_src_cluster_cache_; + map_key_last = d_src_cluster_cache_ + src_cluster_weights_v.size(); + map_value_first = src_cluster_weights_v.begin(); + } + + rmm::device_uvector src_old_cluster_sum_v( + current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream()); + rmm::device_uvector src_cluster_subtract_v( + current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream()); + copy_to_adj_matrix_row( + handle_, current_graph_view_, old_cluster_sum_v.begin(), src_old_cluster_sum_v.begin()); + copy_to_adj_matrix_row( + handle_, current_graph_view_, cluster_subtract_v.begin(), src_cluster_subtract_v.begin()); + + copy_v_transform_reduce_key_aggregated_out_nbr( handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - // - // At this point... - // final_src_v contains the source indices - // final_nbr_cluster_v contains the neighboring clusters - // final_nbr_weights_v contains delta_Q for moving src to the neighboring - // - // They have been shuffled to the gpus responsible for their source vertex - // - // FIXME: Think about how this should work. - // I think Leiden is broken. I don't think that the code we have - // actually does anything. For now I'm going to ignore Leiden in - // MNMG, we can reconsider this later. - // - // If we ignore Leiden, I'd like to think about whether the reduction - // should occur now... - // - - // - // Sort the results, pick the largest version - // - thrust::sort(rmm::exec_policy(stream_)->on(stream_), - thrust::make_zip_iterator(thrust::make_tuple( - final_src_v.begin(), final_nbr_weights_v.begin(), final_nbr_cluster_v.begin())), - thrust::make_zip_iterator(thrust::make_tuple( - final_src_v.end(), final_nbr_weights_v.end(), final_nbr_cluster_v.begin())), - [] __device__(auto left, auto right) { - if (thrust::get<0>(left) < thrust::get<0>(right)) return true; - if (thrust::get<0>(left) > thrust::get<0>(right)) return false; - if (thrust::get<1>(left) > thrust::get<1>(right)) return true; - if (thrust::get<1>(left) < thrust::get<1>(right)) return false; - return (thrust::get<2>(left) < thrust::get<2>(right)); - }); - - // - // Now that we're sorted (ascending), the last entry for each src value is the largest. - // - local_cluster_edge_ids_v.resize(final_src_v.size()); - - thrust::transform(rmm::exec_policy(stream_)->on(stream_), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(final_src_v.size()), - local_cluster_edge_ids_v.begin(), - [sentinel = std::numeric_limits::max(), - d_src = final_src_v.data().get()] __device__(edge_t edge_id) { - if (edge_id == 0) { return edge_id; } - - if (d_src[edge_id - 1] != d_src[edge_id]) { return edge_id; } - - return sentinel; - }); - - local_cluster_edge_ids_v = detail::remove_elements_from_vector( - local_cluster_edge_ids_v, - [sentinel = std::numeric_limits::max()] __device__(auto edge_id) { - return (edge_id != sentinel); - }, - stream_); - - rmm::device_vector cluster_increase_v(final_src_v.size()); - rmm::device_vector cluster_decrease_v(final_src_v.size()); - rmm::device_vector old_cluster_v(final_src_v.size()); - - // - // Then we can, on each gpu, do a local assignment for all of the - // vertices assigned to that gpu using the up_down logic - // - local_cluster_edge_ids_v = detail::remove_elements_from_vector( - local_cluster_edge_ids_v, - local_cluster_edge_ids_v.begin(), - local_cluster_edge_ids_v.end(), - [d_final_src = final_src_v.data().get(), - d_final_nbr_cluster = final_nbr_cluster_v.data().get(), - d_final_nbr_weights = final_nbr_weights_v.data().get(), - d_cluster_increase = cluster_increase_v.data().get(), - d_cluster_decrease = cluster_decrease_v.data().get(), - d_vertex_weights = src_vertex_weights_cache_v_.data().get(), - d_next_cluster = next_cluster_v.data().get(), - d_old_cluster = old_cluster_v.data().get(), - base_vertex_id = base_vertex_id_, - base_src_vertex_id = base_src_vertex_id_, - up_down] __device__(edge_t idx) { - vertex_t src = d_final_src[idx]; - vertex_t new_cluster = d_final_nbr_cluster[idx]; - vertex_t old_cluster = d_next_cluster[src - base_vertex_id]; - weight_t src_weight = d_vertex_weights[src - base_src_vertex_id]; - - if (d_final_nbr_weights[idx] <= weight_t{0}) return false; - if (new_cluster == old_cluster) return false; - if ((new_cluster > old_cluster) != up_down) return false; - - d_next_cluster[src - base_vertex_id] = new_cluster; - d_cluster_increase[idx] = src_weight; - d_cluster_decrease[idx] = src_weight; - d_old_cluster[idx] = old_cluster; - return true; + current_graph_view_, + thrust::make_zip_iterator(thrust::make_tuple(src_old_cluster_sum_v.begin(), + d_src_vertex_weights_cache_, + src_cluster_subtract_v.begin(), + d_src_cluster_cache_, + src_cluster_weights_v.begin())), + + d_dst_cluster_cache_, + map_key_first, + map_key_last, + map_value_first, + [total_edge_weight, resolution] __device__( + auto src, auto neighbor_cluster, auto new_cluster_sum, auto src_info, auto a_new) { + auto old_cluster_sum = thrust::get<0>(src_info); + auto k_k = thrust::get<1>(src_info); + auto cluster_subtract = thrust::get<2>(src_info); + auto src_cluster = thrust::get<3>(src_info); + auto a_old = thrust::get<4>(src_info); + + if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract; + + weight_t delta_modularity = 2 * (((new_cluster_sum - old_cluster_sum) / total_edge_weight) - + resolution * (a_new * k_k - a_old * k_k + k_k * k_k) / + (total_edge_weight * total_edge_weight)); + + return thrust::make_tuple(neighbor_cluster, delta_modularity); }, - stream_); + [] __device__(auto p1, auto p2) { + auto id1 = thrust::get<0>(p1); + auto id2 = thrust::get<0>(p2); + auto wt1 = thrust::get<1>(p1); + auto wt2 = thrust::get<1>(p2); - cluster_increase_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(cluster_increase_v.begin(), - local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(final_nbr_cluster_v.begin(), - local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - final_nbr_cluster_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(final_nbr_cluster_v.begin(), - local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(final_nbr_cluster_v.begin(), - local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - cluster_decrease_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(cluster_decrease_v.begin(), - local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(old_cluster_v.begin(), local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - old_cluster_v = variable_shuffle( - handle_, - local_cluster_edge_ids_v.size(), - thrust::make_permutation_iterator(old_cluster_v.begin(), local_cluster_edge_ids_v.begin()), - thrust::make_transform_iterator( - thrust::make_permutation_iterator(old_cluster_v.begin(), local_cluster_edge_ids_v.begin()), - [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) { - return d_vertex_device_view(v); - })); - - thrust::for_each(rmm::exec_policy(stream_)->on(stream_), - thrust::make_zip_iterator( - thrust::make_tuple(final_nbr_cluster_v.begin(), cluster_increase_v.begin())), - thrust::make_zip_iterator( - thrust::make_tuple(final_nbr_cluster_v.end(), cluster_increase_v.end())), - [d_cluster_weights = cluster_weights_v_.data().get(), - base_vertex_id = base_vertex_id_] __device__(auto p) { - vertex_t cluster_id = thrust::get<0>(p); - weight_t weight = thrust::get<1>(p); - - atomicAdd(d_cluster_weights + cluster_id - base_vertex_id, weight); - }); - - thrust::for_each( - rmm::exec_policy(stream_)->on(stream_), - thrust::make_zip_iterator( - thrust::make_tuple(old_cluster_v.begin(), cluster_decrease_v.begin())), - thrust::make_zip_iterator(thrust::make_tuple(old_cluster_v.end(), cluster_decrease_v.end())), - [d_cluster_weights = cluster_weights_v_.data().get(), - base_vertex_id = base_vertex_id_] __device__(auto p) { - vertex_t cluster_id = thrust::get<0>(p); - weight_t weight = thrust::get<1>(p); - - atomicAdd(d_cluster_weights + cluster_id - base_vertex_id, -weight); + return (wt1 < wt2) ? p2 : ((wt1 > wt2) ? p1 : ((id1 < id2) ? p1 : p2)); + }, + thrust::make_tuple(vertex_t{-1}, weight_t{0}), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer)); + + thrust::transform( + rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + next_cluster_v.begin(), + next_cluster_v.end(), + cugraph::experimental::get_dataframe_buffer_begin>( + output_buffer), + next_cluster_v.begin(), + [up_down] __device__(vertex_t old_cluster, auto p) { + vertex_t new_cluster = thrust::get<0>(p); + weight_t delta_modularity = thrust::get<1>(p); + + return (delta_modularity > weight_t{0}) + ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster) + : old_cluster; }); - cache_vertex_properties( - cluster_weights_v_.begin(), src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_); - } - - template - std::pair, rmm::device_vector> - combine_local_src_nbr_cluster_weights(hash_t hasher, - compare_t compare, - skip_edge_t skip_edge, - weight_t const *d_weights, - count_t num_weights) - { - rmm::device_vector relevant_edges_v; - rmm::device_vector relevant_edge_weights_v; - - if (num_weights > 0) { - std::size_t capacity{static_cast(num_weights / 0.7)}; - - cuco::static_map hash_map( - capacity, std::numeric_limits::max(), count_t{0}); - detail::create_cuco_pair_t create_cuco_pair; - - CUDA_TRY(cudaStreamSynchronize(stream_)); - - hash_map.insert(thrust::make_transform_iterator(thrust::make_counting_iterator(0), - create_cuco_pair), - thrust::make_transform_iterator( - thrust::make_counting_iterator(num_weights), create_cuco_pair), - hasher, - compare); - - CUDA_TRY(cudaStreamSynchronize(stream_)); - - relevant_edges_v.resize(num_weights); - - relevant_edges_v = detail::remove_elements_from_vector( - relevant_edges_v, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_weights), - [d_hash_map = hash_map.get_device_view(), hasher, compare] __device__(count_t idx) { - auto pos = d_hash_map.find(idx, hasher, compare); - return (pos->first == idx); - }, - stream_); - - thrust::for_each_n( - rmm::exec_policy(stream_)->on(stream_), - thrust::make_counting_iterator(0), - relevant_edges_v.size(), - [d_hash_map = hash_map.get_device_view(), - hasher, - compare, - d_relevant_edges = relevant_edges_v.data().get()] __device__(count_t idx) mutable { - count_t edge_id = d_relevant_edges[idx]; - auto pos = d_hash_map.find(edge_id, hasher, compare); - pos->second.store(idx); - }); - - relevant_edge_weights_v.resize(relevant_edges_v.size()); - thrust::fill(rmm::exec_policy(stream_)->on(stream_), - relevant_edge_weights_v.begin(), - relevant_edge_weights_v.end(), - weight_t{0}); - - thrust::for_each_n( - rmm::exec_policy(stream_)->on(stream_), - thrust::make_counting_iterator(0), - num_weights, - [d_hash_map = hash_map.get_device_view(), - hasher, - compare, - skip_edge, - d_relevant_edge_weights = relevant_edge_weights_v.data().get(), - d_weights] __device__(count_t idx) { - if (!skip_edge(idx)) { - auto pos = d_hash_map.find(idx, hasher, compare); - if (pos != d_hash_map.end()) { - atomicAdd(d_relevant_edge_weights + pos->second.load(cuda::std::memory_order_relaxed), - d_weights[idx]); - } - } - }); - } + d_src_cluster_cache_ = cache_src_vertex_properties(next_cluster_v, src_cluster_cache_v_); + d_dst_cluster_cache_ = cache_dst_vertex_properties(next_cluster_v, dst_cluster_cache_v_); - return std::make_pair(relevant_edges_v, relevant_edge_weights_v); - } + std::tie(cluster_keys_v_, cluster_weights_v_) = + cugraph::experimental::transform_reduce_by_adj_matrix_row_key_e( + handle_, + current_graph_view_, + thrust::make_constant_iterator(0), + thrust::make_constant_iterator(0), + d_src_cluster_cache_, + [] __device__(auto src, auto dst, auto wt, auto x, auto y) { return wt; }, + weight_t{0}); #endif + } void shrink_graph() { timer_start("shrinking graph"); - rmm::device_uvector numbering_map(0, stream_); + rmm::device_uvector numbering_map(0, handle_.get_stream()); std::tie(current_graph_, numbering_map) = coarsen_graph(handle_, current_graph_view_, dendrogram_->current_level_begin()); current_graph_view_ = current_graph_->view(); - local_num_vertices_ = current_graph_view_.get_number_of_local_vertices(); - local_num_rows_ = current_graph_view_.get_number_of_local_adj_matrix_partition_rows(); - local_num_cols_ = current_graph_view_.get_number_of_local_adj_matrix_partition_cols(); - base_vertex_id_ = current_graph_view_.get_local_vertex_first(); - - local_num_edges_ = thrust::transform_reduce( - thrust::host, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator( - current_graph_view_.get_number_of_local_adj_matrix_partitions()), - [this](auto indx) { - return current_graph_view_.get_number_of_local_adj_matrix_partition_edges(indx); - }, - size_t{0}, - thrust::plus()); - - src_indices_v_.resize(local_num_edges_); - - cugraph::detail::offsets_to_indices( - current_graph_view_.offsets(), local_num_rows_, src_indices_v_.data().get()); - - rmm::device_uvector numbering_indices(numbering_map.size(), stream_); - thrust::sequence(rmm::exec_policy(stream_)->on(stream_), + rmm::device_uvector numbering_indices(numbering_map.size(), handle_.get_stream()); + thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), numbering_indices.begin(), numbering_indices.end(), - base_vertex_id_); + current_graph_view_.get_local_vertex_first()); relabel( handle_, std::make_tuple(static_cast(numbering_map.begin()), static_cast(numbering_indices.begin())), - local_num_vertices_, + current_graph_view_.get_number_of_local_vertices(), dendrogram_->current_level_begin(), dendrogram_->current_level_size()); - timer_stop(stream_); + timer_stop(handle_.get_stream()); } protected: raft::handle_t const &handle_; - cudaStream_t stream_; std::unique_ptr> dendrogram_; - vertex_t number_of_vertices_; - vertex_t base_vertex_id_{0}; - vertex_t base_src_vertex_id_{0}; - vertex_t base_dst_vertex_id_{0}; - int rank_{0}; - - vertex_t local_num_vertices_; - vertex_t local_num_rows_; - vertex_t local_num_cols_; - edge_t local_num_edges_; - // - // Copy of graph + // Initially we run on the input graph view, + // but as we shrink the graph we'll keep the + // current graph here // std::unique_ptr current_graph_{}; graph_view_t current_graph_view_; - // - // For partitioning - // - detail::compute_partition_t compute_partition_; + rmm::device_uvector vertex_weights_v_; + rmm::device_uvector src_vertex_weights_cache_v_; + rmm::device_uvector src_cluster_cache_v_; + rmm::device_uvector dst_cluster_cache_v_; + rmm::device_uvector cluster_keys_v_; + rmm::device_uvector cluster_weights_v_; - rmm::device_vector src_indices_v_; - - // - // Weights and clustering across iterations of algorithm - // - rmm::device_vector vertex_weights_v_; - rmm::device_vector src_vertex_weights_cache_v_{}; - rmm::device_vector dst_vertex_weights_cache_v_{}; - - rmm::device_vector cluster_weights_v_; - rmm::device_vector src_cluster_weights_cache_v_{}; - rmm::device_vector dst_cluster_weights_cache_v_{}; - - rmm::device_vector src_cluster_cache_v_{}; - rmm::device_vector dst_cluster_cache_v_{}; - - rmm::device_vector empty_cache_weight_v_{}; + weight_t *d_src_vertex_weights_cache_; + vertex_t *d_src_cluster_cache_; + vertex_t *d_dst_cluster_cache_; #ifdef TIMING HighResTimer hr_timer_; #endif -}; // namespace experimental +}; } // namespace experimental } // namespace cugraph diff --git a/cpp/src/experimental/random_walks.cuh b/cpp/src/experimental/random_walks.cuh new file mode 100644 index 00000000000..aea8f3d8420 --- /dev/null +++ b/cpp/src/experimental/random_walks.cuh @@ -0,0 +1,887 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Andrei Schaffer, aschaffer@nvidia.com +// +#pragma once + +#include + +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cugraph { +namespace experimental { + +namespace detail { + +template +using device_vec_t = rmm::device_uvector; + +template +using device_v_it = typename device_vec_t::iterator; + +template +value_t* raw_ptr(device_vec_t& dv) +{ + return dv.data(); +} + +template +value_t const* raw_const_ptr(device_vec_t const& dv) +{ + return dv.data(); +} + +template +struct device_const_vector_view { + device_const_vector_view(value_t const* d_buffer, index_t size) : d_buffer_(d_buffer), size_(size) + { + } + + device_const_vector_view(device_const_vector_view const& other) = delete; + device_const_vector_view& operator=(device_const_vector_view const& other) = delete; + + device_const_vector_view(device_const_vector_view&& other) + { + d_buffer_ = other.d_buffer_; + size_ = other.size_; + } + device_const_vector_view& operator=(device_const_vector_view&& other) + { + d_buffer_ = other.d_buffer_; + size_ = other.size_; + + return *this; + } + + value_t const* begin(void) const { return d_buffer_; } + + value_t const* end() const { return d_buffer_ + size_; } + + index_t size(void) const { return size_; } + + private: + value_t const* d_buffer_{nullptr}; + index_t size_; +}; + +// raft random generator: +// (using upper-bound cached "map" +// giving out_deg(v) for each v in [0, |V|); +// and a pre-generated vector of float random values +// in [0,1] to be brought into [0, d_ub[v])) +// +template +struct rrandom_gen_t { + using seed_type = seed_t; + using real_type = real_t; + + rrandom_gen_t(raft::handle_t const& handle, + index_t num_paths, + device_vec_t& d_random, // scratch-pad, non-coalesced + device_vec_t const& d_crt_out_deg, // non-coalesced + seed_t seed = seed_t{}) + : handle_(handle), + seed_(seed), + num_paths_(num_paths), + d_ptr_out_degs_(raw_const_ptr(d_crt_out_deg)), + d_ptr_random_(raw_ptr(d_random)) + { + auto rnd_sz = d_random.size(); + + CUGRAPH_EXPECTS(rnd_sz >= static_cast(num_paths), + "Un-allocated random buffer."); + + // done in constructor; + // this must be done at each step, + // but this object is constructed at each step; + // + raft::random::Rng rng(seed_); + rng.uniform( + d_ptr_random_, num_paths, real_t{0.0}, real_t{1.0}, handle.get_stream()); + } + + // in place: + // for each v in [0, num_paths) { + // if out_deg(v) > 0 + // d_col_indx[v] = random index in [0, out_deg(v)) + //} + void generate_col_indices(device_vec_t& d_col_indx) const + { + thrust::transform_if( + rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + d_ptr_random_, + d_ptr_random_ + num_paths_, // input1 + d_ptr_out_degs_, // input2 + d_ptr_out_degs_, // also stencil + d_col_indx.begin(), + [] __device__(real_t rnd_vindx, edge_t crt_out_deg) { + real_t max_ub = static_cast(crt_out_deg - 1); + auto interp_vindx = rnd_vindx * max_ub + real_t{.5}; + vertex_t v_indx = static_cast(interp_vindx); + return (v_indx >= crt_out_deg ? crt_out_deg - 1 : v_indx); + }, + [] __device__(auto crt_out_deg) { return crt_out_deg > 0; }); + } + + private: + raft::handle_t const& handle_; + index_t num_paths_; + edge_t const* d_ptr_out_degs_; // device buffer with out-deg of current set of vertices (most + // recent vertex in each path); size = num_paths_ + real_t* d_ptr_random_; // device buffer with real random values; size = num_paths_ + seed_t seed_; // seed to be used for current batch +}; + +// seeding policy: time (clock) dependent, +// to avoid RW calls repeating same random data: +// +template +struct clock_seeding_t { + clock_seeding_t(void) = default; + + seed_t operator()(void) { return static_cast(std::time(nullptr)); } +}; + +// seeding policy: fixed for debug/testing repro +// +template +struct fixed_seeding_t { + // purposely no default cnstr. + + fixed_seeding_t(seed_t seed) : seed_(seed) {} + seed_t operator()(void) { return seed_; } + + private: + seed_t seed_; +}; + +// classes abstracting the next vertex extraction mechanism: +// +// primary template, purposely undefined +template +struct col_indx_extract_t; + +// specialization for single-gpu functionality: +// +template +struct col_indx_extract_t> { + using vertex_t = typename graph_t::vertex_type; + using edge_t = typename graph_t::edge_type; + using weight_t = typename graph_t::weight_type; + + col_indx_extract_t(raft::handle_t const& handle, + device_vec_t const& d_indices, + device_vec_t const& d_offsets, + device_vec_t const& d_values, + device_vec_t const& d_crt_out_degs, + device_vec_t const& d_sizes, + index_t num_paths, + index_t max_depth) + : handle_(handle), + col_indices_(raw_const_ptr(d_indices)), + row_offsets_(raw_const_ptr(d_offsets)), + values_(raw_const_ptr(d_values)), + out_degs_(raw_const_ptr(d_crt_out_degs)), + sizes_(raw_const_ptr(d_sizes)), + num_paths_(num_paths), + max_depth_(max_depth) + { + } + + col_indx_extract_t(raft::handle_t const& handle, + vertex_t const* p_d_indices, + edge_t const* p_d_offsets, + weight_t const* p_d_values, + edge_t const* p_d_crt_out_degs, + index_t const* p_d_sizes, + index_t num_paths, + index_t max_depth) + : handle_(handle), + col_indices_(p_d_indices), + row_offsets_(p_d_offsets), + values_(p_d_values), + out_degs_(p_d_crt_out_degs), + sizes_(p_d_sizes), + num_paths_(num_paths), + max_depth_(max_depth) + { + } + + col_indx_extract_t(raft::handle_t const& handle, + graph_t const& graph, + edge_t const* p_d_crt_out_degs, + index_t const* p_d_sizes, + index_t num_paths, + index_t max_depth) + : handle_(handle), + col_indices_(graph.indices()), + row_offsets_(graph.offsets()), + values_(graph.weights()), + out_degs_(p_d_crt_out_degs), + sizes_(p_d_sizes), + num_paths_(num_paths), + max_depth_(max_depth) + { + } + + // in-place extractor of next set of vertices and weights, + // (d_v_next_vertices, d_v_next_weights), + // given start set of vertices. d_v_src_vertices, + // and corresponding column index set, d_v_col_indx: + // + // for each indx in [0, num_paths){ + // v_indx = d_v_src_vertices[indx*max_depth + d_sizes[indx] - 1]; + // if( out_degs_[v_indx] > 0 ) { + // start_row = row_offsets_[v_indx]; + // delta = d_v_col_indx[indx]; + // d_v_next_vertices[indx] = col_indices_[start_row + delta]; + // } + // (use tranform_if() with transform iterator) + // + void operator()( + device_vec_t const& d_coalesced_src_v, // in: coalesced vector of vertices + device_vec_t const& + d_v_col_indx, // in: column indices, given by stepper's random engine + device_vec_t& d_v_next_vertices, // out: set of destination vertices, for next step + device_vec_t& + d_v_next_weights) // out: set of weights between src and destination vertices, for next step + const + { + thrust::transform_if( + rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_paths_), // input1 + d_v_col_indx.begin(), // input2 + out_degs_, // stencil + thrust::make_zip_iterator( + thrust::make_tuple(d_v_next_vertices.begin(), d_v_next_weights.begin())), // output + [max_depth = max_depth_, + ptr_d_sizes = sizes_, + ptr_d_coalesced_v = raw_const_ptr(d_coalesced_src_v), + row_offsets = row_offsets_, + col_indices = col_indices_, + values = values_] __device__(auto indx, auto col_indx) { + auto delta = ptr_d_sizes[indx] - 1; + auto v_indx = ptr_d_coalesced_v[indx * max_depth + delta]; + auto start_row = row_offsets[v_indx]; + return thrust::make_tuple(col_indices[start_row + col_indx], values[start_row + col_indx]); + }, + [] __device__(auto crt_out_deg) { return crt_out_deg > 0; }); + } + + private: + raft::handle_t const& handle_; + vertex_t const* col_indices_; + edge_t const* row_offsets_; + weight_t const* values_; + + edge_t const* out_degs_; + index_t const* sizes_; + index_t num_paths_; + index_t max_depth_; +}; + +/** + * @brief Class abstracting the RW initialization, stepping, and stopping functionality + * The outline of the algorithm is as follows: + * + * (1) vertex sets are coalesced into d_coalesced_v, + * weight sets are coalesced into d_coalesced_w; + * i.e., the 2 coalesced vectors are allocated to + * num_paths * max_depth, and num_paths * (max_depth -1), respectively + * (since each path has a number of edges equal one + * less than the number of vertices); + * d_coalesced_v is initialized for each i*max_depth entry + * (i=0,,,,num_paths-1) to the corresponding starting vertices; + * (2) d_sizes maintains the current size is for each path; + * Note that a path may end prematurely if it reaches a sink vertex; + * (3) d_crt_out_degs maintains the out-degree of each of the latest + * vertices in the path; i.e., if N(v) := set of destination + * vertices from v, then this vector stores |N(v)| + * for last v in each path; i.e., + * d_crt_out_degs[i] = + * out-degree( d_coalesced_v[i*max_depth + d_sizes[i]-1] ), + * for i in {0,..., num_paths-1}; + * (4) a set of num_paths floating point numbers between [0,1] + * are generated at each step; then they get translated into + * _indices_ k in {0,...d_crt_out_degs[i]-1}; + * (5) the next vertex v is then picked as the k-th out-neighbor: + * next(v) = N(v)[k]; + * (6) d_sizes are incremented accordingly; i.e., for those paths whose + * corresponding last vertex has out-degree > 0; + * (7) then next(v) and corresponding weight of (v, next(v)) are stored + * at appropriate location in their corresponding coalesced vectors; + * (8) the client of this class (the random_walks() function) then repeats + * this process max_depth times or until all paths + * have reached sinks; i.e., d_crt_out_degs = {0, 0,...,0}, + * whichever comes first; + * (9) in the end some post-processing is done (stop()) to remove + * unused entries from the 2 coalesced vectors; + * (10) the triplet made of the 2 coalesced vectors and d_sizes is then returned; + * + */ +template , + typename index_t = typename graph_t::edge_type> +struct random_walker_t { + using vertex_t = typename graph_t::vertex_type; + using edge_t = typename graph_t::edge_type; + using weight_t = typename graph_t::weight_type; + using seed_t = typename random_engine_t::seed_type; + using real_t = typename random_engine_t::real_type; + + random_walker_t(raft::handle_t const& handle, + graph_t const& graph, + index_t num_paths, + index_t max_depth) + : handle_(handle), + num_paths_(num_paths), + max_depth_(max_depth), + d_cached_out_degs_(graph.compute_out_degrees(handle_)) + { + } + + // for each i in [0..num_paths_) { + // d_paths_v_set[i*max_depth] = d_src_init_v[i]; + // + void start(device_const_vector_view& d_src_init_v, // in: start set + device_vec_t& d_paths_v_set, // out: coalesced v + device_vec_t& d_sizes) const // out: init sizes to {1,...} + { + // intialize path sizes to 1, as they contain at least one vertex each: + // the initial set: d_src_init_v; + // + thrust::copy_n(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + thrust::make_constant_iterator(1), + num_paths_, + d_sizes.begin()); + + // scatter d_src_init_v to coalesced vertex vector: + // + auto dlambda = [stride = max_depth_] __device__(auto indx) { return indx * stride; }; + + // use the transform iterator as map: + // + auto map_it_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), dlambda); + + thrust::scatter(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + d_src_init_v.begin(), + d_src_init_v.end(), + map_it_begin, + d_paths_v_set.begin()); + } + + // overload for start() with device_uvector d_v_start + // (handy for testing) + // + void start(device_vec_t const& d_start, // in: start set + device_vec_t& d_paths_v_set, // out: coalesced v + device_vec_t& d_sizes) const // out: init sizes to {1,...} + { + device_const_vector_view d_start_cview{d_start.data(), + static_cast(d_start.size())}; + + start(d_start_cview, d_paths_v_set, d_sizes); + } + + // in-place updates its arguments from one step to next + // (to avoid copying); all "crt" arguments are updated at each step() + // and passed as scratchpad space to avoid copying them + // from one step to another + // + // take one step in sync for all paths that have not reached sinks: + // + void step( + graph_t const& graph, + seed_t seed, + device_vec_t& d_coalesced_v, // crt coalesced vertex set + device_vec_t& d_coalesced_w, // crt coalesced weight set + device_vec_t& d_paths_sz, // crt paths sizes + device_vec_t& d_crt_out_degs, // crt out-degs for current set of vertices + device_vec_t& d_random, // crt set of random real values + device_vec_t& d_col_indx, // crt col col indices to be used for retrieving next step + device_vec_t& d_next_v, // crt set of destination vertices, for next step + device_vec_t& d_next_w) + const // set of weights between src and destination vertices, for next step + { + // update crt snapshot of out-degs, + // from cached out degs, using + // latest vertex in each path as source: + // + gather_from_coalesced( + d_coalesced_v, d_cached_out_degs_, d_paths_sz, d_crt_out_degs, max_depth_, num_paths_); + + // generate random destination indices: + // + random_engine_t rgen(handle_, num_paths_, d_random, d_crt_out_degs, seed); + + rgen.generate_col_indices(d_col_indx); + + // dst extraction from dst indices: + // + col_indx_extract_t col_extractor(handle_, + graph, + raw_const_ptr(d_crt_out_degs), + raw_const_ptr(d_paths_sz), + num_paths_, + max_depth_); + + // The following steps update the next entry in each path, + // except the paths that reached sinks; + // + // for each indx in [0..num_paths) { + // v_indx = d_v_rnd_n_indx[indx]; + // + // -- get the `v_indx`-th out-vertex of d_v_paths_v_set[indx] vertex: + // -- also, note the size deltas increased by 1 in dst (d_sizes[]): + // + // d_coalesced_v[indx*num_paths + d_sizes[indx]] = + // get_out_vertex(graph, d_coalesced_v[indx*num_paths + d_sizes[indx] -1)], v_indx); + // d_coalesced_w[indx*(num_paths-1) + d_sizes[indx] - 1] = + // get_out_edge_weight(graph, d_coalesced_v[indx*num_paths + d_sizes[indx]-2], v_indx); + // + // (1) generate actual vertex destinations: + // + col_extractor(d_coalesced_v, d_col_indx, d_next_v, d_next_w); + + // (2) update path sizes: + // + update_path_sizes(d_crt_out_degs, d_paths_sz); + + // (3) actual coalesced updates: + // + scatter_vertices(d_next_v, d_coalesced_v, d_crt_out_degs, d_paths_sz); + scatter_weights(d_next_w, d_coalesced_w, d_crt_out_degs, d_paths_sz); + } + + // returns true if all paths reached sinks: + // + bool all_paths_stopped(device_vec_t const& d_crt_out_degs) const + { + auto how_many_stopped = + thrust::count_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + d_crt_out_degs.begin(), + d_crt_out_degs.end(), + [] __device__(auto crt_out_deg) { return crt_out_deg == 0; }); + return (static_cast(how_many_stopped) == d_crt_out_degs.size()); + } + + // wrap-up, post-process: + // truncate v_set, w_set to actual space used + // + void stop(device_vec_t& d_coalesced_v, // coalesced vertex set + device_vec_t& d_coalesced_w, // coalesced weight set + device_vec_t const& d_sizes) const // paths sizes + { + assert(max_depth_ > 1); // else, no need to step; and no edges + + index_t const* ptr_d_sizes = d_sizes.data(); + + auto predicate_v = [max_depth = max_depth_, ptr_d_sizes] __device__(auto indx) { + auto row_indx = indx / max_depth; + auto col_indx = indx % max_depth; + + return (col_indx >= ptr_d_sizes[row_indx]); + }; + + auto predicate_w = [max_depth = max_depth_, ptr_d_sizes] __device__(auto indx) { + auto row_indx = indx / (max_depth - 1); + auto col_indx = indx % (max_depth - 1); + + return (col_indx >= ptr_d_sizes[row_indx] - 1); + }; + + auto new_end_v = + thrust::remove_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + d_coalesced_v.begin(), + d_coalesced_v.end(), + thrust::make_counting_iterator(0), + predicate_v); + + auto new_end_w = + thrust::remove_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + d_coalesced_w.begin(), + d_coalesced_w.end(), + thrust::make_counting_iterator(0), + predicate_w); + + CUDA_TRY(cudaStreamSynchronize(handle_.get_stream())); + + d_coalesced_v.resize(thrust::distance(d_coalesced_v.begin(), new_end_v), handle_.get_stream()); + d_coalesced_w.resize(thrust::distance(d_coalesced_w.begin(), new_end_w), handle_.get_stream()); + } + + // in-place non-static (needs handle_): + // for indx in [0, nelems): + // gather d_result[indx] = d_src[d_coalesced[indx*stride + d_sizes[indx] -1]] + // + template + void gather_from_coalesced( + device_vec_t const& d_coalesced, // |gather map| = stride*nelems + device_vec_t const& d_src, // |gather input| = nelems + device_vec_t const& d_sizes, // |paths sizes| = nelems, elems in [1, stride] + device_vec_t& d_result, // |output| = nelems + index_t stride, // stride = coalesce block size (typically max_depth) + index_t nelems) const // nelems = number of elements to gather (typically num_paths_) + { + vertex_t const* ptr_d_coalesced = raw_const_ptr(d_coalesced); + index_t const* ptr_d_sizes = raw_const_ptr(d_sizes); + + // delta = ptr_d_sizes[indx] - 1 + // + auto dlambda = [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) { + auto delta = ptr_d_sizes[indx] - 1; + return ptr_d_coalesced[indx * stride + delta]; + }; + + // use the transform iterator as map: + // + auto map_it_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), dlambda); + + thrust::gather(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + map_it_begin, + map_it_begin + nelems, + d_src.begin(), + d_result.begin()); + } + + // in-place non-static (needs handle_); + // pre-condition: path sizes are assumed updated + // to reflect new vertex additions; + // + // for indx in [0, nelems): + // if ( d_crt_out_degs[indx] > 0 ) + // d_coalesced[indx*stride + (d_sizes[indx] - adjust)- 1] = d_src[indx] + // + // adjust := 0 for coalesced vertices; 1 for weights + // (because |edges| = |vertices| - 1, in each path); + // + template + void scatter_to_coalesced( + device_vec_t const& d_src, // |scatter input| = nelems + device_vec_t& d_coalesced, // |scatter input| = stride*nelems + device_vec_t const& d_crt_out_degs, // |current set of vertex out degrees| = nelems, + // to be used as stencil (don't scatter if 0) + device_vec_t const& + d_sizes, // paths sizes used to provide delta in coalesced paths; + // pre-condition: assumed as updated to reflect new vertex additions; + // also, this is the number of _vertices_ in each path; + // hence for scattering weights this needs to be adjusted; hence the `adjust` parameter + index_t + stride, // stride = coalesce block size (max_depth for vertices; max_depth-1 for weights) + index_t nelems, // nelems = number of elements to gather (typically num_paths_) + index_t adjust = 0) + const // adjusting parameter for scattering vertices (0) or weights (1); see above for more; + { + index_t const* ptr_d_sizes = raw_const_ptr(d_sizes); + + auto dlambda = [stride, adjust, ptr_d_sizes] __device__(auto indx) { + auto delta = ptr_d_sizes[indx] - adjust - 1; + return indx * stride + delta; + }; + + // use the transform iterator as map: + // + auto map_it_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), dlambda); + + thrust::scatter_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + d_src.begin(), + d_src.end(), + map_it_begin, + d_crt_out_degs.begin(), + d_coalesced.begin(), + [] __device__(auto crt_out_deg) { + return crt_out_deg > 0; // predicate + }); + } + + // updates the entries in the corresponding coalesced vector, + // for which out_deg > 0 + // + void scatter_vertices(device_vec_t const& d_src, + device_vec_t& d_coalesced, + device_vec_t const& d_crt_out_degs, + device_vec_t const& d_sizes) const + { + scatter_to_coalesced(d_src, d_coalesced, d_crt_out_degs, d_sizes, max_depth_, num_paths_); + } + // + void scatter_weights(device_vec_t const& d_src, + device_vec_t& d_coalesced, + device_vec_t const& d_crt_out_degs, + device_vec_t const& d_sizes) const + { + scatter_to_coalesced( + d_src, d_coalesced, d_crt_out_degs, d_sizes, max_depth_ - 1, num_paths_, 1); + } + + // in-place update (increment) path sizes for paths + // that have not reached a sink; i.e., for which + // d_crt_out_degs[indx]>0: + // + void update_path_sizes(device_vec_t const& d_crt_out_degs, + device_vec_t& d_sizes) const + { + thrust::transform_if( + rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()), + d_sizes.begin(), + d_sizes.end(), // input + d_crt_out_degs.begin(), // stencil + d_sizes.begin(), // output: in-place + [] __device__(auto crt_sz) { return crt_sz + 1; }, + [] __device__(auto crt_out_deg) { return crt_out_deg > 0; }); + } + + device_vec_t const& get_out_degs(void) const { return d_cached_out_degs_; } + + private: + raft::handle_t const& handle_; + index_t num_paths_; + index_t max_depth_; + device_vec_t d_cached_out_degs_; +}; + +/** + * @brief returns random walks (RW) from starting sources, where each path is of given maximum + * length. Single-GPU specialization. + * + * @tparam graph_t Type of graph (view). + * @tparam random_engine_t Type of random engine used to generate RW. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph Graph object to generate RW on. + * @param d_v_start Device (view) set of starting vertex indices for the RW. + * number(paths) == d_v_start.size(). + * @param max_depth maximum length of RWs. + * @return std::tuple, device_vec_t, + * device_vec_t, seed> Quadruplet of coalesced RW paths, with corresponding edge weights + * for each, and corresponding path sizes. This is meant to minimize the number of DF's to be passed + * to the Python layer. Also returning seed for testing / debugging repro. The meaning of + * "coalesced" here is that a 2D array of paths of different sizes is represented as a 1D array. + */ +template , + typename seeding_policy_t = clock_seeding_t, + typename index_t = typename graph_t::edge_type> +std::enable_if_t, + device_vec_t, + device_vec_t, + typename random_engine_t::seed_type>> +random_walks_impl(raft::handle_t const& handle, + graph_t const& graph, + device_const_vector_view& d_v_start, + index_t max_depth, + seeding_policy_t seeder = clock_seeding_t{}) +{ + using vertex_t = typename graph_t::vertex_type; + using edge_t = typename graph_t::edge_type; + using weight_t = typename graph_t::weight_type; + using seed_t = typename random_engine_t::seed_type; + using real_t = typename random_engine_t::real_type; + + vertex_t num_vertices = graph.get_number_of_vertices(); + + auto how_many_valid = + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_v_start.begin(), + d_v_start.end(), + [num_vertices] __device__(auto crt_vertex) { + return (crt_vertex >= 0) && (crt_vertex < num_vertices); + }); + + CUGRAPH_EXPECTS(static_cast(how_many_valid) == d_v_start.size(), + "Invalid set of starting vertices."); + + auto num_paths = d_v_start.size(); + auto stream = handle.get_stream(); + + random_walker_t rand_walker{ + handle, graph, static_cast(num_paths), static_cast(max_depth)}; + + // pre-allocate num_paths * max_depth; + // + auto coalesced_sz = num_paths * max_depth; + device_vec_t d_coalesced_v(coalesced_sz, stream); // coalesced vertex set + device_vec_t d_coalesced_w(coalesced_sz, stream); // coalesced weight set + device_vec_t d_paths_sz(num_paths, stream); // paths sizes + device_vec_t d_crt_out_degs(num_paths, stream); // out-degs for current set of vertices + device_vec_t d_random(num_paths, stream); + device_vec_t d_col_indx(num_paths, stream); + device_vec_t d_next_v(num_paths, stream); + device_vec_t d_next_w(num_paths, stream); + + // abstracted out seed initialization: + // + seed_t seed0 = static_cast(seeder()); + + // very first vertex, for each path: + // + rand_walker.start(d_v_start, d_coalesced_v, d_paths_sz); + + // start from 1, as 0-th was initialized above: + // + for (decltype(max_depth) step_indx = 1; step_indx < max_depth; ++step_indx) { + // take one-step in-sync for each path in parallel: + // + rand_walker.step(graph, + seed0 + static_cast(step_indx), + d_coalesced_v, + d_coalesced_w, + d_paths_sz, + d_crt_out_degs, + d_random, + d_col_indx, + d_next_v, + d_next_w); + + // early exit: all paths have reached sinks: + // + if (rand_walker.all_paths_stopped(d_crt_out_degs)) break; + } + + // wrap-up, post-process: + // truncate v_set, w_set to actual space used + // + rand_walker.stop(d_coalesced_v, d_coalesced_w, d_paths_sz); + + // because device_uvector is not copy-cnstr-able: + // + return std::make_tuple(std::move(d_coalesced_v), + std::move(d_coalesced_w), + std::move(d_paths_sz), + seed0); // also return seed for repro +} + +/** + * @brief returns random walks (RW) from starting sources, where each path is of given maximum + * length. Multi-GPU specialization. + * + * @tparam graph_t Type of graph (view). + * @tparam random_engine_t Type of random engine used to generate RW. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph Graph object to generate RW on. + * @param d_v_start Device (view) set of starting vertex indices for the RW. number(RW) == + * d_v_start.size(). + * @param max_depth maximum length of RWs. + * @return std::tuple, device_vec_t, + * device_vec_t, seed> Quadruplet of coalesced RW paths, with corresponding edge weights + * for each, and coresponding path sizes. This is meant to minimize the number of DF's to be passed + * to the Python layer. Also returning seed for testing / debugging repro. The meaning of + * "coalesced" here is that a 2D array of paths of different sizes is represented as a 1D array. + */ +template , + typename seeding_policy_t = clock_seeding_t, + typename index_t = typename graph_t::edge_type> +std::enable_if_t, + device_vec_t, + device_vec_t, + typename random_engine_t::seed_type>> +random_walks_impl(raft::handle_t const& handle, + graph_t const& graph, + device_const_vector_view& d_v_start, + index_t max_depth, + seeding_policy_t seeder = clock_seeding_t{}) +{ + CUGRAPH_FAIL("Not implemented yet."); +} + +} // namespace detail + +/** + * @brief returns random walks (RW) from starting sources, where each path is of given maximum + * length. Uniform distribution is assumed for the random engine. + * + * @tparam graph_t Type of graph (view). + * @tparam index_t Type used to store indexing and sizes. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph Graph object to generate RW on. + * @param ptr_d_start Device pointer to set of starting vertex indices for the RW. + * @param num_paths = number(paths). + * @param max_depth maximum length of RWs. + * @return std::tuple, device_vec_t, + * device_vec_t> Triplet of coalesced RW paths, with corresponding edge weights for + * each, and coresponding path sizes. This is meant to minimize the number of DF's to be passed to + * the Python layer. + */ +template +std::tuple, + rmm::device_uvector, + rmm::device_uvector> +random_walks(raft::handle_t const& handle, + graph_t const& graph, + typename graph_t::vertex_type const* ptr_d_start, + index_t num_paths, + index_t max_depth) +{ + using vertex_t = typename graph_t::vertex_type; + + // 0-copy const device view: + // + detail::device_const_vector_view d_v_start{ptr_d_start, num_paths}; + + auto quad_tuple = detail::random_walks_impl(handle, graph, d_v_start, max_depth); + // ignore last element of the quad, seed, + // since it's meant for testing / debugging, only: + // + return std::make_tuple(std::move(std::get<0>(quad_tuple)), + std::move(std::get<1>(quad_tuple)), + std::move(std::get<2>(quad_tuple))); +} +} // namespace experimental +} // namespace cugraph diff --git a/cpp/src/experimental/relabel.cu b/cpp/src/experimental/relabel.cu index 62bd6951f71..8d8fb0322a8 100644 --- a/cpp/src/experimental/relabel.cu +++ b/cpp/src/experimental/relabel.cu @@ -42,6 +42,7 @@ namespace cugraph { namespace experimental { +// FIXME: think about requiring old_new_label_pairs to be pre-shuffled template void relabel(raft::handle_t const& handle, std::tuple old_new_label_pairs, @@ -120,7 +121,12 @@ void relabel(raft::handle_t const& handle, handle.get_stream())); // cuco::static_map currently does not take stream cuco::static_map relabel_map{ - static_cast(static_cast(rx_label_pair_old_labels.size()) / load_factor), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max( + static_cast(static_cast(rx_label_pair_old_labels.size()) / load_factor), + rx_label_pair_old_labels.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value}; @@ -130,7 +136,11 @@ void relabel(raft::handle_t const& handle, [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - relabel_map.insert(pair_first, pair_first + rx_label_pair_old_labels.size()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the + // grid size is 0; this leads to cudaErrorInvaildConfiguration. + if (rx_label_pair_old_labels.size() > 0) { + relabel_map.insert(pair_first, pair_first + rx_label_pair_old_labels.size()); + } rx_label_pair_old_labels.resize(0, handle.get_stream()); rx_label_pair_new_labels.resize(0, handle.get_stream()); @@ -152,19 +162,29 @@ void relabel(raft::handle_t const& handle, CUDA_TRY(cudaStreamSynchronize( handle.get_stream())); // cuco::static_map currently does not take stream - relabel_map.find( - rx_unique_old_labels.begin(), - rx_unique_old_labels.end(), - rx_unique_old_labels - .begin()); // now rx_unique_old_lables hold new labels for the corresponding old labels + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the + // grid size is 0; this leads to cudaErrorInvaildConfiguration. + if (rx_unique_old_labels.size() > 0) { + relabel_map.find( + rx_unique_old_labels.begin(), + rx_unique_old_labels.end(), + rx_unique_old_labels.begin()); // now rx_unique_old_lables hold new labels for the + // corresponding old labels + } std::tie(new_labels_for_unique_old_labels, std::ignore) = shuffle_values( handle.get_comms(), rx_unique_old_labels.begin(), rx_value_counts, handle.get_stream()); } } + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + cuco::static_map relabel_map( - static_cast(static_cast(unique_old_labels.size()) / load_factor), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(unique_old_labels.size()) / load_factor), + unique_old_labels.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value); @@ -175,11 +195,21 @@ void relabel(raft::handle_t const& handle, return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - relabel_map.insert(pair_first, pair_first + unique_old_labels.size()); - relabel_map.find(labels, labels + num_labels, labels); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (unique_old_labels.size() > 0) { + relabel_map.insert(pair_first, pair_first + unique_old_labels.size()); + } + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_labels > 0) { relabel_map.find(labels, labels + num_labels, labels); } } else { cuco::static_map relabel_map( - static_cast(static_cast(num_label_pairs) / load_factor), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(num_label_pairs) / load_factor), + static_cast(num_label_pairs) + 1), invalid_vertex_id::value, invalid_vertex_id::value); @@ -190,8 +220,12 @@ void relabel(raft::handle_t const& handle, return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - relabel_map.insert(pair_first, pair_first + num_label_pairs); - relabel_map.find(labels, labels + num_labels, labels); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_label_pairs > 0) { relabel_map.insert(pair_first, pair_first + num_label_pairs); } + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_labels > 0) { relabel_map.find(labels, labels + num_labels, labels); } } if (do_expensive_check) { diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu index a8847167b87..127bd507271 100644 --- a/cpp/src/experimental/renumber_edgelist.cu +++ b/cpp/src/experimental/renumber_edgelist.cu @@ -50,62 +50,153 @@ rmm::device_uvector compute_renumber_map( raft::handle_t const& handle, vertex_t const* vertices, vertex_t num_local_vertices /* relevant only if vertices != nullptr */, - vertex_t const* edgelist_major_vertices, - vertex_t const* edgelist_minor_vertices, - edge_t num_edgelist_edges) + std::vector const& edgelist_major_vertices, + std::vector const& edgelist_minor_vertices, + std::vector const& edgelist_edge_counts) { // FIXME: compare this sort based approach with hash based approach in both speed and memory // footprint // 1. acquire (unique major label, count) pairs - rmm::device_uvector tmp_labels(num_edgelist_edges, handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - tmp_labels.begin()); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - tmp_labels.begin(), - tmp_labels.end()); - rmm::device_uvector major_labels(tmp_labels.size(), handle.get_stream()); - rmm::device_uvector major_counts(major_labels.size(), handle.get_stream()); - auto major_pair_it = - thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - tmp_labels.begin(), - tmp_labels.end(), - thrust::make_constant_iterator(edge_t{1}), - major_labels.begin(), - major_counts.begin()); - tmp_labels.resize(0, handle.get_stream()); - tmp_labels.shrink_to_fit(handle.get_stream()); - major_labels.resize(thrust::distance(major_labels.begin(), thrust::get<0>(major_pair_it)), - handle.get_stream()); - major_counts.resize(major_labels.size(), handle.get_stream()); - major_labels.shrink_to_fit(handle.get_stream()); - major_counts.shrink_to_fit(handle.get_stream()); + rmm::device_uvector major_labels(0, handle.get_stream()); + rmm::device_uvector major_counts(0, handle.get_stream()); + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + rmm::device_uvector tmp_major_labels(0, handle.get_stream()); + rmm::device_uvector tmp_major_counts(0, handle.get_stream()); + { + rmm::device_uvector sorted_major_labels(edgelist_edge_counts[i], + handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_major_vertices[i], + edgelist_major_vertices[i] + edgelist_edge_counts[i], + sorted_major_labels.begin()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_major_labels.begin(), + sorted_major_labels.end()); + auto num_unique_labels = + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(sorted_major_labels.size()), + [labels = sorted_major_labels.data()] __device__(auto i) { + return (i == 0) || (labels[i - 1] != labels[i]); + }); + tmp_major_labels.resize(num_unique_labels, handle.get_stream()); + tmp_major_counts.resize(tmp_major_labels.size(), handle.get_stream()); + thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_major_labels.begin(), + sorted_major_labels.end(), + thrust::make_constant_iterator(edge_t{1}), + tmp_major_labels.begin(), + tmp_major_counts.begin()); + } + + if (multi_gpu) { + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_rank = col_comm.get_rank(); + auto const col_comm_size = col_comm.get_size(); + + rmm::device_uvector rx_major_labels(0, handle.get_stream()); + rmm::device_uvector rx_major_counts(0, handle.get_stream()); + auto rx_sizes = host_scalar_gather( + col_comm, tmp_major_labels.size(), static_cast(i), handle.get_stream()); + std::vector rx_displs{}; + if (static_cast(i) == col_comm_rank) { + rx_displs.assign(col_comm_size, size_t{0}); + std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); + rx_major_labels.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream()); + rx_major_counts.resize(rx_major_labels.size(), handle.get_stream()); + } + device_gatherv(col_comm, + thrust::make_zip_iterator( + thrust::make_tuple(tmp_major_labels.begin(), tmp_major_counts.begin())), + thrust::make_zip_iterator( + thrust::make_tuple(rx_major_labels.begin(), rx_major_counts.begin())), + tmp_major_labels.size(), + rx_sizes, + rx_displs, + static_cast(i), + handle.get_stream()); + if (static_cast(i) == col_comm_rank) { + thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_major_labels.begin(), + rx_major_labels.end(), + rx_major_counts.begin()); + major_labels.resize(rx_major_labels.size(), handle.get_stream()); + major_counts.resize(major_labels.size(), handle.get_stream()); + auto pair_it = + thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_major_labels.begin(), + rx_major_labels.end(), + rx_major_counts.begin(), + major_labels.begin(), + major_counts.begin()); + major_labels.resize(thrust::distance(major_labels.begin(), thrust::get<0>(pair_it)), + handle.get_stream()); + major_counts.resize(major_labels.size(), handle.get_stream()); + major_labels.shrink_to_fit(handle.get_stream()); + major_counts.shrink_to_fit(handle.get_stream()); + } + } else { + tmp_major_labels.shrink_to_fit(handle.get_stream()); + tmp_major_counts.shrink_to_fit(handle.get_stream()); + major_labels = std::move(tmp_major_labels); + major_counts = std::move(tmp_major_counts); + } + } // 2. acquire unique minor labels - rmm::device_uvector minor_labels(num_edgelist_edges, handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_minor_vertices, - edgelist_minor_vertices + num_edgelist_edges, - minor_labels.begin()); + std::vector minor_displs(edgelist_minor_vertices.size(), edge_t{0}); + std::partial_sum( + edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1); + rmm::device_uvector minor_labels(minor_displs.back() + edgelist_edge_counts.back(), + handle.get_stream()); + for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) { + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_minor_vertices[i], + edgelist_minor_vertices[i] + edgelist_edge_counts[i], + minor_labels.begin() + minor_displs[i]); + } thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), minor_labels.begin(), minor_labels.end()); - auto minor_label_it = - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - minor_labels.begin(), - minor_labels.end()); - minor_labels.resize(thrust::distance(minor_labels.begin(), minor_label_it), handle.get_stream()); + minor_labels.resize( + thrust::distance(minor_labels.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + minor_labels.begin(), + minor_labels.end())), + handle.get_stream()); + if (multi_gpu) { + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + + rmm::device_uvector rx_minor_labels(0, handle.get_stream()); + std::tie(rx_minor_labels, std::ignore) = groupby_gpuid_and_shuffle_values( + row_comm, + minor_labels.begin(), + minor_labels.end(), + [key_func = detail::compute_gpu_id_from_vertex_t{row_comm_size}] __device__( + auto val) { return key_func(val); }, + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_minor_labels.begin(), + rx_minor_labels.end()); + rx_minor_labels.resize( + thrust::distance( + rx_minor_labels.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + rx_minor_labels.begin(), + rx_minor_labels.end())), + handle.get_stream()); + minor_labels = std::move(rx_minor_labels); + } minor_labels.shrink_to_fit(handle.get_stream()); // 3. merge major and minor labels and vertex labels rmm::device_uvector merged_labels(major_labels.size() + minor_labels.size(), handle.get_stream()); - rmm::device_uvector merged_counts(merged_labels.size(), handle.get_stream()); thrust::merge_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), major_labels.begin(), @@ -142,47 +233,7 @@ rmm::device_uvector compute_renumber_map( labels.shrink_to_fit(handle.get_stream()); counts.shrink_to_fit(handle.get_stream()); - // 4. if multi-GPU, shuffle and reduce (label, count) pairs - - if (multi_gpu) { - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - - auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(labels.begin(), counts.begin())); - rmm::device_uvector rx_labels(0, handle.get_stream()); - rmm::device_uvector rx_counts(0, handle.get_stream()); - std::forward_as_tuple(std::tie(rx_labels, rx_counts), std::ignore) = - groupby_gpuid_and_shuffle_values( - comm, - pair_first, - pair_first + labels.size(), - [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}] __device__( - auto val) { return key_func(thrust::get<0>(val)); }, - handle.get_stream()); - - labels.resize(rx_labels.size(), handle.get_stream()); - counts.resize(labels.size(), handle.get_stream()); - thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_labels.begin(), - rx_labels.end(), - rx_counts.begin()); - pair_it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - rx_labels.begin(), - rx_labels.end(), - rx_counts.begin(), - labels.begin(), - counts.begin()); - rx_labels.resize(0, handle.get_stream()); - rx_counts.resize(0, handle.get_stream()); - rx_labels.shrink_to_fit(handle.get_stream()); - rx_counts.shrink_to_fit(handle.get_stream()); - labels.resize(thrust::distance(labels.begin(), thrust::get<0>(pair_it)), handle.get_stream()); - counts.resize(labels.size(), handle.get_stream()); - labels.shrink_to_fit(handle.get_stream()); - labels.shrink_to_fit(handle.get_stream()); - } - - // 5. if vertices != nullptr, add isolated vertices + // 4. if vertices != nullptr, add isolated vertices rmm::device_uvector isolated_vertices(0, handle.get_stream()); if (vertices != nullptr) { @@ -232,10 +283,9 @@ void expensive_check_edgelist( raft::handle_t const& handle, vertex_t const* local_vertices, vertex_t num_local_vertices /* relevant only if local_vertices != nullptr */, - vertex_t const* edgelist_major_vertices, - vertex_t const* edgelist_minor_vertices, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned /* relevant only if multi_gpu == true */) + std::vector const& edgelist_major_vertices, + std::vector const& edgelist_minor_vertices, + std::vector const& edgelist_edge_counts) { rmm::device_uvector sorted_local_vertices( local_vertices != nullptr ? num_local_vertices : vertex_t{0}, handle.get_stream()); @@ -246,6 +296,12 @@ void expensive_check_edgelist( thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), sorted_local_vertices.begin(), sorted_local_vertices.end()); + CUGRAPH_EXPECTS(static_cast(thrust::distance( + sorted_local_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_local_vertices.begin(), + sorted_local_vertices.end()))) == sorted_local_vertices.size(), + "Invalid input argument: local_vertices should not have duplicates."); if (multi_gpu) { auto& comm = handle.get_comms(); @@ -253,8 +309,15 @@ void expensive_check_edgelist( auto const comm_rank = comm.get_rank(); auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); auto const row_comm_size = row_comm.get_size(); + auto const row_comm_rank = row_comm.get_rank(); auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_size = col_comm.get_size(); + auto const col_comm_rank = col_comm.get_rank(); + + CUGRAPH_EXPECTS((edgelist_major_vertices.size() == edgelist_minor_vertices.size()) && + (edgelist_major_vertices.size() == static_cast(col_comm_size)), + "Invalid input argument: both edgelist_major_vertices.size() & " + "edgelist_minor_vertices.size() should coincide with col_comm_size."); CUGRAPH_EXPECTS( thrust::count_if( @@ -268,95 +331,127 @@ void expensive_check_edgelist( }) == 0, "Invalid input argument: local_vertices should be pre-shuffled."); - auto edge_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices)); - CUGRAPH_EXPECTS( - thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edge_first, - edge_first + num_edgelist_edges, - [comm_rank, - key_func = - detail::compute_gpu_id_from_edge_t{is_hypergraph_partitioned, - comm_size, - row_comm_size, - col_comm_size}] __device__(auto edge) { - return key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank; - }) == 0, - "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be " - "pre-shuffled."); - - if (local_vertices != nullptr) { - rmm::device_uvector unique_edge_vertices(num_edgelist_edges * 2, - handle.get_stream()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - unique_edge_vertices.begin()); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_minor_vertices, - edgelist_minor_vertices + num_edgelist_edges, - unique_edge_vertices.begin() + num_edgelist_edges); - thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - unique_edge_vertices.begin(), - unique_edge_vertices.end()); - unique_edge_vertices.resize( - thrust::distance( - unique_edge_vertices.begin(), - thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - unique_edge_vertices.begin(), - unique_edge_vertices.end())), - handle.get_stream()); - - rmm::device_uvector rx_unique_edge_vertices(0, handle.get_stream()); - std::tie(rx_unique_edge_vertices, std::ignore) = groupby_gpuid_and_shuffle_values( - handle.get_comms(), - unique_edge_vertices.begin(), - unique_edge_vertices.end(), - [key_func = detail::compute_gpu_id_from_vertex_t{comm_size}] __device__( - auto val) { return key_func(val); }, - handle.get_stream()); - - unique_edge_vertices = std::move(rx_unique_edge_vertices); - + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i])); CUGRAPH_EXPECTS( thrust::count_if( rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - unique_edge_vertices.begin(), - unique_edge_vertices.end(), - [num_local_vertices, - sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) { - return !thrust::binary_search( - thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v); + edge_first, + edge_first + edgelist_edge_counts[i], + [comm_size, + comm_rank, + row_comm_rank, + col_comm_size, + col_comm_rank, + i, + gpu_id_key_func = + detail::compute_gpu_id_from_edge_t{comm_size, row_comm_size, col_comm_size}, + partition_id_key_func = + detail::compute_partition_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto edge) { + return (gpu_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank) || + (partition_id_key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != + row_comm_rank * col_comm_size + col_comm_rank + i * comm_size); }) == 0, - "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have " - "invalid vertex ID(s)."); + "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be " + "pre-shuffled."); + + auto aggregate_vertexlist_size = host_scalar_allreduce( + comm, + local_vertices != nullptr ? num_local_vertices : vertex_t{0}, + handle.get_stream()); // local_vertices != nullptr is insufficient in multi-GPU as only a + // subset of GPUs may have a non-zero vertices + if (aggregate_vertexlist_size > 0) { + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + + rmm::device_uvector sorted_major_vertices(0, handle.get_stream()); + { + auto recvcounts = + host_scalar_allgather(col_comm, sorted_local_vertices.size(), handle.get_stream()); + std::vector displacements(recvcounts.size(), size_t{0}); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + sorted_major_vertices.resize(displacements.back() + recvcounts.back(), + handle.get_stream()); + device_allgatherv(col_comm, + sorted_local_vertices.data(), + sorted_major_vertices.data(), + recvcounts, + displacements, + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_major_vertices.begin(), + sorted_major_vertices.end()); + } + + rmm::device_uvector sorted_minor_vertices(0, handle.get_stream()); + { + auto recvcounts = + host_scalar_allgather(row_comm, sorted_local_vertices.size(), handle.get_stream()); + std::vector displacements(recvcounts.size(), size_t{0}); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + sorted_minor_vertices.resize(displacements.back() + recvcounts.back(), + handle.get_stream()); + device_allgatherv(row_comm, + sorted_local_vertices.data(), + sorted_minor_vertices.data(), + recvcounts, + displacements, + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_minor_vertices.begin(), + sorted_minor_vertices.end()); + } + + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices[i], edgelist_minor_vertices[i])); + CUGRAPH_EXPECTS( + thrust::count_if( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + edgelist_edge_counts[i], + [num_major_vertices = static_cast(sorted_major_vertices.size()), + sorted_major_vertices = sorted_major_vertices.data(), + num_minor_vertices = static_cast(sorted_minor_vertices.size()), + sorted_minor_vertices = sorted_minor_vertices.data()] __device__(auto e) { + return !thrust::binary_search(thrust::seq, + sorted_major_vertices, + sorted_major_vertices + num_major_vertices, + thrust::get<0>(e)) || + !thrust::binary_search(thrust::seq, + sorted_minor_vertices, + sorted_minor_vertices + num_minor_vertices, + thrust::get<1>(e)); + }) == 0, + "Invalid input argument: edgelist_major_vertices and/or edgelist_mior_vertices have " + "invalid vertex ID(s)."); + } } } else { - if (local_vertices != nullptr) { - CUGRAPH_EXPECTS( - thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - [num_local_vertices, - sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) { - return !thrust::binary_search( - thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v); - }) == 0, - "Invalid input argument: edgelist_major_vertices has invalid vertex ID(s)."); + assert(edgelist_major_vertices.size() == 1); + assert(edgelist_minor_vertices.size() == 1); + if (local_vertices != nullptr) { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices[0], edgelist_minor_vertices[0])); CUGRAPH_EXPECTS( - thrust::count_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - [num_local_vertices, - sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) { - return !thrust::binary_search( - thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v); - }) == 0, - "Invalid input argument: edgelist_major_vertices has invalid vertex ID(s)."); + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + edgelist_edge_counts[0], + [num_local_vertices, + sorted_local_vertices = sorted_local_vertices.data()] __device__(auto e) { + return !thrust::binary_search(thrust::seq, + sorted_local_vertices, + sorted_local_vertices + num_local_vertices, + thrust::get<0>(e)) || + !thrust::binary_search(thrust::seq, + sorted_local_vertices, + sorted_local_vertices + num_local_vertices, + thrust::get<1>(e)); + }) == 0, + "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have " + "invalid vertex ID(s)."); } } } @@ -368,15 +463,15 @@ std::enable_if_t const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of enumber_edgelist not supported on Pascal and older architectures."); #ifdef CUCO_STATIC_MAP_DEFINED auto& comm = handle.get_comms(); @@ -389,14 +484,20 @@ renumber_edgelist(raft::handle_t const& handle, auto const col_comm_size = col_comm.get_size(); auto const col_comm_rank = col_comm.get_rank(); + std::vector edgelist_const_major_vertices(edgelist_major_vertices.size()); + std::vector edgelist_const_minor_vertices(edgelist_const_major_vertices.size()); + for (size_t i = 0; i < edgelist_const_major_vertices.size(); ++i) { + edgelist_const_major_vertices[i] = edgelist_major_vertices[i]; + edgelist_const_minor_vertices[i] = edgelist_minor_vertices[i]; + } + if (do_expensive_check) { expensive_check_edgelist(handle, local_vertices, num_local_vertices, - edgelist_major_vertices, - edgelist_minor_vertices, - num_edgelist_edges, - is_hypergraph_partitioned); + edgelist_const_major_vertices, + edgelist_const_minor_vertices, + edgelist_edge_counts); } // 1. compute renumber map @@ -405,142 +506,129 @@ renumber_edgelist(raft::handle_t const& handle, detail::compute_renumber_map(handle, local_vertices, num_local_vertices, - edgelist_major_vertices, - edgelist_minor_vertices, - num_edgelist_edges); + edgelist_const_major_vertices, + edgelist_const_minor_vertices, + edgelist_edge_counts); // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened // graph - auto vertex_partition_counts = host_scalar_allgather( + auto vertex_counts = host_scalar_allgather( comm, static_cast(renumber_map_labels.size()), handle.get_stream()); std::vector vertex_partition_offsets(comm_size + 1, 0); - std::partial_sum(vertex_partition_counts.begin(), - vertex_partition_counts.end(), - vertex_partition_offsets.begin() + 1); + std::partial_sum( + vertex_counts.begin(), vertex_counts.end(), vertex_partition_offsets.begin() + 1); - partition_t partition(vertex_partition_offsets, - is_hypergraph_partitioned, - row_comm_size, - col_comm_size, - row_comm_rank, - col_comm_rank); + partition_t partition( + vertex_partition_offsets, row_comm_size, col_comm_size, row_comm_rank, col_comm_rank); auto number_of_vertices = vertex_partition_offsets.back(); - auto number_of_edges = host_scalar_allreduce(comm, num_edgelist_edges, handle.get_stream()); + auto number_of_edges = host_scalar_allreduce( + comm, + std::accumulate(edgelist_edge_counts.begin(), edgelist_edge_counts.end(), edge_t{0}), + handle.get_stream()); // 3. renumber edges - if (is_hypergraph_partitioned) { - CUGRAPH_FAIL("unimplemented."); - } else { - double constexpr load_factor = 0.7; + double constexpr load_factor = 0.7; - // FIXME: compare this hash based approach with a binary search based approach in both memory - // footprint and execution time + // FIXME: compare this hash based approach with a binary search based approach in both memory + // footprint and execution time - { - vertex_t major_first{}; - vertex_t major_last{}; - std::tie(major_first, major_last) = partition.get_matrix_partition_major_range(0); - rmm::device_uvector renumber_map_major_labels(major_last - major_first, - handle.get_stream()); - std::vector recvcounts(row_comm_size); - for (int i = 0; i < row_comm_size; ++i) { - recvcounts[i] = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i); - } - std::vector displacements(row_comm_size, 0); - std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); - device_allgatherv(row_comm, - renumber_map_labels.begin(), - renumber_map_major_labels.begin(), - recvcounts, - displacements, - handle.get_stream()); - - CUDA_TRY(cudaStreamSynchronize( - handle.get_stream())); // cuco::static_map currently does not take stream - - cuco::static_map renumber_map{ - static_cast(static_cast(renumber_map_major_labels.size()) / load_factor), - invalid_vertex_id::value, - invalid_vertex_id::value}; - auto pair_first = thrust::make_transform_iterator( - thrust::make_zip_iterator(thrust::make_tuple(renumber_map_major_labels.begin(), - thrust::make_counting_iterator(major_first))), - [] __device__(auto val) { - return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); - }); - renumber_map.insert(pair_first, pair_first + renumber_map_major_labels.size()); - renumber_map.find(edgelist_major_vertices, - edgelist_major_vertices + num_edgelist_edges, - edgelist_major_vertices); + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + rmm::device_uvector renumber_map_major_labels( + col_comm_rank == static_cast(i) ? vertex_t{0} + : partition.get_matrix_partition_major_size(i), + handle.get_stream()); + device_bcast(col_comm, + renumber_map_labels.data(), + renumber_map_major_labels.data(), + partition.get_matrix_partition_major_size(i), + i, + handle.get_stream()); + + CUDA_TRY(cudaStreamSynchronize( + handle.get_stream())); // cuco::static_map currently does not take stream + + cuco::static_map renumber_map{ + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast( + static_cast(partition.get_matrix_partition_major_size(i)) / load_factor), + static_cast(partition.get_matrix_partition_major_size(i)) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value}; + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple( + col_comm_rank == static_cast(i) ? renumber_map_labels.begin() + : renumber_map_major_labels.begin(), + thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i)))), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (partition.get_matrix_partition_major_size(i) > 0) { + renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i)); } + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (edgelist_edge_counts[i]) { + renumber_map.find(edgelist_major_vertices[i], + edgelist_major_vertices[i] + edgelist_edge_counts[i], + edgelist_major_vertices[i]); + } + } - { - vertex_t minor_first{}; - vertex_t minor_last{}; - std::tie(minor_first, minor_last) = partition.get_matrix_partition_minor_range(); - rmm::device_uvector renumber_map_minor_labels(minor_last - minor_first, - handle.get_stream()); - - // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph - // partitioning - auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank; - auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size; - // FIXME: this branch may be no longer necessary with NCCL backend - if (comm_src_rank == comm_rank) { - assert(comm_dst_rank == comm_rank); - thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - renumber_map_labels.begin(), - renumber_map_labels.end(), - renumber_map_minor_labels.begin() + - (partition.get_vertex_partition_first(comm_src_rank) - - partition.get_vertex_partition_first(row_comm_rank * col_comm_size))); - } else { - device_sendrecv(comm, - renumber_map_labels.begin(), - renumber_map_labels.size(), - comm_dst_rank, - renumber_map_minor_labels.begin() + - (partition.get_vertex_partition_first(comm_src_rank) - - partition.get_vertex_partition_first(row_comm_rank * col_comm_size)), - static_cast(partition.get_vertex_partition_size(comm_src_rank)), - comm_src_rank, - handle.get_stream()); - } - - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - for (int i = 0; i < col_comm_size; ++i) { - auto offset = partition.get_vertex_partition_first(row_comm_rank * col_comm_size + i) - - partition.get_vertex_partition_first(row_comm_rank * col_comm_size); - auto count = partition.get_vertex_partition_size(row_comm_rank * col_comm_size + i); - device_bcast(col_comm, - renumber_map_minor_labels.begin() + offset, - renumber_map_minor_labels.begin() + offset, - count, - i, - handle.get_stream()); - } + { + rmm::device_uvector renumber_map_minor_labels( + partition.get_matrix_partition_minor_size(), handle.get_stream()); + std::vector recvcounts(row_comm_size); + for (int i = 0; i < row_comm_size; ++i) { + recvcounts[i] = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i); + } + std::vector displacements(recvcounts.size(), 0); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + device_allgatherv(row_comm, + renumber_map_labels.begin(), + renumber_map_minor_labels.begin(), + recvcounts, + displacements, + handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize( - handle.get_stream())); // cuco::static_map currently does not take stream + CUDA_TRY(cudaStreamSynchronize( + handle.get_stream())); // cuco::static_map currently does not take stream - cuco::static_map renumber_map{ + cuco::static_map renumber_map{ + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max( static_cast(static_cast(renumber_map_minor_labels.size()) / load_factor), - invalid_vertex_id::value, - invalid_vertex_id::value}; - auto pair_first = thrust::make_transform_iterator( - thrust::make_zip_iterator(thrust::make_tuple(renumber_map_minor_labels.begin(), - thrust::make_counting_iterator(minor_first))), - [] __device__(auto val) { - return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); - }); + renumber_map_minor_labels.size() + 1), + invalid_vertex_id::value, + invalid_vertex_id::value}; + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple( + renumber_map_minor_labels.begin(), + thrust::make_counting_iterator(partition.get_matrix_partition_minor_first()))), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (renumber_map_minor_labels.size()) { renumber_map.insert(pair_first, pair_first + renumber_map_minor_labels.size()); - renumber_map.find(edgelist_minor_vertices, - edgelist_minor_vertices + num_edgelist_edges, - edgelist_minor_vertices); + } + for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) { + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the + // grid size is 0; this leads to cudaErrorInvaildConfiguration. + if (edgelist_edge_counts[i]) { + renumber_map.find(edgelist_minor_vertices[i], + edgelist_minor_vertices[i] + edgelist_edge_counts[i], + edgelist_minor_vertices[i]); + } } } @@ -565,27 +653,28 @@ std::enable_if_t> renumber_edgelist( bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); #ifdef CUCO_STATIC_MAP_DEFINED if (do_expensive_check) { - expensive_check_edgelist(handle, - vertices, - num_vertices, - edgelist_major_vertices, - edgelist_minor_vertices, - num_edgelist_edges, - false); + expensive_check_edgelist( + handle, + vertices, + num_vertices, + std::vector{edgelist_major_vertices}, + std::vector{edgelist_minor_vertices}, + std::vector{num_edgelist_edges}); } - auto renumber_map_labels = - detail::compute_renumber_map(handle, - vertices, - num_vertices, - edgelist_major_vertices, - edgelist_minor_vertices, - num_edgelist_edges); + auto renumber_map_labels = detail::compute_renumber_map( + handle, + vertices, + num_vertices, + std::vector{edgelist_major_vertices}, + std::vector{edgelist_minor_vertices}, + std::vector{num_edgelist_edges}); double constexpr load_factor = 0.7; @@ -593,7 +682,11 @@ std::enable_if_t> renumber_edgelist( // footprint and execution time cuco::static_map renumber_map{ - static_cast(static_cast(renumber_map_labels.size()) / load_factor), + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast(static_cast(renumber_map_labels.size()) / load_factor), + renumber_map_labels.size() + 1), invalid_vertex_id::value, invalid_vertex_id::value}; auto pair_first = thrust::make_transform_iterator( @@ -602,11 +695,21 @@ std::enable_if_t> renumber_edgelist( [] __device__(auto val) { return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); }); - renumber_map.insert(pair_first, pair_first + renumber_map_labels.size()); - renumber_map.find( - edgelist_major_vertices, edgelist_major_vertices + num_edgelist_edges, edgelist_major_vertices); - renumber_map.find( - edgelist_minor_vertices, edgelist_minor_vertices + num_edgelist_edges, edgelist_minor_vertices); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (renumber_map_labels.size()) { + renumber_map.insert(pair_first, pair_first + renumber_map_labels.size()); + } + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_edgelist_edges > 0) { + renumber_map.find(edgelist_major_vertices, + edgelist_major_vertices + num_edgelist_edges, + edgelist_major_vertices); + renumber_map.find(edgelist_minor_vertices, + edgelist_minor_vertices + num_edgelist_edges, + edgelist_minor_vertices); + } return renumber_map_labels; #else @@ -620,22 +723,21 @@ template std::enable_if_t, partition_t, vertex_t, edge_t>> renumber_edgelist(raft::handle_t const& handle, - vertex_t* edgelist_major_vertices /* [INOUT] */, - vertex_t* edgelist_minor_vertices /* [INOUT] */, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); return detail::renumber_edgelist(handle, static_cast(nullptr), vertex_t{0}, edgelist_major_vertices, edgelist_minor_vertices, - num_edgelist_edges, - is_hypergraph_partitioned, + edgelist_edge_counts, do_expensive_check); } @@ -648,8 +750,9 @@ std::enable_if_t> renumber_edgelist( bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); return detail::renumber_edgelist(handle, static_cast(nullptr), vertex_t{0} /* dummy */, @@ -665,22 +768,21 @@ std::enable_if_t const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); return detail::renumber_edgelist(handle, local_vertices, num_local_vertices, edgelist_major_vertices, edgelist_minor_vertices, - num_edgelist_edges, - is_hypergraph_partitioned, + edgelist_edge_counts, do_expensive_check); } @@ -695,8 +797,9 @@ std::enable_if_t> renumber_edgelist( bool do_expensive_check) { // FIXME: remove this check once we drop Pascal support - CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, - "Relabel not supported on Pascal and older architectures."); + CUGRAPH_EXPECTS( + handle.get_device_properties().major >= 7, + "This version of renumber_edgelist not supported on Pascal and older architectures."); return detail::renumber_edgelist(handle, vertices, num_vertices, @@ -711,12 +814,12 @@ std::enable_if_t> renumber_edgelist( // instantiations for // template std::tuple, partition_t, int32_t, int32_t> -renumber_edgelist(raft::handle_t const& handle, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -726,14 +829,14 @@ template rmm::device_uvector renumber_edgelist bool do_expensive_check); template std::tuple, partition_t, int32_t, int32_t> -renumber_edgelist(raft::handle_t const& handle, - int32_t const* local_vertices, - int32_t num_local_vertices, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + int32_t const* local_vertices, + int32_t num_local_vertices, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -747,12 +850,12 @@ template rmm::device_uvector renumber_edgelist // instantiations for // template std::tuple, partition_t, int32_t, int64_t> -renumber_edgelist(raft::handle_t const& handle, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -762,14 +865,14 @@ template rmm::device_uvector renumber_edgelist bool do_expensive_check); template std::tuple, partition_t, int32_t, int64_t> -renumber_edgelist(raft::handle_t const& handle, - int32_t const* local_vertices, - int32_t num_local_vertices, - int32_t* edgelist_major_vertices /* [INOUT] */, - int32_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + int32_t const* local_vertices, + int32_t num_local_vertices, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -783,12 +886,12 @@ template rmm::device_uvector renumber_edgelist // instantiations for // template std::tuple, partition_t, int64_t, int64_t> -renumber_edgelist(raft::handle_t const& handle, - int64_t* edgelist_major_vertices /* [INOUT] */, - int64_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, @@ -798,14 +901,14 @@ template rmm::device_uvector renumber_edgelist bool do_expensive_check); template std::tuple, partition_t, int64_t, int64_t> -renumber_edgelist(raft::handle_t const& handle, - int64_t const* local_vertices, - int64_t num_local_vertices, - int64_t* edgelist_major_vertices /* [INOUT] */, - int64_t* edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, - bool do_expensive_check); +renumber_edgelist( + raft::handle_t const& handle, + int64_t const* local_vertices, + int64_t num_local_vertices, + std::vector const& edgelist_major_vertices /* [INOUT] */, + std::vector const& edgelist_minor_vertices /* [INOUT] */, + std::vector const& edgelist_edge_counts, + bool do_expensive_check); template rmm::device_uvector renumber_edgelist( raft::handle_t const& handle, diff --git a/cpp/src/experimental/renumber_utils.cu b/cpp/src/experimental/renumber_utils.cu new file mode 100644 index 00000000000..8f59683d9d6 --- /dev/null +++ b/cpp/src/experimental/renumber_utils.cu @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cugraph { +namespace experimental { + +template +void renumber_ext_vertices(raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + bool do_expensive_check) +{ + double constexpr load_factor = 0.7; + + // FIXME: remove this check once we drop Pascal support + CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, + "renumber_vertices() not supported on Pascal and older architectures."); + +#ifdef CUCO_STATIC_MAP_DEFINED + if (do_expensive_check) { + rmm::device_uvector labels(local_int_vertex_last - local_int_vertex_first, + handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + renumber_map_labels, + renumber_map_labels + labels.size(), + labels.begin()); + thrust::sort( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), labels.begin(), labels.end()); + CUGRAPH_EXPECTS(thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + labels.begin(), + labels.end()) == labels.end(), + "Invalid input arguments: renumber_map_labels have duplicate elements."); + } + + auto renumber_map_ptr = std::make_unique>( + size_t{0}, invalid_vertex_id::value, invalid_vertex_id::value); + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + rmm::device_uvector sorted_unique_ext_vertices(num_vertices, handle.get_stream()); + sorted_unique_ext_vertices.resize( + thrust::distance( + sorted_unique_ext_vertices.begin(), + thrust::copy_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + sorted_unique_ext_vertices.begin(), + [] __device__(auto v) { return v != invalid_vertex_id::value; })), + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_ext_vertices.begin(), + sorted_unique_ext_vertices.end()); + sorted_unique_ext_vertices.resize( + thrust::distance( + sorted_unique_ext_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_ext_vertices.begin(), + sorted_unique_ext_vertices.end())), + handle.get_stream()); + + auto int_vertices_for_sorted_unique_ext_vertices = collect_values_for_unique_keys( + comm, + renumber_map_labels, + renumber_map_labels + (local_int_vertex_last - local_int_vertex_first), + thrust::make_counting_iterator(local_int_vertex_first), + sorted_unique_ext_vertices.begin(), + sorted_unique_ext_vertices.end(), + detail::compute_gpu_id_from_vertex_t{comm_size}, + handle.get_stream()); + + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + renumber_map_ptr.reset(); + + renumber_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max( + static_cast(static_cast(sorted_unique_ext_vertices.size()) / load_factor), + sorted_unique_ext_vertices.size() + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto kv_pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple( + sorted_unique_ext_vertices.begin(), int_vertices_for_sorted_unique_ext_vertices.begin())), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (sorted_unique_ext_vertices.size()) { + renumber_map_ptr->insert(kv_pair_first, kv_pair_first + sorted_unique_ext_vertices.size()); + } + } else { + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + renumber_map_ptr.reset(); + + renumber_map_ptr = std::make_unique>( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max(static_cast( + static_cast(local_int_vertex_last - local_int_vertex_first) / load_factor), + static_cast(local_int_vertex_last - local_int_vertex_first) + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator( + thrust::make_tuple(renumber_map_labels, thrust::make_counting_iterator(vertex_t{0}))), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if ((local_int_vertex_last - local_int_vertex_first) > 0) { + renumber_map_ptr->insert(pair_first, + pair_first + (local_int_vertex_last - local_int_vertex_first)); + } + } + + if (do_expensive_check) { + rmm::device_uvector contains(num_vertices, handle.get_stream()); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_vertices > 0) { + renumber_map_ptr->contains(vertices, vertices + num_vertices, contains.begin()); + } + auto vc_pair_first = thrust::make_zip_iterator(thrust::make_tuple(vertices, contains.begin())); + CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vc_pair_first, + vc_pair_first + num_vertices, + [] __device__(auto pair) { + auto v = thrust::get<0>(pair); + auto c = thrust::get<1>(pair); + return v == invalid_vertex_id::value + ? (c == true) + : (c == false); + }) == 0, + "Invalid input arguments: vertices have elements that are missing in " + "(aggregate) renumber_map_labels."); + } + + // FIXME: a temporary workaround for https://github.com/NVIDIA/cuCollections/issues/74 +#if 1 + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + vertices, + [view = renumber_map_ptr->get_device_view()] __device__(auto v) { + return v != invalid_vertex_id::value + ? view.find(v)->second.load(cuda::std::memory_order_relaxed) + : invalid_vertex_id::value; + }); +#else + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_vertices > 0) { renumber_map_ptr->find(vertices, vertices + num_vertices, vertices); } +#endif +#endif +} + +template +void unrenumber_local_int_vertices( + raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels /* size = local_int_vertex_last - local_int_vertex_first */, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + bool do_expensive_check) +{ + // FIXME: remove this check once we drop Pascal support + CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, + "unrenumber_local_vertices() not supported on Pascal and older architectures."); + +#ifdef CUCO_STATIC_MAP_DEFINED + if (do_expensive_check) { + CUGRAPH_EXPECTS( + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + [local_int_vertex_first, local_int_vertex_last] __device__(auto v) { + return v != invalid_vertex_id::value && + (v < local_int_vertex_first || v >= local_int_vertex_last); + }) == 0, + "Invalid input arguments: there are non-local vertices in [vertices, vertices " + "+ num_vertices)."); + } + + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + vertices, + [renumber_map_labels, local_int_vertex_first] __device__(auto v) { + return v == invalid_vertex_id::value + ? v + : renumber_map_labels[v - local_int_vertex_first]; + }); +#endif +} + +template +void unrenumber_int_vertices(raft::handle_t const& handle, + vertex_t* vertices /* [INOUT] */, + size_t num_vertices, + vertex_t const* renumber_map_labels, + vertex_t local_int_vertex_first, + vertex_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check) +{ + double constexpr load_factor = 0.7; + + // FIXME: remove this check once we drop Pascal support + CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7, + "unrenumber_vertices() not supported on Pascal and older architectures."); + +#ifdef CUCO_STATIC_MAP_DEFINED + if (do_expensive_check) { + CUGRAPH_EXPECTS( + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + [int_vertex_last = vertex_partition_lasts.back()] __device__(auto v) { + return v != invalid_vertex_id::value && + !is_valid_vertex(int_vertex_last, v); + }) == 0, + "Invalid input arguments: there are out-of-range vertices in [vertices, vertices " + "+ num_vertices)."); + } + + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + rmm::device_uvector sorted_unique_int_vertices(num_vertices, handle.get_stream()); + sorted_unique_int_vertices.resize( + thrust::distance( + sorted_unique_int_vertices.begin(), + thrust::copy_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + sorted_unique_int_vertices.begin(), + [] __device__(auto v) { return v != invalid_vertex_id::value; })), + handle.get_stream()); + thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end()); + sorted_unique_int_vertices.resize( + thrust::distance( + sorted_unique_int_vertices.begin(), + thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end())), + handle.get_stream()); + + rmm::device_uvector d_vertex_partition_lasts(vertex_partition_lasts.size(), + handle.get_stream()); + raft::update_device(d_vertex_partition_lasts.data(), + vertex_partition_lasts.data(), + vertex_partition_lasts.size(), + handle.get_stream()); + rmm::device_uvector d_tx_int_vertex_offsets(d_vertex_partition_lasts.size(), + handle.get_stream()); + thrust::lower_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end(), + d_vertex_partition_lasts.begin(), + d_vertex_partition_lasts.end(), + d_tx_int_vertex_offsets.begin()); + std::vector h_tx_int_vertex_counts(d_tx_int_vertex_offsets.size()); + raft::update_host(h_tx_int_vertex_counts.data(), + d_tx_int_vertex_offsets.data(), + d_tx_int_vertex_offsets.size(), + handle.get_stream()); + handle.get_stream_view().synchronize(); + std::adjacent_difference( + h_tx_int_vertex_counts.begin(), h_tx_int_vertex_counts.end(), h_tx_int_vertex_counts.begin()); + + rmm::device_uvector rx_int_vertices(0, handle.get_stream()); + std::vector rx_int_vertex_counts{}; + std::tie(rx_int_vertices, rx_int_vertex_counts) = shuffle_values( + comm, sorted_unique_int_vertices.begin(), h_tx_int_vertex_counts, handle.get_stream()); + + auto tx_ext_vertices = std::move(rx_int_vertices); + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + tx_ext_vertices.begin(), + tx_ext_vertices.end(), + tx_ext_vertices.begin(), + [renumber_map_labels, local_int_vertex_first] __device__(auto v) { + return renumber_map_labels[v - local_int_vertex_first]; + }); + + rmm::device_uvector rx_ext_vertices_for_sorted_unique_int_vertices( + 0, handle.get_stream()); + std::tie(rx_ext_vertices_for_sorted_unique_int_vertices, std::ignore) = + shuffle_values(comm, tx_ext_vertices.begin(), rx_int_vertex_counts, handle.get_stream()); + + handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream + + cuco::static_map unrenumber_map( + // FIXME: std::max(..., ...) as a temporary workaround for + // https://github.com/NVIDIA/cuCollections/issues/72 and + // https://github.com/NVIDIA/cuCollections/issues/73 + std::max( + static_cast(static_cast(sorted_unique_int_vertices.size()) / load_factor), + sorted_unique_int_vertices.size() + 1), + invalid_vertex_id::value, + invalid_vertex_id::value); + + auto pair_first = thrust::make_transform_iterator( + thrust::make_zip_iterator( + thrust::make_tuple(sorted_unique_int_vertices.begin(), + rx_ext_vertices_for_sorted_unique_int_vertices.begin())), + [] __device__(auto val) { + return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val)); + }); + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (sorted_unique_int_vertices.size()) { + unrenumber_map.insert(pair_first, pair_first + sorted_unique_int_vertices.size()); + } + // FIXME: a temporary workaround for https://github.com/NVIDIA/cuCollections/issues/74 +#if 1 + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + vertices, + vertices + num_vertices, + vertices, + [view = unrenumber_map.get_device_view()] __device__(auto v) { + return v != invalid_vertex_id::value + ? view.find(v)->second.load(cuda::std::memory_order_relaxed) + : invalid_vertex_id::value; + }); +#else + // FIXME: a temporary workaround. cuco::static_map currently launches a kernel even if the grid + // size is 0; this leads to cudaErrorInvaildConfiguration. + if (num_vertices > 0) { unrenumber_map.find(vertices, vertices + num_vertices, vertices); } +#endif + } else { + unrenumber_local_int_vertices(handle, + vertices, + num_vertices, + renumber_map_labels, + local_int_vertex_first, + local_int_vertex_last, + do_expensive_check); + } +#endif +} + +// explicit instantiation + +template void renumber_ext_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + bool do_expensive_check); + +template void renumber_ext_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + bool do_expensive_check); + +template void renumber_ext_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + bool do_expensive_check); + +template void renumber_ext_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + bool do_expensive_check); + +template void unrenumber_local_int_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + bool do_expensive_check); + +template void unrenumber_local_int_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + bool do_expensive_check); + +template void unrenumber_int_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check); + +template void unrenumber_int_vertices(raft::handle_t const& handle, + int32_t* vertices, + size_t num_vertices, + int32_t const* renumber_map_labels, + int32_t local_int_vertex_first, + int32_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check); + +template void unrenumber_int_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check); + +template void unrenumber_int_vertices(raft::handle_t const& handle, + int64_t* vertices, + size_t num_vertices, + int64_t const* renumber_map_labels, + int64_t local_int_vertex_first, + int64_t local_int_vertex_last, + std::vector& vertex_partition_lasts, + bool do_expensive_check); + +} // namespace experimental +} // namespace cugraph diff --git a/cpp/src/experimental/shuffle.cuh b/cpp/src/experimental/shuffle.cuh deleted file mode 100644 index 40f3b510b10..00000000000 --- a/cpp/src/experimental/shuffle.cuh +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -namespace cugraph { -namespace experimental { - -namespace detail { - -// -// FIXME: This implementation of variable_shuffle stages the data for transfer -// in host memory. It would be more efficient, I believe, to stage the -// data in device memory, but it would require actually instantiating -// the data in device memory which is already precious in the Louvain -// implementation. We should explore if it's actually more efficient -// through device memory and whether the improvement is worth the extra -// memory required. -// -template -rmm::device_vector variable_shuffle(raft::handle_t const &handle, - std::size_t n_elements, - iterator_t data_iter, - partition_iter_t partition_iter) -{ - // - // We need to compute the size of data movement - // - raft::comms::comms_t const &comms = handle.get_comms(); - - cudaStream_t stream = handle.get_stream(); - int num_gpus = comms.get_size(); - int my_gpu = comms.get_rank(); - - rmm::device_vector local_sizes_v(num_gpus, size_t{0}); - - thrust::for_each(rmm::exec_policy(stream)->on(stream), - partition_iter, - partition_iter + n_elements, - [num_gpus, d_local_sizes = local_sizes_v.data().get()] __device__(auto p) { - atomicAdd(d_local_sizes + p, size_t{1}); - }); - - std::vector h_local_sizes_v(num_gpus); - std::vector h_global_sizes_v(num_gpus); - std::vector h_input_v(n_elements); - std::vector h_partitions_v(n_elements); - - thrust::copy(local_sizes_v.begin(), local_sizes_v.end(), h_local_sizes_v.begin()); - thrust::copy(partition_iter, partition_iter + n_elements, h_partitions_v.begin()); - - std::vector requests(2 * num_gpus); - - int request_pos = 0; - - for (int gpu = 0; gpu < num_gpus; ++gpu) { - if (gpu != my_gpu) { - comms.irecv(&h_global_sizes_v[gpu], 1, gpu, 0, &requests[request_pos]); - ++request_pos; - comms.isend(&h_local_sizes_v[gpu], 1, gpu, 0, &requests[request_pos]); - ++request_pos; - } else { - h_global_sizes_v[gpu] = h_local_sizes_v[gpu]; - } - } - - if (request_pos > 0) { comms.waitall(request_pos, requests.data()); } - - comms.barrier(); - - // - // Now global_sizes contains all of the counts, we need to - // allocate an array of the appropriate size - // - int64_t receive_size = - thrust::reduce(thrust::host, h_global_sizes_v.begin(), h_global_sizes_v.end()); - - std::vector temp_data; - - if (receive_size > 0) temp_data.resize(receive_size); - - rmm::device_vector input_v(n_elements); - - auto input_start = input_v.begin(); - - for (int gpu = 0; gpu < num_gpus; ++gpu) { - input_start = thrust::copy_if(rmm::exec_policy(stream)->on(stream), - data_iter, - data_iter + n_elements, - partition_iter, - input_start, - [gpu] __device__(int32_t p) { return p == gpu; }); - } - - thrust::copy(input_v.begin(), input_v.end(), h_input_v.begin()); - - std::vector temp_v(num_gpus + 1); - - thrust::exclusive_scan( - thrust::host, h_global_sizes_v.begin(), h_global_sizes_v.end(), temp_v.begin()); - - temp_v[num_gpus] = temp_v[num_gpus - 1] + h_global_sizes_v[num_gpus - 1]; - h_global_sizes_v = temp_v; - - thrust::exclusive_scan( - thrust::host, h_local_sizes_v.begin(), h_local_sizes_v.end(), temp_v.begin()); - - temp_v[num_gpus] = temp_v[num_gpus - 1] + h_local_sizes_v[num_gpus - 1]; - h_local_sizes_v = temp_v; - - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - comms.barrier(); - - request_pos = 0; - - for (int gpu = 0; gpu < num_gpus; ++gpu) { - size_t to_receive = h_global_sizes_v[gpu + 1] - h_global_sizes_v[gpu]; - size_t to_send = h_local_sizes_v[gpu + 1] - h_local_sizes_v[gpu]; - - if (gpu != my_gpu) { - if (to_receive > 0) { - comms.irecv( - temp_data.data() + h_global_sizes_v[gpu], to_receive, gpu, 0, &requests[request_pos]); - ++request_pos; - } - - if (to_send > 0) { - comms.isend( - h_input_v.data() + h_local_sizes_v[gpu], to_send, gpu, 0, &requests[request_pos]); - ++request_pos; - } - } else if (to_receive > 0) { - std::copy(h_input_v.begin() + h_local_sizes_v[gpu], - h_input_v.begin() + h_local_sizes_v[gpu + 1], - temp_data.begin() + h_global_sizes_v[gpu]); - } - } - - comms.barrier(); - - if (request_pos > 0) { comms.waitall(request_pos, requests.data()); } - - comms.barrier(); - - return rmm::device_vector(temp_data); -} - -} // namespace detail - -/** - * @brief shuffle data to the desired partition - * - * MNMG algorithms require shuffling data between partitions - * to get the data to the right location for computation. - * - * This function operates dynamically, there is no - * a priori knowledge about where the data will need - * to be transferred. - * - * This function will be executed on each GPU. Each gpu - * has a portion of the data (specified by begin_data and - * end_data iterators) and an iterator that identifies - * (for each corresponding element) which GPU the data - * should be shuffled to. - * - * The return value will be a device vector containing - * the data received by this GPU. - * - * Note that this function accepts iterators as input. - * `partition_iterator` will be traversed multiple times. - * - * @tparam is_multi_gpu If true, multi-gpu - shuffle will occur - * If false, single GPU - simple copy will occur - * @tparam data_t Type of the data being shuffled - * @tparam iterator_t Iterator referencing data to be shuffled - * @tparam partition_iter_t Iterator identifying the destination partition - * - * @param handle Library handle (RAFT) - * @param n_elements Number of elements to transfer - * @param data_iter Iterator that returns the elements to be transfered - * @param partition_iter Iterator that returns the partition where elements - * should be transfered. - */ -template * = nullptr> -rmm::device_vector variable_shuffle(raft::handle_t const &handle, - std::size_t n_elements, - iterator_t data_iter, - partition_iter_t partition_iter) -{ - return detail::variable_shuffle(handle, n_elements, data_iter, partition_iter); -} - -template * = nullptr> -rmm::device_vector variable_shuffle(raft::handle_t const &handle, - std::size_t n_elements, - iterator_t data_iter, - partition_iter_t partition_iter) -{ - return rmm::device_vector(data_iter, data_iter + n_elements); -} - -} // namespace experimental -} // namespace cugraph diff --git a/cpp/src/experimental/sssp.cu b/cpp/src/experimental/sssp.cu index 4996b3734cb..fc488794795 100644 --- a/cpp/src/experimental/sssp.cu +++ b/cpp/src/experimental/sssp.cu @@ -70,6 +70,9 @@ void sssp(raft::handle_t const &handle, CUGRAPH_EXPECTS(push_graph_view.is_valid_vertex(source_vertex), "Invalid input argument: source vertex out-of-range."); + CUGRAPH_EXPECTS(push_graph_view.is_weighted(), + "Invalid input argument: an unweighted graph is passed to SSSP, BFS is more " + "efficient for unweighted graphs."); if (do_expensive_check) { auto num_negative_edge_weights = @@ -122,15 +125,9 @@ void sssp(raft::handle_t const &handle, // 4. initialize SSSP frontier - enum class Bucket { cur_near, new_near, far, num_buckets }; - // FIXME: need to double check the bucket sizes are sufficient - std::vector bucket_sizes(static_cast(Bucket::num_buckets), - push_graph_view.get_number_of_local_vertices()); - VertexFrontier, - vertex_t, - GraphViewType::is_multi_gpu, - static_cast(Bucket::num_buckets)> - vertex_frontier(handle, bucket_sizes); + enum class Bucket { cur_near, next_near, far, num_buckets }; + VertexFrontier(Bucket::num_buckets)> + vertex_frontier(handle); // 5. SSSP iteration @@ -172,8 +169,9 @@ void sssp(raft::handle_t const &handle, update_frontier_v_push_if_out_nbr( handle, push_graph_view, - vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).begin(), - vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).end(), + vertex_frontier, + static_cast(Bucket::cur_near), + std::vector{static_cast(Bucket::next_near), static_cast(Bucket::far)}, row_distances, thrust::make_constant_iterator(0) /* dummy */, [vertex_partition, distances, cutoff] __device__( @@ -188,58 +186,58 @@ void sssp(raft::handle_t const &handle, threshold = old_distance < threshold ? old_distance : threshold; } if (new_distance >= threshold) { push = false; } - return thrust::make_tuple(push, new_distance, src); + return thrust::make_tuple(push, thrust::make_tuple(new_distance, src)); }, reduce_op::min>(), distances, thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first)), - vertex_frontier, [near_far_threshold] __device__(auto v_val, auto pushed_val) { auto new_dist = thrust::get<0>(pushed_val); auto idx = new_dist < v_val - ? (new_dist < near_far_threshold ? static_cast(Bucket::new_near) + ? (new_dist < near_far_threshold ? static_cast(Bucket::next_near) : static_cast(Bucket::far)) - : VertexFrontier, vertex_t>::kInvalidBucketIdx; - return thrust::make_tuple(idx, thrust::get<0>(pushed_val), thrust::get<1>(pushed_val)); + : VertexFrontier::kInvalidBucketIdx; + return thrust::make_tuple(idx, pushed_val); }); vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).clear(); - if (vertex_frontier.get_bucket(static_cast(Bucket::new_near)).aggregate_size() > 0) { + vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).shrink_to_fit(); + if (vertex_frontier.get_bucket(static_cast(Bucket::next_near)).aggregate_size() > 0) { vertex_frontier.swap_buckets(static_cast(Bucket::cur_near), - static_cast(Bucket::new_near)); + static_cast(Bucket::next_near)); } else if (vertex_frontier.get_bucket(static_cast(Bucket::far)).aggregate_size() > 0) { // near queue is empty, split the far queue auto old_near_far_threshold = near_far_threshold; near_far_threshold += delta; - size_t new_near_size{0}; - size_t new_far_size{0}; + size_t near_size{0}; + size_t far_size{0}; while (true) { vertex_frontier.split_bucket( static_cast(Bucket::far), + std::vector{static_cast(Bucket::cur_near)}, [vertex_partition, distances, old_near_far_threshold, near_far_threshold] __device__( auto v) { auto dist = *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v)); if (dist < old_near_far_threshold) { - return VertexFrontier, vertex_t>::kInvalidBucketIdx; + return VertexFrontier::kInvalidBucketIdx; } else if (dist < near_far_threshold) { return static_cast(Bucket::cur_near); } else { return static_cast(Bucket::far); } }); - new_near_size = + near_size = vertex_frontier.get_bucket(static_cast(Bucket::cur_near)).aggregate_size(); - new_far_size = - vertex_frontier.get_bucket(static_cast(Bucket::far)).aggregate_size(); - if ((new_near_size > 0) || (new_far_size == 0)) { + far_size = vertex_frontier.get_bucket(static_cast(Bucket::far)).aggregate_size(); + if ((near_size > 0) || (far_size == 0)) { break; } else { near_far_threshold += delta; } } - if ((new_near_size == 0) && (new_far_size == 0)) { break; } + if ((near_size == 0) && (far_size == 0)) { break; } } else { break; } diff --git a/cpp/src/sampling/random_walks.cu b/cpp/src/sampling/random_walks.cu new file mode 100644 index 00000000000..88d5d9ed5c8 --- /dev/null +++ b/cpp/src/sampling/random_walks.cu @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Andrei Schaffer, aschaffer@nvidia.com +// +#include +#include + +namespace cugraph { +namespace experimental { +// template explicit instantiation directives (EIDir's): +// +// SG FP32{ +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + random_walks(raft::handle_t const& handle, + graph_view_t const& gview, + int32_t const* ptr_d_start, + int32_t num_paths, + int32_t max_depth); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + random_walks(raft::handle_t const& handle, + graph_view_t const& gview, + int32_t const* ptr_d_start, + int64_t num_paths, + int64_t max_depth); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + random_walks(raft::handle_t const& handle, + graph_view_t const& gview, + int64_t const* ptr_d_start, + int64_t num_paths, + int64_t max_depth); +//} +// +// SG FP64{ +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + random_walks(raft::handle_t const& handle, + graph_view_t const& gview, + int32_t const* ptr_d_start, + int32_t num_paths, + int32_t max_depth); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + random_walks(raft::handle_t const& handle, + graph_view_t const& gview, + int32_t const* ptr_d_start, + int64_t num_paths, + int64_t max_depth); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + random_walks(raft::handle_t const& handle, + graph_view_t const& gview, + int64_t const* ptr_d_start, + int64_t num_paths, + int64_t max_depth); +//} +} // namespace experimental +} // namespace cugraph diff --git a/cpp/src/traversal/README.md b/cpp/src/traversal/README.md new file mode 100644 index 00000000000..7f436926de8 --- /dev/null +++ b/cpp/src/traversal/README.md @@ -0,0 +1,56 @@ +# Traversal +cuGraph traversal algorithms are contained in this directory + +## SSSP + +The unit test code is the best place to search for examples on calling SSSP. + + * [SG Implementation](../../tests/experimental/sssp_test.cpp) + * MG Implementation - TBD + +## Simple SSSP + +The example assumes that you create an SG or MG graph somehow. The caller must create the distances and predecessors vectors in device memory and pass in the raw pointers to those vectors into the SSSP function. + +```cpp +#include +... +using vertex_t = int32_t; // or int64_t, whichever is appropriate +using weight_t = float; // or double, whichever is appropriate +using result_t = weight_t; // could specify float or double also +raft::handle_t handle; // Must be configured if MG +auto graph_view = graph.view(); // assumes you have created a graph somehow +vertex_t source; // Initialized by user + +rmm::device_uvector distances_v(graph_view.get_number_of_vertices(), handle.get_stream()); +rmm::device_uvector predecessors_v(graph_view.get_number_of_vertices(), handle.get_stream()); + +cugraph::experimental::sssp(handle, graph_view, distances_v.begin(), predecessors_v.begin(), source, std::numeric_limits::max(), false); +``` + +## BFS + +The unit test code is the best place to search for examples on calling BFS. + + * [SG Implementation](../../tests/experimental/bfs_test.cpp) + * MG Implementation - TBD + +## Simple BFS + +The example assumes that you create an SG or MG graph somehow. The caller must create the distances and predecessors vectors in device memory and pass in the raw pointers to those vectors into the BFS function. + +```cpp +#include +... +using vertex_t = int32_t; // or int64_t, whichever is appropriate +using weight_t = float; // or double, whichever is appropriate +using result_t = weight_t; // could specify float or double also +raft::handle_t handle; // Must be configured if MG +auto graph_view = graph.view(); // assumes you have created a graph somehow +vertex_t source; // Initialized by user + +rmm::device_uvector distances_v(graph_view.get_number_of_vertices(), handle.get_stream()); +rmm::device_uvector predecessors_v(graph_view.get_number_of_vertices(), handle.get_stream()); + +cugraph::experimental::bfs(handle, graph_view, d_distances.begin(), d_predecessors.begin(), source, false, std::numeric_limits::max(), false); +``` diff --git a/cpp/src/traversal/tsp.cu b/cpp/src/traversal/tsp.cu index c669246bc49..a28ddbbaa3f 100644 --- a/cpp/src/traversal/tsp.cu +++ b/cpp/src/traversal/tsp.cu @@ -17,13 +17,15 @@ #include #include +#include + #include "tsp.hpp" #include "tsp_solver.hpp" namespace cugraph { namespace detail { -TSP::TSP(raft::handle_t &handle, +TSP::TSP(raft::handle_t const &handle, int const *vtx_ptr, float const *x_pos, float const *y_pos, @@ -50,56 +52,77 @@ TSP::TSP(raft::handle_t &handle, max_threads_(handle_.get_device_properties().maxThreadsPerBlock), warp_size_(handle_.get_device_properties().warpSize), sm_count_(handle_.get_device_properties().multiProcessorCount), - restart_batch_(4096) + restart_batch_(8192), + neighbors_vec_((k_ + 1) * nodes_, stream_), + work_vec_(restart_batch_ * ((4 * nodes_ + 3 + warp_size_ - 1) / warp_size_ * warp_size_), + stream_), + best_x_pos_vec_(1, stream_), + best_y_pos_vec_(1, stream_), + best_route_vec_(1, stream_) { - allocate(); + setup(); } -void TSP::allocate() +void TSP::setup() { - // Scalars - mylock_ = mylock_scalar_.data(); - best_tour_ = best_tour_scalar_.data(); - climbs_ = climbs_scalar_.data(); + mylock_ = mylock_scalar_.data(); - // Vectors - neighbors_vec_.resize((k_ + 1) * nodes_); + neighbors_ = neighbors_vec_.data(); // pre-allocate workspace for climbs, each block needs a separate permutation space and search // buffer. We allocate a work buffer that will store the computed distances, px, py and the route. // We align it on the warp size. - work_vec_.resize(sizeof(float) * restart_batch_ * - ((4 * nodes_ + 3 + warp_size_ - 1) / warp_size_ * warp_size_)); + work_ = work_vec_.data(); + + results_.best_x_pos = best_x_pos_vec_.data(); + results_.best_y_pos = best_y_pos_vec_.data(); + results_.best_route = best_route_vec_.data(); + results_.best_cost = best_cost_scalar_.data(); +} - // Pointers - neighbors_ = neighbors_vec_.data().get(); - work_ = work_vec_.data().get(); +void TSP::reset_batch() +{ + mylock_scalar_.set_value_zero(stream_); + auto const max{std::numeric_limits::max()}; + best_cost_scalar_.set_value(max, stream_); +} + +void TSP::get_initial_solution(int const batch) +{ + if (!beam_search_) { + random_init<<>>( + work_, x_pos_, y_pos_, vtx_ptr_, nstart_, nodes_, batch, restart_batch_); + CHECK_CUDA(stream_); + } else { + knn_init<<>>( + work_, x_pos_, y_pos_, vtx_ptr_, neighbors_, nstart_, nodes_, k_, batch, restart_batch_); + CHECK_CUDA(stream_); + } } float TSP::compute() { - float valid_coo_dist = 0.f; + float final_cost = 0.f; int num_restart_batches = (restarts_ + restart_batch_ - 1) / restart_batch_; int restart_resid = restarts_ - (num_restart_batches - 1) * restart_batch_; - int global_best = INT_MAX; - float *soln = nullptr; - int *route_sol = nullptr; + int global_best = std::numeric_limits::max(); int best = 0; + std::vector h_x_pos; std::vector h_y_pos; + std::vector h_route; h_x_pos.reserve(nodes_ + 1); h_y_pos.reserve(nodes_ + 1); - - // Stats - int n_timers = 3; - long total_climbs = 0; - std::vector h_times; - struct timeval starttime, endtime; - - // KNN call - knn(); + h_route.reserve(nodes_); + std::vector addr_best_x_pos(1); + std::vector addr_best_y_pos(1); + std::vector addr_best_route(1); + HighResTimer hr_timer; + auto create_timer = [&hr_timer, this](char const *name) { + return VerboseTimer(name, hr_timer, verbose_); + }; if (verbose_) { - std::cout << "Doing " << num_restart_batches - 1 << " batches of size " << restart_batch_ + std::cout << "Doing " << num_restart_batches << " batches of size " << restart_batch_ << ", with " << restart_resid << " tail\n"; std::cout << "configuration: " << nodes_ << " nodes, " << restarts_ << " restart\n"; std::cout << "optimizing graph with kswap = " << kswaps << "\n"; @@ -107,82 +130,75 @@ float TSP::compute() // Tell the cache how we want it to behave cudaFuncSetCacheConfig(search_solution, cudaFuncCachePreferEqual); + best_thread_num_ = best_thread_count(nodes_, max_threads_, sm_count_, warp_size_); - int threads = best_thread_count(nodes_, max_threads_, sm_count_, warp_size_); - if (verbose_) std::cout << "Calculated best thread number = " << threads << "\n"; + if (verbose_) std::cout << "Calculated best thread number = " << best_thread_num_ << "\n"; - rmm::device_vector times(n_timers * threads + n_timers); - h_times.reserve(n_timers * threads + n_timers); + if (beam_search_) { + auto timer = create_timer("knn"); + knn(); + } - gettimeofday(&starttime, NULL); - for (int b = 0; b < num_restart_batches; ++b) { - reset<<<1, 1, 0, stream_>>>(mylock_, best_tour_, climbs_); - CHECK_CUDA(stream_); + for (auto batch = 0; batch < num_restart_batches; ++batch) { + reset_batch(); + if (batch == num_restart_batches - 1) restart_batch_ = restart_resid; - if (b == num_restart_batches - 1) restart_batch_ = restart_resid; - - search_solution<<>>(mylock_, - best_tour_, - vtx_ptr_, - beam_search_, - k_, - nodes_, - neighbors_, - x_pos_, - y_pos_, - work_, - nstart_, - times.data().get(), - climbs_, - threads); + { + auto timer = create_timer("initial_sol"); + get_initial_solution(batch); + } - CHECK_CUDA(stream_); - cudaDeviceSynchronize(); + { + auto timer = create_timer("search_sol"); + search_solution<<>>( + results_, mylock_, vtx_ptr_, beam_search_, k_, nodes_, x_pos_, y_pos_, work_, nstart_); + CHECK_CUDA(stream_); + } + + { + auto timer = create_timer("optimal_tour"); + get_optimal_tour<<>>(results_, mylock_, work_, nodes_); + CHECK_CUDA(stream_); + } - CUDA_TRY(cudaMemcpy(&best, best_tour_, sizeof(int), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); + best = best_cost_scalar_.value(stream_); + if (verbose_) std::cout << "Best reported by kernel = " << best << "\n"; if (best < global_best) { global_best = best; - CUDA_TRY(cudaMemcpyFromSymbol(&soln, best_soln, sizeof(void *))); - cudaDeviceSynchronize(); - CUDA_TRY(cudaMemcpyFromSymbol(&route_sol, best_route, sizeof(void *))); - cudaDeviceSynchronize(); + + raft::update_host(addr_best_x_pos.data(), results_.best_x_pos, 1, stream_); + raft::update_host(addr_best_y_pos.data(), results_.best_y_pos, 1, stream_); + raft::update_host(addr_best_route.data(), results_.best_route, 1, stream_); + CUDA_TRY(cudaStreamSynchronize(stream_)); + + raft::copy(h_x_pos.data(), addr_best_x_pos[0], nodes_ + 1, stream_); + raft::copy(h_y_pos.data(), addr_best_y_pos[0], nodes_ + 1, stream_); + raft::copy(h_route.data(), addr_best_route[0], nodes_, stream_); + raft::copy(route_, addr_best_route[0], nodes_, stream_); + CHECK_CUDA(stream_); } - total_climbs += climbs_scalar_.value(stream_); - } - gettimeofday(&endtime, NULL); - double runtime = - endtime.tv_sec + endtime.tv_usec / 1e6 - starttime.tv_sec - starttime.tv_usec / 1e6; - long long moves = 1LL * total_climbs * (nodes_ - 2) * (nodes_ - 1) / 2; - - raft::copy(route_, route_sol, nodes_, stream_); - - CUDA_TRY(cudaMemcpy(h_x_pos.data(), soln, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost)); - cudaDeviceSynchronize(); - CUDA_TRY(cudaMemcpy( - h_y_pos.data(), soln + nodes_ + 1, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost)); - cudaDeviceSynchronize(); - - for (int i = 0; i < nodes_; ++i) { - if (verbose_) { std::cout << h_x_pos[i] << " " << h_y_pos[i] << "\n"; } - valid_coo_dist += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1); } - CUDA_TRY(cudaMemcpy(h_times.data(), - times.data().get(), - sizeof(float) * n_timers * threads + n_timers, - cudaMemcpyDeviceToHost)); - cudaDeviceSynchronize(); + for (auto i = 0; i < nodes_; ++i) { + if (verbose_) { std::cout << h_route[i] << ": " << h_x_pos[i] << " " << h_y_pos[i] << "\n"; } + final_cost += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1); + } if (verbose_) { - std::cout << "Search runtime = " << runtime << ", " << moves * 1e-9 / runtime << " Gmoves/s\n"; + hr_timer.display(std::cout); std::cout << "Optimized tour length = " << global_best << "\n"; - print_times(h_times, n_timers, handle_.get_device(), threads); } - return valid_coo_dist; + return final_cost; } void TSP::knn() @@ -192,17 +208,17 @@ void TSP::knn() int dim = 2; bool row_major_order = false; - rmm::device_vector input(nodes_ * dim); - float *input_ptr = input.data().get(); + rmm::device_uvector input(nodes_ * dim, stream_); + float *input_ptr = input.data(); raft::copy(input_ptr, x_pos_, nodes_, stream_); raft::copy(input_ptr + nodes_, y_pos_, nodes_, stream_); - rmm::device_vector search_data(nodes_ * dim); - float *search_data_ptr = search_data.data().get(); + rmm::device_uvector search_data(nodes_ * dim, stream_); + float *search_data_ptr = search_data.data(); raft::copy(search_data_ptr, input_ptr, nodes_ * dim, stream_); - rmm::device_vector distances(nodes_ * (k_ + 1)); - float *distances_ptr = distances.data().get(); + rmm::device_uvector distances(nodes_ * (k_ + 1), stream_); + float *distances_ptr = distances.data(); std::vector input_vec; std::vector sizes_vec; @@ -226,7 +242,7 @@ void TSP::knn() } } // namespace detail -float traveling_salesperson(raft::handle_t &handle, +float traveling_salesperson(raft::handle_t const &handle, int const *vtx_ptr, float const *x_pos, float const *y_pos, diff --git a/cpp/src/traversal/tsp.hpp b/cpp/src/traversal/tsp.hpp index b065b779b96..1208f8c8790 100644 --- a/cpp/src/traversal/tsp.hpp +++ b/cpp/src/traversal/tsp.hpp @@ -16,18 +16,28 @@ #pragma once -#include -#include #include + +#include #include + +#include #include #include namespace cugraph { namespace detail { + +struct TSPResults { + float **best_x_pos; + float **best_y_pos; + int **best_route; + int *best_cost; +}; + class TSP { public: - TSP(raft::handle_t &handle, + TSP(raft::handle_t const &handle, int const *vtx_ptr, float const *x_pos, float const *y_pos, @@ -39,14 +49,16 @@ class TSP { bool verbose, int *route); - void allocate(); + void setup(); + void reset_batch(); + void get_initial_solution(int const batch); float compute(); void knn(); ~TSP(){}; private: // Config - raft::handle_t &handle_; + raft::handle_t const &handle_; cudaStream_t stream_; int max_blocks_; int max_threads_; @@ -54,6 +66,7 @@ class TSP { int sm_count_; // how large a grid we want to run, this is fixed int restart_batch_; + int best_thread_num_; // TSP int const *vtx_ptr_; @@ -69,20 +82,42 @@ class TSP { // Scalars rmm::device_scalar mylock_scalar_; - rmm::device_scalar best_tour_scalar_; - rmm::device_scalar climbs_scalar_; + rmm::device_scalar best_cost_scalar_; int *mylock_; - int *best_tour_; - int *climbs_; + int *best_cost_; // Vectors - rmm::device_vector neighbors_vec_; - rmm::device_vector work_vec_; + rmm::device_uvector neighbors_vec_; + rmm::device_uvector work_vec_; + rmm::device_uvector best_x_pos_vec_; + rmm::device_uvector best_y_pos_vec_; + rmm::device_uvector best_route_vec_; int64_t *neighbors_; int *work_; int *work_route_; + TSPResults results_; }; + +class VerboseTimer { + public: + VerboseTimer(char const *name, HighResTimer &hr_timer, bool verbose) + : name_(name), hr_timer_(hr_timer), verbose_(verbose) + { + if (verbose_) hr_timer_.start(name_); + } + + ~VerboseTimer() + { + if (verbose_) hr_timer_.stop(); + } + + private: + const char *name_; + HighResTimer &hr_timer_; + bool verbose_; +}; + } // namespace detail } // namespace cugraph diff --git a/cpp/src/traversal/tsp_solver.hpp b/cpp/src/traversal/tsp_solver.hpp index 20d826cac5c..c7b8cdaaf1d 100644 --- a/cpp/src/traversal/tsp_solver.hpp +++ b/cpp/src/traversal/tsp_solver.hpp @@ -29,29 +29,20 @@ namespace cugraph { namespace detail { -__device__ float *best_soln; -__device__ int *best_route; -extern __shared__ int shbuf[]; - -__global__ void reset(int *mylock, int *best_tour, int *climbs) -{ - *mylock = 0; - *best_tour = INT_MAX; - *climbs = 0; - best_soln = nullptr; - best_route = nullptr; -} - -// random permutation kernel -__device__ void random_init(float const *posx, +__global__ void random_init(int *work, + float const *posx, float const *posy, int const *vtx_ptr, - int *path, - float *px, - float *py, int const nstart, - int const nodes) + int const nodes, + int const batch, + int const restart_batch) { + int *buf = &work[blockIdx.x * ((4 * nodes + 3 + 31) / 32 * 32)]; + float *px = (float *)(&buf[nodes]); + float *py = &px[nodes + 1]; + int *path = (int *)(&py[nodes + 1]); + // Fill values for (int i = threadIdx.x; i <= nodes; i += blockDim.x) { px[i] = posx[i]; @@ -60,15 +51,15 @@ __device__ void random_init(float const *posx, } __syncthreads(); - - if (threadIdx.x == 0) { /* serial permutation as starting point */ + // serial permutation as starting point + if (threadIdx.x == 0) { // swap to start at nstart node raft::swapVals(px[0], px[nstart]); raft::swapVals(py[0], py[nstart]); raft::swapVals(path[0], path[nstart]); curandState rndstate; - curand_init(blockIdx.x, 0, 0, &rndstate); + curand_init(blockIdx.x + (restart_batch * batch), 0, 0, &rndstate); for (int i = 1; i < nodes; i++) { int j = curand(&rndstate) % (nodes - 1 - i) + i; if (i == j) continue; @@ -76,34 +67,37 @@ __device__ void random_init(float const *posx, raft::swapVals(py[i], py[j]); raft::swapVals(path[i], path[j]); } - px[nodes] = px[0]; /* close the loop now, avoid special cases later */ + // close the loop now, avoid special cases later + px[nodes] = px[0]; py[nodes] = py[0]; path[nodes] = path[0]; } } -// Use KNN as a starting solution -__device__ void knn_init(float const *posx, +__global__ void knn_init(int *work, + float const *posx, float const *posy, int const *vtx_ptr, int64_t const *neighbors, - int *buf, - int *path, - float *px, - float *py, int const nstart, int const nodes, - int const K) + int const K, + int const batch, + int const restart_batch) { + int *buf = &work[blockIdx.x * ((4 * nodes + 3 + 31) / 32 * 32)]; + float *px = (float *)(&buf[nodes]); + float *py = &px[nodes + 1]; + int *path = (int *)(&py[nodes + 1]); + for (int i = threadIdx.x; i < nodes; i += blockDim.x) buf[i] = 0; __syncthreads(); if (threadIdx.x == 0) { curandState rndstate; - curand_init(blockIdx.x, 0, 0, &rndstate); + curand_init(blockIdx.x + (restart_batch * batch), 0, 0, &rndstate); int progress = 0; - int initlen = 0; px[0] = posx[nstart]; py[0] = posy[nstart]; @@ -114,7 +108,6 @@ __device__ void knn_init(float const *posx, while (progress < nodes - 1) { // beam search as starting point for (int i = 1; i <= progress; i++) buf[i] = 0; progress = 0; // reset current location in path and visited array - initlen = 0; int randjumps = 0; while (progress < nodes - 1) { int nj = curand(&rndstate) % K; @@ -146,13 +139,11 @@ __device__ void knn_init(float const *posx, px[progress] = posx[head]; py[progress] = posy[head]; path[progress] = vtx_ptr[head]; - initlen += __float2int_rn(euclidean_dist(px, py, progress, progress - 1)); } } px[nodes] = px[nstart]; py[nodes] = py[nstart]; path[nodes] = path[nstart]; - initlen += __float2int_rn(euclidean_dist(px, py, nodes, nstart)); } } @@ -211,10 +202,23 @@ __device__ void two_opt_search( } } -// This function being runned for each block -__device__ void hill_climbing( - float *px, float *py, int *buf, int *path, int *shbuf, int const nodes, int *climbs) +__global__ __launch_bounds__(2048, 2) void search_solution(TSPResults results, + int *mylock, + int const *vtx_ptr, + bool beam_search, + int const K, + int nodes, + float const *posx, + float const *posy, + int *work, + int const nstart) { + int *buf = &work[blockIdx.x * ((4 * nodes + 3 + 31) / 32 * 32)]; + float *px = (float *)(&buf[nodes]); + float *py = &px[nodes + 1]; + int *path = (int *)(&py[nodes + 1]); + + __shared__ int shbuf[tilesize]; __shared__ int best_change[kswaps]; __shared__ int best_i[kswaps]; __shared__ int best_j[kswaps]; @@ -249,9 +253,6 @@ __device__ void hill_climbing( two_opt_search(buf, px, py, shbuf, &minchange, &mini, &minj, nodes); __syncthreads(); - // Stats only - if (threadIdx.x == 0) atomicAdd(climbs, 1); - shbuf[threadIdx.x] = minchange; int j = blockDim.x; // warp reduction to find best thread results @@ -338,77 +339,45 @@ __device__ void hill_climbing( } while (minchange < 0 && myswaps < 2 * nodes); } -__device__ void get_optimal_tour( - int *mylock, int *best_tour, float *px, float *py, int *path, int *shbuf, int const nodes) +__global__ void get_optimal_tour(TSPResults results, int *mylock, int *work, int const nodes) { + extern __shared__ int accumulator[]; + int climber_id = blockIdx.x; + int *buf = &work[climber_id * ((4 * nodes + 3 + 31) / 32 * 32)]; + float *px = (float *)(&buf[nodes]); + float *py = &px[nodes + 1]; + int *path = (int *)(&py[nodes + 1]); + // Now find actual length of the last tour, result of the climb int term = 0; for (int i = threadIdx.x; i < nodes; i += blockDim.x) { term += __float2int_rn(euclidean_dist(px, py, i, i + 1)); } - shbuf[threadIdx.x] = term; + accumulator[threadIdx.x] = term; __syncthreads(); int j = blockDim.x; // block level reduction do { int k = (j + 1) / 2; - if ((threadIdx.x + k) < j) { shbuf[threadIdx.x] += shbuf[threadIdx.x + k]; } + if ((threadIdx.x + k) < j) { accumulator[threadIdx.x] += accumulator[threadIdx.x + k]; } j = k; // divide active warp size in half __syncthreads(); } while (j > 1); - term = shbuf[0]; + term = accumulator[0]; if (threadIdx.x == 0) { - atomicMin(best_tour, term); + atomicMin(results.best_cost, term); while (atomicExch(mylock, 1) != 0) ; // acquire - if (best_tour[0] == term) { - best_soln = px; - best_route = path; + if (results.best_cost[0] == term) { + results.best_x_pos[0] = px; + results.best_y_pos[0] = py; + results.best_route[0] = path; } *mylock = 0; // release __threadfence(); } } -__global__ __launch_bounds__(2048, 2) void search_solution(int *mylock, - int *best_tour, - int const *vtx_ptr, - bool beam_search, - int const K, - int nodes, - int64_t const *neighbors, - float const *posx, - float const *posy, - int *work, - int const nstart, - float *times, - int *climbs, - int threads) -{ - int *buf = &work[blockIdx.x * ((4 * nodes + 3 + 31) / 32 * 32)]; - float *px = (float *)(&buf[nodes]); - float *py = &px[nodes + 1]; - int *path = (int *)(&py[nodes + 1]); - __shared__ int shbuf[tilesize]; - clock_t start; - - start = clock64(); - if (!beam_search) - random_init(posx, posy, vtx_ptr, path, px, py, nstart, nodes); - else - knn_init(posx, posy, vtx_ptr, neighbors, buf, path, px, py, nstart, nodes, K); - __syncthreads(); - times[threadIdx.x] = clock64() - start; - - start = clock64(); - hill_climbing(px, py, buf, path, shbuf, nodes, climbs); - __syncthreads(); - times[threads + threadIdx.x + 1] = clock64() - start; - - start = clock64(); - get_optimal_tour(mylock, best_tour, px, py, path, shbuf, nodes); - times[2 * threads + threadIdx.x + 1] = clock64() - start; -} } // namespace detail } // namespace cugraph diff --git a/cpp/src/traversal/tsp_utils.hpp b/cpp/src/traversal/tsp_utils.hpp index 3faa2efea3b..2a3445f1c81 100644 --- a/cpp/src/traversal/tsp_utils.hpp +++ b/cpp/src/traversal/tsp_utils.hpp @@ -26,34 +26,11 @@ namespace cugraph { namespace detail { -__host__ __device__ inline float euclidean_dist(float *px, float *py, int a, int b) +constexpr float euclidean_dist(float *px, float *py, int a, int b) { return sqrtf((px[a] - px[b]) * (px[a] - px[b]) + (py[a] - py[b]) * (py[a] - py[b])); } -static std::vector device_func = {"Find First", "Hill Climbing", "Retrieve Path"}; - -void print_times(std::vector &h_times, int const n_timers, int device, int threads) -{ - int clock_rate; - cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, device); - - double total = 0; - h_times[0] /= (float)clock_rate; - total += h_times[0]; - for (int i = 1; i < n_timers; ++i) { - h_times[i * threads + 1] /= (float)clock_rate; - total += h_times[i * threads + 1]; - } - std::cout << "Stats: \n"; - std::cout << device_func[0] << " time: " << h_times[0] * 1e-3 << " " - << (h_times[0] / total) * 100.0 << "%\n"; - for (int i = 1; i < n_timers; ++i) { - std::cout << device_func[i] << " time: " << h_times[i * threads + 1] * 1e-3 << " " - << (h_times[i * threads + 1] / total) * 100.0 << "%\n"; - } -} - // Get maximum number of threads we can run on based on number of nodes, // shared memory usage, max threads per block and SM, max blocks for SM and registers per SM. int best_thread_count(int nodes, int max_threads, int sm_count, int warp_size) diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu index 5382b4856f3..4a2b98ea815 100644 --- a/cpp/src/utilities/cython.cu +++ b/cpp/src/utilities/cython.cu @@ -20,22 +20,101 @@ #include #include #include -#include #include #include #include #include +#include +#include + #include +#include +#include #include +#include #include +#include +#include + +#include +#include namespace cugraph { namespace cython { namespace detail { -// FIXME: Add description of this function +// workaround for CUDA extended lambda restrictions +template +struct compute_local_partition_id_t { + vertex_t const* lasts{nullptr}; + size_t num_local_partitions{0}; + + __device__ size_t operator()(vertex_t v) + { + for (size_t i = 0; i < num_local_partitions; ++i) { + if (v < lasts[i]) { return i; } + } + return num_local_partitions; + } +}; + +// FIXME: this is unnecessary if edge_counts_ in the major_minor_weights_t object returned by +// call_shuffle() is passed back, better be fixed. this code assumes that the entire set of edges +// for each partition are consecutively stored. +template +std::vector compute_edge_counts(raft::handle_t const& handle, + graph_container_t const& graph_container) +{ + auto num_local_partitions = static_cast(graph_container.col_comm_size); + + std::vector partition_offsets_vector( + reinterpret_cast(graph_container.vertex_partition_offsets), + reinterpret_cast(graph_container.vertex_partition_offsets) + + (graph_container.row_comm_size * graph_container.col_comm_size) + 1); + + std::vector h_lasts(num_local_partitions); + for (size_t i = 0; i < h_lasts.size(); ++i) { + h_lasts[i] = partition_offsets_vector[graph_container.row_comm_size * (i + 1)]; + } + rmm::device_uvector d_lasts(h_lasts.size(), handle.get_stream()); + raft::update_device(d_lasts.data(), h_lasts.data(), h_lasts.size(), handle.get_stream()); + auto major_vertices = transposed + ? reinterpret_cast(graph_container.dst_vertices) + : reinterpret_cast(graph_container.src_vertices); + auto key_first = thrust::make_transform_iterator( + major_vertices, compute_local_partition_id_t{d_lasts.data(), num_local_partitions}); + rmm::device_uvector d_local_partition_ids(num_local_partitions, handle.get_stream()); + rmm::device_uvector d_edge_counts(d_local_partition_ids.size(), handle.get_stream()); + auto it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + key_first, + key_first + graph_container.num_local_edges, + thrust::make_constant_iterator(edge_t{1}), + d_local_partition_ids.begin(), + d_edge_counts.begin()); + if (static_cast(thrust::distance(d_local_partition_ids.begin(), thrust::get<0>(it))) < + num_local_partitions) { + rmm::device_uvector d_counts(num_local_partitions, handle.get_stream()); + thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_counts.begin(), + d_counts.end(), + edge_t{0}); + thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_edge_counts.begin(), + thrust::get<1>(it), + d_local_partition_ids.begin(), + d_counts.begin()); + d_edge_counts = std::move(d_counts); + } + std::vector h_edge_counts(num_local_partitions, 0); + raft::update_host( + h_edge_counts.data(), d_edge_counts.data(), d_edge_counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + + return h_edge_counts; +} + template > create_graph(raft::handle_t const& handle, graph_container_t const& graph_container) { - std::vector> edgelist( - {{reinterpret_cast(graph_container.src_vertices), - reinterpret_cast(graph_container.dst_vertices), - reinterpret_cast(graph_container.weights), - static_cast(graph_container.num_partition_edges)}}); + auto num_local_partitions = static_cast(graph_container.col_comm_size); std::vector partition_offsets_vector( reinterpret_cast(graph_container.vertex_partition_offsets), reinterpret_cast(graph_container.vertex_partition_offsets) + (graph_container.row_comm_size * graph_container.col_comm_size) + 1); + auto edge_counts = compute_edge_counts(handle, graph_container); + + std::vector displacements(edge_counts.size(), 0); + std::partial_sum(edge_counts.begin(), edge_counts.end() - 1, displacements.begin() + 1); + + std::vector> edgelists( + num_local_partitions); + for (size_t i = 0; i < edgelists.size(); ++i) { + edgelists[i] = cugraph::experimental::edgelist_t{ + reinterpret_cast(graph_container.src_vertices) + displacements[i], + reinterpret_cast(graph_container.dst_vertices) + displacements[i], + graph_container.graph_props.is_weighted + ? reinterpret_cast(graph_container.weights) + displacements[i] + : static_cast(nullptr), + edge_counts[i]}; + } + experimental::partition_t partition(partition_offsets_vector, - graph_container.hypergraph_partitioned, graph_container.row_comm_size, graph_container.col_comm_size, graph_container.row_comm_rank, @@ -65,14 +156,12 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai return std::make_unique>( handle, - edgelist, + edgelists, partition, static_cast(graph_container.num_global_vertices), static_cast(graph_container.num_global_edges), graph_container.graph_props, - // FIXME: This currently fails if sorted_by_degree is true... - // graph_container.sorted_by_degree, - false, + true, graph_container.do_expensive_check); } @@ -89,7 +178,7 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai reinterpret_cast(graph_container.src_vertices), reinterpret_cast(graph_container.dst_vertices), reinterpret_cast(graph_container.weights), - static_cast(graph_container.num_partition_edges)}; + static_cast(graph_container.num_local_edges)}; return std::make_unique>( handle, edgelist, @@ -113,10 +202,11 @@ void populate_graph_container(graph_container_t& graph_container, numberTypeEnum vertexType, numberTypeEnum edgeType, numberTypeEnum weightType, - size_t num_partition_edges, + size_t num_local_edges, size_t num_global_vertices, size_t num_global_edges, bool sorted_by_degree, + bool is_weighted, bool transposed, bool multi_gpu) { @@ -124,7 +214,6 @@ void populate_graph_container(graph_container_t& graph_container, "populate_graph_container() can only be called on an empty container."); bool do_expensive_check{true}; - bool hypergraph_partitioned{false}; if (multi_gpu) { auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); @@ -143,7 +232,7 @@ void populate_graph_container(graph_container_t& graph_container, graph_container.src_vertices = src_vertices; graph_container.dst_vertices = dst_vertices; graph_container.weights = weights; - graph_container.num_partition_edges = num_partition_edges; + graph_container.num_local_edges = num_local_edges; graph_container.num_global_vertices = num_global_vertices; graph_container.num_global_edges = num_global_edges; graph_container.vertexType = vertexType; @@ -151,11 +240,11 @@ void populate_graph_container(graph_container_t& graph_container, graph_container.weightType = weightType; graph_container.transposed = transposed; graph_container.is_multi_gpu = multi_gpu; - graph_container.hypergraph_partitioned = hypergraph_partitioned; graph_container.sorted_by_degree = sorted_by_degree; graph_container.do_expensive_check = do_expensive_check; - experimental::graph_properties_t graph_props{.is_symmetric = false, .is_multigraph = false}; + experimental::graph_properties_t graph_props{ + .is_symmetric = false, .is_multigraph = false, .is_weighted = is_weighted}; graph_container.graph_props = graph_props; graph_container.graph_type = graphTypeEnum::graph_t; @@ -177,7 +266,7 @@ void populate_graph_container_legacy(graph_container_t& graph_container, int* local_offsets) { CUGRAPH_EXPECTS(graph_container.graph_type == graphTypeEnum::null, - "populate_graph_container() can only be called on an empty container."); + "populate_graph_container_legacy() can only be called on an empty container."); // FIXME: This is soon-to-be legacy code left in place until the new graph_t // class is supported everywhere else. Remove everything down to the comment @@ -696,6 +785,61 @@ std::unique_ptr call_egonet(raft::handle_t const& handle, } } +// Wrapper for random_walks() through a graph container +// to expose the API to cython. +// +template +std::enable_if_t::value, + std::unique_ptr> +call_random_walks(raft::handle_t const& handle, + graph_container_t const& graph_container, + vertex_t const* ptr_start_set, + edge_t num_paths, + edge_t max_depth) +{ + if (graph_container.weightType == numberTypeEnum::floatType) { + using weight_t = float; + + auto graph = + detail::create_graph(handle, graph_container); + + auto triplet = cugraph::experimental::random_walks( + handle, graph->view(), ptr_start_set, num_paths, max_depth); + + random_walk_ret_t rw_tri{std::get<0>(triplet).size(), + std::get<1>(triplet).size(), + static_cast(num_paths), + static_cast(max_depth), + std::make_unique(std::get<0>(triplet).release()), + std::make_unique(std::get<1>(triplet).release()), + std::make_unique(std::get<2>(triplet).release())}; + + return std::make_unique(std::move(rw_tri)); + + } else if (graph_container.weightType == numberTypeEnum::doubleType) { + using weight_t = double; + + auto graph = + detail::create_graph(handle, graph_container); + + auto triplet = cugraph::experimental::random_walks( + handle, graph->view(), ptr_start_set, num_paths, max_depth); + + random_walk_ret_t rw_tri{std::get<0>(triplet).size(), + std::get<1>(triplet).size(), + static_cast(num_paths), + static_cast(max_depth), + std::make_unique(std::get<0>(triplet).release()), + std::make_unique(std::get<1>(triplet).release()), + std::make_unique(std::get<2>(triplet).release())}; + + return std::make_unique(std::move(rw_tri)); + + } else { + CUGRAPH_FAIL("Unsupported weight type."); + } +} + // Wrapper for calling SSSP through a graph container template void call_sssp(raft::handle_t const& handle, @@ -747,23 +891,23 @@ void call_sssp(raft::handle_t const& handle, // wrapper for shuffling: // template -std::unique_ptr> call_shuffle( +std::unique_ptr> call_shuffle( raft::handle_t const& handle, vertex_t* edgelist_major_vertices, // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place vertex_t* edgelist_minor_vertices, // [IN / OUT] weight_t* edgelist_weights, // [IN / OUT] - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned) // = false + edge_t num_edgelist_edges) { - auto& comm = handle.get_comms(); - - auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); - auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); - - std::unique_ptr> ptr_ret = - std::make_unique>(handle); + std::unique_ptr> ptr_ret = + std::make_unique>(handle); if (edgelist_weights != nullptr) { auto zip_edge = thrust::make_zip_iterator( @@ -778,10 +922,7 @@ std::unique_ptr> call_shuffle( zip_edge + num_edgelist_edges, [key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ - is_hypergraph_partitioned, - comm.get_size(), - row_comm.get_size(), - col_comm.get_size()}] __device__(auto val) { + comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) { return key_func(thrust::get<0>(val), thrust::get<1>(val)); }, handle.get_stream()); @@ -797,15 +938,46 @@ std::unique_ptr> call_shuffle( zip_edge + num_edgelist_edges, [key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ - is_hypergraph_partitioned, - comm.get_size(), - row_comm.get_size(), - col_comm.get_size()}] __device__(auto val) { + comm.get_size(), row_comm.get_size(), col_comm.get_size()}] __device__(auto val) { return key_func(thrust::get<0>(val), thrust::get<1>(val)); }, handle.get_stream()); } + auto local_partition_id_op = + [comm_size, + key_func = cugraph::experimental::detail::compute_partition_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto pair) { + return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) / + comm_size; // global partition id to local partition id + }; + auto pair_first = thrust::make_zip_iterator( + thrust::make_tuple(ptr_ret->get_major().data(), ptr_ret->get_minor().data())); + + auto edge_counts = + (edgelist_weights != nullptr) + ? cugraph::experimental::groupby_and_count(pair_first, + pair_first + ptr_ret->get_major().size(), + ptr_ret->get_weights().data(), + local_partition_id_op, + col_comm_size, + handle.get_stream()) + : cugraph::experimental::groupby_and_count(pair_first, + pair_first + ptr_ret->get_major().size(), + local_partition_id_op, + col_comm_size, + handle.get_stream()); + + std::vector h_edge_counts(edge_counts.size()); + raft::update_host( + h_edge_counts.data(), edge_counts.data(), edge_counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + + ptr_ret->get_edge_counts().resize(h_edge_counts.size()); + for (size_t i = 0; i < h_edge_counts.size(); ++i) { + ptr_ret->get_edge_counts()[i] = static_cast(h_edge_counts[i]); + } + return ptr_ret; // RVO-ed } @@ -817,8 +989,7 @@ std::unique_ptr> call_renumber( raft::handle_t const& handle, vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */, vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - edge_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu) // bc. cython cannot take non-type template params { @@ -828,33 +999,31 @@ std::unique_ptr> call_renumber( std::make_unique>(handle); if (multi_gpu) { + std::vector displacements(edge_counts.size(), edge_t{0}); + std::partial_sum(edge_counts.begin(), edge_counts.end() - 1, displacements.begin() + 1); + std::vector major_ptrs(edge_counts.size()); + std::vector minor_ptrs(major_ptrs.size()); + for (size_t i = 0; i < edge_counts.size(); ++i) { + major_ptrs[i] = shuffled_edgelist_major_vertices + displacements[i]; + minor_ptrs[i] = shuffled_edgelist_minor_vertices + displacements[i]; + } + std::tie( p_ret->get_dv(), p_ret->get_partition(), p_ret->get_num_vertices(), p_ret->get_num_edges()) = cugraph::experimental::renumber_edgelist( - handle, - shuffled_edgelist_major_vertices, - shuffled_edgelist_minor_vertices, - num_edgelist_edges, - is_hypergraph_partitioned, - do_expensive_check); + handle, major_ptrs, minor_ptrs, edge_counts, do_expensive_check); } else { - auto ret_f = cugraph::experimental::renumber_edgelist( + p_ret->get_dv() = cugraph::experimental::renumber_edgelist( handle, shuffled_edgelist_major_vertices, shuffled_edgelist_minor_vertices, - num_edgelist_edges, + edge_counts[0], do_expensive_check); - auto tot_vertices = static_cast(ret_f.size()); - - p_ret->get_dv() = std::move(ret_f); - cugraph::experimental::partition_t part_sg( - std::vector{0, tot_vertices}, false, 1, 1, 0, 0); + p_ret->get_partition() = cugraph::experimental::partition_t{}; // dummy - p_ret->get_partition() = std::move(part_sg); - - p_ret->get_num_vertices() = tot_vertices; - p_ret->get_num_edges() = num_edgelist_edges; + p_ret->get_num_vertices() = static_cast(p_ret->get_dv().size()); + p_ret->get_num_edges() = edge_counts[0]; } return p_ret; // RVO-ed (copy ellision) @@ -1038,6 +1207,27 @@ template std::unique_ptr call_egonet( int64_t n_subgraphs, int64_t radius); +template std::unique_ptr call_random_walks( + raft::handle_t const& handle, + graph_container_t const& graph_container, + int32_t const* ptr_start_set, + int32_t num_paths, + int32_t max_depth); + +template std::unique_ptr call_random_walks( + raft::handle_t const& handle, + graph_container_t const& graph_container, + int32_t const* ptr_start_set, + int64_t num_paths, + int64_t max_depth); + +template std::unique_ptr call_random_walks( + raft::handle_t const& handle, + graph_container_t const& graph_container, + int64_t const* ptr_start_set, + int64_t num_paths, + int64_t max_depth); + template void call_sssp(raft::handle_t const& handle, graph_container_t const& graph_container, int32_t* identifiers, @@ -1066,53 +1256,47 @@ template void call_sssp(raft::handle_t const& handle, int64_t* predecessors, const int64_t source_vertex); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int32_t* edgelist_major_vertices, int32_t* edgelist_minor_vertices, float* edgelist_weights, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int32_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int32_t* edgelist_major_vertices, int32_t* edgelist_minor_vertices, float* edgelist_weights, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int64_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int32_t* edgelist_major_vertices, int32_t* edgelist_minor_vertices, double* edgelist_weights, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int32_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int32_t* edgelist_major_vertices, int32_t* edgelist_minor_vertices, double* edgelist_weights, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int64_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int64_t* edgelist_major_vertices, int64_t* edgelist_minor_vertices, float* edgelist_weights, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int64_t num_edgelist_edges); -template std::unique_ptr> call_shuffle( +template std::unique_ptr> call_shuffle( raft::handle_t const& handle, int64_t* edgelist_major_vertices, int64_t* edgelist_minor_vertices, double* edgelist_weights, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned); + int64_t num_edgelist_edges); // TODO: add the remaining relevant EIDIr's: // @@ -1120,8 +1304,7 @@ template std::unique_ptr> call_renumber( raft::handle_t const& handle, int32_t* shuffled_edgelist_major_vertices /* [INOUT] */, int32_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - int32_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu); @@ -1129,8 +1312,7 @@ template std::unique_ptr> call_renumber( raft::handle_t const& handle, int32_t* shuffled_edgelist_major_vertices /* [INOUT] */, int32_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu); @@ -1138,8 +1320,7 @@ template std::unique_ptr> call_renumber( raft::handle_t const& handle, int64_t* shuffled_edgelist_major_vertices /* [INOUT] */, int64_t* shuffled_edgelist_minor_vertices /* [INOUT] */, - int64_t num_edgelist_edges, - bool is_hypergraph_partitioned, + std::vector const& edge_counts, bool do_expensive_check, bool multi_gpu); diff --git a/cpp/src/utilities/path_retrieval.cu b/cpp/src/utilities/path_retrieval.cu new file mode 100644 index 00000000000..93ead5898f8 --- /dev/null +++ b/cpp/src/utilities/path_retrieval.cu @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include +#include + +namespace cugraph { +namespace detail { + +template +__global__ void get_traversed_cost_kernel(vertex_t const *vertices, + vertex_t const *preds, + vertex_t const *vtx_map, + weight_t const *info_weights, + weight_t *out, + vertex_t stop_vertex, + vertex_t num_vertices) +{ + for (vertex_t i = threadIdx.x + blockIdx.x * blockDim.x; i < num_vertices; + i += gridDim.x * blockDim.x) { + weight_t sum = info_weights[i]; + vertex_t pred = preds[i]; + while (pred != stop_vertex) { + vertex_t pos = vtx_map[pred]; + sum += info_weights[pos]; + pred = preds[pos]; + } + out[i] = sum; + } +} + +template +void get_traversed_cost_impl(raft::handle_t const &handle, + vertex_t const *vertices, + vertex_t const *preds, + weight_t const *info_weights, + weight_t *out, + vertex_t stop_vertex, + vertex_t num_vertices) +{ + auto stream = handle.get_stream(); + vertex_t max_blocks = handle.get_device_properties().maxGridSize[0]; + vertex_t max_threads = handle.get_device_properties().maxThreadsPerBlock; + + dim3 nthreads, nblocks; + nthreads.x = std::min(num_vertices, max_threads); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = std::min((num_vertices + nthreads.x - 1) / nthreads.x, max_blocks); + nblocks.y = 1; + nblocks.z = 1; + + rmm::device_uvector vtx_map_v(num_vertices, stream); + rmm::device_uvector vtx_keys_v(num_vertices, stream); + vertex_t *vtx_map = vtx_map_v.data(); + vertex_t *vtx_keys = vtx_keys_v.data(); + raft::copy(vtx_keys, vertices, num_vertices, stream); + + thrust::sequence(rmm::exec_policy(stream)->on(stream), vtx_map, vtx_map + num_vertices); + + thrust::stable_sort_by_key( + rmm::exec_policy(stream)->on(stream), vtx_keys, vtx_keys + num_vertices, vtx_map); + + get_traversed_cost_kernel<<>>( + vertices, preds, vtx_map, info_weights, out, stop_vertex, num_vertices); +} +} // namespace detail + +template +void get_traversed_cost(raft::handle_t const &handle, + vertex_t const *vertices, + vertex_t const *preds, + weight_t const *info_weights, + weight_t *out, + vertex_t stop_vertex, + vertex_t num_vertices) +{ + CUGRAPH_EXPECTS(num_vertices > 0, "num_vertices should be strictly positive"); + CUGRAPH_EXPECTS(out != nullptr, "out should be of size num_vertices"); + cugraph::detail::get_traversed_cost_impl( + handle, vertices, preds, info_weights, out, stop_vertex, num_vertices); +} + +template void get_traversed_cost(raft::handle_t const &handle, + int32_t const *vertices, + int32_t const *preds, + float const *info_weights, + float *out, + int32_t stop_vertex, + int32_t num_vertices); + +template void get_traversed_cost(raft::handle_t const &handle, + int32_t const *vertices, + int32_t const *preds, + double const *info_weights, + double *out, + int32_t stop_vertex, + int32_t num_vertices); + +template void get_traversed_cost(raft::handle_t const &handle, + int64_t const *vertices, + int64_t const *preds, + float const *info_weights, + float *out, + int64_t stop_vertex, + int64_t num_vertices); + +template void get_traversed_cost(raft::handle_t const &handle, + int64_t const *vertices, + int64_t const *preds, + double const *info_weights, + double *out, + int64_t stop_vertex, + int64_t num_vertices); +} // namespace cugraph diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 5571cf5f124..89975f673ae 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -20,9 +20,10 @@ # - common test utils ----------------------------------------------------------------------------- add_library(cugraphtestutil STATIC - "${CMAKE_CURRENT_SOURCE_DIR}/utilities/generate_graph_from_edgelist.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/matrix_market_file_utilities.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/rmat_utilities.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/utilities/generate_graph_from_edgelist.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/utilities/thrust_wrapper.cu" "${CMAKE_CURRENT_SOURCE_DIR}/utilities/misc_utilities.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c") @@ -318,6 +319,13 @@ set(MST_TEST_SRC ConfigureTest(MST_TEST "${MST_TEST_SRC}") +################################################################################################### +# - Experimental stream tests ----------------------------------------------------- + +set(EXPERIMENTAL_STREAM_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/streams.cu") + +ConfigureTest(EXPERIMENTAL_STREAM "${EXPERIMENTAL_STREAM_SRCS}" "") ################################################################################################### # - Experimental R-mat graph generation tests ----------------------------------------------------- @@ -375,6 +383,15 @@ set(EXPERIMENTAL_BFS_TEST_SRCS ConfigureTest(EXPERIMENTAL_BFS_TEST "${EXPERIMENTAL_BFS_TEST_SRCS}") +################################################################################################### +# - Experimental BFS tests ------------------------------------------------------------------------ + +set(EXPERIMENTAL_MSBFS_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/ms_bfs_test.cpp") + +ConfigureTest(EXPERIMENTAL_MSBFS_TEST "${EXPERIMENTAL_MSBFS_TEST_SRCS}") + + ################################################################################################### # - Experimental SSSP tests ----------------------------------------------------------------------- @@ -391,14 +408,6 @@ set(EXPERIMENTAL_PAGERANK_TEST_SRCS ConfigureTest(EXPERIMENTAL_PAGERANK_TEST "${EXPERIMENTAL_PAGERANK_TEST_SRCS}") -################################################################################################### -# - Experimental LOUVAIN tests ------------------------------------------------------------------- - -set(EXPERIMENTAL_LOUVAIN_TEST_SRCS - "${CMAKE_CURRENT_SOURCE_DIR}/experimental/louvain_test.cu") - -ConfigureTest(EXPERIMENTAL_LOUVAIN_TEST "${EXPERIMENTAL_LOUVAIN_TEST_SRCS}") - ################################################################################################### # - Experimental KATZ_CENTRALITY tests ------------------------------------------------------------ @@ -407,6 +416,20 @@ set(EXPERIMENTAL_KATZ_CENTRALITY_TEST_SRCS ConfigureTest(EXPERIMENTAL_KATZ_CENTRALITY_TEST "${EXPERIMENTAL_KATZ_CENTRALITY_TEST_SRCS}") +################################################################################################### +# - Experimental RANDOM_WALKS tests ------------------------------------------------------------ + +set(EXPERIMENTAL_RANDOM_WALKS_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/random_walks_test.cu") + +ConfigureTest(EXPERIMENTAL_RANDOM_WALKS_TEST "${EXPERIMENTAL_RANDOM_WALKS_TEST_SRCS}") + +################################################################################################### +set(EXPERIMENTAL_RANDOM_WALKS_LOW_LEVEL_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/rw_low_level_test.cu") + +ConfigureTest(EXPERIMENTAL_RANDOM_WALKS_LOW_LEVEL_TEST "${EXPERIMENTAL_RANDOM_WALKS_LOW_LEVEL_SRCS}") + ################################################################################################### # - MG tests -------------------------------------------------------------------------------------- @@ -422,6 +445,43 @@ if(BUILD_CUGRAPH_MG_TESTS) ConfigureTest(MG_PAGERANK_TEST "${MG_PAGERANK_TEST_SRCS}") target_link_libraries(MG_PAGERANK_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + ########################################################################################### + # - MG KATZ CENTRALITY tests -------------------------------------------------------------- + + set(MG_KATZ_CENTRALITY_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/mg_katz_centrality_test.cpp") + + ConfigureTest(MG_KATZ_CENTRALITY_TEST "${MG_KATZ_CENTRALITY_TEST_SRCS}") + target_link_libraries(MG_KATZ_CENTRALITY_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + + ########################################################################################### + # - MG BFS tests -------------------------------------------------------------------------- + + set(MG_BFS_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/mg_bfs_test.cpp") + + ConfigureTest(MG_BFS_TEST "${MG_BFS_TEST_SRCS}") + target_link_libraries(MG_BFS_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + + ########################################################################################### + # - MG SSSP tests ------------------------------------------------------------------------- + + set(MG_SSSP_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/experimental/mg_sssp_test.cpp") + + ConfigureTest(MG_SSSP_TEST "${MG_SSSP_TEST_SRCS}") + target_link_libraries(MG_SSSP_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + + ########################################################################################### + # - MG LOUVAIN tests ---------------------------------------------------------------------- + + set(MG_LOUVAIN_TEST_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/community/mg_louvain_helper.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/community/mg_louvain_test.cpp") + + ConfigureTest(MG_LOUVAIN_TEST "${MG_LOUVAIN_TEST_SRCS}") + target_link_libraries(MG_LOUVAIN_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX) + else(MPI_CXX_FOUND) message(FATAL_ERROR "OpenMPI NOT found, cannot build MG tests.") endif(MPI_CXX_FOUND) diff --git a/cpp/tests/community/egonet_test.cu b/cpp/tests/community/egonet_test.cu index a9224b42bc1..d61080c685e 100644 --- a/cpp/tests/community/egonet_test.cu +++ b/cpp/tests/community/egonet_test.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include #include @@ -129,8 +128,10 @@ class Tests_InducedEgo : public ::testing::TestWithParam { ASSERT_TRUE(h_cugraph_ego_edge_offsets[i] <= h_cugraph_ego_edge_offsets[i + 1]); auto n_vertices = graph_view.get_number_of_vertices(); for (size_t i = 0; i < d_ego_edgelist_src.size(); i++) { - ASSERT_TRUE(cugraph::test::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_src[i])); - ASSERT_TRUE(cugraph::test::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_dst[i])); + ASSERT_TRUE( + cugraph::experimental::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_src[i])); + ASSERT_TRUE( + cugraph::experimental::is_valid_vertex(n_vertices, h_cugraph_ego_edgelist_dst[i])); } /* @@ -182,150 +183,141 @@ INSTANTIATE_TEST_CASE_P( // For perf analysis /* INSTANTIATE_TEST_CASE_P( -simple_test, -Tests_InducedEgo, -::testing::Values( -InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector{0}, 1, false), -InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector{0}, 2, false), -InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector{0}, 3, false), -InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector{0}, 4, false), -InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector{0}, 5, false), -InducedEgo_Usecase( -"test/datasets/soc-LiveJournal1.mtx", std::vector{363617}, 2, false), -InducedEgo_Usecase( -"test/datasets/soc-LiveJournal1.mtx", -std::vector{ - 363617, 722214, 2337449, 2510183, 2513389, 225853, 2035807, 3836330, 1865496, 28755}, - 2, - false), - InducedEgo_Usecase( - "test/datasets/soc-LiveJournal1.mtx", - std::vector{ - 363617, 722214, 2337449, 2510183, 2513389, 225853, 2035807, 3836330, 1865496, 28755, - 2536834, 3070144, 3888415, 3131712, 2382526, 1040771, 2631543, 4607218, 4465829, - 3341686, 2772973, 2611175, 4526129, 2624421, 1220593, 2593137, 3270705, 1503899, - 1213033, 4840102, 4529036, 3421116, 4264831, 4089751, 4272322, 3486998, 2830318, - 320953, 2388331, 520808, 3023094, 1600294, 3631119, 1716614, 4829213, 1175844, 960680, - 847662, 3277365, 3957318, 3455123, 2454259, 670953, 4465677, 1027332, 2560721, 89061, - 1163406, 3109528, 3221856, 4714426, 2382774, 37828, 4433616, 3283229, 591911, - 4200188, 442522, 872207, 2437601, 741003, 266241, 914618, 3626195, 2021080, - 4679624, 777476, 2527796, 1114017, 640142, 49259, 4069879, 3869098, 1105040, - 4707804, 3208582, 3325885, 1450601, 4072548, 2037062, 2029646, 4575891, 1488598, 79105, - 4827273, 3795434, 4647518, 4733397, 3980718, 1184627}, - 2, - false), - InducedEgo_Usecase( - "test/datasets/soc-LiveJournal1.mtx", - std::vector{ - 363617, 722214, 2337449, 2510183, 2513389, 225853, 2035807, 3836330, 1865496, 28755, - 2536834, 3070144, 3888415, 3131712, 2382526, 1040771, 2631543, 4607218, 4465829, - 3341686, 2772973, 2611175, 4526129, 2624421, 1220593, 2593137, 3270705, 1503899, - 1213033, 4840102, 4529036, 3421116, 4264831, 4089751, 4272322, 3486998, 2830318, - 320953, 2388331, 520808, 3023094, 1600294, 3631119, 1716614, 4829213, 1175844, 960680, - 847662, 3277365, 3957318, 3455123, 2454259, 670953, 4465677, 1027332, 2560721, 89061, - 1163406, 3109528, 3221856, 4714426, 2382774, 37828, 4433616, 3283229, 591911, - 4200188, 442522, 872207, 2437601, 741003, 266241, 914618, 3626195, 2021080, - 4679624, 777476, 2527796, 1114017, 640142, 49259, 4069879, 3869098, 1105040, - 4707804, 3208582, 3325885, 1450601, 4072548, 2037062, 2029646, 4575891, 1488598, 79105, - 4827273, 3795434, 4647518, 4733397, 3980718, 1184627, 984983, 3114832, 1967741, - 1599818, 144593, 2698770, 2889449, 2495550, 1053813, 1193622, 686026, 3989015, - 2040719, 4693428, 3190376, 2926728, 3399030, 1664419, 662429, 4526841, 2186957, - 3752558, 2440046, 2930226, 3633006, 4058166, 3137060, 3499296, 2126343, 148971, - 2199672, 275811, 2813976, 2274536, 1189239, 1335942, 2465624, 2596042, 829684, 193400, - 2682845, 3691697, 4022437, 4051170, 4195175, 2876420, 3984220, 2174475, 326134, - 2606530, 2493046, 4706121, 1498980, 4576225, 1271339, 44832, 1875673, 4664940, - 134931, 736397, 4333554, 2751031, 2163610, 2879676, 3174153, 3317403, 2052464, - 1881883, 4757859, 3596257, 2358088, 2578758, 447504, 590720, 1717038, 1869795, - 1133885, 3027521, 840312, 2818881, 3654321, 2730947, 353585, 1134903, 2223378, - 1508824, 3662521, 1363776, 2712071, 288441, 1204581, 3502242, 4645567, 2767267, - 1514366, 3956099, 1422145, 1216608, 2253360, 189132, 4238225, 1345783, 451571, 1599442, - 3237284, 4711405, 929446, 1857675, 150759, 1277633, 761210, 138628, 1026833, - 2599544, 2464737, 989203, 3399615, 2144292, 216142, 637312, 2044964, 716256, 1660632, - 1762919, 4784357, 2213415, 2764769, 291806, 609772, 3264819, 1870953, 1516385, - 235647, 1045474, 2664957, 819095, 1824119, 4045271, 4448109, 1676788, 4285177, - 1580502, 3546548, 2771971, 3927086, 1339779, 3156204, 1730998, 1172522, 2433024, - 4533449, 479930, 2010695, 672994, 3542039, 3176455, 26352, 2137735, 866910, - 4410835, 2623982, 3603159, 2555625, 2765653, 267865, 2015523, 1009052, 4713994, - 1600667, 2176195, 3179631, 4570390, 2018424, 3356384, 1784287, 894861, 3622099, - 1647273, 3044136, 950354, 1491760, 3416929, 3757300, 2244912, 4129215, 1600848, - 3867343, 72329, 919189, 992521, 3445975, 4712557, 4680974, 188419, 2612093, - 1991268, 3566207, 2281468, 3859078, 2492806, 3398628, 763441, 2679107, 2554420, - 2130132, 4664374, 1182901, 3890770, 4714667, 4209303, 4013060, 3617653, 2040022, - 3296519, 4190671, 1693353, 2678411, 3788834, 2781815, 191965, 1083926, 503974, 3529226, - 1650522, 1900976, 542080, 3423929, 3418905, 878165, 4701703, 3022790, 4316365, 76365, - 4053672, 1358185, 3830478, 4445661, 3210024, 1895915, 4541133, 2938808, 562788, - 3920065, 1458776, 4052046, 2967475, 1092809, 3203538, 159626, 3399464, 214467, - 3343982, 1811854, 3189045, 4272117, 4701563, 424807, 4341116, 760545, 4674683, - 1538018, 386762, 194237, 2162719, 1694433, 943728, 2389036, 2196653, 3085571, - 1513424, 3689413, 3278747, 4197291, 3324063, 3651090, 1737936, 2768803, 2768889, - 3108096, 4311775, 3569480, 886705, 733256, 2477493, 1735412, 2960895, 1983781, - 1861797, 3566460, 4537673, 1164093, 3499764, 4553071, 3518985, 847658, 918948, - 2922351, 1056144, 652895, 1013195, 780505, 1702928, 3562838, 1432719, 2405207, - 1054920, 641647, 2240939, 3617702, 383165, 652641, 879593, 1810739, 2096385, - 4497865, 4768530, 1743968, 3582014, 1025009, 3002122, 2422190, 527647, 1251821, - 2571153, 4095874, 3705333, 3637407, 1385567, 4043855, 4041930, 2433139, 1710383, - 1127734, 4362316, 711588, 817839, 3214775, 910077, 1313768, 2382229, 16864, 2081770, - 3095420, 3195272, 548711, 2259860, 1167323, 2435974, 425238, 2085179, 2630042, - 2632881, 2867923, 3703565, 1037695, 226617, 4379130, 1541468, 3581937, 605965, - 1137674, 4655221, 4769963, 1394370, 4425315, 2990132, 2364485, 1561137, 2713384, - 481509, 2900382, 934766, 2986774, 1767669, 298593, 2502539, 139296, 3794229, - 4002180, 4718138, 2909238, 423691, 3023810, 2784924, 2760160, 1971980, 316683, - 3828090, 3253691, 4839313, 1203624, 584938, 3901482, 1747543, 1572737, 3533226, - 774708, 1691195, 1037110, 1557763, 225120, 4424243, 3524086, 1717663, 4332507, - 3513592, 4274932, 1232118, 873498, 1416042, 2488925, 111391, 4704545, 4492545, - 445317, 1584812, 2187737, 2471948, 3731678, 219255, 2282627, 2589971, 2372185, - 4609096, 3673961, 2524410, 12823, 2437155, 3015974, 4188352, 3184084, 3690756, - 1222341, 1278376, 3652030, 4162647, 326548, 3930062, 3926100, 1551222, 2722165, - 4526695, 3997534, 4815513, 3139056, 2547644, 3028915, 4149092, 3656554, 2691582, - 2676699, 1878842, 260174, 3129900, 4379993, 182347, 2189338, 3783616, 2616666, - 2596952, 243007, 4179282, 2730, 1939894, 2332032, 3335636, 182332, 3112260, - 2174584, 587481, 4527368, 3154106, 3403059, 673206, 2150292, 446521, 1600204, - 4819428, 2591357, 48490, 2917012, 2285923, 1072926, 2824281, 4364250, 956033, 311938, - 37251, 3729300, 2726300, 644966, 1623020, 1419070, 4646747, 2417222, 2680238, - 2561083, 1793801, 2349366, 339747, 611366, 4684147, 4356907, 1277161, 4510381, - 3218352, 4161658, 3200733, 1172372, 3997786, 3169266, 3353418, 2248955, 2875885, - 2365369, 498208, 2968066, 2681505, 2059048, 2097106, 3607540, 1121504, 2016789, - 1762605, 3138431, 866081, 3705757, 3833066, 2599788, 760816, 4046672, 1544367, - 2983906, 4842911, 209599, 1250954, 3333704, 561212, 4674336, 2831841, 3690724, - 2929360, 4830834, 1177524, 2487687, 3525137, 875283, 651241, 2110742, 1296646, - 1543739, 4349417, 2384725, 1931751, 1519208, 1520034, 3385008, 3219962, 734912, 170230, - 1741419, 729913, 2860117, 2362381, 1199807, 2424230, 177824, 125948, 2722701, - 4687548, 1140771, 3232742, 4522020, 4376360, 1125603, 590312, 2481884, 138951, - 4086775, 615155, 3395781, 4587272, 283209, 568470, 4296185, 4344150, 2454321, - 2672602, 838828, 4051647, 1709120, 3074610, 693235, 4356087, 3018806, 239410, - 2431497, 691186, 766276, 4462126, 859155, 2370304, 1571808, 1938673, 1694955, - 3871296, 4245059, 3987376, 301524, 2512461, 3410437, 3300380, 684922, 4581995, - 3599557, 683515, 1850634, 3704678, 1937490, 2035591, 3718533, 2065879, 3160765, - 1467884, 1912241, 2501509, 3668572, 3390469, 2501150, 612319, 713633, 1976262, 135946, - 3641535, 632083, 13414, 4217765, 4137712, 2550250, 3281035, 4179598, 961045, - 2020694, 4380006, 1345936, 289162, 1359035, 770872, 4509911, 3947317, 4719693, - 248568, 2625660, 1237232, 2153208, 4814282, 1259954, 3677369, 861222, 2883506, - 3339149, 3998335, 491017, 1609022, 2648112, 742132, 649609, 4206953, 3131106, - 3504814, 3344486, 611721, 3215620, 2856233, 4447505, 1949222, 1868345, 712710, 6966, - 4730666, 3181872, 2972889, 3038521, 3525444, 4385208, 1845613, 1124187, 2030476, - 4468651, 2478792, 3473580, 3783357, 1852991, 1648485, 871319, 1670723, 4458328, - 3218600, 1811100, 3443356, 2233873, 3035207, 2548692, 3337891, 3773674, 1552957, - 4782811, 3144712, 3523466, 1491315, 3955852, 1838410, 3164028, 1092543, 776459, - 2959379, 2541744, 4064418, 3908320, 2854145, 3960709, 1348188, 977678, 853619, - 1304291, 2848702, 1657913, 1319826, 3322665, 788037, 2913686, 4471279, 1766285, 348304, - 56570, 1892118, 4017244, 401006, 3524539, 4310134, 1624693, 4081113, 957511, 849400, - 129975, 2616130, 378537, 1556787, 3916162, 1039980, 4407778, 2027690, 4213675, - 839863, 683134, 75805, 2493150, 4215796, 81587, 751845, 1255588, 1947964, - 1950470, 859401, 3077088, 3931110, 2316256, 1523761, 4527477, 4237511, 1123513, - 4209796, 3584772, 4250563, 2091754, 1618766, 2139944, 4525352, 382159, 2955887, 41760, - 2313998, 496912, 3791570, 3904792, 3613654, 873959, 127076, 2537797, 2458107, - 4543265, 3661909, 26828, 271816, 17854, 2461269, 1776042, 1573899, 3409957, - 4335712, 4534313, 3392751, 1230124, 2159031, 4444015, 3373087, 3848014, 2026600, - 1382747, 3537242, 4536743, 4714155, 3788371, 3570849, 173741, 211962, 4377778, - 119369, 2856973, 2945854, 1508054, 4503932, 3141566, 1842177, 3448683, 3384614, - 2886508, 1573965, 990618, 3053734, 2918742, 4508753, 1032149, 60943, 4291620, - 722607, 2883224, 169359, 4356585, 3725543, 3678729, 341673, 3592828, 4077251, - 3382936, 3885685, 4630994, 1286698, 4449616, 1138430, 3113385, 4660578, 2539973, - 4562286, 4085089, 494737, 3967610, 2130702, 1823755, 1369324, 3796951, 956299, 141730, - 935144, 4381893, 4412545, 1382250, 3024476, 2364546, 3396164, 3573511, 314081, 577688, - 4154135, 1567018, 4047761, 2446220, 1148833, 4842497, 3967186, 1175290, 3749667, - 1209593, 3295627, 3169065, 2460328, 1838486, 1436923, 2843887, 3676426, 2079145, - 2975635, 535071, 4287509, 3281107, 39606, 3115500, 3204573, 722131, 3124073}, -2, -false)));*/ + simple_test, + Tests_InducedEgo, + ::testing::Values( + InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector{0}, 1, false), + InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector{0}, 2, false), + InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector{0}, 3, false), + InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector{0}, 4, false), + InducedEgo_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector{0}, 5, false), + InducedEgo_Usecase( + "test/datasets/soc-LiveJournal1.mtx", std::vector{363617}, 2, false), + InducedEgo_Usecase( + "test/datasets/soc-LiveJournal1.mtx", + std::vector{ + 363617, 722214, 2337449, 2510183, 2513389, 225853, 2035807, 3836330, 1865496, 28755}, + 2, + false), + InducedEgo_Usecase( + "test/datasets/soc-LiveJournal1.mtx", + std::vector{ + 363617, 722214, 2337449, 2510183, 2513389, 225853, 2035807, 3836330, 1865496, 28755, + 2536834, 3070144, 3888415, 3131712, 2382526, 1040771, 2631543, 4607218, 4465829, 3341686, + 2772973, 2611175, 4526129, 2624421, 1220593, 2593137, 3270705, 1503899, 1213033, 4840102, + 4529036, 3421116, 4264831, 4089751, 4272322, 3486998, 2830318, 320953, 2388331, 520808, + 3023094, 1600294, 3631119, 1716614, 4829213, 1175844, 960680, 847662, 3277365, 3957318, + 3455123, 2454259, 670953, 4465677, 1027332, 2560721, 89061, 1163406, 3109528, 3221856, + 4714426, 2382774, 37828, 4433616, 3283229, 591911, 4200188, 442522, 872207, 2437601, + 741003, 266241, 914618, 3626195, 2021080, 4679624, 777476, 2527796, 1114017, 640142, + 49259, 4069879, 3869098, 1105040, 4707804, 3208582, 3325885, 1450601, 4072548, 2037062, + 2029646, 4575891, 1488598, 79105, 4827273, 3795434, 4647518, 4733397, 3980718, 1184627}, + 2, + false), + InducedEgo_Usecase( + "test/datasets/soc-LiveJournal1.mtx", + std::vector{ + 363617, 722214, 2337449, 2510183, 2513389, 225853, 2035807, 3836330, 1865496, 28755, + 2536834, 3070144, 3888415, 3131712, 2382526, 1040771, 2631543, 4607218, 4465829, 3341686, + 2772973, 2611175, 4526129, 2624421, 1220593, 2593137, 3270705, 1503899, 1213033, 4840102, + 4529036, 3421116, 4264831, 4089751, 4272322, 3486998, 2830318, 320953, 2388331, 520808, + 3023094, 1600294, 3631119, 1716614, 4829213, 1175844, 960680, 847662, 3277365, 3957318, + 3455123, 2454259, 670953, 4465677, 1027332, 2560721, 89061, 1163406, 3109528, 3221856, + 4714426, 2382774, 37828, 4433616, 3283229, 591911, 4200188, 442522, 872207, 2437601, + 741003, 266241, 914618, 3626195, 2021080, 4679624, 777476, 2527796, 1114017, 640142, + 49259, 4069879, 3869098, 1105040, 4707804, 3208582, 3325885, 1450601, 4072548, 2037062, + 2029646, 4575891, 1488598, 79105, 4827273, 3795434, 4647518, 4733397, 3980718, 1184627, + 984983, 3114832, 1967741, 1599818, 144593, 2698770, 2889449, 2495550, 1053813, 1193622, + 686026, 3989015, 2040719, 4693428, 3190376, 2926728, 3399030, 1664419, 662429, 4526841, + 2186957, 3752558, 2440046, 2930226, 3633006, 4058166, 3137060, 3499296, 2126343, 148971, + 2199672, 275811, 2813976, 2274536, 1189239, 1335942, 2465624, 2596042, 829684, 193400, + 2682845, 3691697, 4022437, 4051170, 4195175, 2876420, 3984220, 2174475, 326134, 2606530, + 2493046, 4706121, 1498980, 4576225, 1271339, 44832, 1875673, 4664940, 134931, 736397, + 4333554, 2751031, 2163610, 2879676, 3174153, 3317403, 2052464, 1881883, 4757859, 3596257, + 2358088, 2578758, 447504, 590720, 1717038, 1869795, 1133885, 3027521, 840312, 2818881, + 3654321, 2730947, 353585, 1134903, 2223378, 1508824, 3662521, 1363776, 2712071, 288441, + 1204581, 3502242, 4645567, 2767267, 1514366, 3956099, 1422145, 1216608, 2253360, 189132, + 4238225, 1345783, 451571, 1599442, 3237284, 4711405, 929446, 1857675, 150759, 1277633, + 761210, 138628, 1026833, 2599544, 2464737, 989203, 3399615, 2144292, 216142, 637312, + 2044964, 716256, 1660632, 1762919, 4784357, 2213415, 2764769, 291806, 609772, 3264819, + 1870953, 1516385, 235647, 1045474, 2664957, 819095, 1824119, 4045271, 4448109, 1676788, + 4285177, 1580502, 3546548, 2771971, 3927086, 1339779, 3156204, 1730998, 1172522, 2433024, + 4533449, 479930, 2010695, 672994, 3542039, 3176455, 26352, 2137735, 866910, 4410835, + 2623982, 3603159, 2555625, 2765653, 267865, 2015523, 1009052, 4713994, 1600667, 2176195, + 3179631, 4570390, 2018424, 3356384, 1784287, 894861, 3622099, 1647273, 3044136, 950354, + 1491760, 3416929, 3757300, 2244912, 4129215, 1600848, 3867343, 72329, 919189, 992521, + 3445975, 4712557, 4680974, 188419, 2612093, 1991268, 3566207, 2281468, 3859078, 2492806, + 3398628, 763441, 2679107, 2554420, 2130132, 4664374, 1182901, 3890770, 4714667, 4209303, + 4013060, 3617653, 2040022, 3296519, 4190671, 1693353, 2678411, 3788834, 2781815, 191965, + 1083926, 503974, 3529226, 1650522, 1900976, 542080, 3423929, 3418905, 878165, 4701703, + 3022790, 4316365, 76365, 4053672, 1358185, 3830478, 4445661, 3210024, 1895915, 4541133, + 2938808, 562788, 3920065, 1458776, 4052046, 2967475, 1092809, 3203538, 159626, 3399464, + 214467, 3343982, 1811854, 3189045, 4272117, 4701563, 424807, 4341116, 760545, 4674683, + 1538018, 386762, 194237, 2162719, 1694433, 943728, 2389036, 2196653, 3085571, 1513424, + 3689413, 3278747, 4197291, 3324063, 3651090, 1737936, 2768803, 2768889, 3108096, 4311775, + 3569480, 886705, 733256, 2477493, 1735412, 2960895, 1983781, 1861797, 3566460, 4537673, + 1164093, 3499764, 4553071, 3518985, 847658, 918948, 2922351, 1056144, 652895, 1013195, + 780505, 1702928, 3562838, 1432719, 2405207, 1054920, 641647, 2240939, 3617702, 383165, + 652641, 879593, 1810739, 2096385, 4497865, 4768530, 1743968, 3582014, 1025009, 3002122, + 2422190, 527647, 1251821, 2571153, 4095874, 3705333, 3637407, 1385567, 4043855, 4041930, + 2433139, 1710383, 1127734, 4362316, 711588, 817839, 3214775, 910077, 1313768, 2382229, + 16864, 2081770, 3095420, 3195272, 548711, 2259860, 1167323, 2435974, 425238, 2085179, + 2630042, 2632881, 2867923, 3703565, 1037695, 226617, 4379130, 1541468, 3581937, 605965, + 1137674, 4655221, 4769963, 1394370, 4425315, 2990132, 2364485, 1561137, 2713384, 481509, + 2900382, 934766, 2986774, 1767669, 298593, 2502539, 139296, 3794229, 4002180, 4718138, + 2909238, 423691, 3023810, 2784924, 2760160, 1971980, 316683, 3828090, 3253691, 4839313, + 1203624, 584938, 3901482, 1747543, 1572737, 3533226, 774708, 1691195, 1037110, 1557763, + 225120, 4424243, 3524086, 1717663, 4332507, 3513592, 4274932, 1232118, 873498, 1416042, + 2488925, 111391, 4704545, 4492545, 445317, 1584812, 2187737, 2471948, 3731678, 219255, + 2282627, 2589971, 2372185, 4609096, 3673961, 2524410, 12823, 2437155, 3015974, 4188352, + 3184084, 3690756, 1222341, 1278376, 3652030, 4162647, 326548, 3930062, 3926100, 1551222, + 2722165, 4526695, 3997534, 4815513, 3139056, 2547644, 3028915, 4149092, 3656554, 2691582, + 2676699, 1878842, 260174, 3129900, 4379993, 182347, 2189338, 3783616, 2616666, 2596952, + 243007, 4179282, 2730, 1939894, 2332032, 3335636, 182332, 3112260, 2174584, 587481, + 4527368, 3154106, 3403059, 673206, 2150292, 446521, 1600204, 4819428, 2591357, 48490, + 2917012, 2285923, 1072926, 2824281, 4364250, 956033, 311938, 37251, 3729300, 2726300, + 644966, 1623020, 1419070, 4646747, 2417222, 2680238, 2561083, 1793801, 2349366, 339747, + 611366, 4684147, 4356907, 1277161, 4510381, 3218352, 4161658, 3200733, 1172372, 3997786, + 3169266, 3353418, 2248955, 2875885, 2365369, 498208, 2968066, 2681505, 2059048, 2097106, + 3607540, 1121504, 2016789, 1762605, 3138431, 866081, 3705757, 3833066, 2599788, 760816, + 4046672, 1544367, 2983906, 4842911, 209599, 1250954, 3333704, 561212, 4674336, 2831841, + 3690724, 2929360, 4830834, 1177524, 2487687, 3525137, 875283, 651241, 2110742, 1296646, + 1543739, 4349417, 2384725, 1931751, 1519208, 1520034, 3385008, 3219962, 734912, 170230, + 1741419, 729913, 2860117, 2362381, 1199807, 2424230, 177824, 125948, 2722701, 4687548, + 1140771, 3232742, 4522020, 4376360, 1125603, 590312, 2481884, 138951, 4086775, 615155, + 3395781, 4587272, 283209, 568470, 4296185, 4344150, 2454321, 2672602, 838828, 4051647, + 1709120, 3074610, 693235, 4356087, 3018806, 239410, 2431497, 691186, 766276, 4462126, + 859155, 2370304, 1571808, 1938673, 1694955, 3871296, 4245059, 3987376, 301524, 2512461, + 3410437, 3300380, 684922, 4581995, 3599557, 683515, 1850634, 3704678, 1937490, 2035591, + 3718533, 2065879, 3160765, 1467884, 1912241, 2501509, 3668572, 3390469, 2501150, 612319, + 713633, 1976262, 135946, 3641535, 632083, 13414, 4217765, 4137712, 2550250, 3281035, + 4179598, 961045, 2020694, 4380006, 1345936, 289162, 1359035, 770872, 4509911, 3947317, + 4719693, 248568, 2625660, 1237232, 2153208, 4814282, 1259954, 3677369, 861222, 2883506, + 3339149, 3998335, 491017, 1609022, 2648112, 742132, 649609, 4206953, 3131106, 3504814, + 3344486, 611721, 3215620, 2856233, 4447505, 1949222, 1868345, 712710, 6966, 4730666, + 3181872, 2972889, 3038521, 3525444, 4385208, 1845613, 1124187, 2030476, 4468651, 2478792, + 3473580, 3783357, 1852991, 1648485, 871319, 1670723, 4458328, 3218600, 1811100, 3443356, + 2233873, 3035207, 2548692, 3337891, 3773674, 1552957, 4782811, 3144712, 3523466, 1491315, + 3955852, 1838410, 3164028, 1092543, 776459, 2959379, 2541744, 4064418, 3908320, 2854145, + 3960709, 1348188, 977678, 853619, 1304291, 2848702, 1657913, 1319826, 3322665, 788037, + 2913686, 4471279, 1766285, 348304, 56570, 1892118, 4017244, 401006, 3524539, 4310134, + 1624693, 4081113, 957511, 849400, 129975, 2616130, 378537, 1556787, 3916162, 1039980, + 4407778, 2027690, 4213675, 839863, 683134, 75805, 2493150, 4215796, 81587, 751845, + 1255588, 1947964, 1950470, 859401, 3077088, 3931110, 2316256, 1523761, 4527477, 4237511, + 1123513, 4209796, 3584772, 4250563, 2091754, 1618766, 2139944, 4525352, 382159, 2955887, + 41760, 2313998, 496912, 3791570, 3904792, 3613654, 873959, 127076, 2537797, 2458107, + 4543265, 3661909, 26828, 271816, 17854, 2461269, 1776042, 1573899, 3409957, 4335712, + 4534313, 3392751, 1230124, 2159031, 4444015, 3373087, 3848014, 2026600, 1382747, 3537242, + 4536743, 4714155, 3788371, 3570849, 173741, 211962, 4377778, 119369, 2856973, 2945854, + 1508054, 4503932, 3141566, 1842177, 3448683, 3384614, 2886508, 1573965, 990618, 3053734, + 2918742, 4508753, 1032149, 60943, 4291620, 722607, 2883224, 169359, 4356585, 3725543, + 3678729, 341673, 3592828, 4077251, 3382936, 3885685, 4630994, 1286698, 4449616, 1138430, + 3113385, 4660578, 2539973, 4562286, 4085089, 494737, 3967610, 2130702, 1823755, 1369324, + 3796951, 956299, 141730, 935144, 4381893, 4412545, 1382250, 3024476, 2364546, 3396164, + 3573511, 314081, 577688, 4154135, 1567018, 4047761, 2446220, 1148833, 4842497, 3967186, + 1175290, 3749667, 1209593, 3295627, 3169065, 2460328, 1838486, 1436923, 2843887, 3676426, + 2079145, 2975635, 535071, 4287509, 3281107, 39606, 3115500, 3204573, 722131, 3124073}, + 2, + false))); +*/ CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp index d3024282be3..2ebf9a85902 100644 --- a/cpp/tests/community/louvain_test.cpp +++ b/cpp/tests/community/louvain_test.cpp @@ -9,15 +9,157 @@ * */ #include +#include + +#include +#include +#include +#include + +#include #include -#include -#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +struct Louvain_Usecase { + std::string graph_file_full_path_{}; + bool test_weighted_{false}; + int expected_level_{0}; + float expected_modularity_{0}; + + Louvain_Usecase(std::string const& graph_file_path, + bool test_weighted, + int expected_level, + float expected_modularity) + : test_weighted_(test_weighted), + expected_level_(expected_level), + expected_modularity_(expected_modularity) + { + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path_ = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path_ = graph_file_path; + } + }; +}; + +class Tests_Louvain : public ::testing::TestWithParam { + public: + Tests_Louvain() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + template + void run_legacy_test(Louvain_Usecase const& configuration) + { + raft::handle_t handle{}; + + bool directed{false}; + + auto graph = cugraph::test::generate_graph_csr_from_mm( + directed, configuration.graph_file_full_path_); + auto graph_view = graph->view(); + + // "FIXME": remove this check once we drop support for Pascal + // + // Calling louvain on Pascal will throw an exception, we'll check that + // this is the behavior while we still support Pascal (device_prop.major < 7) + // + cudaDeviceProp device_prop; + CUDA_CHECK(cudaGetDeviceProperties(&device_prop, 0)); + + if (device_prop.major < 7) { + EXPECT_THROW(louvain(graph_view, + graph_view.get_number_of_vertices(), + configuration.expected_level_, + configuration.expected_modularity_), + cugraph::logic_error); + } else { + louvain(graph_view, + graph_view.get_number_of_vertices(), + configuration.expected_level_, + configuration.expected_modularity_); + } + } -#include + template + void run_current_test(Louvain_Usecase const& configuration) + { + raft::handle_t handle{}; + + cugraph::experimental::graph_t graph(handle); + std::tie(graph, std::ignore) = + cugraph::test::read_graph_from_matrix_market_file( + handle, configuration.graph_file_full_path_, configuration.test_weighted_, false); + + auto graph_view = graph.view(); + + // "FIXME": remove this check once we drop support for Pascal + // + // Calling louvain on Pascal will throw an exception, we'll check that + // this is the behavior while we still support Pascal (device_prop.major < 7) + // + cudaDeviceProp device_prop; + CUDA_CHECK(cudaGetDeviceProperties(&device_prop, 0)); + + if (device_prop.major < 7) { + EXPECT_THROW(louvain(graph_view, + graph_view.get_number_of_local_vertices(), + configuration.expected_level_, + configuration.expected_modularity_), + cugraph::logic_error); + } else { + louvain(graph_view, + graph_view.get_number_of_local_vertices(), + configuration.expected_level_, + configuration.expected_modularity_); + } + } + + template + void louvain(graph_t const& graph_view, + typename graph_t::vertex_type num_vertices, + int expected_level, + float expected_modularity) + { + using vertex_t = typename graph_t::vertex_type; + using weight_t = typename graph_t::weight_type; + + raft::handle_t handle{}; + + rmm::device_uvector clustering_v(num_vertices, handle.get_stream()); + size_t level; + weight_t modularity; + + std::tie(level, modularity) = + cugraph::louvain(handle, graph_view, clustering_v.data(), size_t{100}, weight_t{1}); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + float compare_modularity = static_cast(modularity); -TEST(louvain, success) + ASSERT_FLOAT_EQ(compare_modularity, expected_modularity); + ASSERT_EQ(level, expected_level); + } +}; + +// FIXME: add tests for type combinations + +TEST(louvain_legacy, success) { raft::handle_t handle; @@ -84,15 +226,13 @@ TEST(louvain, success) int min = *min_element(cluster_id.begin(), cluster_id.end()); - std::cout << "modularity = " << modularity << std::endl; - ASSERT_GE(min, 0); - ASSERT_GE(modularity, 0.402777 * 0.95); + ASSERT_FLOAT_EQ(modularity, 0.408695); ASSERT_EQ(cluster_id, result_h); } } -TEST(louvain_renumbered, success) +TEST(louvain_legacy_renumbered, success) { raft::handle_t handle; @@ -157,11 +297,25 @@ TEST(louvain_renumbered, success) int min = *min_element(cluster_id.begin(), cluster_id.end()); - std::cout << "modularity = " << modularity << std::endl; - ASSERT_GE(min, 0); - ASSERT_GE(modularity, 0.402777 * 0.95); + ASSERT_FLOAT_EQ(modularity, 0.41880345); } } +TEST_P(Tests_Louvain, CheckInt32Int32FloatFloatLegacy) +{ + run_legacy_test(GetParam()); +} + +TEST_P(Tests_Louvain, CheckInt32Int32FloatFloat) +{ + run_current_test(GetParam()); +} + +// FIXME: Expand testing once we evaluate RMM memory use +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_Louvain, + ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true, 3, 0.408695))); + CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu new file mode 100644 index 00000000000..661065ca65b --- /dev/null +++ b/cpp/tests/community/mg_louvain_helper.cu @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mg_louvain_helper.hpp" + +#include + +#include +#include +#include + +#include + +#include +#include +#include + +namespace cugraph { +namespace test { + +template +rmm::device_uvector gather_distributed_vector(raft::handle_t const &handle, + T const *d_input, + size_t size) +{ + auto rx_sizes = + cugraph::experimental::host_scalar_gather(handle.get_comms(), size, 0, handle.get_stream()); + std::vector rx_displs(static_cast(handle.get_comms().get_rank()) == 0 + ? handle.get_comms().get_size() + : int{0}, + size_t{0}); + if (static_cast(handle.get_comms().get_rank()) == 0) { + std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1); + } + + auto total_size = thrust::reduce(thrust::host, rx_sizes.begin(), rx_sizes.end()); + rmm::device_uvector gathered_v(total_size, handle.get_stream()); + + cugraph::experimental::device_gatherv(handle.get_comms(), + d_input, + gathered_v.data(), + size, + rx_sizes, + rx_displs, + 0, + handle.get_stream()); + + return gathered_v; +} + +template +bool compare_renumbered_vectors(raft::handle_t const &handle, + rmm::device_uvector const &v1, + rmm::device_uvector const &v2) +{ + vertex_t max = 1 + thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + v1.begin(), + v1.end(), + vertex_t{0}); + + rmm::device_uvector map(max, size_t{0}); + + auto iter = thrust::make_zip_iterator(thrust::make_tuple(v1.begin(), v2.begin())); + + thrust::for_each(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + iter, + iter + v1.size(), + [d_map = map.data()] __device__(auto pair) { + vertex_t e1 = thrust::get<0>(pair); + vertex_t e2 = thrust::get<1>(pair); + + d_map[e1] = e2; + }); + + auto error_count = + thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + iter, + iter + v1.size(), + [d_map = map.data()] __device__(auto pair) { + vertex_t e1 = thrust::get<0>(pair); + vertex_t e2 = thrust::get<1>(pair); + + return (d_map[e1] != e2); + }); + + return (error_count == 0); +} + +template +void single_gpu_renumber_edgelist_given_number_map(raft::handle_t const &handle, + rmm::device_uvector &edgelist_rows_v, + rmm::device_uvector &edgelist_cols_v, + rmm::device_uvector &renumber_map_gathered_v) +{ + rmm::device_uvector index_v(renumber_map_gathered_v.size(), handle.get_stream()); + + thrust::for_each( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(renumber_map_gathered_v.size()), + [d_renumber_map_gathered = renumber_map_gathered_v.data(), d_index = index_v.data()] __device__( + auto idx) { d_index[d_renumber_map_gathered[idx]] = idx; }); + + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_rows_v.begin(), + edgelist_rows_v.end(), + edgelist_rows_v.begin(), + [d_index = index_v.data()] __device__(auto v) { return d_index[v]; }); + + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edgelist_cols_v.begin(), + edgelist_cols_v.end(), + edgelist_cols_v.begin(), + [d_index = index_v.data()] __device__(auto v) { return d_index[v]; }); +} + +template +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + compressed_sparse_to_edgelist(edge_t const *compressed_sparse_offsets, + vertex_t const *compressed_sparse_indices, + weight_t const *compressed_sparse_weights, + vertex_t major_first, + vertex_t major_last, + cudaStream_t stream) +{ + edge_t number_of_edges{0}; + raft::update_host( + &number_of_edges, compressed_sparse_offsets + (major_last - major_first), 1, stream); + CUDA_TRY(cudaStreamSynchronize(stream)); + rmm::device_uvector edgelist_major_vertices(number_of_edges, stream); + rmm::device_uvector edgelist_minor_vertices(number_of_edges, stream); + rmm::device_uvector edgelist_weights( + compressed_sparse_weights != nullptr ? number_of_edges : 0, stream); + + // FIXME: this is highly inefficient for very high-degree vertices, for better performance, we can + // fill high-degree vertices using one CUDA block per vertex, mid-degree vertices using one CUDA + // warp per vertex, and low-degree vertices using one CUDA thread per block + thrust::for_each(rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(major_first), + thrust::make_counting_iterator(major_last), + [compressed_sparse_offsets, + major_first, + p_majors = edgelist_major_vertices.begin()] __device__(auto v) { + auto first = compressed_sparse_offsets[v - major_first]; + auto last = compressed_sparse_offsets[v - major_first + 1]; + thrust::fill(thrust::seq, p_majors + first, p_majors + last, v); + }); + thrust::copy(rmm::exec_policy(stream)->on(stream), + compressed_sparse_indices, + compressed_sparse_indices + number_of_edges, + edgelist_minor_vertices.begin()); + if (compressed_sparse_weights != nullptr) { + thrust::copy(rmm::exec_policy(stream)->on(stream), + compressed_sparse_weights, + compressed_sparse_weights + number_of_edges, + edgelist_weights.data()); + } + + return std::make_tuple(std::move(edgelist_major_vertices), + std::move(edgelist_minor_vertices), + std::move(edgelist_weights)); +} + +template +void sort_and_coarsen_edgelist(rmm::device_uvector &edgelist_major_vertices /* [INOUT] */, + rmm::device_uvector &edgelist_minor_vertices /* [INOUT] */, + rmm::device_uvector &edgelist_weights /* [INOUT] */, + cudaStream_t stream) +{ + auto pair_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + + size_t number_of_edges{0}; + if (edgelist_weights.size() > 0) { + thrust::sort_by_key(rmm::exec_policy(stream)->on(stream), + pair_first, + pair_first + edgelist_major_vertices.size(), + edgelist_weights.begin()); + + rmm::device_uvector tmp_edgelist_major_vertices(edgelist_major_vertices.size(), + stream); + rmm::device_uvector tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(), + stream); + rmm::device_uvector tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream); + auto it = thrust::reduce_by_key( + rmm::exec_policy(stream)->on(stream), + pair_first, + pair_first + edgelist_major_vertices.size(), + edgelist_weights.begin(), + thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(), + tmp_edgelist_minor_vertices.begin())), + tmp_edgelist_weights.begin()); + number_of_edges = thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it)); + + edgelist_major_vertices = std::move(tmp_edgelist_major_vertices); + edgelist_minor_vertices = std::move(tmp_edgelist_minor_vertices); + edgelist_weights = std::move(tmp_edgelist_weights); + } else { + thrust::sort(rmm::exec_policy(stream)->on(stream), + pair_first, + pair_first + edgelist_major_vertices.size()); + auto it = thrust::unique(rmm::exec_policy(stream)->on(stream), + pair_first, + pair_first + edgelist_major_vertices.size()); + number_of_edges = thrust::distance(pair_first, it); + } + + edgelist_major_vertices.resize(number_of_edges, stream); + edgelist_minor_vertices.resize(number_of_edges, stream); + edgelist_weights.resize(number_of_edges, stream); + edgelist_major_vertices.shrink_to_fit(stream); + edgelist_minor_vertices.shrink_to_fit(stream); + edgelist_weights.shrink_to_fit(stream); +} + +template +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + edge_t const *compressed_sparse_offsets, + vertex_t const *compressed_sparse_indices, + weight_t const *compressed_sparse_weights, + vertex_t const *p_major_labels, + vertex_t const *p_minor_labels, + vertex_t major_first, + vertex_t major_last, + vertex_t minor_first, + vertex_t minor_last, + cudaStream_t stream) +{ + // FIXME: it might be possible to directly create relabled & coarsened edgelist from the + // compressed sparse format to save memory + + rmm::device_uvector edgelist_major_vertices(0, stream); + rmm::device_uvector edgelist_minor_vertices(0, stream); + rmm::device_uvector edgelist_weights(0, stream); + std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) = + compressed_sparse_to_edgelist(compressed_sparse_offsets, + compressed_sparse_indices, + compressed_sparse_weights, + major_first, + major_last, + stream); + + auto pair_first = thrust::make_zip_iterator( + thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin())); + thrust::transform( + rmm::exec_policy(stream)->on(stream), + pair_first, + pair_first + edgelist_major_vertices.size(), + pair_first, + [p_major_labels, p_minor_labels, major_first, minor_first] __device__(auto val) { + return thrust::make_tuple(p_major_labels[thrust::get<0>(val) - major_first], + p_minor_labels[thrust::get<1>(val) - minor_first]); + }); + + sort_and_coarsen_edgelist( + edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights, stream); + + return std::make_tuple(std::move(edgelist_major_vertices), + std::move(edgelist_minor_vertices), + std::move(edgelist_weights)); +} + +// single-GPU version +template +std::unique_ptr> +coarsen_graph( + raft::handle_t const &handle, + cugraph::experimental::graph_view_t const + &graph_view, + vertex_t const *labels) +{ + rmm::device_uvector coarsened_edgelist_major_vertices(0, handle.get_stream()); + rmm::device_uvector coarsened_edgelist_minor_vertices(0, handle.get_stream()); + rmm::device_uvector coarsened_edgelist_weights(0, handle.get_stream()); + std::tie(coarsened_edgelist_major_vertices, + coarsened_edgelist_minor_vertices, + coarsened_edgelist_weights) = + compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist( + graph_view.offsets(), + graph_view.indices(), + graph_view.weights(), + labels, + labels, + vertex_t{0}, + graph_view.get_number_of_vertices(), + vertex_t{0}, + graph_view.get_number_of_vertices(), + handle.get_stream()); + + cugraph::experimental::edgelist_t edgelist{}; + edgelist.p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices.data() + : coarsened_edgelist_major_vertices.data(); + edgelist.p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices.data() + : coarsened_edgelist_minor_vertices.data(); + edgelist.p_edge_weights = coarsened_edgelist_weights.data(); + edgelist.number_of_edges = static_cast(coarsened_edgelist_major_vertices.size()); + + vertex_t new_number_of_vertices = + 1 + thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + labels, + labels + graph_view.get_number_of_vertices(), + vertex_t{0}, + thrust::maximum()); + + return std::make_unique< + cugraph::experimental::graph_t>( + handle, + edgelist, + new_number_of_vertices, + cugraph::experimental::graph_properties_t{ + graph_view.is_symmetric(), false, graph_view.is_weighted()}, + true); +} + +// explicit instantiation + +template void single_gpu_renumber_edgelist_given_number_map( + raft::handle_t const &handle, + rmm::device_uvector &d_edgelist_rows, + rmm::device_uvector &d_edgelist_cols, + rmm::device_uvector &d_renumber_map_gathered_v); + +template rmm::device_uvector gather_distributed_vector(raft::handle_t const &handle, + int const *d_input, + size_t size); + +template bool compare_renumbered_vectors(raft::handle_t const &handle, + rmm::device_uvector const &v1, + rmm::device_uvector const &v2); + +template std::unique_ptr> +coarsen_graph( + raft::handle_t const &handle, + cugraph::experimental::graph_view_t const &graph_view, + int32_t const *labels); + +} // namespace test +} // namespace cugraph diff --git a/cpp/tests/community/mg_louvain_helper.hpp b/cpp/tests/community/mg_louvain_helper.hpp new file mode 100644 index 00000000000..43eb294cd13 --- /dev/null +++ b/cpp/tests/community/mg_louvain_helper.hpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +namespace cugraph { +namespace test { + +template +rmm::device_uvector gather_distributed_vector(raft::handle_t const &handle, + T const *d_input, + size_t size); + +template +bool compare_renumbered_vectors(raft::handle_t const &handle, + rmm::device_uvector const &v1, + rmm::device_uvector const &v2); + +template +void single_gpu_renumber_edgelist_given_number_map( + raft::handle_t const &handle, + rmm::device_uvector &d_edgelist_rows, + rmm::device_uvector &d_edgelist_cols, + rmm::device_uvector &d_renumber_map_gathered_v); + +template +std::unique_ptr> +coarsen_graph( + raft::handle_t const &handle, + cugraph::experimental::graph_view_t const + &graph_view, + vertex_t const *labels); + +} // namespace test +} // namespace cugraph diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp new file mode 100644 index 00000000000..8a1a3010a6f --- /dev/null +++ b/cpp/tests/community/mg_louvain_test.cpp @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "mg_louvain_helper.hpp" + +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +#include + +void compare(float mg_modularity, float sg_modularity) +{ + ASSERT_FLOAT_EQ(mg_modularity, sg_modularity); +} +void compare(double mg_modularity, double sg_modularity) +{ + ASSERT_DOUBLE_EQ(mg_modularity, sg_modularity); +} + +//////////////////////////////////////////////////////////////////////////////// +// Test param object. This defines the input and expected output for a test, and +// will be instantiated as the parameter to the tests defined below using +// INSTANTIATE_TEST_CASE_P() +// +struct Louvain_Usecase { + std::string graph_file_full_path{}; + bool weighted{false}; + size_t max_level; + double resolution; + + // FIXME: We really should have a Graph_Testparms_Base class or something + // like that which can handle this graph_full_path thing. + // + Louvain_Usecase(std::string const& graph_file_path, + bool weighted, + size_t max_level, + double resolution) + : weighted(weighted), max_level(max_level), resolution(resolution) + { + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + }; +}; + +//////////////////////////////////////////////////////////////////////////////// +// Parameterized test fixture, to be used with TEST_P(). This defines common +// setup and teardown steps as well as common utilities used by each E2E MG +// test. In this case, each test is identical except for the inputs and +// expected outputs, so the entire test is defined in the run_test() method. +// +class Louvain_MG_Testfixture : public ::testing::TestWithParam { + public: + static void SetUpTestCase() {} + static void TearDownTestCase() {} + + // Run once for each test instance + virtual void SetUp() {} + virtual void TearDown() {} + + // Compare the results of MNMG Louvain with the results of running + // each step of SG Louvain, renumbering the coarsened graphs based + // on the MNMG renumbering. + template + void compare_sg_results(raft::handle_t const& handle, + std::string const& graph_filename, + rmm::device_uvector& d_renumber_map_gathered_v, + cugraph::Dendrogram const& dendrogram, + weight_t resolution, + int rank, + weight_t mg_modularity) + { + auto sg_graph = + std::make_unique>( + handle); + rmm::device_uvector d_clustering_v(0, handle.get_stream()); + weight_t sg_modularity{-1.0}; + + if (rank == 0) { + // Create initial SG graph, renumbered according to the MNMG renumber map + rmm::device_uvector d_edgelist_rows(0, handle.get_stream()); + rmm::device_uvector d_edgelist_cols(0, handle.get_stream()); + rmm::device_uvector d_edgelist_weights(0, handle.get_stream()); + vertex_t number_of_vertices{}; + bool is_symmetric{}; + + std::tie( + d_edgelist_rows, d_edgelist_cols, d_edgelist_weights, number_of_vertices, is_symmetric) = + cugraph::test::read_edgelist_from_matrix_market_file( + handle, graph_filename, true); + + rmm::device_uvector d_vertices(number_of_vertices, handle.get_stream()); + std::vector h_vertices(number_of_vertices); + + d_clustering_v.resize(d_vertices.size(), handle.get_stream()); + + thrust::sequence(thrust::host, h_vertices.begin(), h_vertices.end(), vertex_t{0}); + raft::update_device( + d_vertices.data(), h_vertices.data(), d_vertices.size(), handle.get_stream()); + + // renumber using d_renumber_map_gathered_v + cugraph::test::single_gpu_renumber_edgelist_given_number_map( + handle, d_edgelist_rows, d_edgelist_cols, d_renumber_map_gathered_v); + + std::tie(*sg_graph, std::ignore) = + cugraph::test::generate_graph_from_edgelist( + handle, + std::move(d_vertices), + std::move(d_edgelist_rows), + std::move(d_edgelist_cols), + std::move(d_edgelist_weights), + is_symmetric, + true, + false); + } + + std::for_each( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(dendrogram.num_levels()), + [&dendrogram, &sg_graph, &d_clustering_v, &sg_modularity, &handle, resolution, rank]( + size_t i) { + auto d_dendrogram_gathered_v = cugraph::test::gather_distributed_vector( + handle, dendrogram.get_level_ptr_nocheck(i), dendrogram.get_level_size_nocheck(i)); + + if (rank == 0) { + auto graph_view = sg_graph->view(); + + d_clustering_v.resize(graph_view.get_number_of_vertices(), handle.get_stream()); + + std::tie(std::ignore, sg_modularity) = + cugraph::louvain(handle, graph_view, d_clustering_v.data(), size_t{1}, resolution); + + EXPECT_TRUE(cugraph::test::compare_renumbered_vectors( + handle, d_clustering_v, d_dendrogram_gathered_v)); + + sg_graph = + cugraph::test::coarsen_graph(handle, graph_view, d_dendrogram_gathered_v.data()); + } + }); + + if (rank == 0) compare(mg_modularity, sg_modularity); + } + + // Compare the results of running louvain on multiple GPUs to that of a + // single-GPU run for the configuration in param. Note that MNMG Louvain + // and single GPU Louvain are ONLY deterministic through a single + // iteration of the outer loop. Renumbering of the partitions when coarsening + // the graph is a function of the number of GPUs in the GPU cluster. + template + void run_test(const Louvain_Usecase& param) + { + raft::handle_t handle; + + raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); + const auto& comm = handle.get_comms(); + + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { --row_comm_size; } + cugraph::partition_2d::subcomm_factory_t + subcomm_factory(handle, row_comm_size); + + cudaStream_t stream = handle.get_stream(); + + cugraph::experimental::graph_t mg_graph(handle); + + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + + std::tie(mg_graph, d_renumber_map_labels) = + cugraph::test::read_graph_from_matrix_market_file( + handle, param.graph_file_full_path, true, true); + + auto mg_graph_view = mg_graph.view(); + + std::unique_ptr> dendrogram; + weight_t mg_modularity; + + std::tie(dendrogram, mg_modularity) = + cugraph::louvain(handle, mg_graph_view, param.max_level, param.resolution); + + SCOPED_TRACE("compare modularity input: " + param.graph_file_full_path); + + auto d_renumber_map_gathered_v = cugraph::test::gather_distributed_vector( + handle, d_renumber_map_labels.data(), d_renumber_map_labels.size()); + + compare_sg_results(handle, + param.graph_file_full_path, + d_renumber_map_gathered_v, + *dendrogram, + param.resolution, + comm_rank, + mg_modularity); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +TEST_P(Louvain_MG_Testfixture, CheckInt32Int32Float) +{ + run_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P( + simple_test, + Louvain_MG_Testfixture, + ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true, 100, 1) + //,Louvain_Usecase("test/datasets/smallworld.mtx", true, 100, 1) + )); + +CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/components/scc_test.cu b/cpp/tests/components/scc_test.cu index 9d5b55f34c6..a74b5a0ad27 100644 --- a/cpp/tests/components/scc_test.cu +++ b/cpp/tests/components/scc_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * NVIDIA CORPORATION and its licensors retain all intellectual property * and proprietary rights in and to this software, related documentation @@ -24,6 +24,9 @@ #include +#include +#include +#include #include #include @@ -57,41 +60,48 @@ struct Usecase { std::string matrix_file; }; -// checker of counts of labels for each component -// expensive, for testing purposes only; +// counts number of vertices in each component; +// (of same label); +// potentially expensive, for testing purposes only; // // params: -// p_d_labels: device array of labels of size nrows; -// nrows: |V| for graph G(V, E); -// d_v_counts: #labels for each component; (_not_ pre-allocated!) +// in: p_d_labels: device array of labels of size nrows; +// in: nrows: |V| for graph G(V, E); +// out: d_v_counts: #labels for each component; (_not_ pre-allocated!) +// return: number of components; // template -size_t get_component_sizes(const IndexT* p_d_labels, size_t nrows, DVector& d_v_counts) +size_t get_component_sizes(const IndexT* p_d_labels, + size_t nrows, + DVector& d_num_vs_per_component) { DVector d_sorted_l(p_d_labels, p_d_labels + nrows); thrust::sort(d_sorted_l.begin(), d_sorted_l.end()); - size_t counts = - thrust::distance(d_sorted_l.begin(), thrust::unique(d_sorted_l.begin(), d_sorted_l.end())); + auto pair_it = thrust::reduce_by_key(d_sorted_l.begin(), + d_sorted_l.end(), + thrust::make_constant_iterator(1), + thrust::make_discard_iterator(), // ignore... + d_num_vs_per_component.begin()); - IndexT* p_d_srt_l = d_sorted_l.data().get(); - - d_v_counts.resize(counts); - thrust::transform( - thrust::device, - d_sorted_l.begin(), - d_sorted_l.begin() + counts, - d_v_counts.begin(), - [p_d_srt_l, counts] __device__(IndexT indx) { - return thrust::count_if( - thrust::seq, p_d_srt_l, p_d_srt_l + counts, [indx](IndexT label) { return label == indx; }); - }); - - // sort the counts: - thrust::sort(d_v_counts.begin(), d_v_counts.end()); + size_t counts = thrust::distance(d_num_vs_per_component.begin(), pair_it.second); + d_num_vs_per_component.resize(counts); return counts; } + +template +DVector byte_matrix_to_int(const DVector& d_adj_byte_matrix) +{ + auto n2 = d_adj_byte_matrix.size(); + thrust::device_vector d_vec_matrix(n2, 0); + thrust::transform(d_adj_byte_matrix.begin(), + d_adj_byte_matrix.end(), + d_vec_matrix.begin(), + [] __device__(auto byte_v) { return static_cast(byte_v); }); + return d_vec_matrix; +} + } // namespace struct Tests_Strongly_CC : ::testing::TestWithParam { @@ -154,8 +164,8 @@ struct Tests_Strongly_CC : ::testing::TestWithParam { // Allocate memory on host std::vector cooRowInd(nnz); std::vector cooColInd(nnz); - std::vector labels(m); // for G(V, E), m := |V| - std::vector verts(m); + std::vector labels(nrows); // for G(V, E), m := |V| + std::vector verts(nrows); // Read: COO Format // @@ -166,11 +176,11 @@ struct Tests_Strongly_CC : ::testing::TestWithParam { << "\n"; ASSERT_EQ(fclose(fpin), 0); - cugraph::GraphCOOView G_coo(&cooRowInd[0], &cooColInd[0], nullptr, m, nnz); + cugraph::GraphCOOView G_coo(&cooRowInd[0], &cooColInd[0], nullptr, nrows, nnz); auto G_unique = cugraph::coo_to_csr(G_coo); cugraph::GraphCSRView G = G_unique->view(); - rmm::device_vector d_labels(m); + rmm::device_vector d_labels(nrows); size_t count = 0; @@ -190,7 +200,7 @@ struct Tests_Strongly_CC : ::testing::TestWithParam { } strongly_cc_counts.push_back(count); - DVector d_counts; + DVector d_counts(nrows); auto count_labels = get_component_sizes(d_labels.data().get(), nrows, d_counts); } }; @@ -208,4 +218,211 @@ INSTANTIATE_TEST_CASE_P( Usecase("test/datasets/cage6.mtx") // DG "small" enough to meet SCC GPU memory requirements )); +struct SCCSmallTest : public ::testing::Test { +}; + +// FIXME: we should take advantage of gtest parameterization over copy-and-paste reuse. +// +TEST_F(SCCSmallTest, CustomGraphSimpleLoops) +{ + using IndexT = int; + + size_t nrows = 5; + size_t n2 = 2 * nrows * nrows; + + cudaDeviceProp prop; + int device = 0; + cudaGetDeviceProperties(&prop, device); + + ASSERT_TRUE(n2 < prop.totalGlobalMem); + + // Allocate memory on host + std::vector cooRowInd{0, 1, 2, 3, 3, 4}; + std::vector cooColInd{1, 0, 0, 1, 4, 3}; + std::vector labels(nrows); + std::vector verts(nrows); + + size_t nnz = cooRowInd.size(); + + EXPECT_EQ(nnz, cooColInd.size()); + + cugraph::GraphCOOView G_coo(&cooRowInd[0], &cooColInd[0], nullptr, nrows, nnz); + auto G_unique = cugraph::coo_to_csr(G_coo); + cugraph::GraphCSRView G = G_unique->view(); + + rmm::device_vector d_labels(nrows); + + cugraph::connected_components(G, cugraph::cugraph_cc_t::CUGRAPH_STRONG, d_labels.data().get()); + + DVector d_counts(nrows); + auto count_components = get_component_sizes(d_labels.data().get(), nrows, d_counts); + + EXPECT_EQ(count_components, static_cast(3)); + + std::vector v_counts(d_counts.size()); + + cudaMemcpy(v_counts.data(), + d_counts.data().get(), + sizeof(size_t) * v_counts.size(), + cudaMemcpyDeviceToHost); + + cudaDeviceSynchronize(); + + std::vector v_counts_exp{2, 1, 2}; + + EXPECT_EQ(v_counts, v_counts_exp); +} + +TEST_F(SCCSmallTest, /*DISABLED_*/ CustomGraphWithSelfLoops) +{ + using IndexT = int; + + size_t nrows = 5; + size_t n2 = 2 * nrows * nrows; + + cudaDeviceProp prop; + int device = 0; + cudaGetDeviceProperties(&prop, device); + + ASSERT_TRUE(n2 < prop.totalGlobalMem); + + // Allocate memory on host + std::vector cooRowInd{0, 0, 1, 1, 2, 2, 3, 3, 4}; + std::vector cooColInd{0, 1, 0, 1, 0, 2, 1, 3, 4}; + std::vector labels(nrows); + std::vector verts(nrows); + + size_t nnz = cooRowInd.size(); + + EXPECT_EQ(nnz, cooColInd.size()); + + cugraph::GraphCOOView G_coo(&cooRowInd[0], &cooColInd[0], nullptr, nrows, nnz); + auto G_unique = cugraph::coo_to_csr(G_coo); + cugraph::GraphCSRView G = G_unique->view(); + + rmm::device_vector d_labels(nrows); + + cugraph::connected_components(G, cugraph::cugraph_cc_t::CUGRAPH_STRONG, d_labels.data().get()); + + DVector d_counts(nrows); + auto count_components = get_component_sizes(d_labels.data().get(), nrows, d_counts); + + EXPECT_EQ(count_components, static_cast(4)); + + std::vector v_counts(d_counts.size()); + + cudaMemcpy(v_counts.data(), + d_counts.data().get(), + sizeof(size_t) * v_counts.size(), + cudaMemcpyDeviceToHost); + + cudaDeviceSynchronize(); + + std::vector v_counts_exp{2, 1, 1, 1}; + + EXPECT_EQ(v_counts, v_counts_exp); +} + +TEST_F(SCCSmallTest, SmallGraphWithSelfLoops1) +{ + using IndexT = int; + + size_t nrows = 3; + + std::vector cooRowInd{0, 0, 1, 2}; + std::vector cooColInd{0, 1, 0, 0}; + + std::vector v_counts_exp{2, 1}; + + std::vector labels(nrows); + std::vector verts(nrows); + + size_t nnz = cooRowInd.size(); + + EXPECT_EQ(nnz, cooColInd.size()); + + cugraph::GraphCOOView G_coo(&cooRowInd[0], &cooColInd[0], nullptr, nrows, nnz); + auto G_unique = cugraph::coo_to_csr(G_coo); + cugraph::GraphCSRView G = G_unique->view(); + + rmm::device_vector d_labels(nrows); + + cugraph::connected_components(G, cugraph::cugraph_cc_t::CUGRAPH_STRONG, d_labels.data().get()); + + DVector d_counts(nrows); + auto count_components = get_component_sizes(d_labels.data().get(), nrows, d_counts); + + // std::cout << "vertex labels:\n"; + // print_v(d_labels, std::cout); + + decltype(count_components) num_components_exp = 2; + + EXPECT_EQ(count_components, num_components_exp); +} + +TEST_F(SCCSmallTest, SmallGraphWithIsolated) +{ + using IndexT = int; + + size_t nrows = 3; + + std::vector cooRowInd{0, 0, 1}; + std::vector cooColInd{0, 1, 0}; + + std::vector v_counts_exp{2, 1}; + + std::vector labels(nrows); + std::vector verts(nrows); + + size_t nnz = cooRowInd.size(); + + EXPECT_EQ(nnz, cooColInd.size()); + + // Note: there seems to be a BUG in coo_to_csr() or view() + // COO format doesn't account for isolated vertices; + // + // cugraph::GraphCOOView G_coo(&cooRowInd[0], &cooColInd[0], nullptr, nrows, + // nnz); + // auto G_unique = cugraph::coo_to_csr(G_coo); + // cugraph::GraphCSRView G = G_unique->view(); + // + // + // size_t num_vertices = G.number_of_vertices; + // size_t num_edges = G.number_of_edges; + // + // EXPECT_EQ(num_vertices, nrows); //fails when G was constructed from COO + // EXPECT_EQ(num_edges, nnz); + + std::vector ro{0, 2, 3, 3}; + std::vector ci{0, 1, 0}; + + nnz = ci.size(); + + thrust::device_vector d_ro(ro); + thrust::device_vector d_ci(ci); + + cugraph::GraphCSRView G{ + d_ro.data().get(), d_ci.data().get(), nullptr, static_cast(nrows), static_cast(nnz)}; + + size_t num_vertices = G.number_of_vertices; + size_t num_edges = G.number_of_edges; + + EXPECT_EQ(num_vertices, nrows); + EXPECT_EQ(num_edges, nnz); + + rmm::device_vector d_labels(nrows); + + cugraph::connected_components(G, cugraph::cugraph_cc_t::CUGRAPH_STRONG, d_labels.data().get()); + + DVector d_counts(nrows); + auto count_components = get_component_sizes(d_labels.data().get(), nrows, d_counts); + + // std::cout << "vertex labels:\n"; + // print_v(d_labels, std::cout); + + decltype(count_components) num_components_exp = 2; + + EXPECT_EQ(count_components, num_components_exp); +} + CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/bfs_test.cpp b/cpp/tests/experimental/bfs_test.cpp index ad9ece99ef9..ded57dd1855 100644 --- a/cpp/tests/experimental/bfs_test.cpp +++ b/cpp/tests/experimental/bfs_test.cpp @@ -14,11 +14,14 @@ * limitations under the License. */ +#include #include #include +#include #include #include +#include #include #include @@ -28,10 +31,16 @@ #include +#include #include #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + template void bfs_reference(edge_t const* offsets, vertex_t const* indices, @@ -74,9 +83,12 @@ void bfs_reference(edge_t const* offsets, typedef struct BFS_Usecase_t { cugraph::test::input_graph_specifier_t input_graph_specifier{}; - size_t source{false}; - BFS_Usecase_t(std::string const& graph_file_path, size_t source) : source(source) + size_t source{0}; + bool check_correctness{false}; + + BFS_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) + : source(source), check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -88,13 +100,43 @@ typedef struct BFS_Usecase_t { input_graph_specifier.graph_file_full_path = graph_file_full_path; }; - BFS_Usecase_t(cugraph::test::rmat_params_t rmat_params, size_t source) : source(source) + BFS_Usecase_t(cugraph::test::rmat_params_t rmat_params, + size_t source, + bool check_correctness = true) + : source(source), check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } BFS_Usecase; +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, BFS_Usecase const& configuration, bool renumber) +{ + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, false, renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + false, + renumber, + std::vector{0}, + size_t{1}); +} + class Tests_BFS : public ::testing::TestWithParam { public: Tests_BFS() {} @@ -107,108 +149,164 @@ class Tests_BFS : public ::testing::TestWithParam { template void run_current_test(BFS_Usecase const& configuration) { + constexpr bool renumber = true; + using weight_t = float; raft::handle_t handle{}; + HighResClock hr_clock{}; + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, configuration.input_graph_specifier.graph_file_full_path, false, false) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - false, - false); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + std::tie(graph, d_renumber_map_labels) = + read_graph(handle, configuration, renumber); + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "read_graph took " << elapsed_time * 1e-6 << " s.\n"; + } auto graph_view = graph.view(); - std::vector h_offsets(graph_view.get_number_of_vertices() + 1); - std::vector h_indices(graph_view.get_number_of_edges()); - raft::update_host(h_offsets.data(), - graph_view.offsets(), - graph_view.get_number_of_vertices() + 1, - handle.get_stream()); - raft::update_host(h_indices.data(), - graph_view.indices(), - graph_view.get_number_of_edges(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - ASSERT_TRUE(configuration.source >= 0 && - configuration.source <= graph_view.get_number_of_vertices()) - << "Starting sources should be >= 0 and" - << " less than the number of vertices in the graph."; - - std::vector h_reference_distances(graph_view.get_number_of_vertices()); - std::vector h_reference_predecessors(graph_view.get_number_of_vertices()); - - bfs_reference(h_offsets.data(), - h_indices.data(), - h_reference_distances.data(), - h_reference_predecessors.data(), - graph_view.get_number_of_vertices(), - static_cast(configuration.source), - std::numeric_limits::max()); + ASSERT_TRUE(static_cast(configuration.source) >= 0 && + static_cast(configuration.source) < graph_view.get_number_of_vertices()) + << "Invalid starting source."; rmm::device_uvector d_distances(graph_view.get_number_of_vertices(), handle.get_stream()); rmm::device_uvector d_predecessors(graph_view.get_number_of_vertices(), handle.get_stream()); - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } cugraph::experimental::bfs(handle, graph_view, - d_distances.begin(), - d_predecessors.begin(), + d_distances.data(), + d_predecessors.data(), static_cast(configuration.source), false, - std::numeric_limits::max(), - false); - - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - - std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); - std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); - - raft::update_host( - h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); - raft::update_host(h_cugraph_predecessors.data(), - d_predecessors.data(), - d_predecessors.size(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - ASSERT_TRUE(std::equal( - h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin())) - << "distances do not match with the reference values."; - - for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) { - auto i = std::distance(h_cugraph_predecessors.begin(), it); - if (*it == cugraph::invalid_vertex_id::value) { - ASSERT_TRUE(h_reference_predecessors[i] == *it) - << "vertex reachability do not match with the reference."; + std::numeric_limits::max()); + + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "BFS took " << elapsed_time * 1e-6 << " s.\n"; + } + + if (configuration.check_correctness) { + cugraph::experimental::graph_t unrenumbered_graph( + handle); + if (renumber) { + std::tie(unrenumbered_graph, std::ignore) = + read_graph(handle, configuration, false); + } + auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; + + std::vector h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1); + std::vector h_indices(unrenumbered_graph_view.get_number_of_edges()); + raft::update_host(h_offsets.data(), + unrenumbered_graph_view.offsets(), + unrenumbered_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_indices.data(), + unrenumbered_graph_view.indices(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto unrenumbered_source = static_cast(configuration.source); + if (renumber) { + std::vector h_renumber_map_labels(d_renumber_map_labels.size()); + raft::update_host(h_renumber_map_labels.data(), + d_renumber_map_labels.data(), + d_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + unrenumbered_source = h_renumber_map_labels[configuration.source]; + } + + std::vector h_reference_distances(unrenumbered_graph_view.get_number_of_vertices()); + std::vector h_reference_predecessors( + unrenumbered_graph_view.get_number_of_vertices()); + + bfs_reference(h_offsets.data(), + h_indices.data(), + h_reference_distances.data(), + h_reference_predecessors.data(), + unrenumbered_graph_view.get_number_of_vertices(), + unrenumbered_source, + std::numeric_limits::max()); + + std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); + std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); + if (renumber) { + cugraph::experimental::unrenumber_local_int_vertices(handle, + d_predecessors.data(), + d_predecessors.size(), + d_renumber_map_labels.data(), + vertex_t{0}, + graph_view.get_number_of_vertices(), + true); + + auto d_unrenumbered_distances = cugraph::test::sort_by_key( + handle, d_renumber_map_labels.data(), d_distances.data(), d_renumber_map_labels.size()); + auto d_unrenumbered_predecessors = cugraph::test::sort_by_key(handle, + d_renumber_map_labels.data(), + d_predecessors.data(), + d_renumber_map_labels.size()); + raft::update_host(h_cugraph_distances.data(), + d_unrenumbered_distances.data(), + d_unrenumbered_distances.size(), + handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_unrenumbered_predecessors.data(), + d_unrenumbered_predecessors.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); } else { - ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i]) - << "distance to this vertex != distance to the predecessor vertex + 1."; - bool found{false}; - for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) { - if (h_indices[j] == i) { - found = true; - break; + raft::update_host( + h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_predecessors.data(), + d_predecessors.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + } + + ASSERT_TRUE(std::equal( + h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin())) + << "distances do not match with the reference values."; + + for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) { + auto i = std::distance(h_cugraph_predecessors.begin(), it); + if (*it == cugraph::invalid_vertex_id::value) { + ASSERT_TRUE(h_reference_predecessors[i] == *it) + << "vertex reachability does not match with the reference."; + } else { + ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i]) + << "distance to this vertex != distance to the predecessor vertex + 1."; + bool found{false}; + for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) { + if (h_indices[j] == i) { + found = true; + break; + } } + ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex."; } - ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex."; } } } @@ -221,12 +319,17 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_BFS, ::testing::Values( + // enable correctness checks BFS_Usecase("test/datasets/karate.mtx", 0), BFS_Usecase("test/datasets/polbooks.mtx", 0), BFS_Usecase("test/datasets/netscience.mtx", 0), BFS_Usecase("test/datasets/netscience.mtx", 100), BFS_Usecase("test/datasets/wiki2003.mtx", 1000), BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000), - BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0))); + BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), + // disable correctness checks for large graphs + BFS_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + 0, + false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/coarsen_graph_test.cpp b/cpp/tests/experimental/coarsen_graph_test.cpp index 789619f2cd9..0fc0634bbbc 100644 --- a/cpp/tests/experimental/coarsen_graph_test.cpp +++ b/cpp/tests/experimental/coarsen_graph_test.cpp @@ -54,13 +54,14 @@ void check_coarsened_graph_results(edge_t* org_offsets, ASSERT_TRUE(std::count_if(org_indices, org_indices + org_offsets[num_org_vertices], [num_org_vertices](auto nbr) { - return !cugraph::test::is_valid_vertex(num_org_vertices, nbr); + return !cugraph::experimental::is_valid_vertex(num_org_vertices, nbr); }) == 0); ASSERT_TRUE(std::is_sorted(coarse_offsets, coarse_offsets + num_coarse_vertices)); ASSERT_TRUE(std::count_if(coarse_indices, coarse_indices + coarse_offsets[num_coarse_vertices], [num_coarse_vertices](auto nbr) { - return !cugraph::test::is_valid_vertex(num_coarse_vertices, nbr); + return !cugraph::experimental::is_valid_vertex(num_coarse_vertices, + nbr); }) == 0); ASSERT_TRUE(num_coarse_vertices <= num_org_vertices); diff --git a/cpp/tests/experimental/generate_rmat_test.cpp b/cpp/tests/experimental/generate_rmat_test.cpp index 249a1a3c6c8..221accea4f7 100644 --- a/cpp/tests/experimental/generate_rmat_test.cpp +++ b/cpp/tests/experimental/generate_rmat_test.cpp @@ -14,9 +14,11 @@ * limitations under the License. */ +#include #include #include +#include #include #include @@ -200,17 +202,19 @@ class Tests_GenerateRmat : public ::testing::TestWithParam (h_cugraph_srcs.size() == (size_t{1} << configuration.scale) * configuration.edge_factor) && (h_cugraph_dsts.size() == (size_t{1} << configuration.scale) * configuration.edge_factor)) << "Returned an invalid number of R-mat graph edges."; - ASSERT_TRUE( - std::count_if(h_cugraph_srcs.begin(), - h_cugraph_srcs.end(), - [num_vertices = static_cast(size_t{1} << configuration.scale)]( - auto v) { return !cugraph::test::is_valid_vertex(num_vertices, v); }) == 0) + ASSERT_TRUE(std::count_if(h_cugraph_srcs.begin(), + h_cugraph_srcs.end(), + [num_vertices = static_cast( + size_t{1} << configuration.scale)](auto v) { + return !cugraph::experimental::is_valid_vertex(num_vertices, v); + }) == 0) << "Returned R-mat graph edges have invalid source vertex IDs."; - ASSERT_TRUE( - std::count_if(h_cugraph_dsts.begin(), - h_cugraph_dsts.end(), - [num_vertices = static_cast(size_t{1} << configuration.scale)]( - auto v) { return !cugraph::test::is_valid_vertex(num_vertices, v); }) == 0) + ASSERT_TRUE(std::count_if(h_cugraph_dsts.begin(), + h_cugraph_dsts.end(), + [num_vertices = static_cast( + size_t{1} << configuration.scale)](auto v) { + return !cugraph::experimental::is_valid_vertex(num_vertices, v); + }) == 0) << "Returned R-mat graph edges have invalid destination vertex IDs."; if (!scramble) { @@ -281,5 +285,90 @@ INSTANTIATE_TEST_CASE_P(simple_test, GenerateRmat_Usecase(20, 16, 0.57, 0.19, 0.19, false), GenerateRmat_Usecase(20, 16, 0.45, 0.22, 0.22, true), GenerateRmat_Usecase(20, 16, 0.45, 0.22, 0.22, false))); +typedef struct GenerateRmats_Usecase_t { + size_t n_edgelists{0}; + size_t min_scale{0}; + size_t max_scale{0}; + size_t edge_factor{0}; + cugraph::experimental::generator_distribution_t component_distribution; + cugraph::experimental::generator_distribution_t edge_distribution; + + GenerateRmats_Usecase_t(size_t n_edgelists, + size_t min_scale, + size_t max_scale, + size_t edge_factor, + cugraph::experimental::generator_distribution_t component_distribution, + cugraph::experimental::generator_distribution_t edge_distribution) + : n_edgelists(n_edgelists), + min_scale(min_scale), + max_scale(max_scale), + component_distribution(component_distribution), + edge_distribution(edge_distribution), + edge_factor(edge_factor){}; +} GenerateRmats_Usecase; +class Tests_GenerateRmats : public ::testing::TestWithParam { + public: + Tests_GenerateRmats() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + template + void run_current_test(GenerateRmats_Usecase const& configuration) + { + raft::handle_t handle{}; + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + + auto outputs = + cugraph::experimental::generate_rmat_edgelists(handle, + configuration.n_edgelists, + configuration.min_scale, + configuration.max_scale, + configuration.edge_factor, + configuration.component_distribution, + configuration.edge_distribution, + uint64_t{0}); + + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + ASSERT_EQ(configuration.n_edgelists, outputs.size()); + for (auto i = outputs.begin(); i != outputs.end(); ++i) { + ASSERT_EQ(std::get<0>(*i).size(), std::get<1>(*i).size()); + ASSERT_TRUE((configuration.min_scale * configuration.edge_factor) <= std::get<0>(*i).size()); + ASSERT_TRUE((configuration.max_scale * configuration.edge_factor) >= std::get<0>(*i).size()); + } + } +}; +TEST_P(Tests_GenerateRmats, CheckInt32) { run_current_test(GetParam()); } + +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_GenerateRmats, + ::testing::Values( + GenerateRmats_Usecase(8, + 1, + 16, + 32, + cugraph::experimental::generator_distribution_t::UNIFORM, + cugraph::experimental::generator_distribution_t::UNIFORM), + GenerateRmats_Usecase(8, + 1, + 16, + 32, + cugraph::experimental::generator_distribution_t::UNIFORM, + cugraph::experimental::generator_distribution_t::POWER_LAW), + GenerateRmats_Usecase(8, + 3, + 16, + 32, + cugraph::experimental::generator_distribution_t::POWER_LAW, + cugraph::experimental::generator_distribution_t::UNIFORM), + GenerateRmats_Usecase(8, + 3, + 16, + 32, + cugraph::experimental::generator_distribution_t::POWER_LAW, + cugraph::experimental::generator_distribution_t::POWER_LAW))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/graph_test.cpp b/cpp/tests/experimental/graph_test.cpp index 949f6d2e08e..6ce32e0c836 100644 --- a/cpp/tests/experimental/graph_test.cpp +++ b/cpp/tests/experimental/graph_test.cpp @@ -139,7 +139,7 @@ class Tests_Graph : public ::testing::TestWithParam { handle, edgelist, number_of_vertices, - cugraph::experimental::graph_properties_t{is_symmetric, false}, + cugraph::experimental::graph_properties_t{is_symmetric, false, configuration.test_weighted}, false, true); diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp index 776bb60716c..c7756699acd 100644 --- a/cpp/tests/experimental/katz_centrality_test.cpp +++ b/cpp/tests/experimental/katz_centrality_test.cpp @@ -14,11 +14,14 @@ * limitations under the License. */ +#include #include #include +#include #include #include +#include #include #include @@ -34,6 +37,11 @@ #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + template void katz_centrality_reference(edge_t const* offsets, vertex_t const* indices, @@ -92,9 +100,12 @@ typedef struct KatzCentrality_Usecase_t { cugraph::test::input_graph_specifier_t input_graph_specifier{}; bool test_weighted{false}; + bool check_correctness{false}; - KatzCentrality_Usecase_t(std::string const& graph_file_path, bool test_weighted) - : test_weighted(test_weighted) + KatzCentrality_Usecase_t(std::string const& graph_file_path, + bool test_weighted, + bool check_correctness = true) + : test_weighted(test_weighted), check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -107,15 +118,45 @@ typedef struct KatzCentrality_Usecase_t { }; KatzCentrality_Usecase_t(cugraph::test::rmat_params_t rmat_params, - double personalization_ratio, - bool test_weighted) - : test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : test_weighted(test_weighted), check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } KatzCentrality_Usecase; +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, KatzCentrality_Usecase const& configuration, bool renumber) +{ + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber, + std::vector{0}, + size_t{1}); +} + class Tests_KatzCentrality : public ::testing::TestWithParam { public: Tests_KatzCentrality() {} @@ -128,117 +169,145 @@ class Tests_KatzCentrality : public ::testing::TestWithParam void run_current_test(KatzCentrality_Usecase const& configuration) { + constexpr bool renumber = true; + raft::handle_t handle{}; + HighResClock hr_clock{}; + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - false) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - false); - auto graph_view = graph.view(); - - std::vector h_offsets(graph_view.get_number_of_vertices() + 1); - std::vector h_indices(graph_view.get_number_of_edges()); - std::vector h_weights{}; - raft::update_host(h_offsets.data(), - graph_view.offsets(), - graph_view.get_number_of_vertices() + 1, - handle.get_stream()); - raft::update_host(h_indices.data(), - graph_view.indices(), - graph_view.get_number_of_edges(), - handle.get_stream()); - if (graph_view.is_weighted()) { - h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0}); - raft::update_host(h_weights.data(), - graph_view.weights(), - graph_view.get_number_of_edges(), - handle.get_stream()); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + std::tie(graph, d_renumber_map_labels) = + read_graph(handle, configuration, renumber); + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "read_graph took " << elapsed_time * 1e-6 << " s.\n"; } - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - std::vector h_reference_katz_centralities(graph_view.get_number_of_vertices()); + auto graph_view = graph.view(); - std::vector tmps(h_offsets.size()); - std::adjacent_difference(h_offsets.begin(), h_offsets.end(), tmps.begin()); - auto max_it = std::max_element(tmps.begin(), tmps.end()); + auto degrees = graph_view.compute_in_degrees(handle); + std::vector h_degrees(degrees.size()); + raft::update_host(h_degrees.data(), degrees.data(), degrees.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + auto max_it = std::max_element(h_degrees.begin(), h_degrees.end()); result_t const alpha = result_t{1.0} / static_cast(*max_it + 1); result_t constexpr beta{1.0}; result_t constexpr epsilon{1e-6}; - katz_centrality_reference( - h_offsets.data(), - h_indices.data(), - h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), - static_cast(nullptr), - h_reference_katz_centralities.data(), - graph_view.get_number_of_vertices(), - alpha, - beta, - epsilon, - std::numeric_limits::max(), - false, - true); - rmm::device_uvector d_katz_centralities(graph_view.get_number_of_vertices(), handle.get_stream()); - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } cugraph::experimental::katz_centrality(handle, graph_view, static_cast(nullptr), - d_katz_centralities.begin(), + d_katz_centralities.data(), alpha, beta, epsilon, std::numeric_limits::max(), false, - true, - false); - - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - - std::vector h_cugraph_katz_centralities(graph_view.get_number_of_vertices()); - - raft::update_host(h_cugraph_katz_centralities.data(), - d_katz_centralities.data(), - d_katz_centralities.size(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - auto threshold_ratio = 1e-3; - auto threshold_magnitude = - (1.0 / static_cast(graph_view.get_number_of_vertices())) * - threshold_ratio; // skip comparison for low Katz Centrality verties (lowly ranked vertices) - auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { - return std::abs(lhs - rhs) < - std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); - }; - - ASSERT_TRUE(std::equal(h_reference_katz_centralities.begin(), - h_reference_katz_centralities.end(), - h_cugraph_katz_centralities.begin(), - nearly_equal)) - << "Katz centrality values do not match with the reference values."; + true); + + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "Katz Centrality took " << elapsed_time * 1e-6 << " s.\n"; + } + + if (configuration.check_correctness) { + cugraph::experimental::graph_t unrenumbered_graph( + handle); + if (renumber) { + std::tie(unrenumbered_graph, std::ignore) = + read_graph(handle, configuration, false); + } + auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; + + std::vector h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1); + std::vector h_indices(unrenumbered_graph_view.get_number_of_edges()); + std::vector h_weights{}; + raft::update_host(h_offsets.data(), + unrenumbered_graph_view.offsets(), + unrenumbered_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_indices.data(), + unrenumbered_graph_view.indices(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + if (unrenumbered_graph_view.is_weighted()) { + h_weights.assign(unrenumbered_graph_view.get_number_of_edges(), weight_t{0.0}); + raft::update_host(h_weights.data(), + unrenumbered_graph_view.weights(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + } + + handle.get_stream_view().synchronize(); + + std::vector h_reference_katz_centralities( + unrenumbered_graph_view.get_number_of_vertices()); + + katz_centrality_reference( + h_offsets.data(), + h_indices.data(), + h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), + static_cast(nullptr), + h_reference_katz_centralities.data(), + unrenumbered_graph_view.get_number_of_vertices(), + alpha, + beta, + epsilon, + std::numeric_limits::max(), + false, + true); + + std::vector h_cugraph_katz_centralities(graph_view.get_number_of_vertices()); + if (renumber) { + auto d_unrenumbered_katz_centralities = + cugraph::test::sort_by_key(handle, + d_renumber_map_labels.data(), + d_katz_centralities.data(), + d_renumber_map_labels.size()); + raft::update_host(h_cugraph_katz_centralities.data(), + d_unrenumbered_katz_centralities.data(), + d_unrenumbered_katz_centralities.size(), + handle.get_stream()); + } else { + raft::update_host(h_cugraph_katz_centralities.data(), + d_katz_centralities.data(), + d_katz_centralities.size(), + handle.get_stream()); + } + + handle.get_stream_view().synchronize(); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low Katz Centrality verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; + + ASSERT_TRUE(std::equal(h_reference_katz_centralities.begin(), + h_reference_katz_centralities.end(), + h_cugraph_katz_centralities.begin(), + nearly_equal)) + << "Katz centrality values do not match with the reference values."; + } } }; @@ -252,6 +321,7 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_KatzCentrality, ::testing::Values( + // enable correctness checks KatzCentrality_Usecase("test/datasets/karate.mtx", false), KatzCentrality_Usecase("test/datasets/karate.mtx", true), KatzCentrality_Usecase("test/datasets/web-Google.mtx", false), @@ -261,16 +331,15 @@ INSTANTIATE_TEST_CASE_P( KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", false), KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", true), KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, - 0.0, false), KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, - 0.5, - false), - KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, - 0.0, true), - KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, - 0.5, - true))); + // disable correctness checks for large graphs + KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + false, + false), + KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + true, + false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/louvain_test.cu b/cpp/tests/experimental/louvain_test.cu deleted file mode 100644 index 56fb2c109bf..00000000000 --- a/cpp/tests/experimental/louvain_test.cu +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governin_from_mtxg permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include -#include - -#include -#include - -#include - -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - -typedef struct Louvain_Usecase_t { - std::string graph_file_full_path{}; - bool test_weighted{false}; - - Louvain_Usecase_t(std::string const& graph_file_path, bool test_weighted) - : test_weighted(test_weighted) - { - if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { - graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; - } else { - graph_file_full_path = graph_file_path; - } - }; -} Louvain_Usecase; - -class Tests_Louvain : public ::testing::TestWithParam { - public: - Tests_Louvain() {} - static void SetupTestCase() {} - static void TearDownTestCase() {} - - virtual void SetUp() {} - virtual void TearDown() {} - - template - void run_current_test(Louvain_Usecase const& configuration) - { - raft::handle_t handle{}; - - std::cout << "read graph file: " << configuration.graph_file_full_path << std::endl; - - cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - cugraph::test::read_graph_from_matrix_market_file( - handle, configuration.graph_file_full_path, configuration.test_weighted, false); - - auto graph_view = graph.view(); - - // "FIXME": remove this check once we drop support for Pascal - // - // Calling louvain on Pascal will throw an exception, we'll check that - // this is the behavior while we still support Pascal (device_prop.major < 7) - // - cudaDeviceProp device_prop; - CUDA_CHECK(cudaGetDeviceProperties(&device_prop, 0)); - - if (device_prop.major < 7) { - EXPECT_THROW(louvain(graph_view), cugraph::logic_error); - } else { - louvain(graph_view); - } - } - - template - void louvain(graph_t const& graph_view) - { - using vertex_t = typename graph_t::vertex_type; - using weight_t = typename graph_t::weight_type; - - raft::handle_t handle{}; - - rmm::device_vector clustering_v(graph_view.get_number_of_local_vertices()); - size_t level; - weight_t modularity; - - std::tie(level, modularity) = - cugraph::louvain(handle, graph_view, clustering_v.data().get(), size_t{100}, weight_t{1}); - - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - - std::cout << "level = " << level << std::endl; - std::cout << "modularity = " << modularity << std::endl; - } -}; - -// FIXME: add tests for type combinations -TEST_P(Tests_Louvain, CheckInt32Int32FloatFloat) -{ - run_current_test(GetParam()); -} - -INSTANTIATE_TEST_CASE_P(simple_test, - Tests_Louvain, - ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true) -#if 0 - , - Louvain_Usecase("test/datasets/web-Google.mtx", true), - Louvain_Usecase("test/datasets/ljournal-2008.mtx", true), - Louvain_Usecase("test/datasets/webbase-1M.mtx", true) -#endif - )); - -CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/mg_bfs_test.cpp b/cpp/tests/experimental/mg_bfs_test.cpp new file mode 100644 index 00000000000..64ffedd2492 --- /dev/null +++ b/cpp/tests/experimental/mg_bfs_test.cpp @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + +typedef struct BFS_Usecase_t { + cugraph::test::input_graph_specifier_t input_graph_specifier{}; + + size_t source{0}; + bool check_correctness{false}; + + BFS_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) + : source(source), check_correctness(check_correctness) + { + std::string graph_file_full_path{}; + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH; + input_graph_specifier.graph_file_full_path = graph_file_full_path; + }; + + BFS_Usecase_t(cugraph::test::rmat_params_t rmat_params, + size_t source, + bool check_correctness = true) + : source(source), check_correctness(check_correctness) + { + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; + input_graph_specifier.rmat_params = rmat_params; + } +} BFS_Usecase; + +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, BFS_Usecase const& configuration, bool renumber) +{ + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + std::vector partition_ids(multi_gpu ? size_t{1} : static_cast(comm_size)); + std::iota(partition_ids.begin(), + partition_ids.end(), + multi_gpu ? static_cast(comm_rank) : size_t{0}); + + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, false, renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + false, + renumber, + partition_ids, + static_cast(comm_size)); +} + +class Tests_MGBFS : public ::testing::TestWithParam { + public: + Tests_MGBFS() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + // Compare the results of running BFS on multiple GPUs to that of a single-GPU run + template + void run_current_test(BFS_Usecase const& configuration) + { + using weight_t = float; + + // 1. initialize handle + + raft::handle_t handle{}; + HighResClock hr_clock{}; + + raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { --row_comm_size; } + cugraph::partition_2d::subcomm_factory_t + subcomm_factory(handle, row_comm_size); + + // 2. create MG graph + + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } + cugraph::experimental::graph_t mg_graph(handle); + rmm::device_uvector d_mg_renumber_map_labels(0, handle.get_stream()); + std::tie(mg_graph, d_mg_renumber_map_labels) = + read_graph(handle, configuration, true); + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "MG read_graph took " << elapsed_time * 1e-6 << " s.\n"; + } + + auto mg_graph_view = mg_graph.view(); + + ASSERT_TRUE(static_cast(configuration.source) >= 0 && + static_cast(configuration.source) < + mg_graph_view.get_number_of_vertices()) + << "Invalid starting source."; + + // 3. run MG BFS + + rmm::device_uvector d_mg_distances(mg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + rmm::device_uvector d_mg_predecessors(mg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } + + cugraph::experimental::bfs(handle, + mg_graph_view, + d_mg_distances.data(), + d_mg_predecessors.data(), + static_cast(configuration.source), + false, + std::numeric_limits::max()); + + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "MG BFS took " << elapsed_time * 1e-6 << " s.\n"; + } + + // 5. copmare SG & MG results + + if (configuration.check_correctness) { + // 5-1. create SG graph + + cugraph::experimental::graph_t sg_graph(handle); + std::tie(sg_graph, std::ignore) = + read_graph(handle, configuration, false); + + auto sg_graph_view = sg_graph.view(); + + std::vector vertex_partition_lasts(comm_size); + for (size_t i = 0; i < vertex_partition_lasts.size(); ++i) { + vertex_partition_lasts[i] = mg_graph_view.get_vertex_partition_last(i); + } + + rmm::device_scalar d_source(static_cast(configuration.source), + handle.get_stream()); + cugraph::experimental::unrenumber_int_vertices( + handle, + d_source.data(), + size_t{1}, + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + true); + auto unrenumbered_source = d_source.value(handle.get_stream()); + + // 5-2. run SG BFS + + rmm::device_uvector d_sg_distances(sg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + rmm::device_uvector d_sg_predecessors(sg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + + cugraph::experimental::bfs(handle, + sg_graph_view, + d_sg_distances.data(), + d_sg_predecessors.data(), + unrenumbered_source, + false, + std::numeric_limits::max()); + + // 5-3. compare + + std::vector h_sg_offsets(sg_graph_view.get_number_of_vertices() + 1); + std::vector h_sg_indices(sg_graph_view.get_number_of_edges()); + raft::update_host(h_sg_offsets.data(), + sg_graph_view.offsets(), + sg_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_sg_indices.data(), + sg_graph_view.indices(), + sg_graph_view.get_number_of_edges(), + handle.get_stream()); + + std::vector h_sg_distances(sg_graph_view.get_number_of_vertices()); + std::vector h_sg_predecessors(sg_graph_view.get_number_of_vertices()); + raft::update_host( + h_sg_distances.data(), d_sg_distances.data(), d_sg_distances.size(), handle.get_stream()); + raft::update_host(h_sg_predecessors.data(), + d_sg_predecessors.data(), + d_sg_predecessors.size(), + handle.get_stream()); + + std::vector h_mg_distances(mg_graph_view.get_number_of_local_vertices()); + std::vector h_mg_predecessors(mg_graph_view.get_number_of_local_vertices()); + raft::update_host( + h_mg_distances.data(), d_mg_distances.data(), d_mg_distances.size(), handle.get_stream()); + cugraph::experimental::unrenumber_int_vertices( + handle, + d_mg_predecessors.data(), + d_mg_predecessors.size(), + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + true); + raft::update_host(h_mg_predecessors.data(), + d_mg_predecessors.data(), + d_mg_predecessors.size(), + handle.get_stream()); + + std::vector h_mg_renumber_map_labels(d_mg_renumber_map_labels.size()); + raft::update_host(h_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { + auto mapped_vertex = h_mg_renumber_map_labels[i]; + ASSERT_TRUE(h_mg_distances[i] == h_sg_distances[mapped_vertex]) + << "MG BFS distance for vertex: " << mapped_vertex << " in rank: " << comm_rank + << " has value: " << h_mg_distances[i] + << " different from the corresponding SG value: " << h_sg_distances[mapped_vertex]; + if (h_mg_predecessors[i] == cugraph::invalid_vertex_id::value) { + ASSERT_TRUE(h_sg_predecessors[mapped_vertex] == h_mg_predecessors[i]) + << "vertex reachability does not match with the SG result."; + } else { + ASSERT_TRUE(h_sg_distances[h_mg_predecessors[i]] + 1 == h_sg_distances[mapped_vertex]) + << "distances to this vertex != distances to the predecessor vertex + 1."; + bool found{false}; + for (auto j = h_sg_offsets[h_mg_predecessors[i]]; + j < h_sg_offsets[h_mg_predecessors[i] + 1]; + ++j) { + if (h_sg_indices[j] == mapped_vertex) { + found = true; + break; + } + } + ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex."; + } + } + } + } +}; + +TEST_P(Tests_MGBFS, CheckInt32Int32) { run_current_test(GetParam()); } + +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_MGBFS, + ::testing::Values( + // enable correctness checks + BFS_Usecase("test/datasets/karate.mtx", 0), + BFS_Usecase("test/datasets/web-Google.mtx", 0), + BFS_Usecase("test/datasets/ljournal-2008.mtx", 0), + BFS_Usecase("test/datasets/webbase-1M.mtx", 0), + BFS_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), + // disable correctness checks for large graphs + BFS_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + 0, + false))); + +CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/mg_katz_centrality_test.cpp b/cpp/tests/experimental/mg_katz_centrality_test.cpp new file mode 100644 index 00000000000..937bd33472b --- /dev/null +++ b/cpp/tests/experimental/mg_katz_centrality_test.cpp @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + +typedef struct KatzCentrality_Usecase_t { + cugraph::test::input_graph_specifier_t input_graph_specifier{}; + + bool test_weighted{false}; + bool check_correctness{false}; + + KatzCentrality_Usecase_t(std::string const& graph_file_path, + bool test_weighted, + bool check_correctness = true) + : test_weighted(test_weighted), check_correctness(check_correctness) + { + std::string graph_file_full_path{}; + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH; + input_graph_specifier.graph_file_full_path = graph_file_full_path; + }; + + KatzCentrality_Usecase_t(cugraph::test::rmat_params_t rmat_params, + bool test_weighted, + bool check_correctness = true) + : test_weighted(test_weighted), check_correctness(check_correctness) + { + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; + input_graph_specifier.rmat_params = rmat_params; + } +} KatzCentrality_Usecase; + +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, KatzCentrality_Usecase const& configuration, bool renumber) +{ + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + std::vector partition_ids(multi_gpu ? size_t{1} : static_cast(comm_size)); + std::iota(partition_ids.begin(), + partition_ids.end(), + multi_gpu ? static_cast(comm_rank) : size_t{0}); + + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber, + partition_ids, + static_cast(comm_size)); +} + +class Tests_MGKatzCentrality : public ::testing::TestWithParam { + public: + Tests_MGKatzCentrality() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + // Compare the results of running Katz Centrality on multiple GPUs to that of a single-GPU run + template + void run_current_test(KatzCentrality_Usecase const& configuration) + { + // 1. initialize handle + + raft::handle_t handle{}; + HighResClock hr_clock{}; + + raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { --row_comm_size; } + cugraph::partition_2d::subcomm_factory_t + subcomm_factory(handle, row_comm_size); + + // 2. create MG graph + + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } + cugraph::experimental::graph_t mg_graph(handle); + rmm::device_uvector d_mg_renumber_map_labels(0, handle.get_stream()); + std::tie(mg_graph, d_mg_renumber_map_labels) = + read_graph(handle, configuration, true); + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "MG read_graph took " << elapsed_time * 1e-6 << " s.\n"; + } + + auto mg_graph_view = mg_graph.view(); + + // 3. compute max in-degree + + auto max_in_degree = mg_graph_view.compute_max_in_degree(handle); + + // 4. run MG Katz Centrality + + result_t const alpha = result_t{1.0} / static_cast(max_in_degree + 1); + result_t constexpr beta{1.0}; + result_t constexpr epsilon{1e-6}; + + rmm::device_uvector d_mg_katz_centralities( + mg_graph_view.get_number_of_local_vertices(), handle.get_stream()); + + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } + + cugraph::experimental::katz_centrality(handle, + mg_graph_view, + static_cast(nullptr), + d_mg_katz_centralities.data(), + alpha, + beta, + epsilon, + std::numeric_limits::max(), + false); + + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "MG Katz Centrality took " << elapsed_time * 1e-6 << " s.\n"; + } + + // 5. copmare SG & MG results + + if (configuration.check_correctness) { + // 5-1. create SG graph + + cugraph::experimental::graph_t sg_graph(handle); + std::tie(sg_graph, std::ignore) = + read_graph(handle, configuration, false); + + auto sg_graph_view = sg_graph.view(); + + // 5-3. run SG Katz Centrality + + rmm::device_uvector d_sg_katz_centralities(sg_graph_view.get_number_of_vertices(), + handle.get_stream()); + + cugraph::experimental::katz_centrality(handle, + sg_graph_view, + static_cast(nullptr), + d_sg_katz_centralities.data(), + alpha, + beta, + epsilon, + std::numeric_limits::max(), // max_iterations + false); + + // 5-4. compare + + std::vector h_sg_katz_centralities(sg_graph_view.get_number_of_vertices()); + raft::update_host(h_sg_katz_centralities.data(), + d_sg_katz_centralities.data(), + d_sg_katz_centralities.size(), + handle.get_stream()); + + std::vector h_mg_katz_centralities(mg_graph_view.get_number_of_local_vertices()); + raft::update_host(h_mg_katz_centralities.data(), + d_mg_katz_centralities.data(), + d_mg_katz_centralities.size(), + handle.get_stream()); + + std::vector h_mg_renumber_map_labels(d_mg_renumber_map_labels.size()); + raft::update_host(h_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(mg_graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low KatzCentrality verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; + + for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { + auto mapped_vertex = h_mg_renumber_map_labels[i]; + ASSERT_TRUE(nearly_equal(h_mg_katz_centralities[i], h_sg_katz_centralities[mapped_vertex])) + << "MG KatzCentrality value for vertex: " << mapped_vertex << " in rank: " << comm_rank + << " has value: " << h_mg_katz_centralities[i] + << " which exceeds the error margin for comparing to SG value: " + << h_sg_katz_centralities[mapped_vertex]; + } + } + } +}; + +TEST_P(Tests_MGKatzCentrality, CheckInt32Int32FloatFloat) +{ + run_current_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_MGKatzCentrality, + ::testing::Values( + // enable correctness checks + KatzCentrality_Usecase("test/datasets/karate.mtx", false), + KatzCentrality_Usecase("test/datasets/karate.mtx", true), + KatzCentrality_Usecase("test/datasets/web-Google.mtx", false), + KatzCentrality_Usecase("test/datasets/web-Google.mtx", true), + KatzCentrality_Usecase("test/datasets/ljournal-2008.mtx", false), + KatzCentrality_Usecase("test/datasets/ljournal-2008.mtx", true), + KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", false), + KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", true), + KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, + false), + KatzCentrality_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, + true), + // disable correctness checks for large graphs + KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + false, + false), + KatzCentrality_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + true, + false))); + +CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/mg_sssp_test.cpp b/cpp/tests/experimental/mg_sssp_test.cpp new file mode 100644 index 00000000000..de39b8da128 --- /dev/null +++ b/cpp/tests/experimental/mg_sssp_test.cpp @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + +typedef struct SSSP_Usecase_t { + cugraph::test::input_graph_specifier_t input_graph_specifier{}; + + size_t source{0}; + bool check_correctness{false}; + + SSSP_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) + : source(source), check_correctness(check_correctness) + { + std::string graph_file_full_path{}; + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH; + input_graph_specifier.graph_file_full_path = graph_file_full_path; + }; + + SSSP_Usecase_t(cugraph::test::rmat_params_t rmat_params, + size_t source, + bool check_correctness = true) + : source(source), check_correctness(check_correctness) + { + input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; + input_graph_specifier.rmat_params = rmat_params; + } +} SSSP_Usecase; + +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, SSSP_Usecase const& configuration, bool renumber) +{ + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + std::vector partition_ids(multi_gpu ? size_t{1} : static_cast(comm_size)); + std::iota(partition_ids.begin(), + partition_ids.end(), + multi_gpu ? static_cast(comm_rank) : size_t{0}); + + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, true, renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + true, + renumber, + partition_ids, + static_cast(comm_size)); +} + +class Tests_MGSSSP : public ::testing::TestWithParam { + public: + Tests_MGSSSP() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + // Compare the results of running SSSP on multiple GPUs to that of a single-GPU run + template + void run_current_test(SSSP_Usecase const& configuration) + { + // 1. initialize handle + + raft::handle_t handle{}; + HighResClock hr_clock{}; + + raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + auto row_comm_size = static_cast(sqrt(static_cast(comm_size))); + while (comm_size % row_comm_size != 0) { --row_comm_size; } + cugraph::partition_2d::subcomm_factory_t + subcomm_factory(handle, row_comm_size); + + // 2. create MG graph + + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } + cugraph::experimental::graph_t mg_graph(handle); + rmm::device_uvector d_mg_renumber_map_labels(0, handle.get_stream()); + std::tie(mg_graph, d_mg_renumber_map_labels) = + read_graph(handle, configuration, true); + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "MG read_graph took " << elapsed_time * 1e-6 << " s.\n"; + } + + auto mg_graph_view = mg_graph.view(); + + ASSERT_TRUE(static_cast(configuration.source) >= 0 && + static_cast(configuration.source) < + mg_graph_view.get_number_of_vertices()) + << "Invalid starting source."; + + // 3. run MG SSSP + + rmm::device_uvector d_mg_distances(mg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + rmm::device_uvector d_mg_predecessors(mg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } + + // FIXME: disable do_expensive_check + cugraph::experimental::sssp(handle, + mg_graph_view, + d_mg_distances.data(), + d_mg_predecessors.data(), + static_cast(configuration.source), + std::numeric_limits::max()); + + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "MG SSSP took " << elapsed_time * 1e-6 << " s.\n"; + } + + // 5. copmare SG & MG results + + if (configuration.check_correctness) { + // 5-1. create SG graph + + cugraph::experimental::graph_t sg_graph(handle); + std::tie(sg_graph, std::ignore) = + read_graph(handle, configuration, false); + + auto sg_graph_view = sg_graph.view(); + + std::vector vertex_partition_lasts(comm_size); + for (size_t i = 0; i < vertex_partition_lasts.size(); ++i) { + vertex_partition_lasts[i] = mg_graph_view.get_vertex_partition_last(i); + } + + rmm::device_scalar d_source(static_cast(configuration.source), + handle.get_stream()); + cugraph::experimental::unrenumber_int_vertices( + handle, + d_source.data(), + size_t{1}, + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + true); + auto unrenumbered_source = d_source.value(handle.get_stream()); + + // 5-2. run SG SSSP + + rmm::device_uvector d_sg_distances(sg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + rmm::device_uvector d_sg_predecessors(sg_graph_view.get_number_of_local_vertices(), + handle.get_stream()); + + // FIXME: disable do_expensive_check + cugraph::experimental::sssp(handle, + sg_graph_view, + d_sg_distances.data(), + d_sg_predecessors.data(), + unrenumbered_source, + std::numeric_limits::max()); + + // 5-3. compare + + std::vector h_sg_offsets(sg_graph_view.get_number_of_vertices() + 1); + std::vector h_sg_indices(sg_graph_view.get_number_of_edges()); + std::vector h_sg_weights(sg_graph_view.get_number_of_edges()); + raft::update_host(h_sg_offsets.data(), + sg_graph_view.offsets(), + sg_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_sg_indices.data(), + sg_graph_view.indices(), + sg_graph_view.get_number_of_edges(), + handle.get_stream()); + raft::update_host(h_sg_weights.data(), + sg_graph_view.weights(), + sg_graph_view.get_number_of_edges(), + handle.get_stream()); + + std::vector h_sg_distances(sg_graph_view.get_number_of_vertices()); + std::vector h_sg_predecessors(sg_graph_view.get_number_of_vertices()); + raft::update_host( + h_sg_distances.data(), d_sg_distances.data(), d_sg_distances.size(), handle.get_stream()); + raft::update_host(h_sg_predecessors.data(), + d_sg_predecessors.data(), + d_sg_predecessors.size(), + handle.get_stream()); + + std::vector h_mg_distances(mg_graph_view.get_number_of_local_vertices()); + std::vector h_mg_predecessors(mg_graph_view.get_number_of_local_vertices()); + raft::update_host( + h_mg_distances.data(), d_mg_distances.data(), d_mg_distances.size(), handle.get_stream()); + cugraph::experimental::unrenumber_int_vertices( + handle, + d_mg_predecessors.data(), + d_mg_predecessors.size(), + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + true); + raft::update_host(h_mg_predecessors.data(), + d_mg_predecessors.data(), + d_mg_predecessors.size(), + handle.get_stream()); + + std::vector h_mg_renumber_map_labels(d_mg_renumber_map_labels.size()); + raft::update_host(h_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto max_weight_element = std::max_element(h_sg_weights.begin(), h_sg_weights.end()); + auto epsilon = *max_weight_element * weight_t{1e-6}; + auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; }; + + for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { + auto mapped_vertex = h_mg_renumber_map_labels[i]; + ASSERT_TRUE(nearly_equal(h_mg_distances[i], h_sg_distances[mapped_vertex])) + << "MG SSSP distance for vertex: " << mapped_vertex << " in rank: " << comm_rank + << " has value: " << h_mg_distances[i] + << " different from the corresponding SG value: " << h_sg_distances[mapped_vertex]; + if (h_mg_predecessors[i] == cugraph::invalid_vertex_id::value) { + ASSERT_TRUE(h_sg_predecessors[mapped_vertex] == h_mg_predecessors[i]) + << "vertex reachability does not match with the SG result."; + } else { + auto pred_distance = h_sg_distances[h_mg_predecessors[i]]; + bool found{false}; + for (auto j = h_sg_offsets[h_mg_predecessors[i]]; + j < h_sg_offsets[h_mg_predecessors[i] + 1]; + ++j) { + if (h_sg_indices[j] == mapped_vertex) { + if (nearly_equal(pred_distance + h_sg_weights[j], h_sg_distances[mapped_vertex])) { + found = true; + break; + } + } + } + ASSERT_TRUE(found) + << "no edge from the predecessor vertex to this vertex with the matching weight."; + } + } + } + } +}; + +TEST_P(Tests_MGSSSP, CheckInt32Int32Float) +{ + run_current_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_MGSSSP, + ::testing::Values( + // enable correctness checks + SSSP_Usecase("test/datasets/karate.mtx", 0), + SSSP_Usecase("test/datasets/dblp.mtx", 0), + SSSP_Usecase("test/datasets/wiki2003.mtx", 1000), + SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), + // disable correctness checks for large graphs + SSSP_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + 0, + false))); + +CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/ms_bfs_test.cpp b/cpp/tests/experimental/ms_bfs_test.cpp new file mode 100644 index 00000000000..264382c22a3 --- /dev/null +++ b/cpp/tests/experimental/ms_bfs_test.cpp @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +typedef struct MsBfs_Usecase_t { + std::string graph_file_full_path{}; + std::vector sources{}; + int32_t radius; + bool test_weighted{false}; + + MsBfs_Usecase_t(std::string const& graph_file_path, + std::vector const& sources, + int32_t radius, + bool test_weighted) + : sources(sources), radius(radius), test_weighted(test_weighted) + { + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + }; +} MsBfs_Usecase; + +class Tests_MsBfs : public ::testing::TestWithParam { + public: + Tests_MsBfs() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + template + void run_current_test(MsBfs_Usecase const& configuration) + { + auto n_seeds = configuration.sources.size(); + int n_streams = std::min(n_seeds, static_cast(128)); + raft::handle_t handle(n_streams); + + cugraph::experimental::graph_t graph( + handle); + std::tie(graph, std::ignore) = cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.graph_file_full_path, configuration.test_weighted, false); + auto graph_view = graph.view(); + // Streams will allocate concurrently later + std::vector> d_distances{}; + std::vector> d_predecessors{}; + + d_distances.reserve(n_seeds); + d_predecessors.reserve(n_seeds); + for (vertex_t i = 0; i < n_seeds; i++) { + // Allocations and operations are attached to the worker stream + rmm::device_uvector tmp_distances(graph_view.get_number_of_vertices(), + handle.get_internal_stream_view(i)); + rmm::device_uvector tmp_predecessors(graph_view.get_number_of_vertices(), + handle.get_internal_stream_view(i)); + + d_distances.push_back(std::move(tmp_distances)); + d_predecessors.push_back(std::move(tmp_predecessors)); + } + + std::vector radius(n_seeds); + std::generate(radius.begin(), radius.end(), [n = 0]() mutable { return (n++ % 12 + 1); }); + + // warm up + cugraph::experimental::bfs(handle, + graph_view, + d_distances[0].begin(), + d_predecessors[0].begin(), + static_cast(configuration.sources[0]), + false, + radius[0]); + + // one by one + HighResTimer hr_timer; + hr_timer.start("bfs"); + cudaProfilerStart(); + for (vertex_t i = 0; i < n_seeds; i++) { + cugraph::experimental::bfs(handle, + graph_view, + d_distances[i].begin(), + d_predecessors[i].begin(), + static_cast(configuration.sources[i]), + false, + radius[i]); + } + cudaProfilerStop(); + hr_timer.stop(); + hr_timer.display(std::cout); + + // concurrent + hr_timer.start("bfs"); + cudaProfilerStart(); +#pragma omp parallel for + for (vertex_t i = 0; i < n_seeds; i++) { + raft::handle_t light_handle(handle, i); + auto worker_stream_view = light_handle.get_stream_view(); + cugraph::experimental::bfs(light_handle, + graph_view, + d_distances[i].begin(), + d_predecessors[i].begin(), + static_cast(configuration.sources[i]), + false, + radius[i]); + } + + cudaProfilerStop(); + hr_timer.stop(); + hr_timer.display(std::cout); + } +}; + +TEST_P(Tests_MsBfs, DISABLED_CheckInt32Int32FloatUntransposed) +{ + run_current_test(GetParam()); +} +/* +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_MsBfs, + ::testing::Values( + MsBfs_Usecase("test/datasets/karate.mtx", std::vector{0}, 1, false), + MsBfs_Usecase("test/datasets/karate.mtx", std::vector{0}, 2, false), + MsBfs_Usecase("test/datasets/karate.mtx", std::vector{1}, 3, false), + MsBfs_Usecase("test/datasets/karate.mtx", std::vector{10, 0, 5}, 2, false), + MsBfs_Usecase("test/datasets/karate.mtx", std::vector{9, 3, 10}, 2, false), + MsBfs_Usecase( + "test/datasets/karate.mtx", std::vector{5, 9, 3, 10, 12, 13}, 2, true))); +*/ +// For perf analysis + +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_MsBfs, + ::testing::Values( + MsBfs_Usecase("test/datasets/soc-LiveJournal1.mtx", std::vector{363617}, 2, false), + MsBfs_Usecase( + "test/datasets/soc-LiveJournal1.mtx", + std::vector{ + 363617, 722214, 2337449, 2510183, 2513389, 225853, 2035807, 3836330, 1865496, 28755}, + 2, + false), + MsBfs_Usecase( + "test/datasets/soc-LiveJournal1.mtx", + std::vector{ + 363617, 722214, 2337449, 2510183, 2513389, 225853, 2035807, 3836330, 1865496, 28755, + 2536834, 3070144, 3888415, 3131712, 2382526, 1040771, 2631543, 4607218, 4465829, 3341686, + 2772973, 2611175, 4526129, 2624421, 1220593, 2593137, 3270705, 1503899, 1213033, 4840102, + 4529036, 3421116, 4264831, 4089751, 4272322, 3486998, 2830318, 320953, 2388331, 520808, + 3023094, 1600294, 3631119, 1716614, 4829213, 1175844, 960680, 847662, 3277365, 3957318, + 3455123, 2454259, 670953, 4465677, 1027332, 2560721, 89061, 1163406, 3109528, 3221856, + 4714426, 2382774, 37828, 4433616, 3283229, 591911, 4200188, 442522, 872207, 2437601, + 741003, 266241, 914618, 3626195, 2021080, 4679624, 777476, 2527796, 1114017, 640142, + 49259, 4069879, 3869098, 1105040, 4707804, 3208582, 3325885, 1450601, 4072548, 2037062, + 2029646, 4575891, 1488598, 79105, 4827273, 3795434, 4647518, 4733397, 3980718, 1184627}, + 2, + false), + MsBfs_Usecase( + "test/datasets/soc-LiveJournal1.mtx", + std::vector{ + 363617, 722214, 2337449, 2510183, 2513389, 225853, 2035807, 3836330, 1865496, 28755, + 2536834, 3070144, 3888415, 3131712, 2382526, 1040771, 2631543, 4607218, 4465829, 3341686, + 2772973, 2611175, 4526129, 2624421, 1220593, 2593137, 3270705, 1503899, 1213033, 4840102, + 4529036, 3421116, 4264831, 4089751, 4272322, 3486998, 2830318, 320953, 2388331, 520808, + 3023094, 1600294, 3631119, 1716614, 4829213, 1175844, 960680, 847662, 3277365, 3957318, + 3455123, 2454259, 670953, 4465677, 1027332, 2560721, 89061, 1163406, 3109528, 3221856, + 4714426, 2382774, 37828, 4433616, 3283229, 591911, 4200188, 442522, 872207, 2437601, + 741003, 266241, 914618, 3626195, 2021080, 4679624, 777476, 2527796, 1114017, 640142, + 49259, 4069879, 3869098, 1105040, 4707804, 3208582, 3325885, 1450601, 4072548, 2037062, + 2029646, 4575891, 1488598, 79105, 4827273, 3795434, 4647518, 4733397, 3980718, 1184627, + 984983, 3114832, 1967741, 1599818, 144593, 2698770, 2889449, 2495550, 1053813, 1193622, + 686026, 3989015, 2040719, 4693428, 3190376, 2926728, 3399030, 1664419, 662429, 4526841, + 2186957, 3752558, 2440046, 2930226, 3633006, 4058166, 3137060, 3499296, 2126343, 148971, + 2199672, 275811, 2813976, 2274536, 1189239, 1335942, 2465624, 2596042, 829684, 193400, + 2682845, 3691697, 4022437, 4051170, 4195175, 2876420, 3984220, 2174475, 326134, 2606530, + 2493046, 4706121, 1498980, 4576225, 1271339, 44832, 1875673, 4664940, 134931, 736397, + 4333554, 2751031, 2163610, 2879676, 3174153, 3317403, 2052464, 1881883, 4757859, 3596257, + 2358088, 2578758, 447504, 590720, 1717038, 1869795, 1133885, 3027521, 840312, 2818881, + 3654321, 2730947, 353585, 1134903, 2223378, 1508824, 3662521, 1363776, 2712071, 288441, + 1204581, 3502242, 4645567, 2767267, 1514366, 3956099, 1422145, 1216608, 2253360, 189132, + 4238225, 1345783, 451571, 1599442, 3237284, 4711405, 929446, 1857675, 150759, 1277633, + 761210, 138628, 1026833, 2599544, 2464737, 989203, 3399615, 2144292, 216142, 637312, + 2044964, 716256, 1660632, 1762919, 4784357, 2213415, 2764769, 291806, 609772, 3264819, + 1870953, 1516385, 235647, 1045474, 2664957, 819095, 1824119, 4045271, 4448109, 1676788, + 4285177, 1580502, 3546548, 2771971, 3927086, 1339779, 3156204, 1730998, 1172522, 2433024, + 4533449, 479930, 2010695, 672994, 3542039, 3176455, 26352, 2137735, 866910, 4410835, + 2623982, 3603159, 2555625, 2765653, 267865, 2015523, 1009052, 4713994, 1600667, 2176195, + 3179631, 4570390, 2018424, 3356384, 1784287, 894861, 3622099, 1647273, 3044136, 950354, + 1491760, 3416929, 3757300, 2244912, 4129215, 1600848, 3867343, 72329, 919189, 992521, + 3445975, 4712557, 4680974, 188419, 2612093, 1991268, 3566207, 2281468, 3859078, 2492806, + 3398628, 763441, 2679107, 2554420, 2130132, 4664374, 1182901, 3890770, 4714667, 4209303, + 4013060, 3617653, 2040022, 3296519, 4190671, 1693353, 2678411, 3788834, 2781815, 191965, + 1083926, 503974, 3529226, 1650522, 1900976, 542080, 3423929, 3418905, 878165, 4701703, + 3022790, 4316365, 76365, 4053672, 1358185, 3830478, 4445661, 3210024, 1895915, 4541133, + 2938808, 562788, 3920065, 1458776, 4052046, 2967475, 1092809, 3203538, 159626, 3399464, + 214467, 3343982, 1811854, 3189045, 4272117, 4701563, 424807, 4341116, 760545, 4674683, + 1538018, 386762, 194237, 2162719, 1694433, 943728, 2389036, 2196653, 3085571, 1513424, + 3689413, 3278747, 4197291, 3324063, 3651090, 1737936, 2768803, 2768889, 3108096, 4311775, + 3569480, 886705, 733256, 2477493, 1735412, 2960895, 1983781, 1861797, 3566460, 4537673, + 1164093, 3499764, 4553071, 3518985, 847658, 918948, 2922351, 1056144, 652895, 1013195, + 780505, 1702928, 3562838, 1432719, 2405207, 1054920, 641647, 2240939, 3617702, 383165, + 652641, 879593, 1810739, 2096385, 4497865, 4768530, 1743968, 3582014, 1025009, 3002122, + 2422190, 527647, 1251821, 2571153, 4095874, 3705333, 3637407, 1385567, 4043855, 4041930, + 2433139, 1710383, 1127734, 4362316, 711588, 817839, 3214775, 910077, 1313768, 2382229, + 16864, 2081770, 3095420, 3195272, 548711, 2259860, 1167323, 2435974, 425238, 2085179, + 2630042, 2632881, 2867923, 3703565, 1037695, 226617, 4379130, 1541468, 3581937, 605965, + 1137674, 4655221, 4769963, 1394370, 4425315, 2990132, 2364485, 1561137, 2713384, 481509, + 2900382, 934766, 2986774, 1767669, 298593, 2502539, 139296, 3794229, 4002180, 4718138, + 2909238, 423691, 3023810, 2784924, 2760160, 1971980, 316683, 3828090, 3253691, 4839313, + 1203624, 584938, 3901482, 1747543, 1572737, 3533226, 774708, 1691195, 1037110, 1557763, + 225120, 4424243, 3524086, 1717663, 4332507, 3513592, 4274932, 1232118, 873498, 1416042, + 2488925, 111391, 4704545, 4492545, 445317, 1584812, 2187737, 2471948, 3731678, 219255, + 2282627, 2589971, 2372185, 4609096, 3673961, 2524410, 12823, 2437155, 3015974, 4188352, + 3184084, 3690756, 1222341, 1278376, 3652030, 4162647, 326548, 3930062, 3926100, 1551222, + 2722165, 4526695, 3997534, 4815513, 3139056, 2547644, 3028915, 4149092, 3656554, 2691582, + 2676699, 1878842, 260174, 3129900, 4379993, 182347, 2189338, 3783616, 2616666, 2596952, + 243007, 4179282, 2730, 1939894, 2332032, 3335636, 182332, 3112260, 2174584, 587481, + 4527368, 3154106, 3403059, 673206, 2150292, 446521, 1600204, 4819428, 2591357, 48490, + 2917012, 2285923, 1072926, 2824281, 4364250, 956033, 311938, 37251, 3729300, 2726300, + 644966, 1623020, 1419070, 4646747, 2417222, 2680238, 2561083, 1793801, 2349366, 339747, + 611366, 4684147, 4356907, 1277161, 4510381, 3218352, 4161658, 3200733, 1172372, 3997786, + 3169266, 3353418, 2248955, 2875885, 2365369, 498208, 2968066, 2681505, 2059048, 2097106, + 3607540, 1121504, 2016789, 1762605, 3138431, 866081, 3705757, 3833066, 2599788, 760816, + 4046672, 1544367, 2983906, 4842911, 209599, 1250954, 3333704, 561212, 4674336, 2831841, + 3690724, 2929360, 4830834, 1177524, 2487687, 3525137, 875283, 651241, 2110742, 1296646, + 1543739, 4349417, 2384725, 1931751, 1519208, 1520034, 3385008, 3219962, 734912, 170230, + 1741419, 729913, 2860117, 2362381, 1199807, 2424230, 177824, 125948, 2722701, 4687548, + 1140771, 3232742, 4522020, 4376360, 1125603, 590312, 2481884, 138951, 4086775, 615155, + 3395781, 4587272, 283209, 568470, 4296185, 4344150, 2454321, 2672602, 838828, 4051647, + 1709120, 3074610, 693235, 4356087, 3018806, 239410, 2431497, 691186, 766276, 4462126, + 859155, 2370304, 1571808, 1938673, 1694955, 3871296, 4245059, 3987376, 301524, 2512461, + 3410437, 3300380, 684922, 4581995, 3599557, 683515, 1850634, 3704678, 1937490, 2035591, + 3718533, 2065879, 3160765, 1467884, 1912241, 2501509, 3668572, 3390469, 2501150, 612319, + 713633, 1976262, 135946, 3641535, 632083, 13414, 4217765, 4137712, 2550250, 3281035, + 4179598, 961045, 2020694, 4380006, 1345936, 289162, 1359035, 770872, 4509911, 3947317, + 4719693, 248568, 2625660, 1237232, 2153208, 4814282, 1259954, 3677369, 861222, 2883506, + 3339149, 3998335, 491017, 1609022, 2648112, 742132, 649609, 4206953, 3131106, 3504814, + 3344486, 611721, 3215620, 2856233, 4447505, 1949222, 1868345, 712710, 6966, 4730666, + 3181872, 2972889, 3038521, 3525444, 4385208, 1845613, 1124187, 2030476, 4468651, 2478792, + 3473580, 3783357, 1852991, 1648485, 871319, 1670723, 4458328, 3218600, 1811100, 3443356, + 2233873, 3035207, 2548692, 3337891, 3773674, 1552957, 4782811, 3144712, 3523466, 1491315, + 3955852, 1838410, 3164028, 1092543, 776459, 2959379, 2541744, 4064418, 3908320, 2854145, + 3960709, 1348188, 977678, 853619, 1304291, 2848702, 1657913, 1319826, 3322665, 788037, + 2913686, 4471279, 1766285, 348304, 56570, 1892118, 4017244, 401006, 3524539, 4310134, + 1624693, 4081113, 957511, 849400, 129975, 2616130, 378537, 1556787, 3916162, 1039980, + 4407778, 2027690, 4213675, 839863, 683134, 75805, 2493150, 4215796, 81587, 751845, + 1255588, 1947964, 1950470, 859401, 3077088, 3931110, 2316256, 1523761, 4527477, 4237511, + 1123513, 4209796, 3584772, 4250563, 2091754, 1618766, 2139944, 4525352, 382159, 2955887, + 41760, 2313998, 496912, 3791570, 3904792, 3613654, 873959, 127076, 2537797, 2458107, + 4543265, 3661909, 26828, 271816, 17854, 2461269, 1776042, 1573899, 3409957, 4335712, + 4534313, 3392751, 1230124, 2159031, 4444015, 3373087, 3848014, 2026600, 1382747, 3537242, + 4536743, 4714155, 3788371, 3570849, 173741, 211962, 4377778, 119369, 2856973, 2945854, + 1508054, 4503932, 3141566, 1842177, 3448683, 3384614, 2886508, 1573965, 990618, 3053734, + 2918742, 4508753, 1032149, 60943, 4291620, 722607, 2883224, 169359, 4356585, 3725543, + 3678729, 341673, 3592828, 4077251, 3382936, 3885685, 4630994, 1286698, 4449616, 1138430, + 3113385, 4660578, 2539973, 4562286, 4085089, 494737, 3967610, 2130702, 1823755, 1369324, + 3796951, 956299, 141730, 935144, 4381893, 4412545, 1382250, 3024476, 2364546, 3396164, + 3573511, 314081, 577688, 4154135, 1567018, 4047761, 2446220, 1148833, 4842497, 3967186, + 1175290, 3749667, 1209593, 3295627, 3169065, 2460328, 1838486, 1436923, 2843887, 3676426, + 2079145, 2975635, 535071, 4287509, 3281107, 39606, 3115500, 3204573, 722131, 3124073}, + 2, + false))); +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp index ff3b073cbc7..0340140d14b 100644 --- a/cpp/tests/experimental/pagerank_test.cpp +++ b/cpp/tests/experimental/pagerank_test.cpp @@ -14,11 +14,14 @@ * limitations under the License. */ +#include #include #include +#include #include #include +#include #include #include @@ -35,6 +38,11 @@ #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + template void pagerank_reference(edge_t const* offsets, vertex_t const* indices, @@ -128,11 +136,15 @@ typedef struct PageRank_Usecase_t { double personalization_ratio{0.0}; bool test_weighted{false}; + bool check_correctness{false}; PageRank_Usecase_t(std::string const& graph_file_path, double personalization_ratio, - bool test_weighted) - : personalization_ratio(personalization_ratio), test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : personalization_ratio(personalization_ratio), + test_weighted(test_weighted), + check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -146,14 +158,47 @@ typedef struct PageRank_Usecase_t { PageRank_Usecase_t(cugraph::test::rmat_params_t rmat_params, double personalization_ratio, - bool test_weighted) - : personalization_ratio(personalization_ratio), test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : personalization_ratio(personalization_ratio), + test_weighted(test_weighted), + check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } PageRank_Usecase; +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, PageRank_Usecase const& configuration, bool renumber) +{ + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber, + std::vector{0}, + size_t{1}); +} + class Tests_PageRank : public ::testing::TestWithParam { public: Tests_PageRank() {} @@ -166,51 +211,26 @@ class Tests_PageRank : public ::testing::TestWithParam { template void run_current_test(PageRank_Usecase const& configuration) { + constexpr bool renumber = true; + raft::handle_t handle{}; + HighResClock hr_clock{}; + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - false) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - false); - auto graph_view = graph.view(); - - std::vector h_offsets(graph_view.get_number_of_vertices() + 1); - std::vector h_indices(graph_view.get_number_of_edges()); - std::vector h_weights{}; - raft::update_host(h_offsets.data(), - graph_view.offsets(), - graph_view.get_number_of_vertices() + 1, - handle.get_stream()); - raft::update_host(h_indices.data(), - graph_view.indices(), - graph_view.get_number_of_edges(), - handle.get_stream()); - if (graph_view.is_weighted()) { - h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0}); - raft::update_host(h_weights.data(), - graph_view.weights(), - graph_view.get_number_of_edges(), - handle.get_stream()); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + std::tie(graph, d_renumber_map_labels) = + read_graph(handle, configuration, renumber); + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "read_graph took " << elapsed_time * 1e-6 << " s.\n"; } - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + auto graph_view = graph.view(); std::vector h_personalization_vertices{}; std::vector h_personalization_values{}; @@ -260,25 +280,13 @@ class Tests_PageRank : public ::testing::TestWithParam { result_t constexpr alpha{0.85}; result_t constexpr epsilon{1e-6}; - std::vector h_reference_pageranks(graph_view.get_number_of_vertices()); - - pagerank_reference(h_offsets.data(), - h_indices.data(), - h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), - h_personalization_vertices.data(), - h_personalization_values.data(), - h_reference_pageranks.data(), - graph_view.get_number_of_vertices(), - static_cast(h_personalization_vertices.size()), - alpha, - epsilon, - std::numeric_limits::max(), - false); - rmm::device_uvector d_pageranks(graph_view.get_number_of_vertices(), handle.get_stream()); - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } cugraph::experimental::pagerank(handle, graph_view, @@ -286,35 +294,143 @@ class Tests_PageRank : public ::testing::TestWithParam { d_personalization_vertices.data(), d_personalization_values.data(), static_cast(d_personalization_vertices.size()), - d_pageranks.begin(), + d_pageranks.data(), alpha, epsilon, std::numeric_limits::max(), false, false); - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "PageRank took " << elapsed_time * 1e-6 << " s.\n"; + } - std::vector h_cugraph_pageranks(graph_view.get_number_of_vertices()); + if (configuration.check_correctness) { + cugraph::experimental::graph_t unrenumbered_graph( + handle); + if (renumber) { + std::tie(unrenumbered_graph, std::ignore) = + read_graph(handle, configuration, false); + } + auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; + + std::vector h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1); + std::vector h_indices(unrenumbered_graph_view.get_number_of_edges()); + std::vector h_weights{}; + raft::update_host(h_offsets.data(), + unrenumbered_graph_view.offsets(), + unrenumbered_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_indices.data(), + unrenumbered_graph_view.indices(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + if (unrenumbered_graph_view.is_weighted()) { + h_weights.assign(unrenumbered_graph_view.get_number_of_edges(), weight_t{0.0}); + raft::update_host(h_weights.data(), + unrenumbered_graph_view.weights(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + } - raft::update_host( - h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + std::vector h_unrenumbered_personalization_vertices( + d_personalization_vertices.size()); + std::vector h_unrenumbered_personalization_values( + h_unrenumbered_personalization_vertices.size()); + if (renumber) { + rmm::device_uvector d_unrenumbered_personalization_vertices( + d_personalization_vertices.size(), handle.get_stream()); + rmm::device_uvector d_unrenumbered_personalization_values( + d_unrenumbered_personalization_vertices.size(), handle.get_stream()); + raft::copy_async(d_unrenumbered_personalization_vertices.data(), + d_personalization_vertices.data(), + d_personalization_vertices.size(), + handle.get_stream()); + raft::copy_async(d_unrenumbered_personalization_values.data(), + d_personalization_values.data(), + d_personalization_values.size(), + handle.get_stream()); + cugraph::experimental::unrenumber_local_int_vertices( + handle, + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_vertices.size(), + d_renumber_map_labels.data(), + vertex_t{0}, + graph_view.get_number_of_vertices()); + cugraph::test::sort_by_key(handle, + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_values.data(), + d_unrenumbered_personalization_vertices.size()); + + raft::update_host(h_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_vertices.size(), + handle.get_stream()); + raft::update_host(h_unrenumbered_personalization_values.data(), + d_unrenumbered_personalization_values.data(), + d_unrenumbered_personalization_values.size(), + handle.get_stream()); + } else { + raft::update_host(h_unrenumbered_personalization_vertices.data(), + d_personalization_vertices.data(), + d_personalization_vertices.size(), + handle.get_stream()); + raft::update_host(h_unrenumbered_personalization_values.data(), + d_personalization_values.data(), + d_personalization_values.size(), + handle.get_stream()); + } - auto threshold_ratio = 1e-3; - auto threshold_magnitude = - (1.0 / static_cast(graph_view.get_number_of_vertices())) * - threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) - auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { - return std::abs(lhs - rhs) < - std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); - }; + handle.get_stream_view().synchronize(); + + std::vector h_reference_pageranks(unrenumbered_graph_view.get_number_of_vertices()); + + pagerank_reference(h_offsets.data(), + h_indices.data(), + h_weights.size() > 0 ? h_weights.data() : static_cast(nullptr), + h_unrenumbered_personalization_vertices.data(), + h_unrenumbered_personalization_values.data(), + h_reference_pageranks.data(), + unrenumbered_graph_view.get_number_of_vertices(), + static_cast(h_personalization_vertices.size()), + alpha, + epsilon, + std::numeric_limits::max(), + false); + + std::vector h_cugraph_pageranks(graph_view.get_number_of_vertices()); + if (renumber) { + auto d_unrenumbered_pageranks = cugraph::test::sort_by_key( + handle, d_renumber_map_labels.data(), d_pageranks.data(), d_renumber_map_labels.size()); + raft::update_host(h_cugraph_pageranks.data(), + d_unrenumbered_pageranks.data(), + d_unrenumbered_pageranks.size(), + handle.get_stream()); + } else { + raft::update_host( + h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream()); + } - ASSERT_TRUE(std::equal(h_reference_pageranks.begin(), - h_reference_pageranks.end(), - h_cugraph_pageranks.begin(), - nearly_equal)) - << "PageRank values do not match with the reference values."; + handle.get_stream_view().synchronize(); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; + + ASSERT_TRUE(std::equal(h_reference_pageranks.begin(), + h_reference_pageranks.end(), + h_cugraph_pageranks.begin(), + nearly_equal)) + << "PageRank values do not match with the reference values."; + } } }; @@ -328,6 +444,7 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_PageRank, ::testing::Values( + // enable correctness checks PageRank_Usecase("test/datasets/karate.mtx", 0.0, false), PageRank_Usecase("test/datasets/karate.mtx", 0.5, false), PageRank_Usecase("test/datasets/karate.mtx", 0.0, true), @@ -355,6 +472,15 @@ INSTANTIATE_TEST_CASE_P( true), PageRank_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0.5, - true))); + true), + // disable correctness checks for large graphs + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, false, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, false, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, true, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, true, false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/random_walks_test.cu b/cpp/tests/experimental/random_walks_test.cu new file mode 100644 index 00000000000..9fb1716f62b --- /dev/null +++ b/cpp/tests/experimental/random_walks_test.cu @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cuda_profiler_api.h" +#include "gtest/gtest.h" + +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include + +#include "random_walks_utils.cuh" + +#include +#include +#include +#include +#include +#include + +namespace { // anonym. +template +void fill_start(raft::handle_t const& handle, + rmm::device_uvector& d_start, + index_t num_vertices) +{ + index_t num_paths = d_start.size(); + + thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_paths), + + d_start.begin(), + [num_vertices] __device__(auto indx) { return indx % num_vertices; }); +} +} // namespace + +struct RandomWalks_Usecase { + std::string graph_file_full_path{}; + bool test_weighted{false}; + + RandomWalks_Usecase(std::string const& graph_file_path, bool test_weighted) + : test_weighted(test_weighted) + { + if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { + graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path; + } else { + graph_file_full_path = graph_file_path; + } + }; +}; + +class Tests_RandomWalks : public ::testing::TestWithParam { + public: + Tests_RandomWalks() {} + static void SetupTestCase() {} + static void TearDownTestCase() {} + + virtual void SetUp() {} + virtual void TearDown() {} + + template + void run_current_test(RandomWalks_Usecase const& configuration) + { + raft::handle_t handle{}; + + // debuf info: + // + // std::cout << "read graph file: " << configuration.graph_file_full_path << std::endl; + + cugraph::experimental::graph_t graph(handle); + std::tie(graph, std::ignore) = + cugraph::test::read_graph_from_matrix_market_file( + handle, configuration.graph_file_full_path, configuration.test_weighted, false); + + auto graph_view = graph.view(); + + // call random_walks: + start_random_walks(graph_view); + } + + template + void start_random_walks(graph_vt const& graph_view) + { + using vertex_t = typename graph_vt::vertex_type; + using edge_t = typename graph_vt::edge_type; + using weight_t = typename graph_vt::weight_type; + + raft::handle_t handle{}; + edge_t num_paths = 10; + rmm::device_uvector d_start(num_paths, handle.get_stream()); + + vertex_t num_vertices = graph_view.get_number_of_vertices(); + fill_start(handle, d_start, num_vertices); + + // 0-copy const device view: + // + cugraph::experimental::detail::device_const_vector_view d_start_view{ + d_start.data(), num_paths}; + + edge_t max_depth{10}; + + auto ret_tuple = + cugraph::experimental::detail::random_walks_impl(handle, graph_view, d_start_view, max_depth); + + // check results: + // + bool test_all_paths = cugraph::test::host_check_rw_paths( + handle, graph_view, std::get<0>(ret_tuple), std::get<1>(ret_tuple), std::get<2>(ret_tuple)); + + if (!test_all_paths) + std::cout << "starting seed on failure: " << std::get<3>(ret_tuple) << '\n'; + + ASSERT_TRUE(test_all_paths); + } +}; + +TEST_P(Tests_RandomWalks, Initialize_i32_i32_f) +{ + run_current_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P( + simple_test, + Tests_RandomWalks, + ::testing::Values(RandomWalks_Usecase("test/datasets/karate.mtx", true), + RandomWalks_Usecase("test/datasets/web-Google.mtx", true), + RandomWalks_Usecase("test/datasets/ljournal-2008.mtx", true), + RandomWalks_Usecase("test/datasets/webbase-1M.mtx", true))); + +CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/random_walks_utils.cuh b/cpp/tests/experimental/random_walks_utils.cuh new file mode 100644 index 00000000000..863094dc310 --- /dev/null +++ b/cpp/tests/experimental/random_walks_utils.cuh @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +// utilities for testing / verification of Random Walks functionality: +// +namespace cugraph { +namespace test { + +template +using vector_test_t = cugraph::experimental::detail::device_vec_t; // for debug purposes + +// host side utility to check a if a sequence of vertices is connected: +// +template +bool host_check_path(std::vector const& row_offsets, + std::vector const& col_inds, + std::vector const& values, + typename std::vector::const_iterator v_path_begin, + typename std::vector::const_iterator v_path_end, + typename std::vector::const_iterator w_path_begin) +{ + bool assert1 = (row_offsets.size() > 0); + bool assert2 = (col_inds.size() == values.size()); + + vertex_t num_rows = row_offsets.size() - 1; + edge_t nnz = row_offsets.back(); + + bool assert3 = (nnz == static_cast(col_inds.size())); + if (assert1 == false || assert2 == false || assert3 == false) { + std::cout << "CSR inconsistency\n"; + return false; + } + + auto it_w = w_path_begin; + for (auto it_v = v_path_begin; it_v != v_path_end - 1; ++it_v, ++it_w) { + auto crt_vertex = *it_v; + auto next_vertex = *(it_v + 1); + + auto begin = col_inds.begin() + row_offsets[crt_vertex]; + auto end = col_inds.begin() + row_offsets[crt_vertex + 1]; + auto found_next = std::find_if( + begin, end, [next_vertex](auto dst_vertex) { return dst_vertex == next_vertex; }); + if (found_next == end) { + std::cout << "vertex not found: " << next_vertex << " as neighbor of " << crt_vertex << '\n'; + return false; + } + + auto delta = row_offsets[crt_vertex] + std::distance(begin, found_next); + + // std::cout << "delta in ci: " << delta << '\n'; + auto found_edge = values.begin() + delta; + if (*found_edge != *it_w) { + std::cout << "weight not found: " << *found_edge << " between " << crt_vertex << " and " + << next_vertex << '\n'; + return false; + } + } + return true; +} + +template +bool host_check_rw_paths( + raft::handle_t const& handle, + cugraph::experimental::graph_view_t const& graph_view, + vector_test_t const& d_coalesced_v, + vector_test_t const& d_coalesced_w, + vector_test_t const& d_sizes) +{ + edge_t num_edges = graph_view.get_number_of_edges(); + vertex_t num_vertices = graph_view.get_number_of_vertices(); + + edge_t const* offsets = graph_view.offsets(); + vertex_t const* indices = graph_view.indices(); + weight_t const* values = graph_view.weights(); + + std::vector v_ro(num_vertices + 1); + std::vector v_ci(num_edges); + std::vector v_vals(num_edges); + + raft::update_host(v_ro.data(), offsets, v_ro.size(), handle.get_stream()); + raft::update_host(v_ci.data(), indices, v_ci.size(), handle.get_stream()); + raft::update_host(v_vals.data(), values, v_vals.size(), handle.get_stream()); + + std::vector v_coalesced(d_coalesced_v.size()); + std::vector w_coalesced(d_coalesced_w.size()); + std::vector v_sizes(d_sizes.size()); + + raft::update_host(v_coalesced.data(), + cugraph::experimental::detail::raw_const_ptr(d_coalesced_v), + d_coalesced_v.size(), + handle.get_stream()); + raft::update_host(w_coalesced.data(), + cugraph::experimental::detail::raw_const_ptr(d_coalesced_w), + d_coalesced_w.size(), + handle.get_stream()); + raft::update_host(v_sizes.data(), + cugraph::experimental::detail::raw_const_ptr(d_sizes), + d_sizes.size(), + handle.get_stream()); + + auto it_v_begin = v_coalesced.begin(); + auto it_w_begin = w_coalesced.begin(); + for (auto&& crt_sz : v_sizes) { + auto it_v_end = it_v_begin + crt_sz; + + bool test_path = host_check_path(v_ro, v_ci, v_vals, it_v_begin, it_v_end, it_w_begin); + + it_v_begin = it_v_end; + it_w_begin += crt_sz - 1; + + if (!test_path) { // something went wrong; print to debug (since it's random) + raft::print_host_vector("sizes", v_sizes.data(), v_sizes.size(), std::cout); + + raft::print_host_vector("coalesced v", v_coalesced.data(), v_coalesced.size(), std::cout); + + raft::print_host_vector("coalesced w", w_coalesced.data(), w_coalesced.size(), std::cout); + + return false; + } + } + return true; +} + +} // namespace test +} // namespace cugraph diff --git a/cpp/tests/experimental/rw_low_level_test.cu b/cpp/tests/experimental/rw_low_level_test.cu new file mode 100644 index 00000000000..8b562bc41f6 --- /dev/null +++ b/cpp/tests/experimental/rw_low_level_test.cu @@ -0,0 +1,784 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "cuda_profiler_api.h" + +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include + +#include "random_walks_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include + +using namespace cugraph::experimental; + +template +using vector_test_t = detail::device_vec_t; // for debug purposes + +namespace { // anonym. + +template +graph_t make_graph(raft::handle_t const& handle, + std::vector const& v_src, + std::vector const& v_dst, + std::vector const& v_w, + vertex_t num_vertices, + edge_t num_edges, + bool is_weighted) +{ + vector_test_t d_src(num_edges, handle.get_stream()); + vector_test_t d_dst(num_edges, handle.get_stream()); + vector_test_t d_weights(num_edges, handle.get_stream()); + + raft::update_device(d_src.data(), v_src.data(), d_src.size(), handle.get_stream()); + raft::update_device(d_dst.data(), v_dst.data(), d_dst.size(), handle.get_stream()); + raft::update_device(d_weights.data(), v_w.data(), d_weights.size(), handle.get_stream()); + + edgelist_t edgelist{ + d_src.data(), d_dst.data(), d_weights.data(), num_edges}; + + graph_t graph( + handle, edgelist, num_vertices, graph_properties_t{false, false, is_weighted}, false); + + return graph; +} + +template +bool check_col_indices(raft::handle_t const& handle, + vector_test_t const& d_crt_out_degs, + vector_test_t const& d_col_indx, + index_t num_paths) +{ + bool all_indices_within_degs = thrust::all_of( + rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_paths), + [p_d_col_indx = detail::raw_const_ptr(d_col_indx), + p_d_crt_out_degs = detail::raw_const_ptr(d_crt_out_degs)] __device__(auto indx) { + if (p_d_crt_out_degs[indx] > 0) + return ((p_d_col_indx[indx] >= 0) && (p_d_col_indx[indx] < p_d_crt_out_degs[indx])); + else + return true; + }); + return all_indices_within_degs; +} + +} // namespace + +// FIXME (per rlratzel request): +// This test may be considered an e2e test +// which could be moved to a different test suite: +// +struct RandomWalksPrimsTest : public ::testing::Test { +}; + +TEST_F(RandomWalksPrimsTest, SimpleGraphRWStart) +{ + using namespace cugraph::experimental::detail; + + using vertex_t = int32_t; + using edge_t = vertex_t; + using weight_t = float; + using index_t = vertex_t; + + raft::handle_t handle{}; + + edge_t num_edges = 8; + vertex_t num_vertices = 6; + + std::vector v_src{0, 1, 1, 2, 2, 2, 3, 4}; + std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; + std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; + + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); + + auto graph_view = graph.view(); + + edge_t const* offsets = graph_view.offsets(); + vertex_t const* indices = graph_view.indices(); + weight_t const* values = graph_view.weights(); + + std::vector v_ro(num_vertices + 1); + std::vector v_ci(num_edges); + std::vector v_vs(num_edges); + + raft::update_host(v_ro.data(), offsets, num_vertices + 1, handle.get_stream()); + raft::update_host(v_ci.data(), indices, num_edges, handle.get_stream()); + raft::update_host(v_vs.data(), values, num_edges, handle.get_stream()); + + std::vector v_ro_expected{0, 1, 3, 6, 7, 8, 8}; + std::vector v_ci_expected{1, 3, 4, 0, 1, 3, 5, 5}; + std::vector v_vs_expected{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; + + EXPECT_EQ(v_ro, v_ro_expected); + EXPECT_EQ(v_ci, v_ci_expected); + EXPECT_EQ(v_vs, v_vs_expected); + + index_t num_paths = 4; + index_t max_depth = 3; + index_t total_sz = num_paths * max_depth; + + std::vector v_coalesced(total_sz, -1); + std::vector w_coalesced(total_sz - num_paths, -1); + + vector_test_t d_coalesced_v(total_sz, handle.get_stream()); + vector_test_t d_coalesced_w(total_sz - num_paths, handle.get_stream()); + + raft::update_device( + d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream()); + raft::update_device( + d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream()); + + std::vector v_start{1, 0, 4, 2}; + vector_test_t d_start(num_paths, handle.get_stream()); + + raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream()); + + vector_test_t d_sizes(num_paths, handle.get_stream()); + + random_walker_t rand_walker{handle, graph_view, num_paths, max_depth}; + + rand_walker.start(d_start, d_coalesced_v, d_sizes); + + std::vector v_coalesced_exp{1, -1, -1, 0, -1, -1, 4, -1, -1, 2, -1, -1}; + raft::update_host( + v_coalesced.data(), raw_const_ptr(d_coalesced_v), total_sz, handle.get_stream()); + EXPECT_EQ(v_coalesced, v_coalesced_exp); + + std::vector v_sizes{1, 1, 1, 1}; + std::vector v_sz_exp(num_paths); + raft::update_host(v_sz_exp.data(), raw_const_ptr(d_sizes), num_paths, handle.get_stream()); + + EXPECT_EQ(v_sizes, v_sz_exp); +} + +TEST_F(RandomWalksPrimsTest, SimpleGraphCoalesceExperiments) +{ + using namespace cugraph::experimental::detail; + + using vertex_t = int32_t; + using edge_t = vertex_t; + using weight_t = float; + using index_t = vertex_t; + + raft::handle_t handle{}; + + edge_t num_edges = 8; + vertex_t num_vertices = 6; + + std::vector v_src{0, 1, 1, 2, 2, 2, 3, 4}; + std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; + std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; + + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); + + auto graph_view = graph.view(); + + edge_t const* offsets = graph_view.offsets(); + vertex_t const* indices = graph_view.indices(); + weight_t const* values = graph_view.weights(); + + index_t num_paths = 4; + index_t max_depth = 3; + index_t total_sz = num_paths * max_depth; + + std::vector v_coalesced(total_sz, -1); + std::vector w_coalesced(total_sz - num_paths, -1); + + vector_test_t d_coalesced_v(total_sz, handle.get_stream()); + vector_test_t d_coalesced_w(total_sz - num_paths, handle.get_stream()); + + raft::update_device( + d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream()); + raft::update_device( + d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream()); + + std::vector v_start{1, 0, 4, 2}; + vector_test_t d_start(num_paths, handle.get_stream()); + + raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream()); + + vector_test_t d_sizes(num_paths, handle.get_stream()); + + random_walker_t rand_walker{handle, graph_view, num_paths, max_depth}; + + auto const& d_out_degs = rand_walker.get_out_degs(); + EXPECT_EQ(static_cast(num_vertices), d_out_degs.size()); + + std::vector v_out_degs(num_vertices); + raft::update_host( + v_out_degs.data(), raw_const_ptr(d_out_degs), num_vertices, handle.get_stream()); + + std::vector v_out_degs_exp{1, 2, 3, 1, 1, 0}; + EXPECT_EQ(v_out_degs, v_out_degs_exp); + + rand_walker.start(d_start, d_coalesced_v, d_sizes); + + // update crt_out_degs: + // + vector_test_t d_crt_out_degs(num_paths, handle.get_stream()); + rand_walker.gather_from_coalesced( + d_coalesced_v, d_out_degs, d_sizes, d_crt_out_degs, max_depth, num_paths); + + std::vector v_crt_out_degs(num_paths); + raft::update_host( + v_crt_out_degs.data(), raw_const_ptr(d_crt_out_degs), num_paths, handle.get_stream()); + + std::vector v_crt_out_degs_exp{2, 1, 1, 3}; + EXPECT_EQ(v_crt_out_degs, v_crt_out_degs_exp); +} + +TEST_F(RandomWalksPrimsTest, SimpleGraphColExtraction) +{ + using namespace cugraph::experimental::detail; + + using vertex_t = int32_t; + using edge_t = vertex_t; + using weight_t = float; + using index_t = vertex_t; + + raft::handle_t handle{}; + + edge_t num_edges = 8; + vertex_t num_vertices = 6; + + std::vector v_src{0, 1, 1, 2, 2, 2, 3, 4}; + std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; + std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; + + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); + + auto graph_view = graph.view(); + + edge_t const* offsets = graph_view.offsets(); + vertex_t const* indices = graph_view.indices(); + weight_t const* values = graph_view.weights(); + + index_t num_paths = 4; + index_t max_depth = 3; + index_t total_sz = num_paths * max_depth; + + std::vector v_coalesced(total_sz, -1); + std::vector w_coalesced(total_sz - num_paths, -1); + + vector_test_t d_coalesced_v(total_sz, handle.get_stream()); + vector_test_t d_coalesced_w(total_sz - num_paths, handle.get_stream()); + + raft::update_device( + d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream()); + raft::update_device( + d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream()); + + std::vector v_start{1, 0, 4, 2}; + vector_test_t d_start(num_paths, handle.get_stream()); + + raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream()); + + vector_test_t d_sizes(num_paths, handle.get_stream()); + + random_walker_t rand_walker{handle, graph_view, num_paths, max_depth}; + + auto const& d_out_degs = rand_walker.get_out_degs(); + + rand_walker.start(d_start, d_coalesced_v, d_sizes); + + // update crt_out_degs: + // + vector_test_t d_crt_out_degs(num_paths, handle.get_stream()); + rand_walker.gather_from_coalesced( + d_coalesced_v, d_out_degs, d_sizes, d_crt_out_degs, max_depth, num_paths); + + col_indx_extract_t col_extractor{handle, + graph_view, + raw_const_ptr(d_crt_out_degs), + raw_const_ptr(d_sizes), + num_paths, + max_depth}; + + // typically given by random engine: + // + std::vector v_col_indx{1, 0, 0, 2}; + vector_test_t d_col_indx(num_paths, handle.get_stream()); + + raft::update_device(d_col_indx.data(), v_col_indx.data(), d_col_indx.size(), handle.get_stream()); + + vector_test_t d_next_v(num_paths, handle.get_stream()); + vector_test_t d_next_w(num_paths, handle.get_stream()); + + col_extractor(d_coalesced_v, d_col_indx, d_next_v, d_next_w); + + std::vector v_next_v(num_paths); + std::vector v_next_w(num_paths); + + raft::update_host(v_next_v.data(), raw_const_ptr(d_next_v), num_paths, handle.get_stream()); + raft::update_host(v_next_w.data(), raw_const_ptr(d_next_w), num_paths, handle.get_stream()); + + std::vector v_next_v_exp{4, 1, 5, 3}; + std::vector v_next_w_exp{2.1f, 0.1f, 7.1f, 5.1f}; + + EXPECT_EQ(v_next_v, v_next_v_exp); + EXPECT_EQ(v_next_w, v_next_w_exp); +} + +TEST_F(RandomWalksPrimsTest, SimpleGraphRndGenColIndx) +{ + using namespace cugraph::experimental::detail; + + using vertex_t = int32_t; + using edge_t = vertex_t; + using weight_t = float; + using index_t = vertex_t; + using real_t = float; + using seed_t = long; + + using random_engine_t = rrandom_gen_t; + + raft::handle_t handle{}; + + edge_t num_edges = 8; + vertex_t num_vertices = 6; + + std::vector v_src{0, 1, 1, 2, 2, 2, 3, 4}; + std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; + std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; + + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); + + auto graph_view = graph.view(); + + edge_t const* offsets = graph_view.offsets(); + vertex_t const* indices = graph_view.indices(); + weight_t const* values = graph_view.weights(); + + index_t num_paths = 4; + index_t max_depth = 3; + index_t total_sz = num_paths * max_depth; + + std::vector v_coalesced(total_sz, -1); + std::vector w_coalesced(total_sz - num_paths, -1); + + vector_test_t d_coalesced_v(total_sz, handle.get_stream()); + vector_test_t d_coalesced_w(total_sz - num_paths, handle.get_stream()); + + raft::update_device( + d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream()); + raft::update_device( + d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream()); + + std::vector v_start{1, 0, 4, 2}; + vector_test_t d_start(num_paths, handle.get_stream()); + + raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream()); + + vector_test_t d_sizes(num_paths, handle.get_stream()); + + random_walker_t rand_walker{handle, graph_view, num_paths, max_depth}; + + auto const& d_out_degs = rand_walker.get_out_degs(); + + rand_walker.start(d_start, d_coalesced_v, d_sizes); + + // update crt_out_degs: + // + vector_test_t d_crt_out_degs(num_paths, handle.get_stream()); + rand_walker.gather_from_coalesced( + d_coalesced_v, d_out_degs, d_sizes, d_crt_out_degs, max_depth, num_paths); + + // random engine generated: + // + vector_test_t d_col_indx(num_paths, handle.get_stream()); + vector_test_t d_random(num_paths, handle.get_stream()); + + seed_t seed = static_cast(std::time(nullptr)); + random_engine_t rgen(handle, num_paths, d_random, d_crt_out_degs, seed); + rgen.generate_col_indices(d_col_indx); + + bool all_indices_within_degs = check_col_indices(handle, d_crt_out_degs, d_col_indx, num_paths); + + ASSERT_TRUE(all_indices_within_degs); +} + +TEST_F(RandomWalksPrimsTest, SimpleGraphUpdatePathSizes) +{ + using namespace cugraph::experimental::detail; + + using vertex_t = int32_t; + using edge_t = vertex_t; + using weight_t = float; + using index_t = vertex_t; + using real_t = float; + using seed_t = long; + + using random_engine_t = rrandom_gen_t; + + raft::handle_t handle{}; + + edge_t num_edges = 8; + vertex_t num_vertices = 6; + + std::vector v_src{0, 1, 1, 2, 2, 2, 3, 4}; + std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; + std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; + + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); + + auto graph_view = graph.view(); + + edge_t const* offsets = graph_view.offsets(); + vertex_t const* indices = graph_view.indices(); + weight_t const* values = graph_view.weights(); + + index_t num_paths = 4; + index_t max_depth = 3; + index_t total_sz = num_paths * max_depth; + + std::vector v_coalesced(total_sz, -1); + std::vector w_coalesced(total_sz - num_paths, -1); + + vector_test_t d_coalesced_v(total_sz, handle.get_stream()); + vector_test_t d_coalesced_w(total_sz - num_paths, handle.get_stream()); + + raft::update_device( + d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream()); + raft::update_device( + d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream()); + + std::vector v_start{1, 0, 4, 2}; + vector_test_t d_start(num_paths, handle.get_stream()); + + raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream()); + + vector_test_t d_sizes(num_paths, handle.get_stream()); + + random_walker_t rand_walker{handle, graph_view, num_paths, max_depth}; + + auto const& d_out_degs = rand_walker.get_out_degs(); + + rand_walker.start(d_start, d_coalesced_v, d_sizes); + + // Fixed set of out-degs, as opposed to have them generated by the algorithm. + // That's because I want to test a certain functionality in isolation + // + std::vector v_crt_out_degs{2, 0, 1, 0}; + vector_test_t d_crt_out_degs(num_paths, handle.get_stream()); + raft::update_device( + d_crt_out_degs.data(), v_crt_out_degs.data(), d_crt_out_degs.size(), handle.get_stream()); + + rand_walker.update_path_sizes(d_crt_out_degs, d_sizes); + + std::vector v_sizes(num_paths); + raft::update_host(v_sizes.data(), raw_const_ptr(d_sizes), num_paths, handle.get_stream()); + std::vector v_sizes_exp{2, 1, 2, 1}; + // i.e., corresponding 0-entries in crt-out-degs, don't get updated; + + EXPECT_EQ(v_sizes, v_sizes_exp); +} + +TEST_F(RandomWalksPrimsTest, SimpleGraphScatterUpdate) +{ + using namespace cugraph::experimental::detail; + + using vertex_t = int32_t; + using edge_t = vertex_t; + using weight_t = float; + using index_t = vertex_t; + + raft::handle_t handle{}; + + edge_t num_edges = 8; + vertex_t num_vertices = 6; + + std::vector v_src{0, 1, 1, 2, 2, 2, 3, 4}; + std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; + std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; + + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); + + auto graph_view = graph.view(); + + edge_t const* offsets = graph_view.offsets(); + vertex_t const* indices = graph_view.indices(); + weight_t const* values = graph_view.weights(); + + index_t num_paths = 4; + index_t max_depth = 3; + index_t total_sz = num_paths * max_depth; + + std::vector v_coalesced(total_sz, -1); + std::vector w_coalesced(total_sz - num_paths, -1); + + vector_test_t d_coalesced_v(total_sz, handle.get_stream()); + vector_test_t d_coalesced_w(total_sz - num_paths, handle.get_stream()); + + raft::update_device( + d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream()); + raft::update_device( + d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream()); + + std::vector v_start{1, 0, 4, 2}; + vector_test_t d_start(num_paths, handle.get_stream()); + + raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream()); + + vector_test_t d_sizes(num_paths, handle.get_stream()); + + random_walker_t rand_walker{handle, graph_view, num_paths, max_depth}; + + auto const& d_out_degs = rand_walker.get_out_degs(); + + rand_walker.start(d_start, d_coalesced_v, d_sizes); + + // update crt_out_degs: + // + vector_test_t d_crt_out_degs(num_paths, handle.get_stream()); + rand_walker.gather_from_coalesced( + d_coalesced_v, d_out_degs, d_sizes, d_crt_out_degs, max_depth, num_paths); + + col_indx_extract_t col_extractor{handle, + graph_view, + raw_const_ptr(d_crt_out_degs), + raw_const_ptr(d_sizes), + num_paths, + max_depth}; + + // typically given by random engine: + // + std::vector v_col_indx{1, 0, 0, 2}; + vector_test_t d_col_indx(num_paths, handle.get_stream()); + + raft::update_device(d_col_indx.data(), v_col_indx.data(), d_col_indx.size(), handle.get_stream()); + + vector_test_t d_next_v(num_paths, handle.get_stream()); + vector_test_t d_next_w(num_paths, handle.get_stream()); + + col_extractor(d_coalesced_v, d_col_indx, d_next_v, d_next_w); + + rand_walker.update_path_sizes(d_crt_out_degs, d_sizes); + + // check start(): + // + { + std::vector v_coalesced_exp{1, -1, -1, 0, -1, -1, 4, -1, -1, 2, -1, -1}; + raft::update_host( + v_coalesced.data(), raw_const_ptr(d_coalesced_v), total_sz, handle.get_stream()); + EXPECT_EQ(v_coalesced, v_coalesced_exp); + } + + // check crt_out_degs: + // + { + std::vector v_crt_out_degs(num_paths); + raft::update_host( + v_crt_out_degs.data(), raw_const_ptr(d_crt_out_degs), num_paths, handle.get_stream()); + std::vector v_crt_out_degs_exp{2, 1, 1, 3}; + EXPECT_EQ(v_crt_out_degs, v_crt_out_degs_exp); + } + + // check paths sizes update: + // + { + std::vector v_sizes(num_paths); + raft::update_host(v_sizes.data(), raw_const_ptr(d_sizes), num_paths, handle.get_stream()); + std::vector v_sizes_exp{2, 2, 2, 2}; + // i.e., corresponding 0-entries in crt-out-degs, don't get updated; + EXPECT_EQ(v_sizes, v_sizes_exp); + } + + // check next step: + // + { + std::vector v_next_v(num_paths); + std::vector v_next_w(num_paths); + + raft::update_host(v_next_v.data(), raw_const_ptr(d_next_v), num_paths, handle.get_stream()); + raft::update_host(v_next_w.data(), raw_const_ptr(d_next_w), num_paths, handle.get_stream()); + + std::vector v_next_v_exp{4, 1, 5, 3}; + std::vector v_next_w_exp{2.1f, 0.1f, 7.1f, 5.1f}; + + EXPECT_EQ(v_next_v, v_next_v_exp); + EXPECT_EQ(v_next_w, v_next_w_exp); + } + + rand_walker.scatter_vertices(d_next_v, d_coalesced_v, d_crt_out_degs, d_sizes); + rand_walker.scatter_weights(d_next_w, d_coalesced_w, d_crt_out_degs, d_sizes); + + // check vertex/weight scatter: + // + { + raft::update_host( + v_coalesced.data(), raw_const_ptr(d_coalesced_v), total_sz, handle.get_stream()); + raft::update_host( + w_coalesced.data(), raw_const_ptr(d_coalesced_w), total_sz - num_paths, handle.get_stream()); + + std::vector v_coalesced_exp{1, 4, -1, 0, 1, -1, 4, 5, -1, 2, 3, -1}; + std::vector w_coalesced_exp{2.1, -1, 0.1, -1, 7.1, -1, 5.1, -1}; + + EXPECT_EQ(v_coalesced, v_coalesced_exp); + EXPECT_EQ(w_coalesced, w_coalesced_exp); + } +} + +TEST_F(RandomWalksPrimsTest, SimpleGraphCoalesceDefragment) +{ + using namespace cugraph::experimental::detail; + + using vertex_t = int32_t; + using edge_t = vertex_t; + using weight_t = float; + using index_t = vertex_t; + + raft::handle_t handle{}; + + edge_t num_edges = 8; + vertex_t num_vertices = 6; + + std::vector v_src{0, 1, 1, 2, 2, 2, 3, 4}; + std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; + std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; + + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); + + auto graph_view = graph.view(); + + edge_t const* offsets = graph_view.offsets(); + vertex_t const* indices = graph_view.indices(); + weight_t const* values = graph_view.weights(); + + index_t num_paths = 4; + index_t max_depth = 3; + index_t total_sz = num_paths * max_depth; + + std::vector v_sizes{1, 2, 2, 1}; + vector_test_t d_sizes(num_paths, handle.get_stream()); + raft::update_device(d_sizes.data(), v_sizes.data(), d_sizes.size(), handle.get_stream()); + + std::vector v_coalesced(total_sz, -1); + v_coalesced[0] = 3; + v_coalesced[max_depth] = 5; + v_coalesced[max_depth + 1] = 2; + v_coalesced[2 * max_depth] = 4; + v_coalesced[2 * max_depth + 1] = 0; + v_coalesced[3 * max_depth] = 1; + + std::vector w_coalesced(total_sz - num_paths, -1); + w_coalesced[max_depth - 1] = 10.1; + w_coalesced[2 * max_depth - 2] = 11.2; + + vector_test_t d_coalesced_v(total_sz, handle.get_stream()); + vector_test_t d_coalesced_w(total_sz - num_paths, handle.get_stream()); + + raft::update_device( + d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream()); + raft::update_device( + d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream()); + + random_walker_t rand_walker{handle, graph_view, num_paths, max_depth}; + + rand_walker.stop(d_coalesced_v, d_coalesced_w, d_sizes); + + // check vertex/weight defragment: + // + { + v_coalesced.resize(d_coalesced_v.size()); + w_coalesced.resize(d_coalesced_w.size()); + + raft::update_host( + v_coalesced.data(), raw_const_ptr(d_coalesced_v), d_coalesced_v.size(), handle.get_stream()); + raft::update_host( + w_coalesced.data(), raw_const_ptr(d_coalesced_w), d_coalesced_w.size(), handle.get_stream()); + + std::vector v_coalesced_exp{3, 5, 2, 4, 0, 1}; + std::vector w_coalesced_exp{10.1, 11.2}; + + EXPECT_EQ(v_coalesced, v_coalesced_exp); + EXPECT_EQ(w_coalesced, w_coalesced_exp); + } +} + +TEST_F(RandomWalksPrimsTest, SimpleGraphRandomWalk) +{ + using vertex_t = int32_t; + using edge_t = vertex_t; + using weight_t = float; + using index_t = vertex_t; + + raft::handle_t handle{}; + + edge_t num_edges = 8; + vertex_t num_vertices = 6; + + std::vector v_src{0, 1, 1, 2, 2, 2, 3, 4}; + std::vector v_dst{1, 3, 4, 0, 1, 3, 5, 5}; + std::vector v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1}; + + auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true); + + auto graph_view = graph.view(); + + edge_t const* offsets = graph_view.offsets(); + vertex_t const* indices = graph_view.indices(); + weight_t const* values = graph_view.weights(); + + std::vector v_ro(num_vertices + 1); + std::vector v_ci(num_edges); + std::vector v_vals(num_edges); + + raft::update_host(v_ro.data(), offsets, v_ro.size(), handle.get_stream()); + raft::update_host(v_ci.data(), indices, v_ci.size(), handle.get_stream()); + raft::update_host(v_vals.data(), values, v_vals.size(), handle.get_stream()); + + std::vector v_start{1, 0, 4, 2}; + vector_test_t d_v_start(v_start.size(), handle.get_stream()); + raft::update_device(d_v_start.data(), v_start.data(), d_v_start.size(), handle.get_stream()); + + index_t num_paths = v_start.size(); + index_t max_depth = 5; + + // 0-copy const device view: + // + detail::device_const_vector_view d_start_view{d_v_start.data(), num_paths}; + auto quad = detail::random_walks_impl(handle, graph_view, d_start_view, max_depth); + + auto& d_coalesced_v = std::get<0>(quad); + auto& d_coalesced_w = std::get<1>(quad); + auto& d_sizes = std::get<2>(quad); + auto seed0 = std::get<3>(quad); + + bool test_all_paths = + cugraph::test::host_check_rw_paths(handle, graph_view, d_coalesced_v, d_coalesced_w, d_sizes); + + if (!test_all_paths) std::cout << "starting seed on failure: " << seed0 << '\n'; + + ASSERT_TRUE(test_all_paths); +} diff --git a/cpp/tests/experimental/sssp_test.cpp b/cpp/tests/experimental/sssp_test.cpp index 611abcb0d75..e8ab3ec5426 100644 --- a/cpp/tests/experimental/sssp_test.cpp +++ b/cpp/tests/experimental/sssp_test.cpp @@ -14,11 +14,14 @@ * limitations under the License. */ +#include #include #include +#include #include #include +#include #include #include @@ -28,12 +31,18 @@ #include +#include #include #include #include #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + // Dijkstra's algorithm template void sssp_reference(edge_t const* offsets, @@ -80,9 +89,12 @@ void sssp_reference(edge_t const* offsets, typedef struct SSSP_Usecase_t { cugraph::test::input_graph_specifier_t input_graph_specifier{}; - size_t source{false}; - SSSP_Usecase_t(std::string const& graph_file_path, size_t source) : source(source) + size_t source{0}; + bool check_correctness{false}; + + SSSP_Usecase_t(std::string const& graph_file_path, size_t source, bool check_correctness = true) + : source(source), check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -94,13 +106,43 @@ typedef struct SSSP_Usecase_t { input_graph_specifier.graph_file_full_path = graph_file_full_path; }; - SSSP_Usecase_t(cugraph::test::rmat_params_t rmat_params, size_t source) : source(source) + SSSP_Usecase_t(cugraph::test::rmat_params_t rmat_params, + size_t source, + bool check_correctness = true) + : source(source), check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } SSSP_Usecase; +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, SSSP_Usecase const& configuration, bool renumber) +{ + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, configuration.input_graph_specifier.graph_file_full_path, true, renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + true, + renumber, + std::vector{0}, + size_t{1}); +} + class Tests_SSSP : public ::testing::TestWithParam { public: Tests_SSSP() {} @@ -113,118 +155,176 @@ class Tests_SSSP : public ::testing::TestWithParam { template void run_current_test(SSSP_Usecase const& configuration) { + constexpr bool renumber = true; + raft::handle_t handle{}; + HighResClock hr_clock{}; + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } cugraph::experimental::graph_t graph(handle); - std::tie(graph, std::ignore) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, configuration.input_graph_specifier.graph_file_full_path, true, false) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - true, - false); + rmm::device_uvector d_renumber_map_labels(0, handle.get_stream()); + std::tie(graph, d_renumber_map_labels) = + read_graph(handle, configuration, renumber); + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "read_graph took " << elapsed_time * 1e-6 << " s.\n"; + } auto graph_view = graph.view(); - std::vector h_offsets(graph_view.get_number_of_vertices() + 1); - std::vector h_indices(graph_view.get_number_of_edges()); - std::vector h_weights(graph_view.get_number_of_edges()); - raft::update_host(h_offsets.data(), - graph_view.offsets(), - graph_view.get_number_of_vertices() + 1, - handle.get_stream()); - raft::update_host(h_indices.data(), - graph_view.indices(), - graph_view.get_number_of_edges(), - handle.get_stream()); - raft::update_host(h_weights.data(), - graph_view.weights(), - graph_view.get_number_of_edges(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - ASSERT_TRUE(configuration.source >= 0 && - configuration.source <= graph_view.get_number_of_vertices()) - << "Starting sources should be >= 0 and" - << " less than the number of vertices in the graph."; - - std::vector h_reference_distances(graph_view.get_number_of_vertices()); - std::vector h_reference_predecessors(graph_view.get_number_of_vertices()); - - sssp_reference(h_offsets.data(), - h_indices.data(), - h_weights.data(), - h_reference_distances.data(), - h_reference_predecessors.data(), - graph_view.get_number_of_vertices(), - static_cast(configuration.source)); + ASSERT_TRUE(static_cast(configuration.source) >= 0 && + static_cast(configuration.source) < graph_view.get_number_of_vertices()); rmm::device_uvector d_distances(graph_view.get_number_of_vertices(), handle.get_stream()); rmm::device_uvector d_predecessors(graph_view.get_number_of_vertices(), handle.get_stream()); - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } cugraph::experimental::sssp(handle, graph_view, - d_distances.begin(), - d_predecessors.begin(), + d_distances.data(), + d_predecessors.data(), static_cast(configuration.source), std::numeric_limits::max(), false); - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - - std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); - std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); - - raft::update_host( - h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); - raft::update_host(h_cugraph_predecessors.data(), - d_predecessors.data(), - d_predecessors.size(), - handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end()); - auto epsilon = *max_weight_element * weight_t{1e-6}; - auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; }; - - ASSERT_TRUE(std::equal(h_reference_distances.begin(), - h_reference_distances.end(), - h_cugraph_distances.begin(), - nearly_equal)) - << "distances do not match with the reference values."; - - for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) { - auto i = std::distance(h_cugraph_predecessors.begin(), it); - if (*it == cugraph::invalid_vertex_id::value) { - ASSERT_TRUE(h_reference_predecessors[i] == *it) - << "vertex reachability do not match with the reference."; + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "SSSP took " << elapsed_time * 1e-6 << " s.\n"; + } + + if (configuration.check_correctness) { + cugraph::experimental::graph_t unrenumbered_graph( + handle); + if (renumber) { + std::tie(unrenumbered_graph, std::ignore) = + read_graph(handle, configuration, false); + } + auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view; + + std::vector h_offsets(unrenumbered_graph_view.get_number_of_vertices() + 1); + std::vector h_indices(unrenumbered_graph_view.get_number_of_edges()); + std::vector h_weights(unrenumbered_graph_view.get_number_of_edges()); + raft::update_host(h_offsets.data(), + unrenumbered_graph_view.offsets(), + unrenumbered_graph_view.get_number_of_vertices() + 1, + handle.get_stream()); + raft::update_host(h_indices.data(), + unrenumbered_graph_view.indices(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + raft::update_host(h_weights.data(), + unrenumbered_graph_view.weights(), + unrenumbered_graph_view.get_number_of_edges(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto unrenumbered_source = static_cast(configuration.source); + if (renumber) { + std::vector h_renumber_map_labels(d_renumber_map_labels.size()); + raft::update_host(h_renumber_map_labels.data(), + d_renumber_map_labels.data(), + d_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + unrenumbered_source = h_renumber_map_labels[configuration.source]; + } + + std::vector h_reference_distances(unrenumbered_graph_view.get_number_of_vertices()); + std::vector h_reference_predecessors( + unrenumbered_graph_view.get_number_of_vertices()); + + sssp_reference(h_offsets.data(), + h_indices.data(), + h_weights.data(), + h_reference_distances.data(), + h_reference_predecessors.data(), + unrenumbered_graph_view.get_number_of_vertices(), + unrenumbered_source, + std::numeric_limits::max()); + + std::vector h_cugraph_distances(graph_view.get_number_of_vertices()); + std::vector h_cugraph_predecessors(graph_view.get_number_of_vertices()); + if (renumber) { + cugraph::experimental::unrenumber_local_int_vertices(handle, + d_predecessors.data(), + d_predecessors.size(), + d_renumber_map_labels.data(), + vertex_t{0}, + graph_view.get_number_of_vertices(), + true); + + auto d_unrenumbered_distances = cugraph::test::sort_by_key( + handle, d_renumber_map_labels.data(), d_distances.data(), d_renumber_map_labels.size()); + auto d_unrenumbered_predecessors = cugraph::test::sort_by_key(handle, + d_renumber_map_labels.data(), + d_predecessors.data(), + d_renumber_map_labels.size()); + + raft::update_host(h_cugraph_distances.data(), + d_unrenumbered_distances.data(), + d_unrenumbered_distances.size(), + handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_unrenumbered_predecessors.data(), + d_unrenumbered_predecessors.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); } else { - auto pred_distance = h_reference_distances[*it]; - bool found{false}; - for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) { - if (h_indices[j] == i) { - if (nearly_equal(pred_distance + h_weights[j], h_reference_distances[i])) { - found = true; - break; + raft::update_host( + h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream()); + raft::update_host(h_cugraph_predecessors.data(), + d_predecessors.data(), + d_predecessors.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + } + + auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end()); + auto epsilon = *max_weight_element * weight_t{1e-6}; + auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; }; + + ASSERT_TRUE(std::equal(h_reference_distances.begin(), + h_reference_distances.end(), + h_cugraph_distances.begin(), + nearly_equal)) + << "distances do not match with the reference values."; + + for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) { + auto i = std::distance(h_cugraph_predecessors.begin(), it); + if (*it == cugraph::invalid_vertex_id::value) { + ASSERT_TRUE(h_reference_predecessors[i] == *it) + << "vertex reachability do not match with the reference."; + } else { + auto pred_distance = h_reference_distances[*it]; + bool found{false}; + for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) { + if (h_indices[j] == i) { + if (nearly_equal(pred_distance + h_weights[j], h_reference_distances[i])) { + found = true; + break; + } } } + ASSERT_TRUE(found) + << "no edge from the predecessor vertex to this vertex with the matching weight."; } - ASSERT_TRUE(found) - << "no edge from the predecessor vertex to this vertex with the matching weight."; } } } @@ -237,9 +337,14 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_SSSP, ::testing::Values( + // enable correctness checks SSSP_Usecase("test/datasets/karate.mtx", 0), SSSP_Usecase("test/datasets/dblp.mtx", 0), SSSP_Usecase("test/datasets/wiki2003.mtx", 1000), - SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0))); + SSSP_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0), + // disable correctness checks for large graphs + SSSP_Usecase(cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, + 0, + false))); CUGRAPH_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/experimental/streams.cu b/cpp/tests/experimental/streams.cu new file mode 100644 index 00000000000..c89ffe1e532 --- /dev/null +++ b/cpp/tests/experimental/streams.cu @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governin_from_mtxg permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include "gtest/gtest.h" +struct StreamTest : public ::testing::Test { +}; +TEST_F(StreamTest, basic_test) +{ + int n_streams = 4; + raft::handle_t handle(n_streams); + + const size_t intput_size = 4096; + +#pragma omp parallel for + for (int i = 0; i < n_streams; i++) { + rmm::device_uvector u(intput_size, handle.get_internal_stream_view(i)), + v(intput_size, handle.get_internal_stream_view(i)); + thrust::transform(rmm::exec_policy(handle.get_internal_stream_view(i)), + u.begin(), + u.end(), + v.begin(), + v.begin(), + 2 * thrust::placeholders::_1 + thrust::placeholders::_2); + } +} \ No newline at end of file diff --git a/cpp/tests/pagerank/mg_pagerank_test.cpp b/cpp/tests/pagerank/mg_pagerank_test.cpp index 85ee9a4243e..bbc80a60a3d 100644 --- a/cpp/tests/pagerank/mg_pagerank_test.cpp +++ b/cpp/tests/pagerank/mg_pagerank_test.cpp @@ -14,30 +14,46 @@ * limitations under the License. */ +#include #include #include +#include #include +#include +#include +#include #include #include #include #include +#include +#include #include #include +// do the perf measurements +// enabled by command line parameter s'--perf' +// +static int PERF = 0; + typedef struct PageRank_Usecase_t { cugraph::test::input_graph_specifier_t input_graph_specifier{}; double personalization_ratio{0.0}; bool test_weighted{false}; + bool check_correctness{false}; PageRank_Usecase_t(std::string const& graph_file_path, double personalization_ratio, - bool test_weighted) - : personalization_ratio(personalization_ratio), test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : personalization_ratio(personalization_ratio), + test_weighted(test_weighted), + check_correctness(check_correctness) { std::string graph_file_full_path{}; if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) { @@ -51,14 +67,56 @@ typedef struct PageRank_Usecase_t { PageRank_Usecase_t(cugraph::test::rmat_params_t rmat_params, double personalization_ratio, - bool test_weighted) - : personalization_ratio(personalization_ratio), test_weighted(test_weighted) + bool test_weighted, + bool check_correctness = true) + : personalization_ratio(personalization_ratio), + test_weighted(test_weighted), + check_correctness(check_correctness) { input_graph_specifier.tag = cugraph::test::input_graph_specifier_t::RMAT_PARAMS; input_graph_specifier.rmat_params = rmat_params; } } PageRank_Usecase; +template +std::tuple, + rmm::device_uvector> +read_graph(raft::handle_t const& handle, PageRank_Usecase const& configuration, bool renumber) +{ + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + + std::vector partition_ids(multi_gpu ? size_t{1} : static_cast(comm_size)); + std::iota(partition_ids.begin(), + partition_ids.end(), + multi_gpu ? static_cast(comm_rank) : size_t{0}); + + return configuration.input_graph_specifier.tag == + cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH + ? cugraph::test:: + read_graph_from_matrix_market_file( + handle, + configuration.input_graph_specifier.graph_file_full_path, + configuration.test_weighted, + renumber) + : cugraph::test:: + generate_graph_from_rmat_params( + handle, + configuration.input_graph_specifier.rmat_params.scale, + configuration.input_graph_specifier.rmat_params.edge_factor, + configuration.input_graph_specifier.rmat_params.a, + configuration.input_graph_specifier.rmat_params.b, + configuration.input_graph_specifier.rmat_params.c, + configuration.input_graph_specifier.rmat_params.seed, + configuration.input_graph_specifier.rmat_params.undirected, + configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, + configuration.test_weighted, + renumber, + partition_ids, + static_cast(comm_size)); +} + class Tests_MGPageRank : public ::testing::TestWithParam { public: Tests_MGPageRank() {} @@ -68,13 +126,14 @@ class Tests_MGPageRank : public ::testing::TestWithParam { virtual void SetUp() {} virtual void TearDown() {} - // Compare the results of running pagerank on multiple GPUs to that of a single-GPU run + // Compare the results of running PageRank on multiple GPUs to that of a single-GPU run template void run_current_test(PageRank_Usecase const& configuration) { // 1. initialize handle raft::handle_t handle{}; + HighResClock hr_clock{}; raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD); auto& comm = handle.get_comms(); @@ -86,168 +145,50 @@ class Tests_MGPageRank : public ::testing::TestWithParam { cugraph::partition_2d::subcomm_factory_t subcomm_factory(handle, row_comm_size); - // 2. create SG & MG graphs - - cugraph::experimental::graph_t sg_graph(handle); - rmm::device_uvector d_sg_renumber_map_labels(0, handle.get_stream()); - std::tie(sg_graph, d_sg_renumber_map_labels) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test:: - read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - true) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - true); - - auto sg_graph_view = sg_graph.view(); + // 2. create MG graph + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } cugraph::experimental::graph_t mg_graph(handle); rmm::device_uvector d_mg_renumber_map_labels(0, handle.get_stream()); std::tie(mg_graph, d_mg_renumber_map_labels) = - configuration.input_graph_specifier.tag == - cugraph::test::input_graph_specifier_t::MATRIX_MARKET_FILE_PATH - ? cugraph::test::read_graph_from_matrix_market_file( - handle, - configuration.input_graph_specifier.graph_file_full_path, - configuration.test_weighted, - true) - : cugraph::test::generate_graph_from_rmat_params( - handle, - configuration.input_graph_specifier.rmat_params.scale, - configuration.input_graph_specifier.rmat_params.edge_factor, - configuration.input_graph_specifier.rmat_params.a, - configuration.input_graph_specifier.rmat_params.b, - configuration.input_graph_specifier.rmat_params.c, - configuration.input_graph_specifier.rmat_params.seed, - configuration.input_graph_specifier.rmat_params.undirected, - configuration.input_graph_specifier.rmat_params.scramble_vertex_ids, - configuration.test_weighted, - true); + read_graph(handle, configuration, true); + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "MG read_graph took " << elapsed_time * 1e-6 << " s.\n"; + } auto mg_graph_view = mg_graph.view(); - std::vector h_sg_renumber_map_labels(d_sg_renumber_map_labels.size()); - raft::update_host(h_sg_renumber_map_labels.data(), - d_sg_renumber_map_labels.data(), - d_sg_renumber_map_labels.size(), - handle.get_stream()); - - std::vector h_mg_renumber_map_labels(mg_graph_view.get_number_of_local_vertices()); - raft::update_host(h_mg_renumber_map_labels.data(), - d_mg_renumber_map_labels.data(), - d_mg_renumber_map_labels.size(), - handle.get_stream()); + // 3. generate personalization vertex/value pairs - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - // 2. generate personalization vertex/value pairs - - std::vector h_personalization_vertices{}; - std::vector h_personalization_values{}; + std::vector h_mg_personalization_vertices{}; + std::vector h_mg_personalization_values{}; if (configuration.personalization_ratio > 0.0) { - std::default_random_engine generator{}; + std::default_random_engine generator{ + static_cast(comm.get_rank()) /* seed */}; std::uniform_real_distribution distribution{0.0, 1.0}; - h_personalization_vertices.resize(sg_graph_view.get_number_of_vertices()); - std::iota(h_personalization_vertices.begin(), h_personalization_vertices.end(), vertex_t{0}); - h_personalization_vertices.erase( - std::remove_if(h_personalization_vertices.begin(), - h_personalization_vertices.end(), + h_mg_personalization_vertices.resize(mg_graph_view.get_number_of_local_vertices()); + std::iota(h_mg_personalization_vertices.begin(), + h_mg_personalization_vertices.end(), + mg_graph_view.get_local_vertex_first()); + h_mg_personalization_vertices.erase( + std::remove_if(h_mg_personalization_vertices.begin(), + h_mg_personalization_vertices.end(), [&generator, &distribution, configuration](auto v) { return distribution(generator) >= configuration.personalization_ratio; }), - h_personalization_vertices.end()); - h_personalization_values.resize(h_personalization_vertices.size()); - std::for_each(h_personalization_values.begin(), - h_personalization_values.end(), + h_mg_personalization_vertices.end()); + h_mg_personalization_values.resize(h_mg_personalization_vertices.size()); + std::for_each(h_mg_personalization_values.begin(), + h_mg_personalization_values.end(), [&distribution, &generator](auto& val) { val = distribution(generator); }); } - result_t constexpr alpha{0.85}; - result_t constexpr epsilon{1e-6}; - - // 3. run SG pagerank - - std::vector h_sg_personalization_vertices{}; - std::vector h_sg_personalization_values{}; - if (h_personalization_vertices.size() > 0) { - for (vertex_t i = 0; i < sg_graph_view.get_number_of_vertices(); ++i) { - auto it = std::lower_bound(h_personalization_vertices.begin(), - h_personalization_vertices.end(), - h_sg_renumber_map_labels[i]); - if (*it == h_sg_renumber_map_labels[i]) { - h_sg_personalization_vertices.push_back(i); - h_sg_personalization_values.push_back( - h_personalization_values[std::distance(h_personalization_vertices.begin(), it)]); - } - } - } - - rmm::device_uvector d_sg_personalization_vertices( - h_sg_personalization_vertices.size(), handle.get_stream()); - rmm::device_uvector d_sg_personalization_values(d_sg_personalization_vertices.size(), - handle.get_stream()); - if (d_sg_personalization_vertices.size() > 0) { - raft::update_device(d_sg_personalization_vertices.data(), - h_sg_personalization_vertices.data(), - h_sg_personalization_vertices.size(), - handle.get_stream()); - raft::update_device(d_sg_personalization_values.data(), - h_sg_personalization_values.data(), - h_sg_personalization_values.size(), - handle.get_stream()); - } - - rmm::device_uvector d_sg_pageranks(sg_graph_view.get_number_of_vertices(), - handle.get_stream()); - - cugraph::experimental::pagerank(handle, - sg_graph_view, - static_cast(nullptr), - d_sg_personalization_vertices.data(), - d_sg_personalization_values.data(), - static_cast(d_sg_personalization_vertices.size()), - d_sg_pageranks.begin(), - alpha, - epsilon, - std::numeric_limits::max(), // max_iterations - false, - false); - - std::vector h_sg_pageranks(sg_graph_view.get_number_of_vertices()); - raft::update_host( - h_sg_pageranks.data(), d_sg_pageranks.data(), d_sg_pageranks.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - - // 4. run MG pagerank - - std::vector h_mg_personalization_vertices{}; - std::vector h_mg_personalization_values{}; - if (h_personalization_vertices.size() > 0) { - for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { - auto it = std::lower_bound(h_personalization_vertices.begin(), - h_personalization_vertices.end(), - h_mg_renumber_map_labels[i]); - if (*it == h_mg_renumber_map_labels[i]) { - h_mg_personalization_vertices.push_back(mg_graph_view.get_local_vertex_first() + i); - h_mg_personalization_values.push_back( - h_personalization_values[std::distance(h_personalization_vertices.begin(), it)]); - } - } - } - rmm::device_uvector d_mg_personalization_vertices( h_mg_personalization_vertices.size(), handle.get_stream()); rmm::device_uvector d_mg_personalization_values(d_mg_personalization_vertices.size(), @@ -263,10 +204,18 @@ class Tests_MGPageRank : public ::testing::TestWithParam { handle.get_stream()); } + // 4. run MG PageRank + + result_t constexpr alpha{0.85}; + result_t constexpr epsilon{1e-6}; + rmm::device_uvector d_mg_pageranks(mg_graph_view.get_number_of_local_vertices(), handle.get_stream()); - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + hr_clock.start(); + } cugraph::experimental::pagerank(handle, mg_graph_view, @@ -274,44 +223,150 @@ class Tests_MGPageRank : public ::testing::TestWithParam { d_mg_personalization_vertices.data(), d_mg_personalization_values.data(), static_cast(d_mg_personalization_vertices.size()), - d_mg_pageranks.begin(), + d_mg_pageranks.data(), alpha, epsilon, std::numeric_limits::max(), - false, false); - CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - - std::vector h_mg_pageranks(mg_graph_view.get_number_of_local_vertices()); - raft::update_host( - h_mg_pageranks.data(), d_mg_pageranks.data(), d_mg_pageranks.size(), handle.get_stream()); - CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); + if (PERF) { + CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + double elapsed_time{0.0}; + hr_clock.stop(&elapsed_time); + std::cout << "MG PageRank took " << elapsed_time * 1e-6 << " s.\n"; + } // 5. copmare SG & MG results - std::vector h_sg_shuffled_pageranks(sg_graph_view.get_number_of_vertices(), - result_t{0.0}); - for (size_t i = 0; i < h_sg_pageranks.size(); ++i) { - h_sg_shuffled_pageranks[h_sg_renumber_map_labels[i]] = h_sg_pageranks[i]; - } + if (configuration.check_correctness) { + // 5-1. create SG graph + + cugraph::experimental::graph_t sg_graph(handle); + std::tie(sg_graph, std::ignore) = + read_graph(handle, configuration, false); + + auto sg_graph_view = sg_graph.view(); + + // 5-2. collect personalization vertex/value pairs + + rmm::device_uvector d_sg_personalization_vertices(0, handle.get_stream()); + rmm::device_uvector d_sg_personalization_values(0, handle.get_stream()); + if (configuration.personalization_ratio > 0.0) { + rmm::device_uvector d_unrenumbered_personalization_vertices( + d_mg_personalization_vertices.size(), handle.get_stream()); + rmm::device_uvector d_unrenumbered_personalization_values( + d_unrenumbered_personalization_vertices.size(), handle.get_stream()); + raft::copy_async(d_unrenumbered_personalization_vertices.data(), + d_mg_personalization_vertices.data(), + d_mg_personalization_vertices.size(), + handle.get_stream()); + raft::copy_async(d_unrenumbered_personalization_values.data(), + d_mg_personalization_values.data(), + d_mg_personalization_values.size(), + handle.get_stream()); + + std::vector vertex_partition_lasts(comm_size); + for (size_t i = 0; i < vertex_partition_lasts.size(); ++i) { + vertex_partition_lasts[i] = mg_graph_view.get_vertex_partition_last(i); + } + cugraph::experimental::unrenumber_int_vertices( + handle, + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_vertices.size(), + d_mg_renumber_map_labels.data(), + mg_graph_view.get_local_vertex_first(), + mg_graph_view.get_local_vertex_last(), + vertex_partition_lasts, + handle.get_stream()); + + rmm::device_scalar d_local_personalization_vector_size( + d_unrenumbered_personalization_vertices.size(), handle.get_stream()); + rmm::device_uvector d_recvcounts(comm_size, handle.get_stream()); + comm.allgather( + d_local_personalization_vector_size.data(), d_recvcounts.data(), 1, handle.get_stream()); + std::vector recvcounts(d_recvcounts.size()); + raft::update_host( + recvcounts.data(), d_recvcounts.data(), d_recvcounts.size(), handle.get_stream()); + auto status = comm.sync_stream(handle.get_stream()); + ASSERT_EQ(status, raft::comms::status_t::SUCCESS); + + std::vector displacements(recvcounts.size(), size_t{0}); + std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + + d_sg_personalization_vertices.resize(displacements.back() + recvcounts.back(), + handle.get_stream()); + d_sg_personalization_values.resize(d_sg_personalization_vertices.size(), + handle.get_stream()); + + comm.allgatherv(d_unrenumbered_personalization_vertices.data(), + d_sg_personalization_vertices.data(), + recvcounts.data(), + displacements.data(), + handle.get_stream()); + comm.allgatherv(d_unrenumbered_personalization_values.data(), + d_sg_personalization_values.data(), + recvcounts.data(), + displacements.data(), + handle.get_stream()); + + cugraph::test::sort_by_key(handle, + d_unrenumbered_personalization_vertices.data(), + d_unrenumbered_personalization_values.data(), + d_unrenumbered_personalization_vertices.size()); + } + + // 5-3. run SG PageRank + + rmm::device_uvector d_sg_pageranks(sg_graph_view.get_number_of_vertices(), + handle.get_stream()); + + cugraph::experimental::pagerank(handle, + sg_graph_view, + static_cast(nullptr), + d_sg_personalization_vertices.data(), + d_sg_personalization_values.data(), + static_cast(d_sg_personalization_vertices.size()), + d_sg_pageranks.data(), + alpha, + epsilon, + std::numeric_limits::max(), // max_iterations + false); + + // 5-4. compare + + std::vector h_sg_pageranks(sg_graph_view.get_number_of_vertices()); + raft::update_host( + h_sg_pageranks.data(), d_sg_pageranks.data(), d_sg_pageranks.size(), handle.get_stream()); + + std::vector h_mg_pageranks(mg_graph_view.get_number_of_local_vertices()); + raft::update_host( + h_mg_pageranks.data(), d_mg_pageranks.data(), d_mg_pageranks.size(), handle.get_stream()); + + std::vector h_mg_renumber_map_labels(d_mg_renumber_map_labels.size()); + raft::update_host(h_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.data(), + d_mg_renumber_map_labels.size(), + handle.get_stream()); + + handle.get_stream_view().synchronize(); + + auto threshold_ratio = 1e-3; + auto threshold_magnitude = + (1.0 / static_cast(mg_graph_view.get_number_of_vertices())) * + threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) + auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { + return std::abs(lhs - rhs) < + std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); + }; - auto threshold_ratio = 1e-3; - auto threshold_magnitude = - (1.0 / static_cast(mg_graph_view.get_number_of_vertices())) * - threshold_ratio; // skip comparison for low PageRank verties (lowly ranked vertices) - auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) { - return std::abs(lhs - rhs) < - std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude); - }; - - for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { - auto mapped_vertex = h_mg_renumber_map_labels[i]; - ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_shuffled_pageranks[mapped_vertex])) - << "MG PageRank value for vertex: " << i << " in rank: " << comm_rank - << " has value: " << h_mg_pageranks[i] - << " which exceeds the error margin for comparing to SG value: " - << h_sg_shuffled_pageranks[mapped_vertex]; + for (vertex_t i = 0; i < mg_graph_view.get_number_of_local_vertices(); ++i) { + auto mapped_vertex = h_mg_renumber_map_labels[i]; + ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_pageranks[mapped_vertex])) + << "MG PageRank value for vertex: " << mapped_vertex << " in rank: " << comm_rank + << " has value: " << h_mg_pageranks[i] + << " which exceeds the error margin for comparing to SG value: " + << h_sg_pageranks[mapped_vertex]; + } } } }; @@ -325,6 +380,7 @@ INSTANTIATE_TEST_CASE_P( simple_test, Tests_MGPageRank, ::testing::Values( + // enable correctness checks PageRank_Usecase("test/datasets/karate.mtx", 0.0, false), PageRank_Usecase("test/datasets/karate.mtx", 0.5, false), PageRank_Usecase("test/datasets/karate.mtx", 0.0, true), @@ -352,6 +408,15 @@ INSTANTIATE_TEST_CASE_P( true), PageRank_Usecase(cugraph::test::rmat_params_t{10, 16, 0.57, 0.19, 0.19, 0, false, false}, 0.5, - true))); + true), + // disable correctness checks for large graphs + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, false, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, false, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.0, true, false), + PageRank_Usecase( + cugraph::test::rmat_params_t{20, 32, 0.57, 0.19, 0.19, 0, false, false}, 0.5, true, false))); CUGRAPH_MG_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/traversal/tsp_test.cu b/cpp/tests/traversal/tsp_test.cu index 383427a56cf..d4e9ff90f35 100644 --- a/cpp/tests/traversal/tsp_test.cu +++ b/cpp/tests/traversal/tsp_test.cu @@ -132,11 +132,12 @@ class Tests_Tsp : public ::testing::TestWithParam { int nodes = load_tsp(param.tsp_file.c_str(), &input); // Device alloc - raft::handle_t handle; - rmm::device_uvector vertices(static_cast(nodes), nullptr); - rmm::device_uvector route(static_cast(nodes), nullptr); - rmm::device_uvector x_pos(static_cast(nodes), nullptr); - rmm::device_uvector y_pos(static_cast(nodes), nullptr); + raft::handle_t const handle; + auto stream = handle.get_stream(); + rmm::device_uvector vertices(static_cast(nodes), stream); + rmm::device_uvector route(static_cast(nodes), stream); + rmm::device_uvector x_pos(static_cast(nodes), stream); + rmm::device_uvector y_pos(static_cast(nodes), stream); int* vtx_ptr = vertices.data(); int* d_route = route.data(); diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp index e8f11acfbf4..79a86e1fc95 100644 --- a/cpp/tests/utilities/base_fixture.hpp +++ b/cpp/tests/utilities/base_fixture.hpp @@ -160,6 +160,11 @@ inline auto parse_test_options(int argc, char **argv) auto const cmd_opts = parse_test_options(argc, argv); \ auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ auto resource = cugraph::test::create_memory_resource(rmm_mode); \ + \ + if (comm_rank != 0) { \ + auto &listeners = ::testing::UnitTest::GetInstance()->listeners(); \ + delete listeners.Release(listeners.default_result_printer()); \ + } \ rmm::mr::set_current_device_resource(resource.get()); \ auto ret = RUN_ALL_TESTS(); \ MPI_TRY(MPI_Finalize()); \ diff --git a/cpp/tests/utilities/generate_graph_from_edgelist.cu b/cpp/tests/utilities/generate_graph_from_edgelist.cu index 1b9fe6051f7..5f41e0e5ce0 100644 --- a/cpp/tests/utilities/generate_graph_from_edgelist.cu +++ b/cpp/tests/utilities/generate_graph_from_edgelist.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -28,7 +29,7 @@ namespace cugraph { namespace test { -namespace detail { +namespace { template , rmm::device_uvector>> -generate_graph_from_edgelist(raft::handle_t const& handle, - rmm::device_uvector&& vertices, - rmm::device_uvector&& edgelist_rows, - rmm::device_uvector&& edgelist_cols, - rmm::device_uvector&& edgelist_weights, - bool is_symmetric, - bool test_weighted, - bool renumber) +generate_graph_from_edgelist_impl(raft::handle_t const& handle, + rmm::device_uvector&& vertices, + rmm::device_uvector&& edgelist_rows, + rmm::device_uvector&& edgelist_cols, + rmm::device_uvector&& edgelist_weights, + bool is_symmetric, + bool test_weighted, + bool renumber) { CUGRAPH_EXPECTS(renumber, "renumber should be true if multi_gpu is true."); @@ -59,96 +60,86 @@ generate_graph_from_edgelist(raft::handle_t const& handle, auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); auto const col_comm_size = col_comm.get_size(); - vertex_t number_of_vertices = static_cast(vertices.size()); - - auto vertex_key_func = - cugraph::experimental::detail::compute_gpu_id_from_vertex_t{comm_size}; - vertices.resize(thrust::distance(vertices.begin(), - thrust::remove_if( - rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - vertices.begin(), - vertices.end(), - [comm_rank, key_func = vertex_key_func] __device__(auto val) { - return key_func(val) != comm_rank; - })), - handle.get_stream()); - vertices.shrink_to_fit(handle.get_stream()); - - auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ - false, comm_size, row_comm_size, col_comm_size}; - size_t number_of_local_edges{}; - if (test_weighted) { - auto edge_first = thrust::make_zip_iterator( - thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin(), edgelist_weights.begin())); - number_of_local_edges = thrust::distance( - edge_first, - thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edge_first, - edge_first + edgelist_rows.size(), - [comm_rank, key_func = edge_key_func] __device__(auto e) { - auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e); - auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e); - return key_func(major, minor) != comm_rank; - })); - } else { - auto edge_first = - thrust::make_zip_iterator(thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin())); - number_of_local_edges = thrust::distance( - edge_first, - thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - edge_first, - edge_first + edgelist_rows.size(), - [comm_rank, key_func = edge_key_func] __device__(auto e) { - auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e); - auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e); - return key_func(major, minor) != comm_rank; - })); - } - - edgelist_rows.resize(number_of_local_edges, handle.get_stream()); - edgelist_rows.shrink_to_fit(handle.get_stream()); - edgelist_cols.resize(number_of_local_edges, handle.get_stream()); - edgelist_cols.shrink_to_fit(handle.get_stream()); - if (test_weighted) { - edgelist_weights.resize(number_of_local_edges, handle.get_stream()); - edgelist_weights.shrink_to_fit(handle.get_stream()); - } + auto local_partition_id_op = + [comm_size, + key_func = cugraph::experimental::detail::compute_partition_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto pair) { + return key_func(thrust::get<0>(pair), thrust::get<1>(pair)) / + comm_size; // global partition id to local partition id + }; + auto pair_first = + store_transposed + ? thrust::make_zip_iterator(thrust::make_tuple(edgelist_cols.begin(), edgelist_rows.begin())) + : thrust::make_zip_iterator(thrust::make_tuple(edgelist_rows.begin(), edgelist_cols.begin())); + auto edge_counts = test_weighted + ? cugraph::experimental::groupby_and_count(pair_first, + pair_first + edgelist_rows.size(), + edgelist_weights.begin(), + local_partition_id_op, + col_comm_size, + handle.get_stream()) + : cugraph::experimental::groupby_and_count(pair_first, + pair_first + edgelist_rows.size(), + local_partition_id_op, + col_comm_size, + handle.get_stream()); + + std::vector h_edge_counts(edge_counts.size()); + raft::update_host( + h_edge_counts.data(), edge_counts.data(), edge_counts.size(), handle.get_stream()); + handle.get_stream_view().synchronize(); + + std::vector h_displacements(h_edge_counts.size(), size_t{0}); + std::partial_sum(h_edge_counts.begin(), h_edge_counts.end() - 1, h_displacements.begin() + 1); // 3. renumber rmm::device_uvector renumber_map_labels(0, handle.get_stream()); cugraph::experimental::partition_t partition{}; - vertex_t aggregate_number_of_vertices{}; + vertex_t number_of_vertices{}; edge_t number_of_edges{}; - // FIXME: set do_expensive_check to false once validated - std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) = - cugraph::experimental::renumber_edgelist( - handle, - vertices.data(), - static_cast(vertices.size()), - store_transposed ? edgelist_cols.data() : edgelist_rows.data(), - store_transposed ? edgelist_rows.data() : edgelist_cols.data(), - edgelist_rows.size(), - false, - true); - assert(aggregate_number_of_vertices == number_of_vertices); + { + std::vector major_ptrs(h_edge_counts.size()); + std::vector minor_ptrs(major_ptrs.size()); + std::vector counts(major_ptrs.size()); + for (size_t i = 0; i < h_edge_counts.size(); ++i) { + major_ptrs[i] = + (store_transposed ? edgelist_cols.begin() : edgelist_rows.begin()) + h_displacements[i]; + minor_ptrs[i] = + (store_transposed ? edgelist_rows.begin() : edgelist_cols.begin()) + h_displacements[i]; + counts[i] = static_cast(h_edge_counts[i]); + } + std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) = + cugraph::experimental::renumber_edgelist( + handle, + vertices.data(), + static_cast(vertices.size()), + major_ptrs, + minor_ptrs, + counts); + } // 4. create a graph + std::vector> edgelists( + h_edge_counts.size()); + for (size_t i = 0; i < h_edge_counts.size(); ++i) { + edgelists[i] = cugraph::experimental::edgelist_t{ + edgelist_rows.data() + h_displacements[i], + edgelist_cols.data() + h_displacements[i], + test_weighted ? edgelist_weights.data() + h_displacements[i] + : static_cast(nullptr), + static_cast(h_edge_counts[i])}; + } + return std::make_tuple( cugraph::experimental::graph_t( handle, - std::vector>{ - cugraph::experimental::edgelist_t{ - edgelist_rows.data(), - edgelist_cols.data(), - test_weighted ? edgelist_weights.data() : nullptr, - static_cast(edgelist_rows.size())}}, + edgelists, partition, number_of_vertices, number_of_edges, - cugraph::experimental::graph_properties_t{is_symmetric, false}, - true, + cugraph::experimental::graph_properties_t{is_symmetric, false, test_weighted}, true), std::move(renumber_map_labels)); } @@ -163,18 +154,17 @@ std::enable_if_t< std::tuple< cugraph::experimental::graph_t, rmm::device_uvector>> -generate_graph_from_edgelist(raft::handle_t const& handle, - rmm::device_uvector&& vertices, - rmm::device_uvector&& edgelist_rows, - rmm::device_uvector&& edgelist_cols, - rmm::device_uvector&& edgelist_weights, - bool is_symmetric, - bool test_weighted, - bool renumber) +generate_graph_from_edgelist_impl(raft::handle_t const& handle, + rmm::device_uvector&& vertices, + rmm::device_uvector&& edgelist_rows, + rmm::device_uvector&& edgelist_cols, + rmm::device_uvector&& edgelist_weights, + bool is_symmetric, + bool test_weighted, + bool renumber) { vertex_t number_of_vertices = static_cast(vertices.size()); - // FIXME: set do_expensive_check to false once validated auto renumber_map_labels = renumber ? cugraph::experimental::renumber_edgelist( handle, @@ -182,11 +172,9 @@ generate_graph_from_edgelist(raft::handle_t const& handle, static_cast(vertices.size()), store_transposed ? edgelist_cols.data() : edgelist_rows.data(), store_transposed ? edgelist_rows.data() : edgelist_cols.data(), - static_cast(edgelist_rows.size()), - true) + static_cast(edgelist_rows.size())) : rmm::device_uvector(0, handle.get_stream()); - // FIXME: set do_expensive_check to false once validated return std::make_tuple( cugraph::experimental::graph_t( handle, @@ -196,13 +184,12 @@ generate_graph_from_edgelist(raft::handle_t const& handle, test_weighted ? edgelist_weights.data() : nullptr, static_cast(edgelist_rows.size())}, number_of_vertices, - cugraph::experimental::graph_properties_t{is_symmetric, false}, - renumber ? true : false, - true), + cugraph::experimental::graph_properties_t{is_symmetric, false, test_weighted}, + renumber ? true : false), std::move(renumber_map_labels)); } -} // namespace detail +} // namespace template ( - handle, - std::move(vertices), - std::move(edgelist_rows), - std::move(edgelist_cols), - std::move(edgelist_weights), - is_symmetric, - test_weighted, - renumber); + return generate_graph_from_edgelist_impl( + handle, + std::move(vertices), + std::move(edgelist_rows), + std::move(edgelist_cols), + std::move(edgelist_weights), + is_symmetric, + test_weighted, + renumber); } // explicit instantiations diff --git a/cpp/tests/utilities/matrix_market_file_utilities.cu b/cpp/tests/utilities/matrix_market_file_utilities.cu index ddbbac603ee..bf7539864be 100644 --- a/cpp/tests/utilities/matrix_market_file_utilities.cu +++ b/cpp/tests/utilities/matrix_market_file_utilities.cu @@ -13,9 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include +#include #include +#include #include #include @@ -339,7 +342,73 @@ read_graph_from_matrix_market_file(raft::handle_t const& handle, d_vertices.begin(), d_vertices.end(), vertex_t{0}); + handle.get_stream_view().synchronize(); + + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto const comm_rank = comm.get_rank(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); + + auto vertex_key_func = + cugraph::experimental::detail::compute_gpu_id_from_vertex_t{comm_size}; + d_vertices.resize( + thrust::distance( + d_vertices.begin(), + thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_vertices.begin(), + d_vertices.end(), + [comm_rank, key_func = vertex_key_func] __device__(auto val) { + return key_func(val) != comm_rank; + })), + handle.get_stream()); + d_vertices.shrink_to_fit(handle.get_stream()); + + auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}; + size_t number_of_local_edges{}; + if (test_weighted) { + auto edge_first = thrust::make_zip_iterator(thrust::make_tuple( + d_edgelist_rows.begin(), d_edgelist_cols.begin(), d_edgelist_weights.begin())); + number_of_local_edges = thrust::distance( + edge_first, + thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + d_edgelist_rows.size(), + [comm_rank, key_func = edge_key_func] __device__(auto e) { + auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e); + auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e); + return key_func(major, minor) != comm_rank; + })); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(d_edgelist_rows.begin(), d_edgelist_cols.begin())); + number_of_local_edges = thrust::distance( + edge_first, + thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + edge_first, + edge_first + d_edgelist_rows.size(), + [comm_rank, key_func = edge_key_func] __device__(auto e) { + auto major = store_transposed ? thrust::get<1>(e) : thrust::get<0>(e); + auto minor = store_transposed ? thrust::get<0>(e) : thrust::get<1>(e); + return key_func(major, minor) != comm_rank; + })); + } + + d_edgelist_rows.resize(number_of_local_edges, handle.get_stream()); + d_edgelist_rows.shrink_to_fit(handle.get_stream()); + d_edgelist_cols.resize(number_of_local_edges, handle.get_stream()); + d_edgelist_cols.shrink_to_fit(handle.get_stream()); + if (test_weighted) { + d_edgelist_weights.resize(number_of_local_edges, handle.get_stream()); + d_edgelist_weights.shrink_to_fit(handle.get_stream()); + } + } + handle.get_stream_view().synchronize(); return generate_graph_from_edgelist( handle, std::move(d_vertices), diff --git a/cpp/tests/utilities/rmat_utilities.cu b/cpp/tests/utilities/rmat_utilities.cu index 16ea7a486fc..3f0bb0b4a1f 100644 --- a/cpp/tests/utilities/rmat_utilities.cu +++ b/cpp/tests/utilities/rmat_utilities.cu @@ -13,10 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include +#include #include +#include #include +#include #include #include @@ -41,39 +45,191 @@ generate_graph_from_rmat_params(raft::handle_t const& handle, double a, double b, double c, - uint64_t seed, + uint64_t base_seed, bool undirected, bool scramble_vertex_ids, bool test_weighted, - bool renumber) + bool renumber, + std::vector const& partition_ids, + size_t num_partitions) { + CUGRAPH_EXPECTS(!multi_gpu || renumber, "renumber should be true if multi_gpu is true."); + CUGRAPH_EXPECTS(size_t{1} << scale <= static_cast(std::numeric_limits::max()), + "vertex_t overflow."); + CUGRAPH_EXPECTS( + (size_t{1} << scale) * edge_factor <= static_cast(std::numeric_limits::max()), + " edge_t overflow."); + + vertex_t number_of_vertices = static_cast(size_t{1} << scale); + edge_t number_of_edges = + static_cast(static_cast(number_of_vertices) * edge_factor); + + std::vector partition_edge_counts(partition_ids.size()); + std::vector partition_vertex_firsts(partition_ids.size()); + std::vector partition_vertex_lasts(partition_ids.size()); + for (size_t i = 0; i < partition_ids.size(); ++i) { + auto id = partition_ids[i]; + + partition_edge_counts[i] = number_of_edges / num_partitions + + (id < number_of_edges % num_partitions ? edge_t{1} : edge_t{0}); + + partition_vertex_firsts[i] = (number_of_vertices / num_partitions) * id; + partition_vertex_lasts[i] = (number_of_vertices / num_partitions) * (id + 1); + if (id < number_of_vertices % num_partitions) { + partition_vertex_firsts[i] += id; + partition_vertex_lasts[i] += id + 1; + } else { + partition_vertex_firsts[i] += number_of_vertices % num_partitions; + partition_vertex_lasts[i] += number_of_vertices % num_partitions; + } + } + rmm::device_uvector d_edgelist_rows(0, handle.get_stream()); rmm::device_uvector d_edgelist_cols(0, handle.get_stream()); - std::tie(d_edgelist_rows, d_edgelist_cols) = - cugraph::experimental::generate_rmat_edgelist( - handle, scale, edge_factor, a, b, c, seed, undirected ? true : false, scramble_vertex_ids); + rmm::device_uvector d_edgelist_weights(0, handle.get_stream()); + for (size_t i = 0; i < partition_ids.size(); ++i) { + auto id = partition_ids[i]; + + rmm::device_uvector d_tmp_rows(0, handle.get_stream()); + rmm::device_uvector d_tmp_cols(0, handle.get_stream()); + std::tie(i == 0 ? d_edgelist_rows : d_tmp_rows, i == 0 ? d_edgelist_cols : d_tmp_cols) = + cugraph::experimental::generate_rmat_edgelist(handle, + scale, + partition_edge_counts[i], + a, + b, + c, + base_seed + id, + undirected ? true : false, + scramble_vertex_ids); + + rmm::device_uvector d_tmp_weights(0, handle.get_stream()); + if (test_weighted) { + if (i == 0) { + d_edgelist_weights.resize(d_edgelist_rows.size(), handle.get_stream()); + } else { + d_tmp_weights.resize(d_tmp_rows.size(), handle.get_stream()); + } + + raft::random::Rng rng(base_seed + num_partitions + id); + rng.uniform(i == 0 ? d_edgelist_weights.data() : d_tmp_weights.data(), + i == 0 ? d_edgelist_weights.size() : d_tmp_weights.size(), + weight_t{0.0}, + weight_t{1.0}, + handle.get_stream()); + } + + if (i > 0) { + auto start_offset = d_edgelist_rows.size(); + d_edgelist_rows.resize(start_offset + d_tmp_rows.size(), handle.get_stream()); + d_edgelist_cols.resize(d_edgelist_rows.size(), handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_tmp_rows.begin(), + d_tmp_rows.end(), + d_edgelist_rows.begin() + start_offset); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_tmp_cols.begin(), + d_tmp_cols.end(), + d_edgelist_cols.begin() + start_offset); + if (test_weighted) { + d_edgelist_weights.resize(d_edgelist_rows.size(), handle.get_stream()); + thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_tmp_weights.begin(), + d_tmp_weights.end(), + d_edgelist_weights.begin() + start_offset); + } + } + } + if (undirected) { // FIXME: need to symmetrize CUGRAPH_FAIL("unimplemented."); } - rmm::device_uvector d_edgelist_weights(test_weighted ? d_edgelist_rows.size() : 0, - handle.get_stream()); - if (test_weighted) { - raft::random::Rng rng(seed + 1); - rng.uniform(d_edgelist_weights.data(), - d_edgelist_weights.size(), - weight_t{0.0}, - weight_t{1.0}, - handle.get_stream()); + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name()); + auto const row_comm_size = row_comm.get_size(); + auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name()); + auto const col_comm_size = col_comm.get_size(); + + rmm::device_uvector d_rx_edgelist_rows(0, handle.get_stream()); + rmm::device_uvector d_rx_edgelist_cols(0, handle.get_stream()); + rmm::device_uvector d_rx_edgelist_weights(0, handle.get_stream()); + if (test_weighted) { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(store_transposed ? d_edgelist_cols.begin() : d_edgelist_rows.begin(), + store_transposed ? d_edgelist_rows.begin() : d_edgelist_cols.begin(), + d_edgelist_weights.begin())); + + std::forward_as_tuple(std::tie(store_transposed ? d_rx_edgelist_cols : d_rx_edgelist_rows, + store_transposed ? d_rx_edgelist_rows : d_rx_edgelist_cols, + d_rx_edgelist_weights), + std::ignore) = + cugraph::experimental::groupby_gpuid_and_shuffle_values( + comm, // handle.get_comms(), + edge_first, + edge_first + d_edgelist_rows.size(), + [key_func = + cugraph::experimental::detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + handle.get_stream()); + } else { + auto edge_first = thrust::make_zip_iterator( + thrust::make_tuple(store_transposed ? d_edgelist_cols.begin() : d_edgelist_rows.begin(), + store_transposed ? d_edgelist_rows.begin() : d_edgelist_cols.begin())); + + std::forward_as_tuple(std::tie(store_transposed ? d_rx_edgelist_cols : d_rx_edgelist_rows, + store_transposed ? d_rx_edgelist_rows : d_rx_edgelist_cols), + std::ignore) = + cugraph::experimental::groupby_gpuid_and_shuffle_values( + comm, // handle.get_comms(), + edge_first, + edge_first + d_edgelist_rows.size(), + [key_func = + cugraph::experimental::detail::compute_gpu_id_from_edge_t{ + comm_size, row_comm_size, col_comm_size}] __device__(auto val) { + return key_func(thrust::get<0>(val), thrust::get<1>(val)); + }, + handle.get_stream()); + } + + d_edgelist_rows = std::move(d_rx_edgelist_rows); + d_edgelist_cols = std::move(d_rx_edgelist_cols); + d_edgelist_weights = std::move(d_rx_edgelist_weights); + } + + rmm::device_uvector d_vertices(0, handle.get_stream()); + for (size_t i = 0; i < partition_ids.size(); ++i) { + auto id = partition_ids[i]; + + auto start_offset = d_vertices.size(); + d_vertices.resize(start_offset + (partition_vertex_lasts[i] - partition_vertex_firsts[i]), + handle.get_stream()); + thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), + d_vertices.begin() + start_offset, + d_vertices.end(), + partition_vertex_firsts[i]); } - rmm::device_uvector d_vertices(static_cast(size_t{1} << scale), - handle.get_stream()); - thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), - d_vertices.begin(), - d_vertices.end(), - vertex_t{0}); + if (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + rmm::device_uvector d_rx_vertices(0, handle.get_stream()); + std::tie(d_rx_vertices, std::ignore) = cugraph::experimental::groupby_gpuid_and_shuffle_values( + comm, // handle.get_comms(), + d_vertices.begin(), + d_vertices.end(), + [key_func = + cugraph::experimental::detail::compute_gpu_id_from_vertex_t{ + comm_size}] __device__(auto val) { return key_func(val); }, + handle.get_stream()); + d_vertices = std::move(d_rx_vertices); + } return generate_graph_from_edgelist( handle, @@ -90,59 +246,71 @@ generate_graph_from_rmat_params(raft::handle_t const& handle, template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> @@ -157,105 +325,128 @@ generate_graph_from_rmat_params( bool undirected, bool scramble_vertex_ids, bool test_weighted, - bool renumber); + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> @@ -270,105 +461,128 @@ generate_graph_from_rmat_params( bool undirected, bool scramble_vertex_ids, bool test_weighted, - bool renumber); + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> @@ -383,49 +597,60 @@ generate_graph_from_rmat_params( bool undirected, bool scramble_vertex_ids, bool test_weighted, - bool renumber); + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); template std::tuple, rmm::device_uvector> -generate_graph_from_rmat_params(raft::handle_t const& handle, - size_t scale, - size_t edge_factor, - double a, - double b, - double c, - uint64_t seed, - bool undirected, - bool scramble_vertex_ids, - bool test_weighted, - bool renumber); +generate_graph_from_rmat_params( + raft::handle_t const& handle, + size_t scale, + size_t edge_factor, + double a, + double b, + double c, + uint64_t seed, + bool undirected, + bool scramble_vertex_ids, + bool test_weighted, + bool renumber, + std::vector const& partition_ids, + size_t num_partitions); } // namespace test } // namespace cugraph diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp index 37e87c62247..e81a76b4163 100644 --- a/cpp/tests/utilities/test_utilities.hpp +++ b/cpp/tests/utilities/test_utilities.hpp @@ -106,6 +106,22 @@ static const std::string& get_rapids_dataset_root_dir() return rdrd; } +template +std::tuple, + rmm::device_uvector> +generate_graph_from_edgelist(raft::handle_t const& handle, + rmm::device_uvector&& vertices, + rmm::device_uvector&& edgelist_rows, + rmm::device_uvector&& edgelist_cols, + rmm::device_uvector&& edgelist_weights, + bool is_symmetric, + bool test_weighted, + bool renumber); + // returns a tuple of (rows, columns, weights, number_of_vertices, is_symmetric) template std::tuple, @@ -130,22 +146,6 @@ read_graph_from_matrix_market_file(raft::handle_t const& handle, bool test_weighted, bool renumber); -template -std::tuple, - rmm::device_uvector> -generate_graph_from_edgelist(raft::handle_t const& handle, - rmm::device_uvector&& vertices, - rmm::device_uvector&& edgelist_rows, - rmm::device_uvector&& edgelist_cols, - rmm::device_uvector&& edgelist_weights, - bool is_symmetric, - bool test_weighted, - bool renumber); - template const& partition_ids, + size_t num_partitions); struct rmat_params_t { size_t scale{}; @@ -182,19 +184,5 @@ struct input_graph_specifier_t { rmat_params_t rmat_params{}; }; -template -std::enable_if_t::value, bool> is_valid_vertex(vertex_t num_vertices, - vertex_t v) -{ - return (v >= 0) && (v < num_vertices); -} - -template -std::enable_if_t::value, bool> is_valid_vertex(vertex_t num_vertices, - vertex_t v) -{ - return v < num_vertices; -} - } // namespace test } // namespace cugraph diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu new file mode 100644 index 00000000000..5d32fb8a5d1 --- /dev/null +++ b/cpp/tests/utilities/thrust_wrapper.cu @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +namespace cugraph { +namespace test { + +template +rmm::device_uvector sort_by_key(raft::handle_t const& handle, + vertex_t const* keys, + value_t const* values, + size_t num_pairs) +{ + rmm::device_uvector sorted_keys(num_pairs, handle.get_stream_view()); + rmm::device_uvector sorted_values(num_pairs, handle.get_stream_view()); + + thrust::copy( + rmm::exec_policy(handle.get_stream_view()), keys, keys + num_pairs, sorted_keys.begin()); + thrust::copy( + rmm::exec_policy(handle.get_stream_view()), values, values + num_pairs, sorted_values.begin()); + + thrust::sort_by_key(rmm::exec_policy(handle.get_stream_view()), + sorted_keys.begin(), + sorted_keys.end(), + sorted_values.begin()); + + return sorted_values; +} + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int32_t const* keys, + float const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int32_t const* keys, + double const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int32_t const* keys, + int32_t const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int64_t const* keys, + float const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int64_t const* keys, + double const* values, + size_t num_pairs); + +template rmm::device_uvector sort_by_key(raft::handle_t const& handle, + int64_t const* keys, + int64_t const* values, + size_t num_pairs); + +} // namespace test +} // namespace cugraph diff --git a/cpp/tests/utilities/thrust_wrapper.hpp b/cpp/tests/utilities/thrust_wrapper.hpp new file mode 100644 index 00000000000..579dc3c550f --- /dev/null +++ b/cpp/tests/utilities/thrust_wrapper.hpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace cugraph { +namespace test { + +template +rmm::device_uvector sort_by_key(raft::handle_t const& handle, + vertex_t const* keys, + value_t const* values, + size_t num_pairs); + +} // namespace test +} // namespace cugraph diff --git a/python/cugraph/__init__.py b/python/cugraph/__init__.py index 8a847d1f1d4..11ba2d6ef96 100644 --- a/python/cugraph/__init__.py +++ b/python/cugraph/__init__.py @@ -82,7 +82,9 @@ shortest_path, filter_unreachable, shortest_path_length, - traveling_salesperson + traveling_salesperson, + concurrent_bfs, + multi_source_bfs, ) from cugraph.tree import minimum_spanning_tree, maximum_spanning_tree diff --git a/python/cugraph/community/egonet_wrapper.pyx b/python/cugraph/community/egonet_wrapper.pyx index ff9f2b8b3de..23aa159314f 100644 --- a/python/cugraph/community/egonet_wrapper.pyx +++ b/python/cugraph/community/egonet_wrapper.pyx @@ -33,7 +33,7 @@ def egonet(input_graph, vertices, radius=1): np.dtype("float32") : numberTypeEnum.floatType, np.dtype("double") : numberTypeEnum.doubleType} - [src, dst] = [input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']] + [src, dst] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']], [np.int32]) vertex_t = src.dtype edge_t = np.dtype("int32") weights = None @@ -42,7 +42,7 @@ def egonet(input_graph, vertices, radius=1): num_verts = input_graph.number_of_vertices() num_edges = input_graph.number_of_edges(directed_edges=True) - num_partition_edges = num_edges + num_local_edges = num_edges cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] @@ -50,10 +50,13 @@ def egonet(input_graph, vertices, radius=1): if weights is not None: c_edge_weights = weights.__cuda_array_interface__['data'][0] weight_t = weights.dtype + is_weighted = True else: weight_t = np.dtype("float32") + is_weighted = False # Pointers for egonet + vertices = vertices.astype('int32') cdef uintptr_t c_source_vertex_ptr = vertices.__cuda_array_interface__['data'][0] n_subgraphs = vertices.size n_streams = 1 @@ -71,10 +74,11 @@ def egonet(input_graph, vertices, radius=1): ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_verts, num_edges, False, + is_weighted, False, False) if(weight_t==np.dtype("float32")): diff --git a/python/cugraph/community/ktruss_subgraph_wrapper.pyx b/python/cugraph/community/ktruss_subgraph_wrapper.pyx index 9f38b33d774..d3b7a38ba41 100644 --- a/python/cugraph/community/ktruss_subgraph_wrapper.pyx +++ b/python/cugraph/community/ktruss_subgraph_wrapper.pyx @@ -33,6 +33,10 @@ def ktruss_subgraph_double(input_graph, k, use_weights): def ktruss_subgraph(input_graph, k, use_weights): + [input_graph.edgelist.edgelist_df['src'], + input_graph.edgelist.edgelist_df['dst']] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'], + input_graph.edgelist.edgelist_df['dst']], + [np.int32]) if graph_primtypes_wrapper.weight_type(input_graph) == np.float64 and use_weights: return ktruss_subgraph_double(input_graph, k, use_weights) else: diff --git a/python/cugraph/community/subgraph_extraction_wrapper.pyx b/python/cugraph/community/subgraph_extraction_wrapper.pyx index 31c5d2372f0..46dc5c07eaf 100644 --- a/python/cugraph/community/subgraph_extraction_wrapper.pyx +++ b/python/cugraph/community/subgraph_extraction_wrapper.pyx @@ -59,6 +59,7 @@ def subgraph(input_graph, vertices): if weights is not None: c_weights = weights.__cuda_array_interface__['data'][0] + [vertices] = graph_primtypes_wrapper.datatype_cast([vertices], [np.int32]) cdef uintptr_t c_vertices = vertices.__cuda_array_interface__['data'][0] if use_float: diff --git a/python/cugraph/cores/k_core_wrapper.pyx b/python/cugraph/cores/k_core_wrapper.pyx index a0ef99a8e8b..28bb191f4f4 100644 --- a/python/cugraph/cores/k_core_wrapper.pyx +++ b/python/cugraph/cores/k_core_wrapper.pyx @@ -49,6 +49,10 @@ def k_core(input_graph, k, core_number): """ Call k_core """ + [input_graph.edgelist.edgelist_df['src'], + input_graph.edgelist.edgelist_df['dst']] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'], + input_graph.edgelist.edgelist_df['dst']], + [np.int32]) if graph_primtypes_wrapper.weight_type(input_graph) == np.float64: return k_core_double(input_graph, k, core_number) else: diff --git a/python/cugraph/dask/centrality/katz_centrality.py b/python/cugraph/dask/centrality/katz_centrality.py index e690e291928..a2f83a0b2a8 100644 --- a/python/cugraph/dask/centrality/katz_centrality.py +++ b/python/cugraph/dask/centrality/katz_centrality.py @@ -14,8 +14,8 @@ # from dask.distributed import wait, default_client -from cugraph.dask.common.input_utils import get_distributed_data -from cugraph.structure.shuffle import shuffle +from cugraph.dask.common.input_utils import (get_distributed_data, + get_vertex_partition_offsets) from cugraph.dask.centrality import\ mg_katz_centrality_wrapper as mg_katz_centrality import cugraph.comms.comms as Comms @@ -133,11 +133,9 @@ def katz_centrality(input_graph, client = default_client() input_graph.compute_renumber_edge_list(transposed=True) - (ddf, - num_verts, - partition_row_size, - partition_col_size, - vertex_partition_offsets) = shuffle(input_graph, transposed=True) + ddf = input_graph.edgelist.edgelist_df + vertex_partition_offsets = get_vertex_partition_offsets(input_graph) + num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) diff --git a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx index ccae26fe7e6..5fb9de788cf 100644 --- a/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx +++ b/python/cugraph/dask/centrality/mg_katz_centrality_wrapper.pyx @@ -52,8 +52,12 @@ def mg_katz_centrality(input_df, if "value" in input_df.columns: weights = input_df['value'] weight_t = weights.dtype + is_weighted = True + raise NotImplementedError # FIXME: c_edge_weights is always set to NULL else: + weights = None weight_t = np.dtype("float32") + is_weighted = False if alpha is None: alpha = 0.1 @@ -67,11 +71,13 @@ def mg_katz_centrality(input_df, np.dtype("double") : numberTypeEnum.doubleType} # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] cdef uintptr_t c_edge_weights = NULL + if weights is not None: + c_edge_weights = weights.__cuda_array_interface__['data'][0] # FIXME: data is on device, move to host (to_pandas()), convert to np array and access pointer to pass to C vertex_partition_offsets_host = vertex_partition_offsets.values_host @@ -85,9 +91,10 @@ def mg_katz_centrality(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, True, + is_weighted, True, True) df = cudf.DataFrame() diff --git a/python/cugraph/dask/common/input_utils.py b/python/cugraph/dask/common/input_utils.py index bbc914da502..0248f429a09 100644 --- a/python/cugraph/dask/common/input_utils.py +++ b/python/cugraph/dask/common/input_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -217,3 +217,15 @@ def get_distributed_data(input_ddf): if data.worker_info is None and comms is not None: data.calculate_worker_and_rank_info(comms) return data + + +def get_vertex_partition_offsets(input_graph): + import cudf + renumber_vertex_count = input_graph.renumber_map.implementation.ddf.\ + map_partitions(len).compute() + renumber_vertex_cumsum = renumber_vertex_count.cumsum() + vertex_dtype = input_graph.edgelist.edgelist_df['src'].dtype + vertex_partition_offsets = cudf.Series([0], dtype=vertex_dtype) + vertex_partition_offsets = vertex_partition_offsets.append(cudf.Series( + renumber_vertex_cumsum, dtype=vertex_dtype)) + return vertex_partition_offsets diff --git a/python/cugraph/dask/community/louvain.py b/python/cugraph/dask/community/louvain.py index 495061c0f81..c9af0f526c9 100644 --- a/python/cugraph/dask/community/louvain.py +++ b/python/cugraph/dask/community/louvain.py @@ -16,8 +16,8 @@ from dask.distributed import wait, default_client import cugraph.comms.comms as Comms -from cugraph.dask.common.input_utils import get_distributed_data -from cugraph.structure.shuffle import shuffle +from cugraph.dask.common.input_utils import (get_distributed_data, + get_vertex_partition_offsets) from cugraph.dask.community import louvain_wrapper as c_mg_louvain from cugraph.utilities.utils import is_cuda_version_less_than @@ -86,12 +86,9 @@ def louvain(input_graph, max_iter=100, resolution=1.0): input_graph.compute_renumber_edge_list(transposed=False) sorted_by_degree = True - (ddf, - num_verts, - partition_row_size, - partition_col_size, - vertex_partition_offsets) = shuffle(input_graph, transposed=False) - + ddf = input_graph.edgelist.edgelist_df + vertex_partition_offsets = get_vertex_partition_offsets(input_graph) + num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) diff --git a/python/cugraph/dask/community/louvain_wrapper.pyx b/python/cugraph/dask/community/louvain_wrapper.pyx index f58630d07aa..a3cebeac272 100644 --- a/python/cugraph/dask/community/louvain_wrapper.pyx +++ b/python/cugraph/dask/community/louvain_wrapper.pyx @@ -56,12 +56,12 @@ def louvain(input_df, src = input_df['src'] dst = input_df['dst'] - num_partition_edges = len(src) + num_local_edges = len(src) if "value" in input_df.columns: weights = input_df['value'] else: - weights = cudf.Series(np.full(num_partition_edges, 1.0, dtype=np.float32)) + weights = cudf.Series(np.full(num_local_edges, 1.0, dtype=np.float32)) vertex_t = src.dtype if num_global_edges > (2**31 - 1): @@ -94,9 +94,10 @@ def louvain(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, sorted_by_degree, + True, False, True) # store_transposed, multi_gpu # Create the output dataframe, column lengths must be equal to the number of diff --git a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx index 12f2342559b..c2f92f0f33b 100644 --- a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx +++ b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx @@ -51,8 +51,12 @@ def mg_pagerank(input_df, if "value" in input_df.columns: weights = input_df['value'] weight_t = weights.dtype + is_weighted = True + raise NotImplementedError # FIXME: c_edge_weights is always set to NULL else: + weights = None weight_t = np.dtype("float32") + is_weighted = False # FIXME: Offsets and indices are currently hardcoded to int, but this may # not be acceptable in the future. @@ -62,11 +66,13 @@ def mg_pagerank(input_df, np.dtype("double") : numberTypeEnum.doubleType} # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] cdef uintptr_t c_edge_weights = NULL + if weights is not None: + c_edge_weights = weights.__cuda_array_interface__['data'][0] # FIXME: data is on device, move to host (to_pandas()), convert to np array and access pointer to pass to C vertex_partition_offsets_host = vertex_partition_offsets.values_host @@ -81,9 +87,10 @@ def mg_pagerank(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, True, + is_weighted, True, True) df = cudf.DataFrame() diff --git a/python/cugraph/dask/link_analysis/pagerank.py b/python/cugraph/dask/link_analysis/pagerank.py index d8a76f1231e..bfaada85a6f 100644 --- a/python/cugraph/dask/link_analysis/pagerank.py +++ b/python/cugraph/dask/link_analysis/pagerank.py @@ -14,8 +14,8 @@ # from dask.distributed import wait, default_client -from cugraph.dask.common.input_utils import get_distributed_data -from cugraph.structure.shuffle import shuffle +from cugraph.dask.common.input_utils import (get_distributed_data, + get_vertex_partition_offsets) from cugraph.dask.link_analysis import mg_pagerank_wrapper as mg_pagerank import cugraph.comms.comms as Comms import dask_cudf @@ -124,11 +124,10 @@ def pagerank(input_graph, client = default_client() input_graph.compute_renumber_edge_list(transposed=True) - (ddf, - num_verts, - partition_row_size, - partition_col_size, - vertex_partition_offsets) = shuffle(input_graph, transposed=True) + + ddf = input_graph.edgelist.edgelist_df + vertex_partition_offsets = get_vertex_partition_offsets(input_graph) + num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) diff --git a/python/cugraph/dask/traversal/bfs.py b/python/cugraph/dask/traversal/bfs.py index 51e0dc0de5d..d108730f665 100644 --- a/python/cugraph/dask/traversal/bfs.py +++ b/python/cugraph/dask/traversal/bfs.py @@ -14,8 +14,8 @@ # from dask.distributed import wait, default_client -from cugraph.dask.common.input_utils import get_distributed_data -from cugraph.structure.shuffle import shuffle +from cugraph.dask.common.input_utils import (get_distributed_data, + get_vertex_partition_offsets) from cugraph.dask.traversal import mg_bfs_wrapper as mg_bfs import cugraph.comms.comms as Comms import cudf @@ -91,11 +91,10 @@ def bfs(graph, client = default_client() graph.compute_renumber_edge_list(transposed=False) - (ddf, - num_verts, - partition_row_size, - partition_col_size, - vertex_partition_offsets) = shuffle(graph, transposed=False) + ddf = graph.edgelist.edgelist_df + vertex_partition_offsets = get_vertex_partition_offsets(graph) + num_verts = vertex_partition_offsets.iloc[-1] + num_edges = len(ddf) data = get_distributed_data(ddf) diff --git a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx index 527cb2bcf0a..44630ba5fb3 100644 --- a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx +++ b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx @@ -58,7 +58,7 @@ def mg_bfs(input_df, np.dtype("double") : numberTypeEnum.doubleType} # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] @@ -77,9 +77,10 @@ def mg_bfs(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, True, + False, # BFS runs on unweighted graphs False, True) # Generate the cudf.DataFrame result diff --git a/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx b/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx index 15d956836b4..82a4ebe04d6 100644 --- a/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx +++ b/python/cugraph/dask/traversal/mg_sssp_wrapper.pyx @@ -46,9 +46,11 @@ def mg_sssp(input_df, if "value" in input_df.columns: weights = input_df['value'] weight_t = weights.dtype + is_weighted = True else: weights = None weight_t = np.dtype("float32") + is_weighted = False # FIXME: Offsets and indices are currently hardcoded to int, but this may # not be acceptable in the future. @@ -58,7 +60,7 @@ def mg_sssp(input_df, np.dtype("double") : numberTypeEnum.doubleType} # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] @@ -79,9 +81,10 @@ def mg_sssp(input_df, ((numberTypeMap[vertex_t])), ((numberTypeMap[edge_t])), ((numberTypeMap[weight_t])), - num_partition_edges, + num_local_edges, num_global_verts, num_global_edges, True, + is_weighted, False, True) # Generate the cudf.DataFrame result diff --git a/python/cugraph/dask/traversal/sssp.py b/python/cugraph/dask/traversal/sssp.py index 52f2b9b256c..32e7401023a 100644 --- a/python/cugraph/dask/traversal/sssp.py +++ b/python/cugraph/dask/traversal/sssp.py @@ -14,8 +14,8 @@ # from dask.distributed import wait, default_client -from cugraph.dask.common.input_utils import get_distributed_data -from cugraph.structure.shuffle import shuffle +from cugraph.dask.common.input_utils import (get_distributed_data, + get_vertex_partition_offsets) from cugraph.dask.traversal import mg_sssp_wrapper as mg_sssp import cugraph.comms.comms as Comms import cudf @@ -91,11 +91,9 @@ def sssp(graph, client = default_client() graph.compute_renumber_edge_list(transposed=False) - (ddf, - num_verts, - partition_row_size, - partition_col_size, - vertex_partition_offsets) = shuffle(graph, transposed=False) + ddf = graph.edgelist.edgelist_df + vertex_partition_offsets = get_vertex_partition_offsets(graph) + num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) diff --git a/python/cugraph/link_analysis/pagerank_wrapper.pyx b/python/cugraph/link_analysis/pagerank_wrapper.pyx index 81a68d42360..2c619a052ec 100644 --- a/python/cugraph/link_analysis/pagerank_wrapper.pyx +++ b/python/cugraph/link_analysis/pagerank_wrapper.pyx @@ -42,7 +42,7 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1. num_verts = input_graph.number_of_vertices() num_edges = input_graph.number_of_edges(directed_edges=True) # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(src) + cdef int num_local_edges = len(src) df = cudf.DataFrame() df['vertex'] = cudf.Series(np.arange(num_verts, dtype=np.int32)) @@ -71,8 +71,10 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1. if weights is not None: c_edge_weights = weights.__cuda_array_interface__['data'][0] weight_t = weights.dtype + is_weighted = True else: weight_t = np.dtype("float32") + is_weighted = False # FIXME: Offsets and indices are currently hardcoded to int, but this may # not be acceptable in the future. @@ -96,10 +98,10 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1. ((numberTypeEnum.int32Type)), ((numberTypeEnum.int32Type)), ((numberTypeMap[weight_t])), - #num_verts, num_edges, - num_partition_edges, + num_local_edges, num_verts, num_edges, False, + is_weighted, True, False) diff --git a/python/cugraph/structure/graph_utilities.pxd b/python/cugraph/structure/graph_utilities.pxd index 10c90f44cb8..b169e42ccf8 100644 --- a/python/cugraph/structure/graph_utilities.pxd +++ b/python/cugraph/structure/graph_utilities.pxd @@ -46,10 +46,11 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": numberTypeEnum vertexType, numberTypeEnum edgeType, numberTypeEnum weightType, - size_t num_partition_edges, + size_t num_local_edges, size_t num_global_vertices, size_t num_global_edges, bool sorted_by_degree, + bool is_weighted, bool transposed, bool multi_gpu) except + @@ -106,18 +107,21 @@ cdef extern from "experimental/graph_view.hpp" namespace "cugraph::experimental" # cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": - cdef cppclass major_minor_weights_t[vertex_t, weight_t]: + cdef cppclass major_minor_weights_t[vertex_t, edge_t, weight_t]: major_minor_weights_t(const handle_t &handle) pair[unique_ptr[device_buffer], size_t] get_major_wrap() pair[unique_ptr[device_buffer], size_t] get_minor_wrap() pair[unique_ptr[device_buffer], size_t] get_weights_wrap() + unique_ptr[vector[edge_t]] get_edge_counts_wrap() ctypedef fused shuffled_vertices_t: - major_minor_weights_t[int, float] - major_minor_weights_t[int, double] - major_minor_weights_t[long, float] - major_minor_weights_t[long, double] + major_minor_weights_t[int, int, float] + major_minor_weights_t[int, int, double] + major_minor_weights_t[int, long, float] + major_minor_weights_t[int, long, double] + major_minor_weights_t[long, long, float] + major_minor_weights_t[long, long, double] # 3. return type for renumber: # @@ -151,13 +155,12 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": # cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": - cdef unique_ptr[major_minor_weights_t[vertex_t, weight_t]] call_shuffle[vertex_t, edge_t, weight_t]( + cdef unique_ptr[major_minor_weights_t[vertex_t, edge_t, weight_t]] call_shuffle[vertex_t, edge_t, weight_t]( const handle_t &handle, vertex_t *edgelist_major_vertices, vertex_t *edgelist_minor_vertices, weight_t* edgelist_weights, - edge_t num_edges, - bool is_hyper_partitioned) except + + edge_t num_edges) except + # 5. `renumber_edgelist()` wrapper # @@ -167,7 +170,6 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": const handle_t &handle, vertex_t *edgelist_major_vertices, vertex_t *edgelist_minor_vertices, - edge_t num_edges, - bool is_hyper_partitioned, + const vector[edge_t]& edge_counts, bool do_check, bool multi_gpu) except + diff --git a/python/cugraph/structure/new_number_map.py b/python/cugraph/structure/new_number_map.py deleted file mode 100644 index f8a2164d2c4..00000000000 --- a/python/cugraph/structure/new_number_map.py +++ /dev/null @@ -1,317 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from dask.distributed import wait, default_client -from cugraph.dask.common.input_utils import get_distributed_data -from cugraph.structure import renumber_wrapper as c_renumber -import cugraph.comms as Comms -import dask_cudf -import numpy as np -import cudf -import cugraph.structure.number_map as legacy_number_map - - -def call_renumber(sID, - data, - num_edges, - is_mnmg, - store_transposed): - wid = Comms.get_worker_id(sID) - handle = Comms.get_handle(sID) - return c_renumber.renumber(data[0], - num_edges, - wid, - handle, - is_mnmg, - store_transposed) - - -class NumberMap: - - class SingleGPU: - def __init__(self, df, src_col_names, dst_col_names, id_type, - store_transposed): - self.col_names = NumberMap.compute_vals(src_col_names) - self.df = cudf.DataFrame() - self.id_type = id_type - self.store_transposed = store_transposed - self.numbered = False - - def to_internal_vertex_id(self, df, col_names): - tmp_df = df[col_names].rename( - columns=dict(zip(col_names, self.col_names)), copy=False - ) - index_name = NumberMap.generate_unused_column_name(df.columns) - tmp_df[index_name] = tmp_df.index - return ( - self.df.merge(tmp_df, on=self.col_names, how="right") - .sort_values(index_name) - .drop(columns=[index_name]) - .reset_index()["id"] - ) - - def from_internal_vertex_id( - self, df, internal_column_name, external_column_names - ): - tmp_df = self.df.merge( - df, - right_on=internal_column_name, - left_on="id", - how="right", - ) - if internal_column_name != "id": - tmp_df = tmp_df.drop(columns=["id"]) - if external_column_names is None: - return tmp_df - else: - return tmp_df.rename( - columns=dict(zip(self.col_names, external_column_names)), - copy=False, - ) - - class MultiGPU: - def __init__( - self, ddf, src_col_names, dst_col_names, id_type, store_transposed - ): - self.col_names = NumberMap.compute_vals(src_col_names) - self.val_types = NumberMap.compute_vals_types(ddf, src_col_names) - self.val_types["count"] = np.int32 - self.id_type = id_type - self.store_transposed = store_transposed - self.numbered = False - - def to_internal_vertex_id(self, ddf, col_names): - return self.ddf.merge( - ddf, - right_on=col_names, - left_on=self.col_names, - how="right", - )["global_id"] - - def from_internal_vertex_id( - self, df, internal_column_name, external_column_names - ): - tmp_df = self.ddf.merge( - df, - right_on=internal_column_name, - left_on="global_id", - how="right" - ).map_partitions(lambda df: df.drop(columns="global_id")) - - if external_column_names is None: - return tmp_df - else: - return tmp_df.map_partitions( - lambda df: - df.rename( - columns=dict( - zip(self.col_names, external_column_names) - ), - copy=False - ) - ) - - def __init__(self, id_type=np.int32): - self.implementation = None - self.id_type = id_type - - def compute_vals_types(df, column_names): - """ - Helper function to compute internal column names and types - """ - return { - str(i): df[column_names[i]].dtype for i in range(len(column_names)) - } - - def generate_unused_column_name(column_names): - """ - Helper function to generate an unused column name - """ - name = 'x' - while name in column_names: - name = name + "x" - - return name - - def compute_vals(column_names): - """ - Helper function to compute internal column names based on external - column names - """ - return [str(i) for i in range(len(column_names))] - - def renumber(df, src_col_names, dst_col_names, preserve_order=False, - store_transposed=False): - - if isinstance(src_col_names, list): - renumber_type = 'legacy' - # elif isinstance(df[src_col_names].dtype, string): - # renumber_type = 'legacy' - else: - renumber_type = 'experimental' - - if renumber_type == 'legacy': - renumber_map, renumbered_df = legacy_number_map.renumber( - df, - src_col_names, - dst_col_names, - preserve_order, - store_transposed) - # Add shuffling once algorithms are switched to new renumber - # (ddf, - # num_verts, - # partition_row_size, - # partition_col_size, - # vertex_partition_offsets) = shuffle(input_graph, transposed=True) - return renumber_map, renumbered_df - - renumber_map = NumberMap() - if not isinstance(src_col_names, list): - src_col_names = [src_col_names] - dst_col_names = [dst_col_names] - if type(df) is cudf.DataFrame: - renumber_map.implementation = NumberMap.SingleGPU( - df, src_col_names, dst_col_names, renumber_map.id_type, - store_transposed - ) - elif type(df) is dask_cudf.DataFrame: - renumber_map.implementation = NumberMap.MultiGPU( - df, src_col_names, dst_col_names, renumber_map.id_type, - store_transposed - ) - else: - raise Exception("df must be cudf.DataFrame or dask_cudf.DataFrame") - - num_edges = len(df) - - if isinstance(df, dask_cudf.DataFrame): - is_mnmg = True - else: - is_mnmg = False - - if is_mnmg: - client = default_client() - data = get_distributed_data(df) - result = [(client.submit(call_renumber, - Comms.get_session_id(), - wf[1], - num_edges, - is_mnmg, - store_transposed, - workers=[wf[0]]), wf[0]) - for idx, wf in enumerate(data.worker_to_parts.items())] - wait(result) - - def get_renumber_map(data): - return data[0] - - def get_renumbered_df(data): - return data[1] - - renumbering_map = dask_cudf.from_delayed( - [client.submit(get_renumber_map, - data, - workers=[wf]) - for (data, wf) in result]) - renumbered_df = dask_cudf.from_delayed( - [client.submit(get_renumbered_df, - data, - workers=[wf]) - for (data, wf) in result]) - - renumber_map.implementation.ddf = renumbering_map - renumber_map.implementation.numbered = True - - return renumbered_df, renumber_map - else: - renumbering_map, renumbered_df = c_renumber.renumber( - df, - num_edges, - 0, - Comms.get_default_handle(), - is_mnmg, - store_transposed) - renumber_map.implementation.df = renumbering_map - renumber_map.implementation.numbered = True - return renumbered_df, renumber_map - - def unrenumber(self, df, column_name, preserve_order=False): - """ - Given a DataFrame containing internal vertex ids in the identified - column, replace this with external vertex ids. If the renumbering - is from a single column, the output dataframe will use the same - name for the external vertex identifiers. If the renumbering is from - a multi-column input, the output columns will be labeled 0 through - n-1 with a suffix of _column_name. - Note that this function does not guarantee order or partitioning in - multi-GPU mode. - Parameters - ---------- - df: cudf.DataFrame or dask_cudf.DataFrame - A DataFrame containing internal vertex identifiers that will be - converted into external vertex identifiers. - column_name: string - Name of the column containing the internal vertex id. - preserve_order: (optional) bool - If True, preserve the ourder of the rows in the output - DataFrame to match the input DataFrame - Returns - --------- - df : cudf.DataFrame or dask_cudf.DataFrame - The original DataFrame columns exist unmodified. The external - vertex identifiers are added to the DataFrame, the internal - vertex identifier column is removed from the dataframe. - Examples - -------- - >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', - >>> dtype=['int32', 'int32', 'float32'], header=None) - >>> - >>> df, number_map = NumberMap.renumber(df, '0', '1') - >>> - >>> G = cugraph.Graph() - >>> G.from_cudf_edgelist(df, 'src', 'dst') - >>> - >>> pr = cugraph.pagerank(G, alpha = 0.85, max_iter = 500, - >>> tol = 1.0e-05) - >>> - >>> pr = number_map.unrenumber(pr, 'vertex') - >>> - """ - if len(self.col_names) == 1: - # Output will be renamed to match input - mapping = {"0": column_name} - else: - # Output will be renamed to ${i}_${column_name} - mapping = {} - for nm in self.col_names: - mapping[nm] = nm + "_" + column_name - - if preserve_order: - index_name = NumberMap.generate_unused_column_name(df) - df[index_name] = df.index - - df = self.from_internal_vertex_id(df, column_name, drop=True) - - if preserve_order: - df = df.sort_values( - index_name - ).drop(columns=index_name).reset_index(drop=True) - - if type(df) is dask_cudf.DataFrame: - return df.map_partitions( - lambda df: df.rename(columns=mapping, copy=False) - ) - else: - return df.rename(columns=mapping, copy=False) diff --git a/python/cugraph/structure/number_map.py b/python/cugraph/structure/number_map.py index deb2b9f4114..e45a50d6dbe 100644 --- a/python/cugraph/structure/number_map.py +++ b/python/cugraph/structure/number_map.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -10,100 +11,46 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# -import cudf +from dask.distributed import wait, default_client +from cugraph.dask.common.input_utils import get_distributed_data +from cugraph.structure import renumber_wrapper as c_renumber +from cugraph.utilities.utils import is_device_version_less_than +import cugraph.comms.comms as Comms import dask_cudf import numpy as np -import bisect +import cudf + + +def call_renumber(sID, + data, + num_edges, + is_mnmg, + store_transposed): + wid = Comms.get_worker_id(sID) + handle = Comms.get_handle(sID) + return c_renumber.renumber(data[0], + num_edges, + wid, + handle, + is_mnmg, + store_transposed) class NumberMap: - """ - Class used to translate external vertex ids to internal vertex ids - in the cuGraph framework. - - Internal vertex ids are assigned by hashing the external vertex ids - into a structure to eliminate duplicates, and the resulting list - of unique vertices are assigned integers from [0, V) where V is - the number of unique vertices. - - In Single GPU mode, internal vertex ids are constructed using - cudf functions, with a cudf.DataFrame containing the mapping - from external vertex identifiers and internal vertex identifiers - allowing for mapping vertex identifiers in either direction. In - this mode, the order of the output from the mapping functions is - non-deterministic. cudf makes no guarantees about order. If - matching the input order is required set the preserve_order - to True. - - In Multi GPU mode, internal vertex ids are constucted using - dask_cudf functions, with a dask_cudf.DataFrame containing - the mapping from external vertex identifiers and internal - vertex identifiers allowing for mapping vertex identifiers - in either direction. In this mode, the partitioning of - the number_map and the output from any of the mapping functions - are non-deterministic. dask_cudf makes no guarantees about the - partitioning or order of the output. As of this release, - there is no mechanism for controlling that, this will be - addressed at some point. - """ class SingleGPU: def __init__(self, df, src_col_names, dst_col_names, id_type, store_transposed): self.col_names = NumberMap.compute_vals(src_col_names) - self.df = cudf.DataFrame() + self.src_col_names = src_col_names + self.dst_col_names = dst_col_names + self.df = df self.id_type = id_type self.store_transposed = store_transposed - - source_count = 0 - dest_count = 0 - - if store_transposed: - dest_count = 1 - else: - source_count = 1 - - tmp = ( - df[src_col_names] - .assign(count=source_count) - .groupby(src_col_names) - .sum() - .reset_index() - .rename( - columns=dict(zip(src_col_names, self.col_names)), - copy=False, - ) - ) - - if dst_col_names is not None: - tmp_dst = ( - df[dst_col_names] - .assign(count=dest_count) - .groupby(dst_col_names) - .sum() - .reset_index() - ) - for newname, oldname in zip(self.col_names, dst_col_names): - self.df[newname] = tmp[newname].append(tmp_dst[oldname]) - self.df['count'] = tmp['count'].append(tmp_dst['count']) - else: - for newname in self.col_names: - self.df[newname] = tmp[newname] - self.df['count'] = tmp['count'] - self.numbered = False - def compute(self): - if not self.numbered: - tmp = self.df.groupby(self.col_names).sum().sort_values( - 'count', ascending=False - ).reset_index().drop(columns='count') - - tmp["id"] = tmp.index.astype(self.id_type) - self.df = tmp - self.numbered = True - def to_internal_vertex_id(self, df, col_names): tmp_df = df[col_names].rename( columns=dict(zip(col_names, self.col_names)), copy=False @@ -117,6 +64,25 @@ def to_internal_vertex_id(self, df, col_names): .reset_index()["id"] ) + def from_internal_vertex_id( + self, df, internal_column_name, external_column_names + ): + tmp_df = self.df.merge( + df, + right_on=internal_column_name, + left_on="id", + how="right", + ) + if internal_column_name != "id": + tmp_df = tmp_df.drop(columns=["id"]) + if external_column_names is None: + return tmp_df + else: + return tmp_df.rename( + columns=dict(zip(self.col_names, external_column_names)), + copy=False, + ) + def add_internal_vertex_id(self, df, id_column_name, col_names, drop, preserve_order): ret = None @@ -162,76 +128,39 @@ def add_internal_vertex_id(self, df, id_column_name, col_names, return ret - def from_internal_vertex_id( - self, df, internal_column_name, external_column_names - ): - tmp_df = self.df.merge( - df, - right_on=internal_column_name, - left_on="id", - how="right", - ) - if internal_column_name != "id": - tmp_df = tmp_df.drop(columns=["id"]) - if external_column_names is None: - return tmp_df - else: - return tmp_df.rename( - columns=dict(zip(self.col_names, external_column_names)), - copy=False, - ) - - class MultiGPU: - def extract_vertices( - df, src_col_names, dst_col_names, - internal_col_names, store_transposed - ): - source_count = 0 - dest_count = 0 - - if store_transposed: - dest_count = 1 - else: - source_count = 1 + def indirection_map(self, df, src_col_names, dst_col_names): + tmp_df = cudf.DataFrame() - s = ( + tmp = ( df[src_col_names] - .assign(count=source_count) .groupby(src_col_names) - .sum() + .count() .reset_index() .rename( - columns=dict(zip(src_col_names, internal_col_names)), + columns=dict(zip(src_col_names, self.col_names)), copy=False, ) ) - d = None if dst_col_names is not None: - d = ( + tmp_dst = ( df[dst_col_names] - .assign(count=dest_count) .groupby(dst_col_names) - .sum() + .count() .reset_index() - .rename( - columns=dict(zip(dst_col_names, internal_col_names)), - copy=False, - ) ) + for newname, oldname in zip(self.col_names, dst_col_names): + tmp_df[newname] = tmp[newname].append(tmp_dst[oldname]) + else: + for newname in self.col_names: + tmp_df[newname] = tmp[newname] - reply = cudf.DataFrame() - - for i in internal_col_names: - if d is None: - reply[i] = s[i] - else: - reply[i] = s[i].append(d[i]) - - reply['count'] = s['count'].append(d['count']) - - return reply + tmp_df = tmp_df.groupby(self.col_names).count().reset_index() + tmp_df["id"] = tmp_df.index.astype(self.id_type) + self.df = tmp_df + return tmp_df + class MultiGPU: def __init__( self, ddf, src_col_names, dst_col_names, id_type, store_transposed ): @@ -239,110 +168,10 @@ def __init__( self.val_types = NumberMap.compute_vals_types(ddf, src_col_names) self.val_types["count"] = np.int32 self.id_type = id_type + self.ddf = ddf self.store_transposed = store_transposed - self.ddf = ddf.map_partitions( - NumberMap.MultiGPU.extract_vertices, - src_col_names, - dst_col_names, - self.col_names, - store_transposed, - meta=self.val_types, - ) self.numbered = False - # Function to compute partitions based on known divisions of the - # hash value - def compute_partition(df, divisions): - sample = df.index[0] - partition_id = bisect.bisect_right(divisions, sample) - 1 - return df.assign(partition=partition_id) - - def assign_internal_identifiers_kernel( - local_id, partition, global_id, base_addresses - ): - for i in range(len(local_id)): - global_id[i] = local_id[i] + base_addresses[partition[i]] - - def assign_internal_identifiers(df, base_addresses, id_type): - df = df.assign(local_id=df.index.astype(np.int64)) - df = df.apply_rows( - NumberMap.MultiGPU.assign_internal_identifiers_kernel, - incols=["local_id", "partition"], - outcols={"global_id": id_type}, - kwargs={"base_addresses": base_addresses}, - ) - - return df.drop(columns=["local_id", "hash", "partition"]) - - def assign_global_id(self, ddf, base_addresses, val_types): - val_types["global_id"] = self.id_type - del val_types["hash"] - del val_types["partition"] - - ddf = ddf.map_partitions( - lambda df: NumberMap.MultiGPU.assign_internal_identifiers( - df, base_addresses, self.id_type - ), - meta=val_types, - ) - return ddf - - def compute(self): - if not self.numbered: - val_types = self.val_types - val_types["hash"] = np.int32 - - vertices = self.ddf.map_partitions( - lambda df: df.assign(hash=df.hash_columns(self.col_names)), - meta=val_types, - ) - - # Redistribute the ddf based on the hash values - rehashed = vertices.set_index("hash", drop=False) - - # Compute the local partition id (obsolete once - # https://github.com/dask/dask/issues/3707 is completed) - val_types["partition"] = np.int32 - - rehashed_with_partition_id = rehashed.map_partitions( - NumberMap.MultiGPU.compute_partition, - rehashed.divisions, - meta=val_types, - ) - - val_types.pop('count') - - numbering_map = rehashed_with_partition_id.map_partitions( - lambda df: df.groupby( - self.col_names + ["hash", "partition"] - ).sum() - .sort_values('count', ascending=False) - .reset_index() - .drop(columns='count'), - meta=val_types - ) - - # - # Compute base address for each partition - # - counts = numbering_map.map_partitions( - lambda df: df.groupby("partition").count() - ).compute()["hash"].to_pandas() - base_addresses = np.zeros(len(counts) + 1, self.id_type) - - for i in range(len(counts)): - base_addresses[i + 1] = base_addresses[i] + counts[i] - - # - # Update each partition with the base address - # - numbering_map = self.assign_global_id( - numbering_map, cudf.Series(base_addresses), val_types - ) - - self.ddf = numbering_map - self.numbered = True - def to_internal_vertex_id(self, ddf, col_names): return self.ddf.merge( ddf, @@ -351,6 +180,29 @@ def to_internal_vertex_id(self, ddf, col_names): how="right", )["global_id"] + def from_internal_vertex_id( + self, df, internal_column_name, external_column_names + ): + tmp_df = self.ddf.merge( + df, + right_on=internal_column_name, + left_on="global_id", + how="right" + ).map_partitions(lambda df: df.drop(columns="global_id")) + + if external_column_names is None: + return tmp_df + else: + return tmp_df.map_partitions( + lambda df: + df.rename( + columns=dict( + zip(self.col_names, external_column_names) + ), + copy=False + ) + ) + def add_internal_vertex_id(self, ddf, id_column_name, col_names, drop, preserve_order): # At the moment, preserve_order cannot be done on @@ -385,39 +237,50 @@ def add_internal_vertex_id(self, ddf, id_column_name, col_names, drop, return ret - def from_internal_vertex_id( - self, df, internal_column_name, external_column_names - ): - tmp_df = self.ddf.merge( - df, - right_on=internal_column_name, - left_on="global_id", - how="right" - ).map_partitions(lambda df: df.drop(columns="global_id")) + def indirection_map(self, ddf, src_col_names, dst_col_names): - if external_column_names is None: - return tmp_df - else: - return tmp_df.map_partitions( - lambda df: - df.rename( - columns=dict( - zip(self.col_names, external_column_names) - ), - copy=False - ) + tmp = ( + ddf[src_col_names] + .groupby(src_col_names) + .count() + .reset_index() + .rename( + columns=dict(zip(src_col_names, self.col_names)), ) + ) + + if dst_col_names is not None: + tmp_dst = ( + ddf[dst_col_names] + .groupby(dst_col_names) + .count() + .reset_index() + ) + for i, (newname, oldname) in enumerate(zip(self.col_names, + dst_col_names)): + if i == 0: + tmp_df = tmp[newname].append(tmp_dst[oldname]).\ + to_frame(name=newname) + else: + tmp_df[newname] = tmp[newname].append(tmp_dst[oldname]) + print(tmp_df.columns) + else: + for newname in self.col_names: + tmp_df[newname] = tmp[newname] + tmp_ddf = tmp_df.groupby(self.col_names).count().reset_index() + + # Set global index + tmp_ddf = tmp_ddf.assign(idx=1) + tmp_ddf['global_id'] = tmp_ddf.idx.cumsum() - 1 + tmp_ddf = tmp_ddf.drop(columns='idx') + + self.ddf = tmp_ddf + return tmp_ddf def __init__(self, id_type=np.int32): self.implementation = None self.id_type = id_type - def aggregate_count_and_partition(df): - d = {} - d['count'] = df['count'].sum() - d['partition'] = df['partition'].min() - return cudf.Series(d, index=['count', 'partition']) - def compute_vals_types(df, column_names): """ Helper function to compute internal column names and types @@ -443,125 +306,19 @@ def compute_vals(column_names): """ return [str(i) for i in range(len(column_names))] - def from_dataframe( - self, df, src_col_names, dst_col_names=None, store_transposed=False - ): - """ - Populate the numbering map with vertices from the specified - columns of the provided DataFrame. - - Parameters - ---------- - df : cudf.DataFrame or dask_cudf.DataFrame - Contains a list of external vertex identifiers that will be - numbered by the NumberMap class. - src_col_names: list of strings - This list of 1 or more strings contain the names - of the columns that uniquely identify an external - vertex identifier for source vertices - dst_col_names: list of strings - This list of 1 or more strings contain the names - of the columns that uniquely identify an external - vertex identifier for destination vertices - store_transposed : bool - Identify how the graph adjacency will be used. - If True, the graph will be organized by destination. - If False, the graph will be organized by source - - """ - if self.implementation is not None: - raise Exception("NumberMap is already populated") - - if dst_col_names is not None and len(src_col_names) != len( - dst_col_names - ): - raise Exception( - "src_col_names must have same length as dst_col_names" - ) - - if type(df) is cudf.DataFrame: - self.implementation = NumberMap.SingleGPU( - df, src_col_names, dst_col_names, self.id_type, - store_transposed - ) - elif type(df) is dask_cudf.DataFrame: - self.implementation = NumberMap.MultiGPU( - df, src_col_names, dst_col_names, self.id_type, - store_transposed - ) - else: - raise Exception("df must be cudf.DataFrame or dask_cudf.DataFrame") - - self.implementation.compute() - - def from_series(self, src_series, dst_series=None, store_transposed=False): - """ - Populate the numbering map with vertices from the specified - pair of series objects, one for the source and one for - the destination - - Parameters - ---------- - src_series: cudf.Series or dask_cudf.Series - Contains a list of external vertex identifiers that will be - numbered by the NumberMap class. - dst_series: cudf.Series or dask_cudf.Series - Contains a list of external vertex identifiers that will be - numbered by the NumberMap class. - store_transposed : bool - Identify how the graph adjacency will be used. - If True, the graph will be organized by destination. - If False, the graph will be organized by source - """ - if self.implementation is not None: - raise Exception("NumberMap is already populated") - - if dst_series is not None and type(src_series) != type(dst_series): - raise Exception("src_series and dst_series must have same type") - - if type(src_series) is cudf.Series: - dst_series_list = None - df = cudf.DataFrame() - df["s"] = src_series - if dst_series is not None: - df["d"] = dst_series - dst_series_list = ["d"] - self.implementation = NumberMap.SingleGPU( - df, ["s"], dst_series_list, self.id_type, store_transposed - ) - elif type(src_series) is dask_cudf.Series: - dst_series_list = None - df = dask_cudf.DataFrame() - df["s"] = src_series - if dst_series is not None: - df["d"] = dst_series - dst_series_list = ["d"] - self.implementation = NumberMap.MultiGPU( - df, ["s"], dst_series_list, self.id_type, store_transposed - ) - else: - raise Exception( - "src_series must be cudf.Series or " "dask_cudf.Series" - ) - - self.implementation.compute() - def to_internal_vertex_id(self, df, col_names=None): """ Given a collection of external vertex ids, return the internal vertex ids - Parameters ---------- df: cudf.DataFrame, cudf.Series, dask_cudf.DataFrame, dask_cudf.Series Contains a list of external vertex identifiers that will be converted into internal vertex identifiers - col_names: (optional) list of strings This list of 1 or more strings contain the names of the columns that uniquely identify an external vertex identifier - Returns --------- vertex_ids : cudf.Series or dask_cudf.Series @@ -569,7 +326,6 @@ def to_internal_vertex_id(self, df, col_names=None): does not guarantee order or partitioning (in the case of dask_cudf) of vertex ids. If order matters use add_internal_vertex_id - """ tmp_df = None tmp_col_names = None @@ -600,34 +356,27 @@ def add_internal_vertex_id( """ Given a collection of external vertex ids, return the internal vertex ids combined with the input data. - If a series-type input is provided then the series will be in a column named '0'. Otherwise the input column names in the DataFrame will be preserved. - Parameters ---------- df: cudf.DataFrame, cudf.Series, dask_cudf.DataFrame, dask_cudf.Series Contains a list of external vertex identifiers that will be converted into internal vertex identifiers - id_column_name: (optional) string The name to be applied to the column containing the id (defaults to 'id') - col_names: (optional) list of strings This list of 1 or more strings contain the names of the columns that uniquely identify an external vertex identifier - drop: (optional) boolean If True, drop the column names specified in col_names from the returned DataFrame. Defaults to False. - preserve_order: (optional) boolean If True, do extra sorting work to preserve the order of the input DataFrame. Defaults to False. - Returns --------- df : cudf.DataFrame or dask_cudf.DataFrame @@ -635,7 +384,6 @@ def add_internal_vertex_id( with an additional column containing the internal vertex id. Note that there is no guarantee of the order or partitioning of elements in the returned DataFrame. - """ tmp_df = None tmp_col_names = None @@ -671,7 +419,6 @@ def from_internal_vertex_id( """ Given a collection of internal vertex ids, return a DataFrame of the external vertex ids - Parameters ---------- df: cudf.DataFrame, cudf.Series, dask_cudf.DataFrame, dask_cudf.Series @@ -681,20 +428,16 @@ def from_internal_vertex_id( in a column labeled 'id'. If df is a dataframe type object then internal_column_name should identify which column corresponds the the internal vertex id that should be converted - internal_column_name: (optional) string Name of the column containing the internal vertex id. If df is a series then this parameter is ignored. If df is a DataFrame this parameter is required. - external_column_names: (optional) string or list of strings Name of the columns that define an external vertex id. If not specified, columns will be labeled '0', '1,', ..., 'n-1' - drop: (optional) boolean If True the internal column name will be dropped from the DataFrame. Defaults to False. - Returns --------- df : cudf.DataFrame or dask_cudf.DataFrame @@ -727,107 +470,126 @@ def from_internal_vertex_id( return output_df - def column_names(self): - """ - Return the list of internal column names - - Returns - ---------- - List of column names ('0', '1', ..., 'n-1') - """ - return self.implementation.col_names - def renumber(df, src_col_names, dst_col_names, preserve_order=False, store_transposed=False): - """ - Given a single GPU or distributed DataFrame, use src_col_names and - dst_col_names to identify the source vertex identifiers and destination - vertex identifiers, respectively. - - Internal vertex identifiers will be created, numbering vertices as - integers starting from 0. - - The function will return a DataFrame containing the original dataframe - contents with a new column labeled 'src' containing the renumbered - source vertices and a new column labeled 'dst' containing the - renumbered dest vertices, along with a NumberMap object that contains - the number map for the numbering that was used. - - Note that this function does not guarantee order in single GPU mode, - and does not guarantee order or partitioning in multi-GPU mode. If you - wish to preserve ordering, add an index column to df and sort the - return by that index column. - - Parameters - ---------- - df: cudf.DataFrame or dask_cudf.DataFrame - Contains a list of external vertex identifiers that will be - numbered by the NumberMap class. - src_col_names: string or list of strings - This list of 1 or more strings contain the names - of the columns that uniquely identify an external - vertex identifier for source vertices - dst_col_names: string or list of strings - This list of 1 or more strings contain the names - of the columns that uniquely identify an external - vertex identifier for destination vertices - store_transposed : bool - Identify how the graph adjacency will be used. - If True, the graph will be organized by destination. - If False, the graph will be organized by source - - Returns - --------- - df : cudf.DataFrame or dask_cudf.DataFrame - The original DataFrame columns exist unmodified. Columns - are added to the DataFrame to identify the external vertex - identifiers. If external_columns is specified, these names - are used as the names of the output columns. If external_columns - is not specifed the columns are labeled '0', ... 'n-1' based on - the number of columns identifying the external vertex identifiers. - - number_map : NumberMap - The number map object object that retains the mapping between - internal vertex identifiers and external vertex identifiers. + if isinstance(src_col_names, list): + renumber_type = 'legacy' + elif not (df[src_col_names].dtype == np.int32 or + df[src_col_names].dtype == np.int64): + renumber_type = 'legacy' + elif is_device_version_less_than((7, 0)): + renumber_type = 'legacy' + else: + renumber_type = 'experimental' + df = df.rename(columns={src_col_names: "src", + dst_col_names: "dst"}) - Examples - -------- - >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', - >>> dtype=['int32', 'int32', 'float32'], header=None) - >>> - >>> df, number_map = NumberMap.renumber(df, '0', '1') - >>> - >>> G = cugraph.Graph() - >>> G.from_cudf_edgelist(df, 'src', 'dst') - """ renumber_map = NumberMap() - - if isinstance(src_col_names, list): - renumber_map.from_dataframe(df, src_col_names, dst_col_names) - df = renumber_map.add_internal_vertex_id( - df, "src", src_col_names, drop=True, - preserve_order=preserve_order + if not isinstance(src_col_names, list): + src_col_names = [src_col_names] + dst_col_names = [dst_col_names] + if type(df) is cudf.DataFrame: + renumber_map.implementation = NumberMap.SingleGPU( + df, src_col_names, dst_col_names, renumber_map.id_type, + store_transposed ) - df = renumber_map.add_internal_vertex_id( - df, "dst", dst_col_names, drop=True, - preserve_order=preserve_order + elif type(df) is dask_cudf.DataFrame: + renumber_map.implementation = NumberMap.MultiGPU( + df, src_col_names, dst_col_names, renumber_map.id_type, + store_transposed ) else: - renumber_map.from_dataframe(df, [src_col_names], [dst_col_names]) + raise Exception("df must be cudf.DataFrame or dask_cudf.DataFrame") + + if renumber_type == 'legacy': + indirection_map = renumber_map.implementation.\ + indirection_map(df, + src_col_names, + dst_col_names) df = renumber_map.add_internal_vertex_id( df, "src", src_col_names, drop=True, preserve_order=preserve_order ) - df = renumber_map.add_internal_vertex_id( df, "dst", dst_col_names, drop=True, preserve_order=preserve_order ) - if type(df) is dask_cudf.DataFrame: - df = df.persist() + num_edges = len(df) + + if isinstance(df, dask_cudf.DataFrame): + is_mnmg = True + else: + is_mnmg = False + + if is_mnmg: + client = default_client() + data = get_distributed_data(df) + result = [(client.submit(call_renumber, + Comms.get_session_id(), + wf[1], + num_edges, + is_mnmg, + store_transposed, + workers=[wf[0]]), wf[0]) + for idx, wf in enumerate(data.worker_to_parts.items())] + wait(result) + + def get_renumber_map(data): + return data[0] + + def get_renumbered_df(data): + return data[1] + + renumbering_map = dask_cudf.from_delayed( + [client.submit(get_renumber_map, + data, + workers=[wf]) + for (data, wf) in result]) + renumbered_df = dask_cudf.from_delayed( + [client.submit(get_renumbered_df, + data, + workers=[wf]) + for (data, wf) in result]) + if renumber_type == 'legacy': + renumber_map.implementation.ddf = indirection_map.merge( + renumbering_map, + right_on='original_ids', left_on='global_id', + how='right').\ + drop(columns=['global_id', 'original_ids'])\ + .rename(columns={'new_ids': 'global_id'}) + else: + renumber_map.implementation.ddf = renumbering_map.rename( + columns={'original_ids': '0', 'new_ids': 'global_id'}) + renumber_map.implementation.numbered = True + return renumbered_df, renumber_map - return df, renumber_map + else: + if is_device_version_less_than((7, 0)): + renumbered_df = df + renumber_map.implementation.df = indirection_map + renumber_map.implementation.numbered = True + return renumbered_df, renumber_map + + renumbering_map, renumbered_df = c_renumber.renumber( + df, + num_edges, + 0, + Comms.get_default_handle(), + is_mnmg, + store_transposed) + if renumber_type == 'legacy': + renumber_map.implementation.df = indirection_map.\ + merge(renumbering_map, + right_on='original_ids', left_on='id').\ + drop(columns=['id', 'original_ids'])\ + .rename(columns={'new_ids': 'id'}, copy=False) + else: + renumber_map.implementation.df = renumbering_map.rename( + columns={'original_ids': '0', 'new_ids': 'id'}, copy=False) + + renumber_map.implementation.numbered = True + return renumbered_df, renumber_map def unrenumber(self, df, column_name, preserve_order=False): """ @@ -837,30 +599,24 @@ def unrenumber(self, df, column_name, preserve_order=False): name for the external vertex identifiers. If the renumbering is from a multi-column input, the output columns will be labeled 0 through n-1 with a suffix of _column_name. - Note that this function does not guarantee order or partitioning in multi-GPU mode. - Parameters ---------- df: cudf.DataFrame or dask_cudf.DataFrame A DataFrame containing internal vertex identifiers that will be converted into external vertex identifiers. - column_name: string Name of the column containing the internal vertex id. - preserve_order: (optional) bool If True, preserve the ourder of the rows in the output DataFrame to match the input DataFrame - Returns --------- df : cudf.DataFrame or dask_cudf.DataFrame The original DataFrame columns exist unmodified. The external vertex identifiers are added to the DataFrame, the internal vertex identifier column is removed from the dataframe. - Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', diff --git a/python/cugraph/structure/renumber_wrapper.pyx b/python/cugraph/structure/renumber_wrapper.pyx index 302fcfe583b..99626cdee08 100644 --- a/python/cugraph/structure/renumber_wrapper.pyx +++ b/python/cugraph/structure/renumber_wrapper.pyx @@ -22,6 +22,7 @@ from libc.stdint cimport uintptr_t from cython.operator cimport dereference as deref import numpy as np +from libcpp.memory cimport make_unique from libcpp.utility cimport move from rmm._lib.device_buffer cimport device_buffer, DeviceBuffer @@ -43,8 +44,8 @@ cdef renumber_helper(shuffled_vertices_t* ptr_maj_min_w, vertex_t, weights): shuffled_minor_series = cudf.Series(data=shuffled_minor_buffer, dtype=vertex_t) shuffled_df = cudf.DataFrame() - shuffled_df['src']=shuffled_major_series - shuffled_df['dst']=shuffled_minor_series + shuffled_df['major_vertices']=shuffled_major_series + shuffled_df['minor_vertices']=shuffled_minor_series if weights is not None: weight_t = weights.dtype @@ -53,7 +54,7 @@ cdef renumber_helper(shuffled_vertices_t* ptr_maj_min_w, vertex_t, weights): shuffled_weights_series = cudf.Series(data=shuffled_weights_buffer, dtype=weight_t) - shuffled_df['weights']= shuffled_weights_series + shuffled_df['value']= shuffled_weights_series return shuffled_df @@ -84,7 +85,7 @@ def renumber(input_df, # maybe use cpdef ? if num_global_edges > (2**31 - 1): edge_t = np.dtype("int64") else: - edge_t = np.dtype("int32") + edge_t = vertex_t if "value" in input_df.columns: weights = input_df['value'] weight_t = weights.dtype @@ -103,13 +104,11 @@ def renumber(input_df, # maybe use cpdef ? raise Exception("Incompatible vertex_t and edge_t types.") # FIXME: needs to be edge_t type not int - cdef int num_partition_edges = len(major_vertices) + cdef int num_local_edges = len(major_vertices) cdef uintptr_t c_major_vertices = major_vertices.__cuda_array_interface__['data'][0] cdef uintptr_t c_minor_vertices = minor_vertices.__cuda_array_interface__['data'][0] - cdef bool is_hyper_partitioned = False # for now - cdef uintptr_t shuffled_major = NULL cdef uintptr_t shuffled_minor = NULL @@ -119,12 +118,14 @@ def renumber(input_df, # maybe use cpdef ? cdef pair[unique_ptr[device_buffer], size_t] pair_original cdef pair[unique_ptr[device_buffer], size_t] pair_partition - # tparams: vertex_t, weight_t: + # tparams: vertex_t, edge_t, weight_t: # - cdef unique_ptr[major_minor_weights_t[int, float]] ptr_shuffled_32_32 - cdef unique_ptr[major_minor_weights_t[int, double]] ptr_shuffled_32_64 - cdef unique_ptr[major_minor_weights_t[long, float]] ptr_shuffled_64_32 - cdef unique_ptr[major_minor_weights_t[long, double]] ptr_shuffled_64_64 + cdef unique_ptr[major_minor_weights_t[int, int, float]] ptr_shuffled_32_32_32 + cdef unique_ptr[major_minor_weights_t[int, int, double]] ptr_shuffled_32_32_64 + cdef unique_ptr[major_minor_weights_t[int, long, float]] ptr_shuffled_32_64_32 + cdef unique_ptr[major_minor_weights_t[int, long, double]] ptr_shuffled_32_64_64 + cdef unique_ptr[major_minor_weights_t[long, long, float]] ptr_shuffled_64_64_32 + cdef unique_ptr[major_minor_weights_t[long, long, double]] ptr_shuffled_64_64_64 # tparams: vertex_t, edge_t: # @@ -132,6 +133,11 @@ def renumber(input_df, # maybe use cpdef ? cdef unique_ptr[renum_quad_t[int, long]] ptr_renum_quad_32_64 cdef unique_ptr[renum_quad_t[long, long]] ptr_renum_quad_64_64 + # tparam: vertex_t: + # + cdef unique_ptr[vector[int]] edge_counts_32 + cdef unique_ptr[vector[long]] edge_counts_64 + # tparam: vertex_t: # cdef unique_ptr[vector[int]] uniq_partition_vector_32 @@ -143,27 +149,32 @@ def renumber(input_df, # maybe use cpdef ? if ( edge_t == np.dtype("int32")): if( weight_t == np.dtype("float32")): if(is_multi_gpu): - ptr_shuffled_32_32.reset(call_shuffle[int, int, float](deref(handle_ptr), + ptr_shuffled_32_32_32.reset(call_shuffle[int, int, float](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) - shuffled_df = renumber_helper(ptr_shuffled_32_32.get(), vertex_t, weights) + num_local_edges).release()) + shuffled_df = renumber_helper(ptr_shuffled_32_32_32.get(), vertex_t, weights) + major_vertices = shuffled_df['major_vertices'] + minor_vertices = shuffled_df['minor_vertices'] + num_local_edges = len(shuffled_df) + if not transposed: + major = 'src'; minor = 'dst' + else: + major = 'dst'; minor = 'src' + shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_32 = move(ptr_shuffled_32_32_32.get().get_edge_counts_wrap()) else: - shuffled_df = input_df - - shuffled_src = shuffled_df['src'] - shuffled_dst = shuffled_df['dst'] - num_partition_edges = len(shuffled_df) - - shuffled_major = shuffled_src.__cuda_array_interface__['data'][0] - shuffled_minor = shuffled_dst.__cuda_array_interface__['data'][0] + shuffled_df = input_df + edge_counts_32 = make_unique[vector[int]](1, num_local_edges) + + shuffled_major = major_vertices.__cuda_array_interface__['data'][0] + shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] + ptr_renum_quad_32_32.reset(call_renumber[int, int](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_32.get()), 1, mg_flag).release()) @@ -186,8 +197,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_32.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_32.get().get_num_vertices()), dtype=vertex_t) # create new cudf df # @@ -201,29 +211,33 @@ def renumber(input_df, # maybe use cpdef ? elif( weight_t == np.dtype("float64")): if(is_multi_gpu): - ptr_shuffled_32_64.reset(call_shuffle[int, int, double](deref(handle_ptr), + ptr_shuffled_32_32_64.reset(call_shuffle[int, int, double](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) - - shuffled_df = renumber_helper(ptr_shuffled_32_64.get(), vertex_t, weights) + num_local_edges).release()) + + shuffled_df = renumber_helper(ptr_shuffled_32_32_64.get(), vertex_t, weights) + major_vertices = shuffled_df['major_vertices'] + minor_vertices = shuffled_df['minor_vertices'] + num_local_edges = len(shuffled_df) + if not transposed: + major = 'src'; minor = 'dst' + else: + major = 'dst'; minor = 'src' + shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_32 = move(ptr_shuffled_32_32_64.get().get_edge_counts_wrap()) else: shuffled_df = input_df - - shuffled_src = shuffled_df['src'] - shuffled_dst = shuffled_df['dst'] - num_partition_edges = len(shuffled_df) - - shuffled_major = shuffled_src.__cuda_array_interface__['data'][0] - shuffled_minor = shuffled_dst.__cuda_array_interface__['data'][0] + edge_counts_32 = make_unique[vector[int]](1, num_local_edges) + + shuffled_major = major_vertices.__cuda_array_interface__['data'][0] + shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] ptr_renum_quad_32_32.reset(call_renumber[int, int](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_32.get()), do_check, mg_flag).release()) @@ -246,8 +260,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_32.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_32.get().get_num_vertices()), dtype=vertex_t) # create new cudf df @@ -259,32 +272,37 @@ def renumber(input_df, # maybe use cpdef ? renumbered_map['new_ids'] = new_series return renumbered_map, shuffled_df + elif ( edge_t == np.dtype("int64")): if( weight_t == np.dtype("float32")): if(is_multi_gpu): - ptr_shuffled_32_32.reset(call_shuffle[int, long, float](deref(handle_ptr), + ptr_shuffled_32_64_32.reset(call_shuffle[int, long, float](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) - - shuffled_df = renumber_helper(ptr_shuffled_32_32.get(), vertex_t, weights) + num_local_edges).release()) + + shuffled_df = renumber_helper(ptr_shuffled_32_64_32.get(), vertex_t, weights) + major_vertices = shuffled_df['major_vertices'] + minor_vertices = shuffled_df['minor_vertices'] + num_local_edges = len(shuffled_df) + if not transposed: + major = 'src'; minor = 'dst' + else: + major = 'dst'; minor = 'src' + shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_64 = move(ptr_shuffled_32_64_32.get().get_edge_counts_wrap()) else: shuffled_df = input_df - - shuffled_src = shuffled_df['src'] - shuffled_dst = shuffled_df['dst'] - num_partition_edges = len(shuffled_df) - - shuffled_major = shuffled_src.__cuda_array_interface__['data'][0] - shuffled_minor = shuffled_dst.__cuda_array_interface__['data'][0] + edge_counts_64 = make_unique[vector[long]](1, num_local_edges) + + shuffled_major = major_vertices.__cuda_array_interface__['data'][0] + shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] ptr_renum_quad_32_64.reset(call_renumber[int, long](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_64.get()), do_check, mg_flag).release()) @@ -307,8 +325,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_32.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_64.get().get_num_vertices()), dtype=vertex_t) # create new cudf df @@ -322,29 +339,33 @@ def renumber(input_df, # maybe use cpdef ? return renumbered_map, shuffled_df elif( weight_t == np.dtype("float64")): if(is_multi_gpu): - ptr_shuffled_32_64.reset(call_shuffle[int, long, double](deref(handle_ptr), + ptr_shuffled_32_64_64.reset(call_shuffle[int, long, double](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) - - shuffled_df = renumber_helper(ptr_shuffled_32_64.get(), vertex_t, weights) + num_local_edges).release()) + + shuffled_df = renumber_helper(ptr_shuffled_32_64_64.get(), vertex_t, weights) + major_vertices = shuffled_df['major_vertices'] + minor_vertices = shuffled_df['minor_vertices'] + num_local_edges = len(shuffled_df) + if not transposed: + major = 'src'; minor = 'dst' + else: + major = 'dst'; minor = 'src' + shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_64 = move(ptr_shuffled_32_64_64.get().get_edge_counts_wrap()) else: shuffled_df = input_df - - shuffled_src = shuffled_df['src'] - shuffled_dst = shuffled_df['dst'] - num_partition_edges = len(shuffled_df) - - shuffled_major = shuffled_src.__cuda_array_interface__['data'][0] - shuffled_minor = shuffled_dst.__cuda_array_interface__['data'][0] + edge_counts_64 = make_unique[vector[long]](1, num_local_edges) + + shuffled_major = major_vertices.__cuda_array_interface__['data'][0] + shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] ptr_renum_quad_32_64.reset(call_renumber[int, long](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_64.get()), do_check, mg_flag).release()) @@ -367,8 +388,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_32.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_32_64.get().get_num_vertices()), dtype=vertex_t) # create new cudf df # @@ -379,33 +399,38 @@ def renumber(input_df, # maybe use cpdef ? renumbered_map['new_ids'] = new_series return renumbered_map, shuffled_df + elif (vertex_t == np.dtype("int64")): if ( edge_t == np.dtype("int64")): if( weight_t == np.dtype("float32")): if(is_multi_gpu): - ptr_shuffled_64_32.reset(call_shuffle[long, long, float](deref(handle_ptr), + ptr_shuffled_64_64_32.reset(call_shuffle[long, long, float](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) - - shuffled_df = renumber_helper(ptr_shuffled_64_32.get(), vertex_t, weights) + num_local_edges).release()) + + shuffled_df = renumber_helper(ptr_shuffled_64_64_32.get(), vertex_t, weights) + major_vertices = shuffled_df['major_vertices'] + minor_vertices = shuffled_df['minor_vertices'] + num_local_edges = len(shuffled_df) + if not transposed: + major = 'src'; minor = 'dst' + else: + major = 'dst'; minor = 'src' + shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_64 = move(ptr_shuffled_64_64_32.get().get_edge_counts_wrap()) else: shuffled_df = input_df - - shuffled_src = shuffled_df['src'] - shuffled_dst = shuffled_df['dst'] - num_partition_edges = len(shuffled_df) - - shuffled_major = shuffled_src.__cuda_array_interface__['data'][0] - shuffled_minor = shuffled_dst.__cuda_array_interface__['data'][0] + edge_counts_64 = make_unique[vector[long]](1, num_local_edges) + + shuffled_major = major_vertices.__cuda_array_interface__['data'][0] + shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] ptr_renum_quad_64_64.reset(call_renumber[long, long](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_64.get()), do_check, mg_flag).release()) @@ -428,8 +453,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_64.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_64_64.get().get_num_vertices()), dtype=vertex_t) # create new cudf df @@ -441,31 +465,36 @@ def renumber(input_df, # maybe use cpdef ? renumbered_map['new_ids'] = new_series return renumbered_map, shuffled_df + elif( weight_t == np.dtype("float64")): if(is_multi_gpu): - ptr_shuffled_64_64.reset(call_shuffle[long, long, double](deref(handle_ptr), + ptr_shuffled_64_64_64.reset(call_shuffle[long, long, double](deref(handle_ptr), c_major_vertices, c_minor_vertices, c_edge_weights, - num_partition_edges, - is_hyper_partitioned).release()) - - shuffled_df = renumber_helper(ptr_shuffled_64_64.get(), vertex_t, weights) + num_local_edges).release()) + + shuffled_df = renumber_helper(ptr_shuffled_64_64_64.get(), vertex_t, weights) + major_vertices = shuffled_df['major_vertices'] + minor_vertices = shuffled_df['minor_vertices'] + num_local_edges = len(shuffled_df) + if not transposed: + major = 'src'; minor = 'dst' + else: + major = 'dst'; minor = 'src' + shuffled_df = shuffled_df.rename(columns={'major_vertices':major, 'minor_vertices':minor}, copy=False) + edge_counts_64 = move(ptr_shuffled_64_64_64.get().get_edge_counts_wrap()) else: shuffled_df = input_df - - shuffled_src = shuffled_df['src'] - shuffled_dst = shuffled_df['dst'] - num_partition_edges = len(shuffled_df) - - shuffled_major = shuffled_src.__cuda_array_interface__['data'][0] - shuffled_minor = shuffled_dst.__cuda_array_interface__['data'][0] + edge_counts_64 = make_unique[vector[long]](1, num_local_edges) + + shuffled_major = major_vertices.__cuda_array_interface__['data'][0] + shuffled_minor = minor_vertices.__cuda_array_interface__['data'][0] ptr_renum_quad_64_64.reset(call_renumber[long, long](deref(handle_ptr), shuffled_major, shuffled_minor, - num_partition_edges, - is_hyper_partitioned, + deref(edge_counts_64.get()), do_check, mg_flag).release()) @@ -488,8 +517,7 @@ def renumber(input_df, # maybe use cpdef ? uniq_partition_vector_64.get()[0].at(rank_indx+1)), dtype=vertex_t) else: - new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(0), - uniq_partition_vector_32.get()[0].at(1)), + new_series = cudf.Series(np.arange(0, ptr_renum_quad_64_64.get().get_num_vertices()), dtype=vertex_t) # create new cudf df diff --git a/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py b/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py index 4b0f6629bc3..6e1e5ea380a 100644 --- a/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py +++ b/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -16,6 +16,7 @@ from cugraph.tests.dask.mg_context import MGContext, skip_if_not_enough_devices from cugraph.dask.common.mg_utils import is_single_gpu +from cugraph.tests import utils # Get parameters from standard betwenness_centrality_test from cugraph.tests.test_betweenness_centrality import ( @@ -36,7 +37,7 @@ # ============================================================================= # Parameters # ============================================================================= -DATASETS = ["../datasets/karate.csv"] +DATASETS = [utils.DATASETS_UNDIRECTED[0]] MG_DEVICE_COUNT_OPTIONS = [pytest.param(1, marks=pytest.mark.preset_gpu_count), pytest.param(2, marks=pytest.mark.preset_gpu_count), pytest.param(3, marks=pytest.mark.preset_gpu_count), diff --git a/python/cugraph/tests/test_balanced_cut.py b/python/cugraph/tests/test_balanced_cut.py index f0fc7152e56..2492017511a 100644 --- a/python/cugraph/tests/test_balanced_cut.py +++ b/python/cugraph/tests/test_balanced_cut.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/tests/test_betweenness_centrality.py b/python/cugraph/tests/test_betweenness_centrality.py index f338e5aa633..29c012e95a2 100755 --- a/python/cugraph/tests/test_betweenness_centrality.py +++ b/python/cugraph/tests/test_betweenness_centrality.py @@ -55,7 +55,7 @@ # Comparison functions # ============================================================================= def calc_betweenness_centrality( - graph_obj_tuple, + graph_file, directed=True, k=None, normalized=False, @@ -68,49 +68,36 @@ def calc_betweenness_centrality( edgevals=False, ): """ Generate both cugraph and networkx betweenness centrality - Parameters ---------- graph_file : string Path to COO Graph representation in .csv format - directed : bool, optional, default=True - k : int or None, optional, default=None int: Number of sources to sample from None: All sources are used to compute - normalized : bool True: Normalize Betweenness Centrality scores False: Scores are left unnormalized - weight : cudf.DataFrame: Not supported as of 06/2020 - endpoints : bool True: Endpoints are included when computing scores False: Endpoints are not considered - seed : int or None, optional, default=None Seed for random sampling of the starting point - result_dtype : numpy.dtype Expected type of the result, either np.float32 or np.float64 - use_k_full : bool When True, if k is None replaces k by the number of sources of the Graph - multi_gpu_batch : bool When True, enable mg batch after constructing the graph - edgevals: bool When True, enable tests with weighted graph, should be ignored during computation. - Returns ------- - sorted_df : cudf.DataFrame Contains 'vertex' and 'cu_bc' 'ref_bc' columns, where 'cu_bc' and 'ref_bc' are the two betweenness centrality scores to compare. @@ -120,7 +107,8 @@ def calc_betweenness_centrality( G = None Gnx = None - G, Gnx = graph_obj_tuple + G, Gnx = utils.build_cu_and_nx_graphs(graph_file, directed=directed, + edgevals=edgevals) assert G is not None and Gnx is not None if multi_gpu_batch: @@ -298,67 +286,46 @@ def prepare_test(): gc.collect() -# ============================================================================= -# Pytest Fixtures -# ============================================================================= -DIRECTED = [pytest.param(d) for d in DIRECTED_GRAPH_OPTIONS] -DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL] -DATASETS_UNRENUMBERED = [pytest.param(d) for d in utils.DATASETS_UNRENUMBERED] -WEIGHTED_GRAPH_OPTIONS = [pytest.param(w) for w in WEIGHTED_GRAPH_OPTIONS] - - -small_graph_fixture_params = utils.genFixtureParamsProduct( - (DATASETS_SMALL, "grph"), - (DIRECTED, "dirctd"), - (WEIGHTED_GRAPH_OPTIONS, "wgtd_gph_opts")) - -unrenumbered_graph_fixture_params = utils.genFixtureParamsProduct( - (DATASETS_UNRENUMBERED, "grph"), - (DIRECTED, "dirctd"), - (WEIGHTED_GRAPH_OPTIONS, "wgtd_gph_opts")) - - -@pytest.fixture(scope="module", params=small_graph_fixture_params) -def get_cu_nx_graph_datasets_small(request): - return utils.build_cu_and_nx_graphs(*request.param) - - -@pytest.fixture(scope="module", params=unrenumbered_graph_fixture_params) -def get_cu_nx_graph_datasets_unrenumbered(request): - return utils.build_cu_and_nx_graphs(*request.param) - - # ============================================================================= # Tests # ============================================================================= +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @pytest.mark.parametrize("weight", [None]) @pytest.mark.parametrize("endpoints", ENDPOINTS_OPTIONS) @pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS) @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS) +@pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) def test_betweenness_centrality( - get_cu_nx_graph_datasets_small, + graph_file, + directed, subset_size, normalized, weight, endpoints, subset_seed, result_dtype, + edgevals ): prepare_test() sorted_df = calc_betweenness_centrality( - get_cu_nx_graph_datasets_small, + graph_file, + directed=directed, normalized=normalized, k=subset_size, weight=weight, endpoints=endpoints, seed=subset_seed, result_dtype=result_dtype, + edgevals=edgevals, ) compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", [None]) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @pytest.mark.parametrize("weight", [None]) @@ -366,8 +333,10 @@ def test_betweenness_centrality( @pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS) @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS) @pytest.mark.parametrize("use_k_full", [True]) +@pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) def test_betweenness_centrality_k_full( - get_cu_nx_graph_datasets_small, + graph_file, + directed, subset_size, normalized, weight, @@ -375,12 +344,14 @@ def test_betweenness_centrality_k_full( subset_seed, result_dtype, use_k_full, + edgevals ): """Tests full betweenness centrality by using k = G.number_of_vertices() instead of k=None, checks that k scales properly""" prepare_test() sorted_df = calc_betweenness_centrality( - get_cu_nx_graph_datasets_small, + graph_file, + directed=directed, normalized=normalized, k=subset_size, weight=weight, @@ -388,6 +359,7 @@ def test_betweenness_centrality_k_full( seed=subset_seed, result_dtype=result_dtype, use_k_full=use_k_full, + edgevals=edgevals ) compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") @@ -396,99 +368,118 @@ def test_betweenness_centrality_k_full( # the function operating the comparison inside is first proceeding # to a random sampling over the number of vertices (thus direct offsets) # in the graph structure instead of actual vertices identifiers +@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @pytest.mark.parametrize("weight", [None]) @pytest.mark.parametrize("endpoints", ENDPOINTS_OPTIONS) @pytest.mark.parametrize("subset_seed", [None]) @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS) +@pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) def test_betweenness_centrality_fixed_sample( - get_cu_nx_graph_datasets_unrenumbered, + graph_file, + directed, subset_size, normalized, weight, endpoints, subset_seed, result_dtype, + edgevals ): """Test Betweenness Centrality using a subset - Only k sources are considered for an approximate Betweenness Centrality """ prepare_test() sorted_df = calc_betweenness_centrality( - get_cu_nx_graph_datasets_unrenumbered, + graph_file, + directed=directed, k=subset_size, normalized=normalized, weight=weight, endpoints=endpoints, seed=subset_seed, result_dtype=result_dtype, + edgevals=edgevals ) compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @pytest.mark.parametrize("weight", [[]]) @pytest.mark.parametrize("endpoints", ENDPOINTS_OPTIONS) @pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS) @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS) +@pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) def test_betweenness_centrality_weight_except( - get_cu_nx_graph_datasets_small, + graph_file, + directed, subset_size, normalized, weight, endpoints, subset_seed, result_dtype, + edgevals ): """Calls betwenness_centrality with weight - As of 05/28/2020, weight is not supported and should raise a NotImplementedError """ prepare_test() with pytest.raises(NotImplementedError): sorted_df = calc_betweenness_centrality( - get_cu_nx_graph_datasets_small, + graph_file, + directed=directed, k=subset_size, normalized=normalized, weight=weight, endpoints=endpoints, seed=subset_seed, result_dtype=result_dtype, + edgevals=edgevals ) compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("weight", [None]) @pytest.mark.parametrize("endpoints", ENDPOINTS_OPTIONS) @pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS) @pytest.mark.parametrize("result_dtype", [str]) +@pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) def test_betweenness_invalid_dtype( - get_cu_nx_graph_datasets_small, + graph_file, + directed, subset_size, normalized, weight, endpoints, subset_seed, result_dtype, + edgevals ): """Test calls edge_betwenness_centrality an invalid type""" prepare_test() with pytest.raises(TypeError): sorted_df = calc_betweenness_centrality( - get_cu_nx_graph_datasets_small, + graph_file, + directed=directed, k=subset_size, normalized=normalized, weight=weight, endpoints=endpoints, seed=subset_seed, result_dtype=result_dtype, + edgevals=edgevals ) compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") diff --git a/python/cugraph/tests/test_bfs.py b/python/cugraph/tests/test_bfs.py index 0070a34248c..d04ef957104 100644 --- a/python/cugraph/tests/test_bfs.py +++ b/python/cugraph/tests/test_bfs.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/tests/test_connectivity.py b/python/cugraph/tests/test_connectivity.py index f957c4b417b..194147ab620 100644 --- a/python/cugraph/tests/test_connectivity.py +++ b/python/cugraph/tests/test_connectivity.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/tests/test_convert_matrix.py b/python/cugraph/tests/test_convert_matrix.py index d418dd7ce2e..1dbf51910ea 100644 --- a/python/cugraph/tests/test_convert_matrix.py +++ b/python/cugraph/tests/test_convert_matrix.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/tests/test_core_number.py b/python/cugraph/tests/test_core_number.py index edbc7b0597b..9cfc37ba1c5 100644 --- a/python/cugraph/tests/test_core_number.py +++ b/python/cugraph/tests/test_core_number.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/tests/test_edge_betweenness_centrality.py b/python/cugraph/tests/test_edge_betweenness_centrality.py index 529b0b9de9c..8c5aad7dc61 100644 --- a/python/cugraph/tests/test_edge_betweenness_centrality.py +++ b/python/cugraph/tests/test_edge_betweenness_centrality.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION.: +# Copyright (c) 2019-2021, NVIDIA CORPORATION.: # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/tests/test_egonet.py b/python/cugraph/tests/test_egonet.py index 009fd1252f1..b259c2567dc 100644 --- a/python/cugraph/tests/test_egonet.py +++ b/python/cugraph/tests/test_egonet.py @@ -58,29 +58,6 @@ def test_ego_graph_nx(graph_file, seed, radius): @pytest.mark.parametrize("seeds", [[0, 5, 13]]) @pytest.mark.parametrize("radius", [1, 2, 3]) def test_batched_ego_graphs(graph_file, seeds, radius): - """ - Compute the induced subgraph of neighbors for each node in seeds - within a given radius. - Parameters - ---------- - G : cugraph.Graph, networkx.Graph, CuPy or SciPy sparse matrix - Graph or matrix object, which should contain the connectivity - information. Edge weights, if present, should be single or double - precision floating point values. - seeds : cudf.Series - Specifies the seeds of the induced egonet subgraphs - radius: integer, optional - Include all neighbors of distance<=radius from n. - - Returns - ------- - ego_edge_lists : cudf.DataFrame - GPU data frame containing all induced sources identifiers, - destination identifiers, edge weights - seeds_offsets: cudf.Series - Series containing the starting offset in the returned edge list - for each seed. - """ gc.collect() # Nx @@ -93,9 +70,8 @@ def test_batched_ego_graphs(graph_file, seeds, radius): df, offsets = cugraph.batched_ego_graphs(Gnx, seeds, radius=radius) for i in range(len(seeds)): ego_nx = nx.ego_graph(Gnx, seeds[i], radius=radius) - ego_df = df[offsets[i]:offsets[i+1]] - ego_cugraph = nx.from_pandas_edgelist(ego_df, - source="src", - target="dst", - edge_attr="weight") + ego_df = df[offsets[i]:offsets[i + 1]] + ego_cugraph = nx.from_pandas_edgelist( + ego_df, source="src", target="dst", edge_attr="weight" + ) assert nx.is_isomorphic(ego_nx, ego_cugraph) diff --git a/python/cugraph/tests/test_filter_unreachable.py b/python/cugraph/tests/test_filter_unreachable.py index 29b862f0285..6c00461d234 100644 --- a/python/cugraph/tests/test_filter_unreachable.py +++ b/python/cugraph/tests/test_filter_unreachable.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/tests/test_graph.py b/python/cugraph/tests/test_graph.py index d8d5a504070..348f7e2e130 100644 --- a/python/cugraph/tests/test_graph.py +++ b/python/cugraph/tests/test_graph.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/tests/test_hits.py b/python/cugraph/tests/test_hits.py index 6b6f54937a6..9229f3734f8 100644 --- a/python/cugraph/tests/test_hits.py +++ b/python/cugraph/tests/test_hits.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/tests/test_hypergraph.py b/python/cugraph/tests/test_hypergraph.py index dbce89905cd..43801be9fdc 100644 --- a/python/cugraph/tests/test_hypergraph.py +++ b/python/cugraph/tests/test_hypergraph.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -34,12 +34,14 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import cudf -from cudf.tests.utils import assert_eq -import cugraph import datetime as dt + import pandas as pd import pytest +import cudf +from cudf.tests.utils import assert_eq + +import cugraph simple_df = cudf.DataFrame.from_pandas(pd.DataFrame({ diff --git a/python/cugraph/tests/test_k_core.py b/python/cugraph/tests/test_k_core.py index 5e3220dcfb1..33d403ee27b 100644 --- a/python/cugraph/tests/test_k_core.py +++ b/python/cugraph/tests/test_k_core.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/tests/test_katz_centrality.py b/python/cugraph/tests/test_katz_centrality.py index a2a03c1518b..1fef6b05d59 100644 --- a/python/cugraph/tests/test_katz_centrality.py +++ b/python/cugraph/tests/test_katz_centrality.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/tests/test_maximum_spanning_tree.py b/python/cugraph/tests/test_maximum_spanning_tree.py index e20e2f72267..311f28bd6f8 100644 --- a/python/cugraph/tests/test_maximum_spanning_tree.py +++ b/python/cugraph/tests/test_maximum_spanning_tree.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,16 +11,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +import time import gc import pytest +import numpy as np +import rmm +import cudf import cugraph from cugraph.tests import utils -import rmm -import cudf -import time -import numpy as np + # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from diff --git a/python/cugraph/tests/test_minimum_spanning_tree.py b/python/cugraph/tests/test_minimum_spanning_tree.py index 55ebdcfda08..d1588507bce 100644 --- a/python/cugraph/tests/test_minimum_spanning_tree.py +++ b/python/cugraph/tests/test_minimum_spanning_tree.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,16 +11,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +import time import gc import pytest +import numpy as np +import rmm +import cudf import cugraph from cugraph.tests import utils -import rmm -import cudf -import time -import numpy as np + # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from diff --git a/python/cugraph/tests/test_modularity.py b/python/cugraph/tests/test_modularity.py index 7a7d42d1592..c1ff95042ed 100644 --- a/python/cugraph/tests/test_modularity.py +++ b/python/cugraph/tests/test_modularity.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/tests/test_multigraph.py b/python/cugraph/tests/test_multigraph.py index cb659bc7e24..57be3eb34e8 100644 --- a/python/cugraph/tests/test_multigraph.py +++ b/python/cugraph/tests/test_multigraph.py @@ -1,10 +1,25 @@ -import cugraph -import networkx as nx -from cugraph.tests import utils -import pytest +# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import gc + +import pytest +import networkx as nx import numpy as np +import cugraph +from cugraph.tests import utils + # ============================================================================= # Pytest Setup / Teardown - called for each test function diff --git a/python/cugraph/tests/test_nx_convert.py b/python/cugraph/tests/test_nx_convert.py index 08a96a801e2..98cc8a11dc7 100644 --- a/python/cugraph/tests/test_nx_convert.py +++ b/python/cugraph/tests/test_nx_convert.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,11 +12,14 @@ # limitations under the License. import gc + import pytest import cudf + import cugraph from cugraph.tests import utils + # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from # 'collections.abc' is deprecated, and in 3.8 it will stop working) for @@ -77,7 +80,6 @@ def test_networkx_compatibility(graph_file): _compare_graphs(nxG, cuG) -# Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_nx_convert(graph_file): gc.collect() diff --git a/python/cugraph/tests/test_overlap.py b/python/cugraph/tests/test_overlap.py index 53d279478f7..a0c336c3f16 100644 --- a/python/cugraph/tests/test_overlap.py +++ b/python/cugraph/tests/test_overlap.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -17,6 +17,7 @@ import pytest import numpy as np import scipy + import cugraph from cugraph.tests import utils diff --git a/python/cugraph/tests/test_pagerank.py b/python/cugraph/tests/test_pagerank.py index 3ce8dd4ffe9..163b2adb967 100644 --- a/python/cugraph/tests/test_pagerank.py +++ b/python/cugraph/tests/test_pagerank.py @@ -21,6 +21,7 @@ import cugraph from cugraph.tests import utils + # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from # 'collections.abc' is deprecated, and in 3.8 it will stop working) for diff --git a/python/cugraph/tests/test_paths.py b/python/cugraph/tests/test_paths.py index 7467d024051..56cc9b3cd50 100644 --- a/python/cugraph/tests/test_paths.py +++ b/python/cugraph/tests/test_paths.py @@ -1,11 +1,27 @@ +# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from tempfile import NamedTemporaryFile + import cudf -import cugraph from cupy.sparse import coo_matrix as cupy_coo_matrix import cupy import networkx as nx import pytest -import sys -from tempfile import NamedTemporaryFile + +import cugraph + CONNECTED_GRAPH = """1,5,3 1,4,1 diff --git a/python/cugraph/tests/test_renumber.py b/python/cugraph/tests/test_renumber.py index 6f88d5f85c4..129bd667621 100644 --- a/python/cugraph/tests/test_renumber.py +++ b/python/cugraph/tests/test_renumber.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -17,8 +17,8 @@ import pandas as pd import pytest - import cudf + from cugraph.structure.number_map import NumberMap from cugraph.tests import utils @@ -44,13 +44,14 @@ def test_renumber_ips(): gdf["source_as_int"] = gdf["source_list"].str.ip2int() gdf["dest_as_int"] = gdf["dest_list"].str.ip2int() - numbering = NumberMap() - numbering.from_series(gdf["source_as_int"], gdf["dest_as_int"]) - src = numbering.to_internal_vertex_id(gdf["source_as_int"]) - dst = numbering.to_internal_vertex_id(gdf["dest_as_int"]) + renumbered_gdf, renumber_map = NumberMap.renumber(gdf, + "source_as_int", + "dest_as_int") - check_src = numbering.from_internal_vertex_id(src)["0"] - check_dst = numbering.from_internal_vertex_id(dst)["0"] + check_src = renumber_map.from_internal_vertex_id(renumbered_gdf['src'] + )["0"] + check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst'] + )["0"] assert check_src.equals(gdf["source_as_int"]) assert check_dst.equals(gdf["dest_as_int"]) @@ -78,13 +79,14 @@ def test_renumber_ips_cols(): gdf["source_as_int"] = gdf["source_list"].str.ip2int() gdf["dest_as_int"] = gdf["dest_list"].str.ip2int() - numbering = NumberMap() - numbering.from_dataframe(gdf, ["source_as_int"], ["dest_as_int"]) - src = numbering.to_internal_vertex_id(gdf["source_as_int"]) - dst = numbering.to_internal_vertex_id(gdf["dest_as_int"]) + renumbered_gdf, renumber_map = NumberMap.renumber(gdf, + ["source_as_int"], + ["dest_as_int"]) - check_src = numbering.from_internal_vertex_id(src)["0"] - check_dst = numbering.from_internal_vertex_id(dst)["0"] + check_src = renumber_map.from_internal_vertex_id(renumbered_gdf['src'] + )["0"] + check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst'] + )["0"] assert check_src.equals(gdf["source_as_int"]) assert check_dst.equals(gdf["dest_as_int"]) @@ -110,13 +112,14 @@ def test_renumber_ips_str_cols(): gdf = cudf.from_pandas(pdf) - numbering = NumberMap() - numbering.from_dataframe(gdf, ["source_list"], ["dest_list"]) - src = numbering.to_internal_vertex_id(gdf["source_list"]) - dst = numbering.to_internal_vertex_id(gdf["dest_list"]) + renumbered_gdf, renumber_map = NumberMap.renumber(gdf, + ["source_as_int"], + ["dest_as_int"]) - check_src = numbering.from_internal_vertex_id(src)["0"] - check_dst = numbering.from_internal_vertex_id(dst)["0"] + check_src = renumber_map.from_internal_vertex_id(renumbered_gdf['src'] + )["0"] + check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst'] + )["0"] assert check_src.equals(gdf["source_list"]) assert check_dst.equals(gdf["dest_list"]) @@ -130,13 +133,14 @@ def test_renumber_negative(): gdf = cudf.DataFrame.from_pandas(df[["source_list", "dest_list"]]) - numbering = NumberMap() - numbering.from_dataframe(gdf, ["source_list"], ["dest_list"]) - src = numbering.to_internal_vertex_id(gdf["source_list"]) - dst = numbering.to_internal_vertex_id(gdf["dest_list"]) + renumbered_gdf, renumber_map = NumberMap.renumber(gdf, + "source_list", + "dest_list") - check_src = numbering.from_internal_vertex_id(src)["0"] - check_dst = numbering.from_internal_vertex_id(dst)["0"] + check_src = renumber_map.from_internal_vertex_id(renumbered_gdf['src'] + )["0"] + check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst'] + )["0"] assert check_src.equals(gdf["source_list"]) assert check_dst.equals(gdf["dest_list"]) @@ -150,19 +154,20 @@ def test_renumber_negative_col(): gdf = cudf.DataFrame.from_pandas(df[["source_list", "dest_list"]]) - numbering = NumberMap() - numbering.from_dataframe(gdf, ["source_list"], ["dest_list"]) - src = numbering.to_internal_vertex_id(gdf["source_list"]) - dst = numbering.to_internal_vertex_id(gdf["dest_list"]) + renumbered_gdf, renumber_map = NumberMap.renumber(gdf, + "source_list", + "dest_list") - check_src = numbering.from_internal_vertex_id(src)["0"] - check_dst = numbering.from_internal_vertex_id(dst)["0"] + check_src = renumber_map.from_internal_vertex_id(renumbered_gdf['src'] + )["0"] + check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst'] + )["0"] assert check_src.equals(gdf["source_list"]) assert check_dst.equals(gdf["dest_list"]) -# Test all combinations of default/managed and pooled/non-pooled allocation +@pytest.mark.skip(reason="dropped renumbering from series support") @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_renumber_series(graph_file): gc.collect() @@ -215,22 +220,23 @@ def test_renumber_files(graph_file): df["dst"] = cudf.Series([x + translate for x in destinations. values_host]) - numbering = NumberMap() - numbering.from_series(df["src"], df["dst"]) + exp_src = cudf.Series([x + translate for x in sources. + values_host]) + exp_dst = cudf.Series([x + translate for x in destinations. + values_host]) - renumbered_df = numbering.add_internal_vertex_id( - numbering.add_internal_vertex_id(df, "src_id", ["src"]), - "dst_id", ["dst"] - ) + renumbered_df, renumber_map = NumberMap.renumber(df, "src", "dst", + preserve_order=True) - check_src = numbering.from_internal_vertex_id(renumbered_df, "src_id") - check_dst = numbering.from_internal_vertex_id(renumbered_df, "dst_id") + unrenumbered_df = renumber_map.unrenumber(renumbered_df, "src", + preserve_order=True) + unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst", + preserve_order=True) - assert check_src["src"].equals(check_src["0"]) - assert check_dst["dst"].equals(check_dst["0"]) + assert exp_src.equals(unrenumbered_df["src"]) + assert exp_dst.equals(unrenumbered_df["dst"]) -# Test all combinations of default/managed and pooled/non-pooled allocation @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_renumber_files_col(graph_file): gc.collect() @@ -246,22 +252,23 @@ def test_renumber_files_col(graph_file): gdf['dst'] = cudf.Series([x + translate for x in destinations. values_host]) - numbering = NumberMap() - numbering.from_dataframe(gdf, ["src"], ["dst"]) + exp_src = cudf.Series([x + translate for x in sources. + values_host]) + exp_dst = cudf.Series([x + translate for x in destinations. + values_host]) - renumbered_df = numbering.add_internal_vertex_id( - numbering.add_internal_vertex_id(gdf, "src_id", ["src"]), - "dst_id", ["dst"] - ) + renumbered_df, renumber_map = NumberMap.renumber(gdf, ["src"], ["dst"], + preserve_order=True) - check_src = numbering.from_internal_vertex_id(renumbered_df, "src_id") - check_dst = numbering.from_internal_vertex_id(renumbered_df, "dst_id") + unrenumbered_df = renumber_map.unrenumber(renumbered_df, "src", + preserve_order=True) + unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst", + preserve_order=True) - assert check_src["src"].equals(check_src["0"]) - assert check_dst["dst"].equals(check_dst["0"]) + assert exp_src.equals(unrenumbered_df["src"]) + assert exp_dst.equals(unrenumbered_df["dst"]) -# Test all combinations of default/managed and pooled/non-pooled allocation @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_renumber_files_multi_col(graph_file): gc.collect() @@ -278,21 +285,17 @@ def test_renumber_files_multi_col(graph_file): gdf["src"] = sources + translate gdf["dst"] = destinations + translate - numbering = NumberMap() - numbering.from_dataframe(gdf, ["src", "src_old"], ["dst", "dst_old"]) - - renumbered_df = numbering.add_internal_vertex_id( - numbering.add_internal_vertex_id( - gdf, "src_id", ["src", "src_old"] - ), - "dst_id", - ["dst", "dst_old"], - ) + renumbered_df, renumber_map = NumberMap.renumber(gdf, + ["src", "src_old"], + ["dst", "dst_old"], + preserve_order=True) - check_src = numbering.from_internal_vertex_id(renumbered_df, "src_id") - check_dst = numbering.from_internal_vertex_id(renumbered_df, "dst_id") + unrenumbered_df = renumber_map.unrenumber(renumbered_df, "src", + preserve_order=True) + unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst", + preserve_order=True) - assert check_src["src"].equals(check_src["0"]) - assert check_src["src_old"].equals(check_src["1"]) - assert check_dst["dst"].equals(check_dst["0"]) - assert check_dst["dst_old"].equals(check_dst["1"]) + assert gdf["src"].equals(unrenumbered_df["0_src"]) + assert gdf["src_old"].equals(unrenumbered_df["1_src"]) + assert gdf["dst"].equals(unrenumbered_df["0_dst"]) + assert gdf["dst_old"].equals(unrenumbered_df["1_dst"]) diff --git a/python/cugraph/tests/test_sssp.py b/python/cugraph/tests/test_sssp.py index 0a5347a6290..9230b7a7b96 100644 --- a/python/cugraph/tests/test_sssp.py +++ b/python/cugraph/tests/test_sssp.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -29,6 +29,7 @@ import cugraph from cugraph.tests import utils + # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from # 'collections.abc' is deprecated, and in 3.8 it will stop working) for diff --git a/python/cugraph/tests/test_subgraph_extraction.py b/python/cugraph/tests/test_subgraph_extraction.py index 9e9eccc4347..56c1c23e0ea 100644 --- a/python/cugraph/tests/test_subgraph_extraction.py +++ b/python/cugraph/tests/test_subgraph_extraction.py @@ -20,6 +20,7 @@ import cugraph from cugraph.tests import utils + # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from # 'collections.abc' is deprecated, and in 3.8 it will stop working) for @@ -70,7 +71,6 @@ def nx_call(M, verts, directed=True): return nx.subgraph(G, verts) -# Test all combinations of default/managed and pooled/non-pooled allocation @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_subgraph_extraction_DiGraph(graph_file): gc.collect() @@ -85,9 +85,6 @@ def test_subgraph_extraction_DiGraph(graph_file): assert compare_edges(cu_sg, nx_sg) -# Test all combinations of default/managed and pooled/non-pooled allocation - - @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_subgraph_extraction_Graph(graph_file): gc.collect() diff --git a/python/cugraph/tests/test_triangle_count.py b/python/cugraph/tests/test_triangle_count.py index ff28f55838d..917a4f320a7 100644 --- a/python/cugraph/tests/test_triangle_count.py +++ b/python/cugraph/tests/test_triangle_count.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -19,6 +19,7 @@ import cugraph from cugraph.tests import utils + # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from # 'collections.abc' is deprecated, and in 3.8 it will stop working) for diff --git a/python/cugraph/tests/test_utils.py b/python/cugraph/tests/test_utils.py index 55410817f90..55256d6b74e 100644 --- a/python/cugraph/tests/test_utils.py +++ b/python/cugraph/tests/test_utils.py @@ -12,10 +12,14 @@ # limitations under the License. import gc +from pathlib import PurePath + import pytest + import cugraph +import cudf from cugraph.tests import utils -from pathlib import PurePath +import numpy as np def test_bfs_paths(): @@ -66,3 +70,29 @@ def test_bfs_paths_array(): answer = cugraph.utils.get_traversed_path_list(df, 100) assert "not in the result set" in str(ErrorMsg) + + +@pytest.mark.parametrize("graph_file", utils.DATASETS) +def test_get_traversed_cost(graph_file): + cu_M = utils.read_csv_file(graph_file) + + noise = cudf.Series(np.random.randint(10, size=(cu_M.shape[0]))) + cu_M['info'] = cu_M['2'] + noise + + G = cugraph.Graph() + G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='info') + + # run SSSP starting at vertex 17 + df = cugraph.sssp(G, 16) + + answer = cugraph.utilities.path_retrieval.get_traversed_cost(df, 16, + cu_M['0'], + cu_M['1'], + cu_M['info'] + ) + + df = df.sort_values(by='vertex').reset_index() + answer = answer.sort_values(by='vertex').reset_index() + + assert df.shape[0] == answer.shape[0] + assert np.allclose(df['distance'], answer['info']) diff --git a/python/cugraph/tests/test_wjaccard.py b/python/cugraph/tests/test_wjaccard.py index c5cab18484c..9f82857a8d7 100644 --- a/python/cugraph/tests/test_wjaccard.py +++ b/python/cugraph/tests/test_wjaccard.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -21,6 +21,7 @@ import cugraph from cugraph.tests import utils + # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from # 'collections.abc' is deprecated, and in 3.8 it will stop working) for diff --git a/python/cugraph/tests/test_woverlap.py b/python/cugraph/tests/test_woverlap.py index e7da21014ba..b6ceda40116 100644 --- a/python/cugraph/tests/test_woverlap.py +++ b/python/cugraph/tests/test_woverlap.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -16,10 +16,11 @@ import pytest import scipy +import numpy as np import cudf + import cugraph from cugraph.tests import utils -import numpy as np def cugraph_call(cu_M, pairs): @@ -83,7 +84,6 @@ def cpu_call(M, first, second): return result -# Test @pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) def test_woverlap(graph_file): gc.collect() diff --git a/python/cugraph/traversal/__init__.py b/python/cugraph/traversal/__init__.py index 5944ebe0865..e74266d29fc 100644 --- a/python/cugraph/traversal/__init__.py +++ b/python/cugraph/traversal/__init__.py @@ -17,6 +17,8 @@ sssp, shortest_path, filter_unreachable, - shortest_path_length + shortest_path_length, ) from cugraph.traversal.traveling_salesperson import traveling_salesperson + +from cugraph.traversal.ms_bfs import concurrent_bfs, multi_source_bfs diff --git a/python/cugraph/traversal/ms_bfs.py b/python/cugraph/traversal/ms_bfs.py new file mode 100644 index 00000000000..e4b799e30e4 --- /dev/null +++ b/python/cugraph/traversal/ms_bfs.py @@ -0,0 +1,282 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import cudf + +# from cugraph.structure.graph import Graph, DiGraph +# from cugraph.utilities.utils import get_device_memory_info +import warnings + + +def _get_feasibility(G, sources, components=None, depth_limit=None): + """ + Evaluate the feasibility for breadth first traversal from multiple sources + in a graph. + + Parameters + ---------- + G : cugraph.Graph or cugraph.DiGraph + The adjacency list will be computed if not already present. + + sources : cudf.Series + Subset of vertices from which the traversals start. A BFS is run for + each source in the Series. + The size of the series should be at least one and cannot exceed + the size of the graph. + + depth_limit : Integer, optional, default=None + Limit the depth of the search. Terminates if no more vertices are + reachable within the distance of depth_limit + + components : cudf.DataFrame, optional, default=None + GPU Dataframe containing the component information. + Passing this information may impact the return type. + When no component information is passed BFS uses one component + behavior settings. + + components['vertex'] : cudf.Series + vertex IDs + components['color'] : cudf.Series + component IDs/color for vertices. + + Returns + ------- + mem_footprint : integer + Estimated memory foot print size in Bytes + """ + + # Fixme not implemented in RMM yet + # using 96GB upper bound for now + # mem = get_device_memory_info() + mem = 9.6e10 + n_sources = sources.size + V = G.number_of_vertices() + E = G.number_of_edges() + mean_component_sz = V + n_components = 1 + + # Retreive types + size_of_v = 4 + size_of_e = 4 + size_of_w = 0 + if G.adjlist.weights is not None: + if G.adjlist.weights.dtype is np.float64: + size_of_w = 8 + else: + size_of_w = 4 + if G.adjlist.offsets.dtype is np.float64: + size_of_v = 8 + if G.adjlist.indices.dtype is np.float64: + size_of_e = 8 + + # Graph size + G_sz = E * size_of_e + E * size_of_w + V * size_of_v + + # The impact of depth limit depends on the sparsity + # pattern and diameter. We cannot leverage it without + # traversing the full dataset a the moment. + + # dense output + output_sz = n_sources * 2 * V * size_of_v + + # sparse output + if components is not None: + tmp = components["color"].value_counts() + n_components = tmp.size + if n_sources / n_components > 100: + warnings.warn( + "High number of seeds per component result in large output." + ) + mean_component_sz = tmp.mean() + output_sz = mean_component_sz * n_sources * 2 * size_of_e + + # counting 10% for context, handle and temporary allocations + mem_footprint = (G_sz + output_sz) * 1.1 + if mem_footprint > mem: + warnings.warn(f"Cannot execute in-memory :{mem_footprint} Bytes") + + return mem_footprint + + +def concurrent_bfs(Graphs, sources, depth_limit=None, offload=False): + """ + Find the breadth first traversals of multiple graphs with multiple sources + in each graph. + + Parameters + ---------- + Graphs : list of cugraph.Graph or cugraph.DiGraph + The adjacency lists will be computed if not already present. + + sources : list of cudf.Series + For each graph, subset of vertices from which the traversals start. + A BFS is run in Graphs[i] for each source in the Series at sources[i]. + The size of this list must match the size of the graph list. + The size of each Series (ie. the number of sources per graph) + is flexible, but cannot exceed the size of the corresponding graph. + + + depth_limit : Integer, optional, default=None + Limit the depth of the search. Terminates if no more vertices are + reachable within the distance of depth_limit + + offload : boolean, optional, default=False + Indicates if output should be written to the disk. + When not provided, the algorithms decides if offloading is needed + based on the input parameters. + + Returns + ------- + Return type is decided based on the input parameters (size of + sources, size of the graph, number of graphs and offload setting) + + If G is a cugraph.Graph and output fits in memory: + BFS_edge_lists : cudf.DataFrame + GPU data frame containing all BFS edges + source_offsets: cudf.Series + Series containing the starting offset in the returned edge list + for each source. + + If offload is True, or if the output does not fit in memory : + Writes csv files containing BFS output to the disk. + """ + raise NotImplementedError( + "concurrent_bfs is coming soon! Please up vote the github issue 1465\ + to help us prioritize" + ) + if not isinstance(Graphs, list): + raise TypeError( + "Graphs should be a list of cugraph.Graph or cugraph.DiGraph" + ) + if not isinstance(sources, list): + raise TypeError("sources should be a list of cudf.Series") + if len(Graphs) != len(sources): + raise ValueError( + "The size of the sources list must match\ + the size of the graph list." + ) + if offload is True: + raise NotImplementedError( + "Offloading is coming soon! Please up vote the github issue 1461\ + to help us prioritize" + ) + + # Consolidate graphs in a single graph and record components + + # Renumber and concatenate sources in a single df + + # Call multi_source_bfs + # multi_source_bfs( + # G, + # sources, + # components=components, + # depth_limit=depth_limit, + # offload=offload, + # ) + + +def multi_source_bfs( + G, sources, components=None, depth_limit=None, offload=False +): + """ + Find the breadth first traversal from multiple sources in a graph. + + Parameters + ---------- + G : cugraph.Graph or cugraph.DiGraph + The adjacency list will be computed if not already present. + + sources : cudf.Series + Subset of vertices from which the traversals start. A BFS is run for + each source in the Series. + The size of the series should be at least one and cannot exceed the + size of the graph. + + depth_limit : Integer, optional, default=None + Limit the depth of the search. Terminates if no more vertices are + reachable within the distance of depth_limit + + components : cudf.DataFrame, optional, default=None + GPU Dataframe containing the component information. + Passing this information may impact the return type. + When no component information is passed BFS uses one component + behavior settings. + + components['vertex'] : cudf.Series + vertex IDs + components['color'] : cudf.Series + component IDs/color for vertices. + + offload : boolean, optional, default=False + Indicates if output should be written to the disk. + When not provided, the algorithms decides if offloading is needed + based on the input parameters. + + Returns + ------- + Return value type is decided based on the input parameters (size of + sources, size of the graph, number of components and offload setting) + If G is a cugraph.Graph, returns : + cudf.DataFrame + df['vertex'] vertex IDs + + df['distance_'] path distance for each vertex from the + starting vertex. One column per source. + + df['predecessor_'] for each i'th position in the column, + the vertex ID immediately preceding the vertex at position i in + the 'vertex' column. One column per source. + + If G is a cugraph.Graph and component information is present returns : + BFS_edge_lists : cudf.DataFrame + GPU data frame containing all BFS edges + source_offsets: cudf.Series + Series containing the starting offset in the returned edge list + for each source. + + If offload is True, or if the output does not fit in memory : + Writes csv files containing BFS output to the disk. + """ + raise NotImplementedError( + "concurrent_bfs is coming soon! Please up vote the github issue 1465\ + to help us prioritize" + ) + # if components is not None: + # null_check(components["vertex"]) + # null_check(components["colors"]) + # + # if depth_limit is not None: + # raise NotImplementedError( + # "depth limit implementation of BFS is not currently supported" + # ) + + # if offload is True: + # raise NotImplementedError( + # "Offloading is coming soon! Please up vote the github issue 1461 + # to help us prioritize" + # ) + if isinstance(sources, list): + sources = cudf.Series(sources) + if G.renumbered is True: + sources = G.lookup_internal_vertex_id(cudf.Series(sources)) + if not G.adjlist: + G.view_adj_list() + # Memory footprint check + footprint = _get_feasibility( + G, sources, components=components, depth_limit=depth_limit + ) + print(footprint) + # Call multi_source_bfs + # FIXME remove when implemented + # raise NotImplementedError("Commming soon") diff --git a/python/cugraph/utilities/__init__.py b/python/cugraph/utilities/__init__.py index 61f5596eee6..38b46b0fe87 100644 --- a/python/cugraph/utilities/__init__.py +++ b/python/cugraph/utilities/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -25,3 +25,4 @@ is_cp_matrix_type, is_sp_matrix_type, ) +from cugraph.utilities.path_retrieval import get_traversed_cost diff --git a/python/cugraph/utilities/path_retrieval.pxd b/python/cugraph/utilities/path_retrieval.pxd new file mode 100644 index 00000000000..88f1da8f213 --- /dev/null +++ b/python/cugraph/utilities/path_retrieval.pxd @@ -0,0 +1,30 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +from cugraph.structure.graph_primtypes cimport * + +cdef extern from "utilities/path_retrieval.hpp" namespace "cugraph": + + cdef void get_traversed_cost[vertex_t, weight_t](const handle_t &handle, + const vertex_t *vertices, + const vertex_t *preds, + const weight_t *info_weights, + weight_t *out, + vertex_t stop_vertex, + vertex_t num_vertices) except + + diff --git a/python/cugraph/utilities/path_retrieval.py b/python/cugraph/utilities/path_retrieval.py new file mode 100644 index 00000000000..b9baadc2f21 --- /dev/null +++ b/python/cugraph/utilities/path_retrieval.py @@ -0,0 +1,100 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import cudf + +from cugraph.structure.symmetrize import symmetrize +from cugraph.structure.number_map import NumberMap +from cugraph.utilities import path_retrieval_wrapper + + +def get_traversed_cost(df, source, source_col, dest_col, value_col): + """ + Take the DataFrame result from a BFS or SSSP function call and sums + the given weights along the path to the starting vertex. + The source_col, dest_col identifiers need to match with the vertex and + predecessor columns of df. + + Input Parameters + ---------- + df : cudf.DataFrame + The dataframe containing the results of a BFS or SSSP call + source: int + Index of the source vertex. + source_col : cudf.DataFrame + This cudf.Series wraps a gdf_column of size E (E: number of edges). + The gdf column contains the source index for each edge. + Source indices must be an integer type. + dest_col : cudf.Series + This cudf.Series wraps a gdf_column of size E (E: number of edges). + The gdf column contains the destination index for each edge. + Destination indices must be an integer type. + value_col : cudf.Series + This cudf.Series wraps a gdf_column of size E (E: number of edges). + The gdf column contains values associated with this edge. + Weight should be a floating type. + + Returns + --------- + df : cudf.DataFrame + DataFrame containing two columns 'vertex' and 'info'. + Unreachable vertices will have value the max value of the weight type. + """ + + if 'vertex' not in df.columns: + raise ValueError("DataFrame does not appear to be a BFS or " + "SSP result - 'vertex' column missing") + if 'distance' not in df.columns: + raise ValueError("DataFrame does not appear to be a BFS or " + "SSP result - 'distance' column missing") + if 'predecessor' not in df.columns: + raise ValueError("DataFrame does not appear to be a BFS or " + "SSP result - 'predecessor' column missing") + + src, dst, val = symmetrize(source_col, + dest_col, + value_col) + + symmetrized_df = cudf.DataFrame() + symmetrized_df['source'] = src + symmetrized_df['destination'] = dst + symmetrized_df['weights'] = val + + input_df = df.merge(symmetrized_df, + left_on=['vertex', 'predecessor'], + right_on=['source', 'destination'], + how="left" + ) + + # Set unreachable vertex weights to max float and source vertex weight to 0 + max_val = np.finfo(val.dtype).max + input_df[['weights']] = input_df[['weights']].fillna(max_val) + input_df.loc[input_df['vertex'] == source, 'weights'] = 0 + + # Renumber + renumbered_gdf, renumber_map = NumberMap.renumber(input_df, + ["vertex"], + ["predecessor"], + preserve_order=True) + renumbered_gdf = renumbered_gdf.rename(columns={'src': 'vertex', + 'dst': 'predecessor'}) + stop_vertex = renumber_map.to_internal_vertex_id(cudf.Series(-1)).values[0] + + out_df = path_retrieval_wrapper.get_traversed_cost(renumbered_gdf, + stop_vertex) + + # Unrenumber + out_df['vertex'] = renumber_map.unrenumber(renumbered_gdf, 'vertex', + preserve_order=True)["vertex"] + return out_df diff --git a/python/cugraph/utilities/path_retrieval_wrapper.pyx b/python/cugraph/utilities/path_retrieval_wrapper.pyx new file mode 100644 index 00000000000..98d11ad07df --- /dev/null +++ b/python/cugraph/utilities/path_retrieval_wrapper.pyx @@ -0,0 +1,72 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +from cugraph.utilities.path_retrieval cimport get_traversed_cost as c_get_traversed_cost +from cugraph.structure.graph_primtypes cimport * +from libc.stdint cimport uintptr_t +from numba import cuda +import cudf +import numpy as np + + +def get_traversed_cost(input_df, stop_vertex): + """ + Call get_traversed_cost + """ + num_verts = input_df.shape[0] + vertex_t = input_df.vertex.dtype + weight_t = input_df.weights.dtype + + df = cudf.DataFrame() + df['vertex'] = input_df['vertex'] + df['info'] = cudf.Series(np.zeros(num_verts, dtype=weight_t)) + + cdef unique_ptr[handle_t] handle_ptr + handle_ptr.reset(new handle_t()) + handle_ = handle_ptr.get(); + + cdef uintptr_t vertices = NULL + cdef uintptr_t preds = NULL + cdef uintptr_t out = NULL + cdef uintptr_t info_weights = NULL + + vertices = input_df['vertex'].__cuda_array_interface__['data'][0] + preds = input_df['predecessor'].__cuda_array_interface__['data'][0] + info_weights = input_df['weights'].__cuda_array_interface__['data'][0] + out = df['info'].__cuda_array_interface__['data'][0] + + if weight_t == np.float32: + c_get_traversed_cost(handle_[0], + vertices, + preds, + info_weights, + out, + stop_vertex, + num_verts) + elif weight_t == np.float64: + c_get_traversed_cost(handle_[0], + vertices, + preds, + info_weights, + out, + stop_vertex, + num_verts) + else: + raise NotImplementedError + + return df diff --git a/python/cugraph/utilities/utils.py b/python/cugraph/utilities/utils.py index 39b789d7f79..adaec0f9e44 100644 --- a/python/cugraph/utilities/utils.py +++ b/python/cugraph/utilities/utils.py @@ -26,6 +26,7 @@ from cupyx.scipy.sparse.coo import coo_matrix as cp_coo_matrix from cupyx.scipy.sparse.csr import csr_matrix as cp_csr_matrix from cupyx.scipy.sparse.csc import csc_matrix as cp_csc_matrix + CP_MATRIX_TYPES = [cp_coo_matrix, cp_csr_matrix, cp_csc_matrix] CP_COMPRESSED_MATRIX_TYPES = [cp_csr_matrix, cp_csc_matrix] except ModuleNotFoundError: @@ -38,6 +39,7 @@ from scipy.sparse.coo import coo_matrix as sp_coo_matrix from scipy.sparse.csr import csr_matrix as sp_csr_matrix from scipy.sparse.csc import csc_matrix as sp_csc_matrix + SP_MATRIX_TYPES = [sp_coo_matrix, sp_csr_matrix, sp_csc_matrix] SP_COMPRESSED_MATRIX_TYPES = [sp_csr_matrix, sp_csc_matrix] except ModuleNotFoundError: @@ -80,15 +82,21 @@ def get_traversed_path(df, id): >>> path = cugraph.utils.get_traversed_path(sssp_df, 32) """ - if 'vertex' not in df.columns: - raise ValueError("DataFrame does not appear to be a BFS or " - "SSP result - 'vertex' column missing") - if 'distance' not in df.columns: - raise ValueError("DataFrame does not appear to be a BFS or " - "SSP result - 'distance' column missing") - if 'predecessor' not in df.columns: - raise ValueError("DataFrame does not appear to be a BFS or " - "SSP result - 'predecessor' column missing") + if "vertex" not in df.columns: + raise ValueError( + "DataFrame does not appear to be a BFS or " + "SSP result - 'vertex' column missing" + ) + if "distance" not in df.columns: + raise ValueError( + "DataFrame does not appear to be a BFS or " + "SSP result - 'distance' column missing" + ) + if "predecessor" not in df.columns: + raise ValueError( + "DataFrame does not appear to be a BFS or " + "SSP result - 'predecessor' column missing" + ) if type(id) != int: raise ValueError("The vertex 'id' needs to be an integer") @@ -96,17 +104,17 @@ def get_traversed_path(df, id): # or edited. Therefore we cannot assume that using the vertex ID # as an index will work - ddf = df[df['vertex'] == id] + ddf = df[df["vertex"] == id] if len(ddf) == 0: raise ValueError("The vertex (", id, " is not in the result set") - pred = ddf['predecessor'].iloc[0] + pred = ddf["predecessor"].iloc[0] answer = [] answer.append(ddf) while pred != -1: - ddf = df[df['vertex'] == pred] - pred = ddf['predecessor'].iloc[0] + ddf = df[df["vertex"] == pred] + pred = ddf["predecessor"].iloc[0] answer.append(ddf) return cudf.concat(answer) @@ -138,15 +146,21 @@ def get_traversed_path_list(df, id): >>> path = cugraph.utils.get_traversed_path_list(sssp_df, 32) """ - if 'vertex' not in df.columns: - raise ValueError("DataFrame does not appear to be a BFS or " - "SSP result - 'vertex' column missing") - if 'distance' not in df.columns: - raise ValueError("DataFrame does not appear to be a BFS or " - "SSP result - 'distance' column missing") - if 'predecessor' not in df.columns: - raise ValueError("DataFrame does not appear to be a BFS or " - "SSP result - 'predecessor' column missing") + if "vertex" not in df.columns: + raise ValueError( + "DataFrame does not appear to be a BFS or " + "SSP result - 'vertex' column missing" + ) + if "distance" not in df.columns: + raise ValueError( + "DataFrame does not appear to be a BFS or " + "SSP result - 'distance' column missing" + ) + if "predecessor" not in df.columns: + raise ValueError( + "DataFrame does not appear to be a BFS or " + "SSP result - 'predecessor' column missing" + ) if type(id) != int: raise ValueError("The vertex 'id' needs to be an integer") @@ -158,17 +172,17 @@ def get_traversed_path_list(df, id): answer = [] answer.append(id) - ddf = df[df['vertex'] == id] + ddf = df[df["vertex"] == id] if len(ddf) == 0: raise ValueError("The vertex (", id, " is not in the result set") - pred = ddf['predecessor'].iloc[0] + pred = ddf["predecessor"].iloc[0] while pred != -1: answer.append(pred) - ddf = df[df['vertex'] == pred] - pred = ddf['predecessor'].iloc[0] + ddf = df[df["vertex"] == pred] + pred = ddf["predecessor"].iloc[0] return answer @@ -206,6 +220,14 @@ def is_device_version_less_than(min_version=(7, 0)): return False +def get_device_memory_info(): + """ + Returns the total amount of global memory on the device in bytes + """ + meminfo = cuda.current_context().get_memory_info() + return meminfo[1] + + # FIXME: if G is a Nx type, the weight attribute is assumed to be "weight", if # set. An additional optional parameter for the weight attr name when accepting # Nx graphs may be needed. From the Nx docs: @@ -229,29 +251,35 @@ def ensure_cugraph_obj(obj, nx_weight_attr=None, matrix_graph_type=None): elif (nx is not None) and (input_type in [nx.Graph, nx.DiGraph]): return (convert_from_nx(obj, weight=nx_weight_attr), input_type) - elif (input_type in CP_MATRIX_TYPES) or \ - (input_type in SP_MATRIX_TYPES): + elif (input_type in CP_MATRIX_TYPES) or (input_type in SP_MATRIX_TYPES): if matrix_graph_type is None: matrix_graph_type = Graph elif matrix_graph_type not in [Graph, DiGraph]: - raise TypeError(f"matrix_graph_type must be either a cugraph " - f"Graph or DiGraph, got: {matrix_graph_type}") - - if input_type in (CP_COMPRESSED_MATRIX_TYPES + - SP_COMPRESSED_MATRIX_TYPES): + raise TypeError( + f"matrix_graph_type must be either a cugraph " + f"Graph or DiGraph, got: {matrix_graph_type}" + ) + + if input_type in ( + CP_COMPRESSED_MATRIX_TYPES + SP_COMPRESSED_MATRIX_TYPES + ): coo = obj.tocoo(copy=False) else: coo = obj if input_type in CP_MATRIX_TYPES: - df = cudf.DataFrame({"source": cp.ascontiguousarray(coo.row), - "destination": cp.ascontiguousarray(coo.col), - "weight": cp.ascontiguousarray(coo.data)}) + df = cudf.DataFrame( + { + "source": cp.ascontiguousarray(coo.row), + "destination": cp.ascontiguousarray(coo.col), + "weight": cp.ascontiguousarray(coo.data), + } + ) else: - df = cudf.DataFrame({"source": coo.row, - "destination": coo.col, - "weight": coo.data}) + df = cudf.DataFrame( + {"source": coo.row, "destination": coo.col, "weight": coo.data} + ) # FIXME: # * do a quick check that symmetry is stored explicitly in the cupy # data for sym matrices (ie. for each uv, check vu is there)