diff --git a/README.md b/README.md
index 77377fe2bbc..4bdbcd00280 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ As of Release 0.18 - including 0.18 nightly
| Community | | | |
| | EgoNet | Single-GPU | |
| | Leiden | Single-GPU | |
-| | Louvain | Multi-GPU | |
+| | Louvain | Multi-GPU | [C++ README](cpp/src/community/README.md#Louvain) |
| | Ensemble Clustering for Graphs | Single-GPU | |
| | Spectral-Clustering - Balanced Cut | Single-GPU | |
| | Spectral-Clustering - Modularity | Single-GPU | |
@@ -71,16 +71,16 @@ As of Release 0.18 - including 0.18 nightly
| Linear Assignment| | | |
| | Hungarian | Single-GPU | [README](cpp/src/linear_assignment/README-hungarian.md) |
| Link Analysis| | | |
-| | Pagerank | Multi-GPU | |
-| | Personal Pagerank | Multi-GPU | |
+| | Pagerank | Multi-GPU | [C++ README](cpp/src/centrality/README.md#Pagerank) |
+| | Personal Pagerank | Multi-GPU | [C++ README](cpp/src/centrality/README.md#Personalized-Pagerank) |
| | HITS | Single-GPU | leverages Gunrock |
| Link Prediction | | | |
| | Jaccard Similarity | Single-GPU | |
| | Weighted Jaccard Similarity | Single-GPU | |
| | Overlap Similarity | Single-GPU | |
| Traversal | | | |
-| | Breadth First Search (BFS) | Multi-GPU | with cutoff support |
-| | Single Source Shortest Path (SSSP) | Multi-GPU | |
+| | Breadth First Search (BFS) | Multi-GPU | with cutoff support
[C++ README](cpp/src/traversal/README.md#BFS) |
+| | Single Source Shortest Path (SSSP) | Multi-GPU | [C++ README](cpp/src/traversal/README.md#SSSP) |
| | Traveling Salesperson Problem (TSP) | Single-GPU | |
| Structure | | | |
| | Renumbering | Single-GPU | multiple columns, any data type |
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index d69448cda4e..8d12b10a640 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -26,6 +26,7 @@ export GPUCI_CONDA_RETRY_SLEEP=30
# Use Ninja to build
export CMAKE_GENERATOR="Ninja"
+export CONDA_BLD_DIR="${WORKSPACE}/.conda-bld"
################################################################################
# SETUP - Check environment
@@ -58,18 +59,20 @@ conda config --set ssl_verify False
gpuci_logger "Build conda pkg for libcugraph"
if [ "$BUILD_LIBCUGRAPH" == '1' ]; then
if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
- conda build conda/recipes/libcugraph
+ gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcugraph
else
- conda build --dirty --no-remove-work-dir conda/recipes/libcugraph
+ gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libcugraph
+ mkdir -p ${CONDA_BLD_DIR}/libcugraph/work
+ cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcugraph/work
fi
fi
gpuci_logger "Build conda pkg for cugraph"
if [ "$BUILD_CUGRAPH" == "1" ]; then
if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
- conda build conda/recipes/cugraph --python=$PYTHON
+ gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/cugraph --python=$PYTHON
else
- conda build conda/recipes/cugraph -c ci/artifacts/cugraph/cpu/conda-bld/ --dirty --no-remove-work-dir --python=$PYTHON
+ gpuci_conda_retry build --croot ${CONDA_BLD_DIR} conda/recipes/cugraph -c ci/artifacts/cugraph/cpu/.conda-bld/ --dirty --no-remove-work-dir --python=$PYTHON
fi
fi
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 0fca82216c3..50e4c25b90b 100644
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -1,4 +1,5 @@
#!/bin/bash
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
#
# Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh
@@ -29,8 +30,8 @@ fi
gpuci_logger "Get conda file output locations"
-export LIBCUGRAPH_FILE=`conda build conda/recipes/libcugraph --output`
-export CUGRAPH_FILE=`conda build conda/recipes/cugraph --python=$PYTHON --output`
+export LIBCUGRAPH_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcugraph --output`
+export CUGRAPH_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cugraph --python=$PYTHON --output`
################################################################################
# UPLOAD - Conda packages
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 7242b4a11f5..30dc7373e15 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -16,6 +16,7 @@ function hasArg {
export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
export CUDA_REL=${CUDA_VERSION%.*}
+export CONDA_ARTIFACT_PATH=${WORKSPACE}/ci/artifacts/cugraph/cpu/.conda-bld/
function cleanup {
gpuci_logger "Removing datasets and temp files"
@@ -101,11 +102,11 @@ else
chrpath -d libcugraph.so
patchelf --replace-needed `patchelf --print-needed libcugraph.so | grep faiss` libfaiss.so libcugraph.so
- CONDA_FILE=`find $WORKSPACE/ci/artifacts/cugraph/cpu/conda-bld/ -name "libcugraph*.tar.bz2"`
+ CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcugraph*.tar.bz2"`
CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension
CONDA_FILE=${CONDA_FILE//-/=} #convert to conda install
echo "Installing $CONDA_FILE"
- conda install -c $WORKSPACE/ci/artifacts/cugraph/cpu/conda-bld/ "$CONDA_FILE"
+ conda install -c ${CONDA_ARTIFACT_PATH} "$CONDA_FILE"
echo "Build cugraph..."
$WORKSPACE/build.sh cugraph
diff --git a/ci/test.sh b/ci/test.sh
index 58cbb950f73..31660cd15ec 100755
--- a/ci/test.sh
+++ b/ci/test.sh
@@ -66,13 +66,17 @@ fi
# EXITCODE for the script.
set +e
-echo "C++ gtests for cuGraph..."
-for gt in tests/*_TEST; do
- test_name=$(basename $gt)
- echo "Running gtest $test_name"
- ${gt} ${GTEST_FILTER} ${GTEST_ARGS}
- echo "Ran gtest $test_name : return code was: $?, test script exit code is now: $EXITCODE"
-done
+if (python ${CUGRAPH_ROOT}/ci/utils/is_pascal.py); then
+ echo "WARNING: skipping C++ tests on Pascal GPU arch."
+else
+ echo "C++ gtests for cuGraph..."
+ for gt in tests/*_TEST; do
+ test_name=$(basename $gt)
+ echo "Running gtest $test_name"
+ ${gt} ${GTEST_FILTER} ${GTEST_ARGS}
+ echo "Ran gtest $test_name : return code was: $?, test script exit code is now: $EXITCODE"
+ done
+fi
echo "Python pytest for cuGraph..."
cd ${CUGRAPH_ROOT}/python
diff --git a/ci/utils/is_pascal.py b/ci/utils/is_pascal.py
new file mode 100644
index 00000000000..e55a3153a12
--- /dev/null
+++ b/ci/utils/is_pascal.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import sys
+import glob
+
+from numba import cuda
+
+# FIXME: consolidate this code with ci/gpu/notebook_list.py
+
+#
+# Not strictly true... however what we mean is
+# Pascal or earlier
+#
+pascal = False
+
+device = cuda.get_current_device()
+# check for the attribute using both pre and post numba 0.53 names
+cc = getattr(device, 'COMPUTE_CAPABILITY', None) or \
+ getattr(device, 'compute_capability')
+if (cc[0] < 7):
+ pascal = True
+
+# Return zero (success) if pascal is True
+if pascal:
+ sys.exit(0)
+else:
+ sys.exit(1)
diff --git a/conda/environments/cugraph_dev_cuda10.1.yml b/conda/environments/cugraph_dev_cuda10.1.yml
index f26c3dd45d9..a138f5e80df 100644
--- a/conda/environments/cugraph_dev_cuda10.1.yml
+++ b/conda/environments/cugraph_dev_cuda10.1.yml
@@ -18,7 +18,7 @@ dependencies:
- ucx-py=0.19*
- ucx-proc=*=gpu
- scipy
-- networkx
+- networkx>=2.5.1
- python-louvain
- cudatoolkit=10.1
- clang=8.0.1
diff --git a/conda/environments/cugraph_dev_cuda10.2.yml b/conda/environments/cugraph_dev_cuda10.2.yml
index 2848cc49dc7..d53fefc086a 100644
--- a/conda/environments/cugraph_dev_cuda10.2.yml
+++ b/conda/environments/cugraph_dev_cuda10.2.yml
@@ -18,7 +18,7 @@ dependencies:
- ucx-py=0.19*
- ucx-proc=*=gpu
- scipy
-- networkx
+- networkx>=2.5.1
- python-louvain
- cudatoolkit=10.2
- clang=8.0.1
diff --git a/conda/environments/cugraph_dev_cuda11.0.yml b/conda/environments/cugraph_dev_cuda11.0.yml
index 82e8b409d13..771b175aa92 100644
--- a/conda/environments/cugraph_dev_cuda11.0.yml
+++ b/conda/environments/cugraph_dev_cuda11.0.yml
@@ -18,7 +18,7 @@ dependencies:
- ucx-py=0.19*
- ucx-proc=*=gpu
- scipy
-- networkx
+- networkx>=2.5.1
- python-louvain
- cudatoolkit=11.0
- clang=8.0.1
diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
index e714b61d774..1ef64ddbe72 100644
--- a/conda/recipes/cugraph/meta.yaml
+++ b/conda/recipes/cugraph/meta.yaml
@@ -10,7 +10,7 @@ package:
version: {{ version }}
source:
- path: ../../..
+ git_url: ../../..
build:
number: {{ GIT_DESCRIBE_NUMBER }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 26a8f98e265..0388a76d729 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -18,6 +18,10 @@ cmake_minimum_required(VERSION 3.18...3.18 FATAL_ERROR)
project(CUGRAPH VERSION 0.19.0 LANGUAGES C CXX CUDA)
+# Write the version header
+include(cmake/Modules/Version.cmake)
+write_version()
+
###################################################################################################
# - build type ------------------------------------------------------------------------------------
@@ -112,6 +116,7 @@ set(FAISS_GPU_ARCHS "${FAISS_GPU_ARCHS} -gencode arch=compute_${ptx},code=comput
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xptxas --disable-warnings")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wno-error=sign-compare,-Wno-error=unused-but-set-variable")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all")
# Option to enable line info in CUDA device compilation to allow introspection when profiling /
# memchecking
@@ -271,7 +276,7 @@ message("set LIBCUDACXX_INCLUDE_DIR to: ${LIBCUDACXX_INCLUDE_DIR}")
FetchContent_Declare(
cuhornet
GIT_REPOSITORY https://github.com/rapidsai/cuhornet.git
- GIT_TAG 9cb8e8803852bd895a9c95c0fe778ad6eeefa7ad
+ GIT_TAG e58d0ecdbc270fc28867d66c965787a62a7a882c
GIT_SHALLOW true
SOURCE_SUBDIR hornet
)
@@ -297,7 +302,7 @@ else(DEFINED ENV{RAFT_PATH})
FetchContent_Declare(
raft
GIT_REPOSITORY https://github.com/rapidsai/raft.git
- GIT_TAG 6455e05b3889db2b495cf3189b33c2b07bfbebf2
+ GIT_TAG f0cd81fb49638eaddc9bf18998cc894f292bc293
SOURCE_SUBDIR raft
)
@@ -397,6 +402,7 @@ endif(BUILD_STATIC_FAISS)
add_library(cugraph SHARED
src/utilities/spmv_1D.cu
src/utilities/cython.cu
+ src/utilities/path_retrieval.cu
src/structure/graph.cu
src/linear_assignment/hungarian.cu
src/link_analysis/gunrock_hits.cpp
@@ -416,6 +422,7 @@ add_library(cugraph SHARED
src/community/triangles_counting.cu
src/community/extract_subgraph_by_vertex.cu
src/community/egonet.cu
+ src/sampling/random_walks.cu
src/cores/core_number.cu
src/traversal/two_hop_neighbors.cu
src/components/connectivity.cu
@@ -426,6 +433,7 @@ add_library(cugraph SHARED
src/experimental/graph_view.cu
src/experimental/coarsen_graph.cu
src/experimental/renumber_edgelist.cu
+ src/experimental/renumber_utils.cu
src/experimental/relabel.cu
src/experimental/induced_subgraph.cu
src/experimental/bfs.cu
@@ -560,6 +568,9 @@ install(TARGETS cugraph LIBRARY
install(DIRECTORY include/
DESTINATION include/cugraph)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/include/cugraph/version_config.hpp
+ DESTINATION include/cugraph)
+
install(DIRECTORY ${RAFT_DIR}/cpp/include/raft/
DESTINATION include/cugraph/raft)
###################################################################################################
diff --git a/cpp/cmake/Modules/Version.cmake b/cpp/cmake/Modules/Version.cmake
new file mode 100644
index 00000000000..15046784175
--- /dev/null
+++ b/cpp/cmake/Modules/Version.cmake
@@ -0,0 +1,18 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+# Generate version_config.hpp from the version found in CMakeLists.txt
+function(write_version)
+ message(STATUS "CUGRAPH VERSION: ${CUGRAPH_VERSION}")
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/version_config.hpp.in
+ ${CMAKE_CURRENT_BINARY_DIR}/include/cugraph/version_config.hpp @ONLY)
+endfunction(write_version)
diff --git a/cpp/cmake/version_config.hpp.in b/cpp/cmake/version_config.hpp.in
new file mode 100644
index 00000000000..c669d1b97f3
--- /dev/null
+++ b/cpp/cmake/version_config.hpp.in
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#define CUGRAPH_VERSION_MAJOR @CUGRAPH_VERSION_MAJOR@
+#define CUGRAPH_VERSION_MINOR @CUGRAPH_VERSION_MINOR@
+#define CUGRAPH_VERSION_PATCH @CUGRAPH_VERSION_PATCH@
diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp
index c3a4f3ec985..0b45b799357 100644
--- a/cpp/include/algorithms.hpp
+++ b/cpp/include/algorithms.hpp
@@ -14,10 +14,14 @@
* limitations under the License.
*/
#pragma once
+
+#include
#include
#include
+
#include
#include
+
#include
namespace cugraph {
@@ -218,7 +222,7 @@ void force_atlas2(GraphCOOView &graph,
* @param[out] route Device array containing the returned route.
*
*/
-float traveling_salesperson(raft::handle_t &handle,
+float traveling_salesperson(raft::handle_t const &handle,
int const *vtx_ptr,
float const *x_pos,
float const *y_pos,
@@ -612,7 +616,7 @@ weight_t hungarian(raft::handle_t const &handle,
*
* @throws cugraph::logic_error when an error occurs.
*
- * @tparam graph_t Type of graph
+ * @tparam graph_view_t Type of graph
*
* @param[in] handle Library handle (RAFT). If a communicator is set in the handle,
* @param[in] graph input graph object (CSR)
@@ -629,13 +633,74 @@ weight_t hungarian(raft::handle_t const &handle,
* 2) modularity of the returned clustering
*
*/
-template
-std::pair louvain(
+template
+std::pair louvain(
raft::handle_t const &handle,
- graph_t const &graph,
- typename graph_t::vertex_type *clustering,
- size_t max_level = 100,
- typename graph_t::weight_type resolution = typename graph_t::weight_type{1});
+ graph_view_t const &graph_view,
+ typename graph_view_t::vertex_type *clustering,
+ size_t max_level = 100,
+ typename graph_view_t::weight_type resolution = typename graph_view_t::weight_type{1});
+
+/**
+ * @brief Louvain implementation, returning dendrogram
+ *
+ * Compute a clustering of the graph by maximizing modularity
+ *
+ * Computed using the Louvain method described in:
+ *
+ * VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of
+ * community hierarchies in large networks, J Stat Mech P10008 (2008),
+ * http://arxiv.org/abs/0803.0476
+ *
+ * @throws cugraph::logic_error when an error occurs.
+ *
+ * @tparam graph_view_t Type of graph
+ *
+ * @param[in] handle Library handle (RAFT)
+ * @param[in] graph_view Input graph view object (CSR)
+ * @param[in] max_level (optional) maximum number of levels to run (default 100)
+ * @param[in] resolution (optional) The value of the resolution parameter to use.
+ * Called gamma in the modularity formula, this changes the size
+ * of the communities. Higher resolutions lead to more smaller
+ * communities, lower resolutions lead to fewer larger
+ * communities. (default 1)
+ *
+ * @return a pair containing:
+ * 1) unique pointer to dendrogram
+ * 2) modularity of the returned clustering
+ *
+ */
+template
+std::pair>,
+ typename graph_view_t::weight_type>
+louvain(raft::handle_t const &handle,
+ graph_view_t const &graph_view,
+ size_t max_level = 100,
+ typename graph_view_t::weight_type resolution = typename graph_view_t::weight_type{1});
+
+/**
+ * @brief Flatten a Dendrogram at a particular level
+ *
+ * A Dendrogram represents a hierarchical clustering/partitioning of
+ * a graph. This function will flatten the hierarchical clustering into
+ * a label for each vertex representing the final cluster/partition to
+ * which it is assigned
+ *
+ * @throws cugraph::logic_error when an error occurs.
+ *
+ * @tparam graph_view_t Type of graph
+ *
+ * @param[in] handle Library handle (RAFT). If a communicator is set in the handle,
+ * @param[in] graph input graph object
+ * @param[in] dendrogram input dendrogram object
+ * @param[out] clustering Pointer to device array where the clustering should be stored
+ *
+ */
+template
+void flatten_dendrogram(raft::handle_t const &handle,
+ graph_view_t const &graph_view,
+ Dendrogram const &dendrogram,
+ typename graph_view_t::vertex_type *clustering);
/**
* @brief Leiden implementation
@@ -1187,5 +1252,33 @@ extract_ego(raft::handle_t const &handle,
vertex_t *source_vertex,
vertex_t n_subgraphs,
vertex_t radius);
+
+/**
+ * @brief returns random walks (RW) from starting sources, where each path is of given maximum
+ * length. Uniform distribution is assumed for the random engine.
+ *
+ * @tparam graph_t Type of graph/view (typically, graph_view_t).
+ * @tparam index_t Type used to store indexing and sizes.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph Graph (view )object to generate RW on.
+ * @param ptr_d_start Device pointer to set of starting vertex indices for the RW.
+ * @param num_paths = number(paths).
+ * @param max_depth maximum length of RWs.
+ * @return std::tuple, device_vec_t,
+ * device_vec_t> Triplet of coalesced RW paths, with corresponding edge weights for
+ * each, and corresponding path sizes. This is meant to minimize the number of DF's to be passed to
+ * the Python layer. The meaning of "coalesced" here is that a 2D array of paths of different sizes
+ * is represented as a 1D array.
+ */
+template
+std::tuple,
+ rmm::device_uvector,
+ rmm::device_uvector>
+random_walks(raft::handle_t const &handle,
+ graph_t const &graph,
+ typename graph_t::vertex_type const *ptr_d_start,
+ index_t num_paths,
+ index_t max_depth);
} // namespace experimental
} // namespace cugraph
diff --git a/cpp/src/community/dendrogram.cuh b/cpp/include/dendrogram.hpp
similarity index 53%
rename from cpp/src/community/dendrogram.cuh
rename to cpp/include/dendrogram.hpp
index 414f5f3854d..aa0802e80b3 100644
--- a/cpp/src/community/dendrogram.cuh
+++ b/cpp/include/dendrogram.hpp
@@ -15,7 +15,7 @@
*/
#pragma once
-#include
+#include
#include
#include
@@ -25,30 +25,26 @@ namespace cugraph {
template
class Dendrogram {
public:
- void add_level(vertex_t num_verts,
- cudaStream_t stream = 0,
+ void add_level(vertex_t first_index,
+ vertex_t num_verts,
+ cudaStream_t stream,
rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
{
- level_ptr_.push_back(
- std::make_unique(num_verts * sizeof(vertex_t), stream, mr));
- level_size_.push_back(num_verts);
+ level_ptr_.push_back(std::make_unique>(num_verts, stream, mr));
+ level_first_index_.push_back(first_index);
}
- size_t current_level() const { return level_size_.size() - 1; }
+ size_t current_level() const { return level_ptr_.size() - 1; }
- size_t num_levels() const { return level_size_.size(); }
+ size_t num_levels() const { return level_ptr_.size(); }
- vertex_t const *get_level_ptr_nocheck(size_t level) const
- {
- return static_cast(level_ptr_[level]->data());
- }
+ vertex_t const *get_level_ptr_nocheck(size_t level) const { return level_ptr_[level]->data(); }
- vertex_t *get_level_ptr_nocheck(size_t level)
- {
- return static_cast(level_ptr_[level]->data());
- }
+ vertex_t *get_level_ptr_nocheck(size_t level) { return level_ptr_[level]->data(); }
- vertex_t get_level_size_nocheck(size_t level) const { return level_size_[level]; }
+ size_t get_level_size_nocheck(size_t level) const { return level_ptr_[level]->size(); }
+
+ vertex_t get_level_first_index_nocheck(size_t level) const { return level_first_index_[level]; }
vertex_t const *current_level_begin() const { return get_level_ptr_nocheck(current_level()); }
@@ -58,11 +54,16 @@ class Dendrogram {
vertex_t *current_level_end() { return current_level_begin() + current_level_size(); }
- vertex_t current_level_size() const { return get_level_size_nocheck(current_level()); }
+ size_t current_level_size() const { return get_level_size_nocheck(current_level()); }
+
+ vertex_t current_level_first_index() const
+ {
+ return get_level_first_index_nocheck(current_level());
+ }
private:
- std::vector level_size_;
- std::vector> level_ptr_;
+ std::vector level_first_index_;
+ std::vector>> level_ptr_;
};
} // namespace cugraph
diff --git a/cpp/include/experimental/detail/graph_utils.cuh b/cpp/include/experimental/detail/graph_utils.cuh
index 084d68b8ba4..d79788e59ce 100644
--- a/cpp/include/experimental/detail/graph_utils.cuh
+++ b/cpp/include/experimental/detail/graph_utils.cuh
@@ -56,65 +56,32 @@ rmm::device_uvector compute_major_degrees(
rmm::device_uvector degrees(0, handle.get_stream());
vertex_t max_num_local_degrees{0};
- for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size);
- ++i) {
- auto vertex_partition_idx = partition.is_hypergraph_partitioned()
- ? static_cast(i * row_comm_size + row_comm_rank)
- : static_cast(col_comm_rank * row_comm_size + i);
+ for (int i = 0; i < col_comm_size; ++i) {
+ auto vertex_partition_idx = static_cast(i * row_comm_size + row_comm_rank);
auto vertex_partition_size = partition.get_vertex_partition_size(vertex_partition_idx);
max_num_local_degrees = std::max(max_num_local_degrees, vertex_partition_size);
- if (i == (partition.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank)) {
- degrees.resize(vertex_partition_size, handle.get_stream());
- }
+ if (i == col_comm_rank) { degrees.resize(vertex_partition_size, handle.get_stream()); }
}
local_degrees.resize(max_num_local_degrees, handle.get_stream());
- for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size);
- ++i) {
- auto vertex_partition_idx = partition.is_hypergraph_partitioned()
- ? static_cast(i * row_comm_size + row_comm_rank)
- : static_cast(col_comm_rank * row_comm_size + i);
+ for (int i = 0; i < col_comm_size; ++i) {
+ auto vertex_partition_idx = static_cast(i * row_comm_size + row_comm_rank);
vertex_t major_first{};
vertex_t major_last{};
std::tie(major_first, major_last) = partition.get_vertex_partition_range(vertex_partition_idx);
- auto p_offsets =
- partition.is_hypergraph_partitioned()
- ? adj_matrix_partition_offsets[i]
- : adj_matrix_partition_offsets[0] +
- (major_first - partition.get_vertex_partition_first(col_comm_rank * row_comm_size));
+ auto p_offsets = adj_matrix_partition_offsets[i];
thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
thrust::make_counting_iterator(vertex_t{0}),
thrust::make_counting_iterator(major_last - major_first),
local_degrees.data(),
[p_offsets] __device__(auto i) { return p_offsets[i + 1] - p_offsets[i]; });
- if (partition.is_hypergraph_partitioned()) {
- col_comm.reduce(local_degrees.data(),
- i == col_comm_rank ? degrees.data() : static_cast(nullptr),
- static_cast(major_last - major_first),
- raft::comms::op_t::SUM,
- i,
- handle.get_stream());
- } else {
- row_comm.reduce(local_degrees.data(),
- i == row_comm_rank ? degrees.data() : static_cast(nullptr),
- static_cast(major_last - major_first),
- raft::comms::op_t::SUM,
- i,
- handle.get_stream());
- }
+ col_comm.reduce(local_degrees.data(),
+ i == col_comm_rank ? degrees.data() : static_cast(nullptr),
+ static_cast(major_last - major_first),
+ raft::comms::op_t::SUM,
+ i,
+ handle.get_stream());
}
- raft::comms::status_t status{};
- if (partition.is_hypergraph_partitioned()) {
- status =
- col_comm.sync_stream(handle.get_stream()); // this is neessary as local_degrees will become
- // out-of-scope once this function returns.
- } else {
- status =
- row_comm.sync_stream(handle.get_stream()); // this is neessary as local_degrees will become
- // out-of-scope once this function returns.
- }
- CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-
return degrees;
}
@@ -170,7 +137,6 @@ struct compute_gpu_id_from_vertex_t {
template
struct compute_gpu_id_from_edge_t {
- bool hypergraph_partitioned{false};
int comm_size{0};
int row_comm_size{0};
int col_comm_size{0};
@@ -180,12 +146,22 @@ struct compute_gpu_id_from_edge_t {
cuco::detail::MurmurHash3_32 hash_func{};
auto major_comm_rank = static_cast(hash_func(major) % comm_size);
auto minor_comm_rank = static_cast(hash_func(minor) % comm_size);
- if (hypergraph_partitioned) {
- return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
- } else {
- return (major_comm_rank - (major_comm_rank % row_comm_size)) +
- (minor_comm_rank / col_comm_size);
- }
+ return (minor_comm_rank / row_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
+ }
+};
+
+template
+struct compute_partition_id_from_edge_t {
+ int comm_size{0};
+ int row_comm_size{0};
+ int col_comm_size{0};
+
+ __device__ int operator()(vertex_t major, vertex_t minor) const
+ {
+ cuco::detail::MurmurHash3_32 hash_func{};
+ auto major_comm_rank = static_cast(hash_func(major) % comm_size);
+ auto minor_comm_rank = static_cast(hash_func(minor) % comm_size);
+ return major_comm_rank * col_comm_size + minor_comm_rank / row_comm_size;
}
};
diff --git a/cpp/include/experimental/graph.hpp b/cpp/include/experimental/graph.hpp
index 6a10256e6f4..27f766b8593 100644
--- a/cpp/include/experimental/graph.hpp
+++ b/cpp/include/experimental/graph.hpp
@@ -88,12 +88,12 @@ class graph_tget_number_of_vertices(),
this->get_number_of_edges(),
this->get_graph_properties(),
- vertex_partition_segment_offsets_.size() > 0,
+ adj_matrix_partition_segment_offsets_.size() > 0,
false);
}
@@ -105,9 +105,10 @@ class graph_t partition_{};
std::vector
- vertex_partition_segment_offsets_{}; // segment offsets within the vertex partition based on
- // vertex degree, relevant only if
- // sorted_by_global_degree_within_vertex_partition is true
+ adj_matrix_partition_segment_offsets_{}; // segment offsets within the vertex partition based
+ // on vertex degree, relevant only if
+ // sorted_by_global_degree_within_vertex_partition is
+ // true
};
// single-GPU version
@@ -188,6 +189,20 @@ template
struct invalid_edge_id : invalid_idx {
};
+template
+__host__ __device__ std::enable_if_t::value, bool> is_valid_vertex(
+ vertex_t num_vertices, vertex_t v)
+{
+ return (v >= 0) && (v < num_vertices);
+}
+
+template
+__host__ __device__ std::enable_if_t::value, bool> is_valid_vertex(
+ vertex_t num_vertices, vertex_t v)
+{
+ return v < num_vertices;
+}
+
} // namespace experimental
} // namespace cugraph
diff --git a/cpp/include/experimental/graph_functions.hpp b/cpp/include/experimental/graph_functions.hpp
index 7b4bb466b97..b48dc6da136 100644
--- a/cpp/include/experimental/graph_functions.hpp
+++ b/cpp/include/experimental/graph_functions.hpp
@@ -17,13 +17,13 @@
#include
#include
-#include
#include
#include
#include
#include
+#include
namespace cugraph {
namespace experimental {
@@ -40,19 +40,24 @@ namespace experimental {
* or multi-GPU (true).
* @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
* handles to various CUDA libraries) to run graph algorithms.
- * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as
+ * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition
+ * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as
* is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex
- * IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t functor to
- * every (major, minor) pair should return the local GPU ID for this function to work (edges should
- * be pre-shuffled).
- * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is
- * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored).
- * Vertex IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t
- * functor to every (major, minor) pair should return the local GPU ID for this function to work
- * (edges should be pre-shuffled).
- * @param num_edgelist_edges Number of edges in the edgelist.
- * @param is_hypergraph_partitioned Flag indicating whether we are assuming hypergraph partitioning
- * (this flag will be removed in the future).
+ * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target
+ * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major,
+ * minor) pair should return the GPU ID of this process and applying the
+ * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition
+ * should return the partition ID of the corresponding matrix partition.
+ * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition
+ * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored
+ * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs
+ * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target
+ * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major,
+ * minor) pair should return the GPU ID of this process and applying the
+ * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition
+ * should return the partition ID of the corresponding matrix partition.
+ * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition
+ * assigned to this process).
* @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
* @return std::tuple, partition_t, vertex_t, edge_t>
* Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to
@@ -63,10 +68,9 @@ template
std::enable_if_t, partition_t, vertex_t, edge_t>>
renumber_edgelist(raft::handle_t const& handle,
- vertex_t* edgelist_major_vertices /* [INOUT] */,
- vertex_t* edgelist_minor_vertices /* [INOUT] */,
- edge_t num_edgelist_edges,
- bool is_hypergraph_partitioned,
+ std::vector const& edgelist_major_vertices /* [INOUT] */,
+ std::vector const& edgelist_minor_vertices /* [INOUT] */,
+ std::vector const& edgelist_edge_counts,
bool do_expensive_check = false);
/**
@@ -115,19 +119,24 @@ std::enable_if_t> renumber_edgelist(
* the compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function
* to work (vertices should be pre-shuffled).
* @param num_local_vertices Number of local vertices.
- * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as
+ * @param edgelist_major_vertices Pointers (one pointer per local graph adjacency matrix partition
+ * assigned to this process) to edge source vertex IDs (if the graph adjacency matrix is stored as
* is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex
- * IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t functor to
- * every (major, minor) pair should return the local GPU ID for this function to work (edges should
- * be pre-shuffled).
- * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is
- * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored).
- * Vertex IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t
- * functor to every (major, minor) pair should return the local GPU ID for this function to work
- * (edges should be pre-shuffled).
- * @param num_edgelist_edges Number of edges in the edgelist.
- * @param is_hypergraph_partitioned Flag indicating whether we are assuming hypergraph partitioning
- * (this flag will be removed in the future).
+ * IDs are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target
+ * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major,
+ * minor) pair should return the GPU ID of this process and applying the
+ * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition
+ * should return the partition ID of the corresponding matrix partition.
+ * @param edgelist_minor_vertices Pointers (one pointer per local graph adjacency matrix partition
+ * assigned to this process) to edge destination vertex IDs (if the graph adjacency matrix is stored
+ * as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored). Vertex IDs
+ * are updated in-place ([INOUT] parameter). Edges should be pre-shuffled to their final target
+ * process & matrix partition; i.e. applying the compute_gpu_id_from_edge_t functor to every (major,
+ * minor) pair should return the GPU ID of this process and applying the
+ * compute_partition_id_from_edge_t fuctor to every (major, minor) pair for a local matrix partition
+ * should return the partition ID of the corresponding matrix partition.
+ * @param edgelist_edge_counts Edge counts (one count per local graph adjacency matrix partition
+ * assigned to this process).
* @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
* @return std::tuple, partition_t, vertex_t, edge_t>
* Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to
@@ -140,10 +149,9 @@ std::enable_if_t const& edgelist_major_vertices /* [INOUT] */,
+ std::vector const& edgelist_minor_vertices /* [INOUT] */,
+ std::vector const& edgelist_edge_counts,
bool do_expensive_check = false);
/**
@@ -181,6 +189,104 @@ std::enable_if_t> renumber_edgelist(
edge_t num_edgelist_edges,
bool do_expensive_check = false);
+/**
+ * @brief Renumber external vertices to internal vertices based on the provoided @p
+ * renumber_map_labels.
+ *
+ * Note cugraph::experimental::invalid_id::value remains unchanged.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices Pointer to the vertices to be renumbered. The input external vertices are
+ * renumbered to internal vertices in-place.
+ * @param num_vertices Number of vertices to be renumbered.
+ * @param renumber_map_labels Pointer to the external vertices corresponding to the internal
+ * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last).
+ * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this
+ * process in multi-GPU).
+ * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process
+ * in multi-GPU).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template
+void renumber_ext_vertices(raft::handle_t const& handle,
+ vertex_t* vertices /* [INOUT] */,
+ size_t num_vertices,
+ vertex_t const* renumber_map_labels,
+ vertex_t local_int_vertex_first,
+ vertex_t local_int_vertex_last,
+ bool do_expensive_check = false);
+
+/**
+ * @brief Unrenumber local internal vertices to external vertices based on the providied @p
+ * renumber_map_labels.
+ *
+ * Note cugraph::experimental::invalid_id::value remains unchanged.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices Pointer to the local internal vertices to be unrenumbered. Each input element
+ * should be in [@p local_int_vertex_first, @p local_int_vertex_last). The input internal vertices
+ * are renumbered to external vertices in-place.
+ * @param num_vertices Number of vertices to be unrenumbered.
+ * @param renumber_map_labels Pointer to the external vertices corresponding to the internal
+ * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last).
+ * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this
+ * process in multi-GPU).
+ * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process
+ * in multi-GPU).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template
+void unrenumber_local_int_vertices(
+ raft::handle_t const& handle,
+ vertex_t* vertices /* [INOUT] */,
+ size_t num_vertices,
+ vertex_t const* renumber_map_labels /* size = local_int_vertex_last - local_int_vertex_first */,
+ vertex_t local_int_vertex_first,
+ vertex_t local_int_vertex_last,
+ bool do_expensive_check = false);
+
+// FIXME: We may add unrenumber_int_rows(or cols) as this will require communication only within a
+// sub-communicator and potentially be more efficient.
+/**
+ * @brief Unrenumber (possibly non-local) internal vertices to external vertices based on the
+ * providied @p renumber_map_labels.
+ *
+ * Note cugraph::experimental::invalid_id::value remains unchanged.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices Pointer to the internal vertices to be unrenumbered. The input internal vertices
+ * are renumbered to external vertices in-place.
+ * @param num_vertices Number of vertices to be unrenumbered.
+ * @param renumber_map_labels Pointer to the external vertices corresponding to the internal
+ * vertices in the range [@p local_int_vertex_first, @p local_int_vertex_last).
+ * @param local_int_vertex_first The first local internal vertex (inclusive, assigned to this
+ * process in multi-GPU).
+ * @param local_int_vertex_last The last local internal vertex (exclusive, assigned to this process
+ * in multi-GPU).
+ * @param vertex_partition_lasts Last local internal vertices (exclusive, assigned to each process
+ * in multi-GPU).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template
+void unrenumber_int_vertices(raft::handle_t const& handle,
+ vertex_t* vertices /* [INOUT] */,
+ size_t num_vertices,
+ vertex_t const* renumber_map_labels,
+ vertex_t local_int_vertex_first,
+ vertex_t local_int_vertex_last,
+ std::vector& vertex_partition_lasts,
+ bool do_expensive_check = false);
+
/**
* @brief Compute the coarsened graph.
*
diff --git a/cpp/include/experimental/graph_generator.hpp b/cpp/include/experimental/graph_generator.hpp
index b8495ed7581..bc7337944f3 100644
--- a/cpp/include/experimental/graph_generator.hpp
+++ b/cpp/include/experimental/graph_generator.hpp
@@ -72,7 +72,7 @@ template
std::tuple, rmm::device_uvector> generate_rmat_edgelist(
raft::handle_t const& handle,
size_t scale,
- size_t edge_factor = 16,
+ size_t num_edges,
double a = 0.57,
double b = 0.19,
double c = 0.19,
@@ -80,5 +80,58 @@ std::tuple, rmm::device_uvector> generat
bool clip_and_flip = false,
bool scramble_vertex_ids = false);
+enum class generator_distribution_t { POWER_LAW = 0, UNIFORM };
+
+/**
+ * @brief generate multiple edge lists using the R-mat graph generator.
+ *
+ * This function allows multi-edges and self-loops similar to the Graph 500 reference
+ * implementation.
+ *
+ * @p scramble_vertex_ids needs to be set to `true` to generate a graph conforming to the Graph 500
+ * specification (note that scrambling does not affect cuGraph's graph construction performance, so
+ * this is generally unnecessary). If `edge_factor` is given (e.g. Graph 500), set @p num_edges to
+ * (size_t{1} << @p scale) * `edge_factor`. To generate an undirected graph, set @p b == @p c and @p
+ * clip_and_flip = true. All the resulting edges will be placed in the lower triangular part
+ * (inculding the diagonal) of the graph adjacency matrix.
+ *
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param n_edgelists Number of edge lists (graphs) to generate
+ * @param min_scale Scale factor to set the minimum number of verties in the graph.
+ * @param max_scale Scale factor to set the maximum number of verties in the graph.
+ * @param edge_factor Average number of edges per vertex to generate.
+ * @param size_distribution Distribution of the graph sizes, impacts the scale parameter of the
+ * R-MAT generator
+ * @param edge_distribution Edges distribution for each graph, impacts how R-MAT parameters a,b,c,d,
+ * are set.
+ * @param seed Seed value for the random number generator.
+ * @param clip_and_flip Flag controlling whether to generate edges only in the lower triangular part
+ * (including the diagonal) of the graph adjacency matrix (if set to `true`) or not (if set to
+ * `false`).
+ * @param scramble_vertex_ids Flag controlling whether to scramble vertex ID bits (if set to `true`)
+ * or not (if set to `false`); scrambling vertx ID bits breaks correlation between vertex ID values
+ * and vertex degrees. The scramble code here follows the algorithm in the Graph 500 reference
+ * implementation version 3.0.0.
+ * @return A vector of std::tuple, rmm::device_uvector> of
+ *size @p n_edgelists, each vector element being a tuple of rmm::device_uvector objects for edge
+ *source vertex IDs and edge destination vertex IDs.
+ */
+template
+std::vector, rmm::device_uvector>>
+generate_rmat_edgelists(
+ raft::handle_t const& handle,
+ size_t n_edgelists,
+ size_t min_scale,
+ size_t max_scale,
+ size_t edge_factor = 16,
+ generator_distribution_t size_distribution = generator_distribution_t::POWER_LAW,
+ generator_distribution_t edge_distribution = generator_distribution_t::POWER_LAW,
+ uint64_t seed = 0,
+ bool clip_and_flip = false,
+ bool scramble_vertex_ids = false);
+
} // namespace experimental
} // namespace cugraph
diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/experimental/graph_view.hpp
index 5d3d09bb087..e9593b70ddb 100644
--- a/cpp/include/experimental/graph_view.hpp
+++ b/cpp/include/experimental/graph_view.hpp
@@ -40,32 +40,11 @@ namespace experimental {
*
* We need to partition 1D vertex arrays (storing per vertex values) and the 2D graph adjacency
* matrix (or transposed 2D graph adjacency matrix) of G. An 1D vertex array of size V is divided to
- * P linear partitions; each partition has the size close to V / P. We consider two different
- * strategies to partition the 2D matrix: the default strategy and the hypergraph partitioning based
- * strategy (the latter is for future extension).
- * FIXME: in the future we may use the latter for both as this leads to simpler communication
- * patterns and better control over parallelism vs memory footprint trade-off.
+ * P linear partitions; each partition has the size close to V / P.
*
- * In the default case, one GPU will be responsible for 1 rectangular partition. The matrix will be
- * horizontally partitioned first to P_row slabs. Each slab will be further vertically partitioned
- * to P_col rectangles. Each rectangular partition will have the size close to V / P_row by V /
- * P_col.
- *
- * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for one
- * rectangular partition [a,b) by [c,d) where a = vertex_partition_offsets[row_comm_size *
- * col_comm_rank], b = vertex_partition_offsets[row_comm_size * (col_comm_rank + 1)], c =
- * vertex_partition_offsets[col_comm_size * row_comm_rank], and d =
- * vertex_partition_offsets[col_comm_size * (row_comm_rank + 1)].
- *
- * In the future, we may apply hyper-graph partitioning to divide V vertices to P groups minimizing
- * edge cuts across groups while balancing the number of vertices in each group. We will also
- * renumber vertices so the vertices in each group are mapped to consecutive integers. Then, there
- * will be more non-zeros in the diagonal partitions of the 2D graph adjacency matrix (or the
- * transposed 2D graph adjacency matrix) than the off-diagonal partitions. The default strategy does
- * not balance the number of nonzeros if hyper-graph partitioning is applied. To solve this problem,
- * the matrix is first horizontally partitioned to P slabs, then each slab will be further
- * vertically partitioned to P_row (instead of P_col in the default case) rectangles. One GPU will
- * be responsible col_comm_size rectangular partitions in this case.
+ * The 2D graph adjacency matrix is first horizontally partitioned to P slabs, then each slab will
+ * be further vertically partitioned to P_row (instead of P_col in the default case) rectangles. One
+ * GPU will be responsible col_comm_size rectangular partitions.
*
* To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for
* col_comm_size rectangular partitions [a_i,b_i) by [c,d) where a_i =
@@ -85,13 +64,11 @@ class partition_t {
partition_t() = default;
partition_t(std::vector const& vertex_partition_offsets,
- bool hypergraph_partitioned,
int row_comm_size,
int col_comm_size,
int row_comm_rank,
int col_comm_rank)
: vertex_partition_offsets_(vertex_partition_offsets),
- hypergraph_partitioned_(hypergraph_partitioned),
comm_rank_(col_comm_rank * row_comm_size + row_comm_rank),
row_comm_size_(row_comm_size),
col_comm_size_(col_comm_size),
@@ -159,10 +136,7 @@ class partition_t {
get_vertex_partition_first(vertex_partition_idx);
}
- size_t get_number_of_matrix_partitions() const
- {
- return hypergraph_partitioned_ ? col_comm_size_ : 1;
- }
+ size_t get_number_of_matrix_partitions() const { return col_comm_size_; }
// major: row of the graph adjacency matrix (if the graph adjacency matrix is stored as is) or
// column of the graph adjacency matrix (if the transposed graph adjacency matrix is stored).
@@ -175,16 +149,18 @@ class partition_t {
vertex_t get_matrix_partition_major_first(size_t partition_idx) const
{
- return hypergraph_partitioned_
- ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_]
- : vertex_partition_offsets_[col_comm_rank_ * row_comm_size_];
+ return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_];
}
vertex_t get_matrix_partition_major_last(size_t partition_idx) const
{
- return hypergraph_partitioned_
- ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1]
- : vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_];
+ return vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1];
+ }
+
+ vertex_t get_matrix_partition_major_size(size_t partition_idx) const
+ {
+ return get_matrix_partition_major_last(partition_idx) -
+ get_matrix_partition_major_first(partition_idx);
}
vertex_t get_matrix_partition_major_value_start_offset(size_t partition_idx) const
@@ -204,24 +180,21 @@ class partition_t {
vertex_t get_matrix_partition_minor_first() const
{
- return hypergraph_partitioned_ ? vertex_partition_offsets_[col_comm_rank_ * row_comm_size_]
- : vertex_partition_offsets_[row_comm_rank_ * col_comm_size_];
+ return vertex_partition_offsets_[col_comm_rank_ * row_comm_size_];
}
vertex_t get_matrix_partition_minor_last() const
{
- return hypergraph_partitioned_
- ? vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_]
- : vertex_partition_offsets_[(row_comm_rank_ + 1) * col_comm_size_];
+ return vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_];
}
- // FIXME: this function may be removed if we use the same partitioning strategy whether hypergraph
- // partitioning is applied or not
- bool is_hypergraph_partitioned() const { return hypergraph_partitioned_; }
+ vertex_t get_matrix_partition_minor_size() const
+ {
+ return get_matrix_partition_minor_last() - get_matrix_partition_minor_first();
+ }
private:
std::vector vertex_partition_offsets_{}; // size = P + 1
- bool hypergraph_partitioned_{false};
int comm_rank_{0};
int row_comm_size_{0};
@@ -236,6 +209,7 @@ class partition_t {
struct graph_properties_t {
bool is_symmetric{false};
bool is_multigraph{false};
+ bool is_weighted{false};
};
namespace detail {
@@ -277,6 +251,7 @@ class graph_base_t {
bool is_symmetric() const { return properties_.is_symmetric; }
bool is_multigraph() const { return properties_.is_multigraph; }
+ bool is_weighted() const { return properties_.is_weighted; }
protected:
raft::handle_t const* get_handle_ptr() const { return handle_ptr_; };
@@ -326,7 +301,7 @@ class graph_view_t const& adj_matrix_partition_offsets,
std::vector const& adj_matrix_partition_indices,
std::vector const& adj_matrix_partition_weights,
- std::vector const& vertex_partition_segment_offsets,
+ std::vector const& adj_matrix_partition_segment_offsets,
partition_t const& partition,
vertex_t number_of_vertices,
edge_t number_of_edges,
@@ -334,11 +309,6 @@ class graph_view_t 0; }
-
- // FIXME: this should be removed once MNMG Louvain is updated to use graph primitives
- partition_t get_partition() const { return partition_; }
-
vertex_t get_number_of_local_vertices() const
{
return partition_.get_local_vertex_last() - partition_.get_local_vertex_first();
@@ -421,6 +391,12 @@ class graph_view_t get_local_adj_matrix_partition_segment_offsets(size_t partition_idx) const
+ {
+ return adj_matrix_partition_segment_offsets_.size() > 0
+ ? std::vector(
+ adj_matrix_partition_segment_offsets_.begin() +
+ partition_idx * (detail::num_segments_per_vertex_partition + 1),
+ adj_matrix_partition_segment_offsets_.begin() +
+ (partition_idx + 1) * (detail::num_segments_per_vertex_partition + 1))
+ : std::vector{};
+ }
// FIXME: this function is not part of the public stable API. This function is mainly for pattern
// accelerator implementation. This function is currently public to support the legacy
@@ -504,6 +495,12 @@ class graph_view_t compute_in_weight_sums(raft::handle_t const& handle) const;
rmm::device_uvector compute_out_weight_sums(raft::handle_t const& handle) const;
+ edge_t compute_max_in_degree(raft::handle_t const& handle) const;
+ edge_t compute_max_out_degree(raft::handle_t const& handle) const;
+
+ weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const;
+ weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const;
+
private:
std::vector adj_matrix_partition_offsets_{};
std::vector adj_matrix_partition_indices_{};
@@ -513,9 +510,10 @@ class graph_view_t partition_{};
std::vector
- vertex_partition_segment_offsets_{}; // segment offsets within the vertex partition based on
- // vertex degree, relevant only if
- // sorted_by_global_degree_within_vertex_partition is true
+ adj_matrix_partition_segment_offsets_{}; // segment offsets within the vertex partition based
+ // on vertex degree, relevant only if
+ // sorted_by_global_degree_within_vertex_partition is
+ // true
};
// single-GPU version
@@ -549,8 +547,6 @@ class graph_view_tget_number_of_vertices(); }
constexpr vertex_t get_local_vertex_first() const { return vertex_t{0}; }
@@ -628,7 +624,12 @@ class graph_view_t get_local_adj_matrix_partition_segment_offsets(
+ size_t adj_matrix_partition_idx) const
+ {
+ assert(adj_matrix_partition_idx == 0);
+ return segment_offsets_.size() > 0 ? segment_offsets_ : std::vector{};
+ }
// FIXME: this function is not part of the public stable API.This function is mainly for pattern
// accelerator implementation. This function is currently public to support the legacy
@@ -654,6 +655,12 @@ class graph_view_t compute_in_weight_sums(raft::handle_t const& handle) const;
rmm::device_uvector compute_out_weight_sums(raft::handle_t const& handle) const;
+ edge_t compute_max_in_degree(raft::handle_t const& handle) const;
+ edge_t compute_max_out_degree(raft::handle_t const& handle) const;
+
+ weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const;
+ weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const;
+
private:
edge_t const* offsets_{nullptr};
vertex_t const* indices_{nullptr};
diff --git a/cpp/src/experimental/include_cuco_static_map.cuh b/cpp/include/experimental/include_cuco_static_map.cuh
similarity index 100%
rename from cpp/src/experimental/include_cuco_static_map.cuh
rename to cpp/include/experimental/include_cuco_static_map.cuh
diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp
index b30159566b5..8ea58546ce1 100644
--- a/cpp/include/graph.hpp
+++ b/cpp/include/graph.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -69,6 +69,10 @@ class GraphViewBase {
edge_t *local_edges;
vertex_t *local_offsets;
+ vertex_t get_number_of_vertices() const { return number_of_vertices; }
+
+ vertex_t get_local_vertex_first() const { return vertex_t{0}; }
+
/**
* @brief Fill the identifiers array with the vertex identifiers.
*
diff --git a/cpp/include/matrix_partition_device.cuh b/cpp/include/matrix_partition_device.cuh
index b41119e7be6..30d6540bcfe 100644
--- a/cpp/include/matrix_partition_device.cuh
+++ b/cpp/include/matrix_partition_device.cuh
@@ -192,7 +192,7 @@ class matrix_partition_device_t rx_counts(row_comm_size, size_t{0});
- std::vector displacements(row_comm_size, size_t{0});
- for (int i = 0; i < row_comm_size; ++i) {
- rx_counts[i] = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
- displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
- }
- device_allgatherv(row_comm,
- vertex_value_input_first,
- matrix_major_value_output_first,
- rx_counts,
- displacements,
- handle.get_stream());
+ auto& comm = handle.get_comms();
+ auto const comm_rank = comm.get_rank();
+ auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+ auto const row_comm_rank = row_comm.get_rank();
+ auto const row_comm_size = row_comm.get_size();
+ auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+ auto const col_comm_rank = col_comm.get_rank();
+ auto const col_comm_size = col_comm.get_size();
+
+ std::vector rx_counts(col_comm_size, size_t{0});
+ std::vector displacements(col_comm_size, size_t{0});
+ for (int i = 0; i < col_comm_size; ++i) {
+ rx_counts[i] = graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank);
+ displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
}
+ device_allgatherv(col_comm,
+ vertex_value_input_first,
+ matrix_major_value_output_first,
+ rx_counts,
+ displacements,
+ handle.get_stream());
} else {
assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
? graph_view.get_number_of_local_adj_matrix_partition_cols()
@@ -101,80 +97,78 @@ void copy_to_matrix_major(raft::handle_t const& handle,
using vertex_t = typename GraphViewType::vertex_type;
if (GraphViewType::is_multi_gpu) {
- if (graph_view.is_hypergraph_partitioned()) {
- CUGRAPH_FAIL("unimplemented.");
- } else {
- auto& comm = handle.get_comms();
- auto const comm_rank = comm.get_rank();
- auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
- auto const row_comm_rank = row_comm.get_rank();
- auto const row_comm_size = row_comm.get_size();
- auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
- auto const col_comm_rank = col_comm.get_rank();
- auto const col_comm_size = col_comm.get_size();
-
- auto rx_counts =
- host_scalar_allgather(row_comm,
- static_cast(thrust::distance(vertex_first, vertex_last)),
- handle.get_stream());
-
- matrix_partition_device_t matrix_partition(graph_view, 0);
- for (int i = 0; i < row_comm_size; ++i) {
- rmm::device_uvector rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i],
- handle.get_stream());
- auto rx_tmp_buffer = allocate_dataframe_buffer<
- typename std::iterator_traits::value_type>(rx_counts[i],
- handle.get_stream());
- auto rx_value_first = get_dataframe_buffer_begin<
- typename std::iterator_traits::value_type>(rx_tmp_buffer);
+ auto& comm = handle.get_comms();
+ auto const comm_rank = comm.get_rank();
+ auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+ auto const row_comm_rank = row_comm.get_rank();
+ auto const row_comm_size = row_comm.get_size();
+ auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+ auto const col_comm_rank = col_comm.get_rank();
+ auto const col_comm_size = col_comm.get_size();
+
+ auto rx_counts =
+ host_scalar_allgather(col_comm,
+ static_cast(thrust::distance(vertex_first, vertex_last)),
+ handle.get_stream());
+
+ for (int i = 0; i < col_comm_size; ++i) {
+ matrix_partition_device_t matrix_partition(graph_view, i);
+
+ rmm::device_uvector rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i],
+ handle.get_stream());
+ auto rx_tmp_buffer = allocate_dataframe_buffer<
+ typename std::iterator_traits::value_type>(rx_counts[i],
+ handle.get_stream());
+ auto rx_value_first = get_dataframe_buffer_begin<
+ typename std::iterator_traits::value_type>(rx_tmp_buffer);
- if (row_comm_rank == i) {
- vertex_partition_device_t vertex_partition(graph_view);
- auto map_first =
- thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
- return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
- });
- // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
- // permutation iterator (and directly gathers to the internal buffer)
- thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
- map_first,
- map_first + thrust::distance(vertex_first, vertex_last),
- vertex_value_input_first,
- rx_value_first);
- }
+ if (col_comm_rank == i) {
+ vertex_partition_device_t vertex_partition(graph_view);
+ auto map_first =
+ thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+ return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
+ });
+ // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
+ // permutation iterator (and directly gathers to the internal buffer)
+ thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+ map_first,
+ map_first + thrust::distance(vertex_first, vertex_last),
+ vertex_value_input_first,
+ rx_value_first);
+ }
- // FIXME: these broadcast operations can be placed between ncclGroupStart() and
- // ncclGroupEnd()
- device_bcast(
- row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
- device_bcast(
- row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
+ // FIXME: these broadcast operations can be placed between ncclGroupStart() and
+ // ncclGroupEnd()
+ device_bcast(
+ col_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+ device_bcast(col_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
- if (row_comm_rank == i) {
- auto map_first =
- thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) {
- return matrix_partition.get_major_offset_from_major_nocheck(v);
- });
- // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
- // directly scatters from the internal buffer)
- thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
- rx_value_first,
- rx_value_first + rx_counts[i],
- map_first,
- matrix_major_value_output_first);
- } else {
- auto map_first = thrust::make_transform_iterator(
- rx_vertices.begin(), [matrix_partition] __device__(auto v) {
- return matrix_partition.get_major_offset_from_major_nocheck(v);
- });
- // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
- // directly scatters from the internal buffer)
- thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
- rx_value_first,
- rx_value_first + rx_counts[i],
- map_first,
- matrix_major_value_output_first);
- }
+ if (col_comm_rank == i) {
+ auto map_first =
+ thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) {
+ return matrix_partition.get_major_offset_from_major_nocheck(v);
+ });
+ // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+ // directly scatters from the internal buffer)
+ thrust::scatter(
+ rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+ rx_value_first,
+ rx_value_first + rx_counts[i],
+ map_first,
+ matrix_major_value_output_first + matrix_partition.get_major_value_start_offset());
+ } else {
+ auto map_first = thrust::make_transform_iterator(
+ rx_vertices.begin(), [matrix_partition] __device__(auto v) {
+ return matrix_partition.get_major_offset_from_major_nocheck(v);
+ });
+ // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+ // directly scatters from the internal buffer)
+ thrust::scatter(
+ rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+ rx_value_first,
+ rx_value_first + rx_counts[i],
+ map_first,
+ matrix_major_value_output_first + matrix_partition.get_major_value_start_offset());
}
}
} else {
@@ -199,59 +193,27 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
MatrixMinorValueOutputIterator matrix_minor_value_output_first)
{
if (GraphViewType::is_multi_gpu) {
- if (graph_view.is_hypergraph_partitioned()) {
- CUGRAPH_FAIL("unimplemented.");
- } else {
- auto& comm = handle.get_comms();
- auto const comm_rank = comm.get_rank();
- auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
- auto const row_comm_rank = row_comm.get_rank();
- auto const row_comm_size = row_comm.get_size();
- auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
- auto const col_comm_rank = col_comm.get_rank();
- auto const col_comm_size = col_comm.get_size();
-
- // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph
- // partitioning
- auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank;
- auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
- // FIXME: this branch may be no longer necessary with NCCL backend
- if (comm_src_rank == comm_rank) {
- assert(comm_dst_rank == comm_rank);
- thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
- vertex_value_input_first,
- vertex_value_input_first + graph_view.get_number_of_local_vertices(),
- matrix_minor_value_output_first +
- (graph_view.get_vertex_partition_first(comm_src_rank) -
- graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)));
- } else {
- device_sendrecv(
- comm,
- vertex_value_input_first,
- static_cast(graph_view.get_number_of_local_vertices()),
- comm_dst_rank,
- matrix_minor_value_output_first +
- (graph_view.get_vertex_partition_first(comm_src_rank) -
- graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)),
- static_cast(graph_view.get_vertex_partition_size(comm_src_rank)),
- comm_src_rank,
- handle.get_stream());
- }
-
- // FIXME: these broadcast operations can be placed between ncclGroupStart() and
- // ncclGroupEnd()
- for (int i = 0; i < col_comm_size; ++i) {
- auto offset = graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) -
- graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size);
- auto count = graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i);
- device_bcast(col_comm,
- matrix_minor_value_output_first + offset,
- matrix_minor_value_output_first + offset,
- count,
- i,
- handle.get_stream());
- }
+ auto& comm = handle.get_comms();
+ auto const comm_rank = comm.get_rank();
+ auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+ auto const row_comm_rank = row_comm.get_rank();
+ auto const row_comm_size = row_comm.get_size();
+ auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+ auto const col_comm_rank = col_comm.get_rank();
+ auto const col_comm_size = col_comm.get_size();
+
+ std::vector rx_counts(row_comm_size, size_t{0});
+ std::vector displacements(row_comm_size, size_t{0});
+ for (int i = 0; i < row_comm_size; ++i) {
+ rx_counts[i] = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
+ displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
}
+ device_allgatherv(row_comm,
+ vertex_value_input_first,
+ matrix_minor_value_output_first,
+ rx_counts,
+ displacements,
+ handle.get_stream());
} else {
assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
? graph_view.get_number_of_local_adj_matrix_partition_rows()
@@ -277,143 +239,75 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
using vertex_t = typename GraphViewType::vertex_type;
if (GraphViewType::is_multi_gpu) {
- if (graph_view.is_hypergraph_partitioned()) {
- CUGRAPH_FAIL("unimplemented.");
- } else {
- auto& comm = handle.get_comms();
- auto const comm_rank = comm.get_rank();
- auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
- auto const row_comm_rank = row_comm.get_rank();
- auto const row_comm_size = row_comm.get_size();
- auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
- auto const col_comm_rank = col_comm.get_rank();
- auto const col_comm_size = col_comm.get_size();
-
- // FIXME: this P2P is unnecessary if apply the same partitioning scheme regardless of
- // hypergraph partitioning is applied or not
- auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank;
- auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
- size_t tx_count = thrust::distance(vertex_first, vertex_last);
- size_t rx_count{};
- // FIXME: it seems like raft::isend and raft::irecv do not properly handle the destination (or
- // source) == self case. Need to double check and fix this if this is indeed the case (or RAFT
- // may use ncclSend/ncclRecv instead of UCX for device data).
- if (comm_src_rank == comm_rank) {
- assert(comm_dst_rank == comm_rank);
- rx_count = tx_count;
- } else {
- std::vector count_requests(2);
- comm.isend(&tx_count, 1, comm_dst_rank, 0 /* tag */, count_requests.data());
- comm.irecv(&rx_count, 1, comm_src_rank, 0 /* tag */, count_requests.data() + 1);
- comm.waitall(count_requests.size(), count_requests.data());
- }
-
- vertex_partition_device_t vertex_partition(graph_view);
- rmm::device_uvector dst_vertices(rx_count, handle.get_stream());
- auto dst_tmp_buffer = allocate_dataframe_buffer<
- typename std::iterator_traits::value_type>(rx_count,
+ auto& comm = handle.get_comms();
+ auto const comm_rank = comm.get_rank();
+ auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+ auto const row_comm_rank = row_comm.get_rank();
+ auto const row_comm_size = row_comm.get_size();
+ auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+ auto const col_comm_rank = col_comm.get_rank();
+ auto const col_comm_size = col_comm.get_size();
+
+ auto rx_counts =
+ host_scalar_allgather(row_comm,
+ static_cast(thrust::distance(vertex_first, vertex_last)),
+ handle.get_stream());
+
+ matrix_partition_device_t matrix_partition(graph_view, 0);
+ for (int i = 0; i < row_comm_size; ++i) {
+ rmm::device_uvector rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i],
+ handle.get_stream());
+ auto rx_tmp_buffer = allocate_dataframe_buffer<
+ typename std::iterator_traits::value_type>(rx_counts[i],
handle.get_stream());
- auto dst_value_first = get_dataframe_buffer_begin<
- typename std::iterator_traits::value_type>(dst_tmp_buffer);
- if (comm_src_rank == comm_rank) {
- thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
- vertex_first,
- vertex_last,
- dst_vertices.begin());
- auto map_first =
- thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
- return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
- });
- thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
- map_first,
- map_first + thrust::distance(vertex_first, vertex_last),
- vertex_value_input_first,
- dst_value_first);
- } else {
- auto src_tmp_buffer = allocate_dataframe_buffer<
- typename std::iterator_traits::value_type>(tx_count,
- handle.get_stream());
- auto src_value_first = get_dataframe_buffer_begin<
- typename std::iterator_traits::value_type>(src_tmp_buffer);
+ auto rx_value_first = get_dataframe_buffer_begin<
+ typename std::iterator_traits::value_type>(rx_tmp_buffer);
+ if (row_comm_rank == i) {
+ vertex_partition_device_t vertex_partition(graph_view);
auto map_first =
thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
});
+ // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
+ // permutation iterator (and directly gathers to the internal buffer)
thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
map_first,
map_first + thrust::distance(vertex_first, vertex_last),
vertex_value_input_first,
- src_value_first);
-
- device_sendrecv(
- comm,
- vertex_first,
- tx_count,
- comm_dst_rank,
- dst_vertices.begin(),
- rx_count,
- comm_src_rank,
- handle.get_stream());
-
- device_sendrecv(comm,
- src_value_first,
- tx_count,
- comm_dst_rank,
- dst_value_first,
- rx_count,
- comm_src_rank,
- handle.get_stream());
+ rx_value_first);
}
- // FIXME: now we can clear tx_tmp_buffer
-
- auto rx_counts = host_scalar_allgather(col_comm, rx_count, handle.get_stream());
-
- matrix_partition_device_t matrix_partition(graph_view, 0);
- for (int i = 0; i < col_comm_size; ++i) {
- rmm::device_uvector rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i],
- handle.get_stream());
- auto rx_tmp_buffer = allocate_dataframe_buffer<
- typename std::iterator_traits::value_type>(rx_counts[i],
- handle.get_stream());
- auto rx_value_first = get_dataframe_buffer_begin<
- typename std::iterator_traits::value_type>(rx_tmp_buffer);
-
- // FIXME: these broadcast operations can be placed between ncclGroupStart() and
- // ncclGroupEnd()
- device_bcast(col_comm,
- dst_vertices.begin(),
- rx_vertices.begin(),
- rx_counts[i],
- i,
- handle.get_stream());
- device_bcast(
- col_comm, dst_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
-
- if (col_comm_rank == i) {
- auto map_first = thrust::make_transform_iterator(
- dst_vertices.begin(), [matrix_partition] __device__(auto v) {
- return matrix_partition.get_minor_offset_from_minor_nocheck(v);
- });
-
- thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
- dst_value_first,
- dst_value_first + rx_counts[i],
- map_first,
- matrix_minor_value_output_first);
- } else {
- auto map_first = thrust::make_transform_iterator(
- rx_vertices.begin(), [matrix_partition] __device__(auto v) {
- return matrix_partition.get_minor_offset_from_minor_nocheck(v);
- });
+ // FIXME: these broadcast operations can be placed between ncclGroupStart() and
+ // ncclGroupEnd()
+ device_bcast(
+ row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+ device_bcast(row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
- thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
- rx_value_first,
- rx_value_first + rx_counts[i],
- map_first,
- matrix_minor_value_output_first);
- }
+ if (row_comm_rank == i) {
+ auto map_first =
+ thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) {
+ return matrix_partition.get_minor_offset_from_minor_nocheck(v);
+ });
+ // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+ // directly scatters from the internal buffer)
+ thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+ rx_value_first,
+ rx_value_first + rx_counts[i],
+ map_first,
+ matrix_minor_value_output_first);
+ } else {
+ auto map_first = thrust::make_transform_iterator(
+ rx_vertices.begin(), [matrix_partition] __device__(auto v) {
+ return matrix_partition.get_minor_offset_from_minor_nocheck(v);
+ });
+ // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+ // directly scatters from the internal buffer)
+ thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+ rx_value_first,
+ rx_value_first + rx_counts[i],
+ map_first,
+ matrix_minor_value_output_first);
}
}
} else {
diff --git a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
index 3059cf95852..6d828dab513 100644
--- a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
@@ -42,23 +42,7 @@ namespace experimental {
namespace detail {
-// FIXME: block size requires tuning
-int32_t constexpr copy_v_transform_reduce_nbr_for_all_block_size = 128;
-
-#if 0
-// FIXME: delete this once we verify that the thrust replace in for_all_major_for_all_nbr_low_degree is no slower than the original for loop based imoplementation
-template
-__device__ std::enable_if_t accumulate_edge_op_result(T& lhs, T const& rhs)
-{
- lhs = plus_edge_op_result(lhs, rhs);
-}
-
-template
-__device__ std::enable_if_t accumulate_edge_op_result(T& lhs, T const& rhs)
-{
- atomic_add(&lhs, rhs);
-}
-#endif
+int32_t constexpr copy_v_transform_reduce_nbr_for_all_block_size = 512;
template (tid);
while (idx < static_cast(major_last - major_first)) {
+ auto major_offset = major_start_offset + idx;
vertex_t const* indices{nullptr};
weight_t const* weights{nullptr};
edge_t local_degree{};
- auto major_offset = major_start_offset + idx;
thrust::tie(indices, weights, local_degree) =
matrix_partition.get_local_edges(static_cast(major_offset));
-#if 1
auto transform_op = [&matrix_partition,
&adj_matrix_row_value_input_first,
&adj_matrix_col_value_input_first,
@@ -148,44 +131,6 @@ __global__ void for_all_major_for_all_nbr_low_degree(
atomic_accumulate_edge_op_result(result_value_output_first + minor_offset, e_op_result);
});
}
-#else
- // FIXME: delete this once we verify that the code above is not slower than this.
- e_op_result_t e_op_result_sum{init}; // relevent only if update_major == true
- for (edge_t i = 0; i < local_degree; ++i) {
- auto minor = indices[i];
- auto weight = weights != nullptr ? weights[i] : weight_t{1.0};
- auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
- auto row = GraphViewType::is_adj_matrix_transposed
- ? minor
- : matrix_partition.get_major_from_major_offset_nocheck(major_offset);
- auto col = GraphViewType::is_adj_matrix_transposed
- ? matrix_partition.get_major_from_major_offset_nocheck(major_offset)
- : minor;
- auto row_offset = GraphViewType::is_adj_matrix_transposed
- ? minor_offset
- : static_cast(major_offset);
- auto col_offset = GraphViewType::is_adj_matrix_transposed
- ? static_cast(major_offset)
- : minor_offset;
- auto e_op_result = evaluate_edge_op()
- .compute(row,
- col,
- weight,
- *(adj_matrix_row_value_input_first + row_offset),
- *(adj_matrix_col_value_input_first + col_offset),
- e_op);
- if (update_major) {
- accumulate_edge_op_result(e_op_result_sum, e_op_result);
- } else {
- accumulate_edge_op_result(*(result_value_output_first + minor_offset),
- e_op_result);
- }
- }
- if (update_major) { *(result_value_output_first + idx) = e_op_result_sum; }
-#endif
idx += gridDim.x * blockDim.x;
}
}
@@ -219,14 +164,14 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
auto idx = static_cast(tid / raft::warp_size());
while (idx < static_cast(major_last - major_first)) {
+ auto major_offset = major_start_offset + idx;
vertex_t const* indices{nullptr};
weight_t const* weights{nullptr};
edge_t local_degree{};
- auto major_offset = major_start_offset + idx;
thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset);
auto e_op_result_sum =
lane_id == 0 ? init : e_op_result_t{}; // relevent only if update_major == true
- for (edge_t i = lane_id; i < local_degree; i += raft::warp_size) {
+ for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
auto minor = indices[i];
auto weight = weights != nullptr ? weights[i] : weight_t{1.0};
auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
@@ -293,10 +238,10 @@ __global__ void for_all_major_for_all_nbr_high_degree(
auto idx = static_cast(blockIdx.x);
while (idx < static_cast(major_last - major_first)) {
+ auto major_offset = major_start_offset + idx;
vertex_t const* indices{nullptr};
weight_t const* weights{nullptr};
edge_t local_degree{};
- auto major_offset = major_start_offset + idx;
thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset);
auto e_op_result_sum =
threadIdx.x == 0 ? init : e_op_result_t{}; // relevent only if update_major == true
@@ -358,20 +303,11 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
T init,
VertexValueOutputIterator vertex_value_output_first)
{
- using vertex_t = typename GraphViewType::vertex_type;
+ constexpr auto update_major = (in == GraphViewType::is_adj_matrix_transposed);
+ using vertex_t = typename GraphViewType::vertex_type;
static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value);
- auto loop_count = size_t{1};
- if (GraphViewType::is_multi_gpu) {
- auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
- auto const row_comm_size = row_comm.get_size();
- loop_count = graph_view.is_hypergraph_partitioned()
- ? graph_view.get_number_of_local_adj_matrix_partitions()
- : static_cast(row_comm_size);
- }
- auto comm_rank = handle.comms_initialized() ? handle.get_comms().get_rank() : int{0};
-
auto minor_tmp_buffer_size =
(GraphViewType::is_multi_gpu && (in != GraphViewType::is_adj_matrix_transposed))
? GraphViewType::is_adj_matrix_transposed
@@ -386,10 +322,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
if (GraphViewType::is_multi_gpu) {
auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
auto const row_comm_rank = row_comm.get_rank();
- auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
- auto const col_comm_rank = col_comm.get_rank();
- minor_init = graph_view.is_hypergraph_partitioned() ? (row_comm_rank == 0) ? init : T{}
- : (col_comm_rank == 0) ? init : T{};
+ minor_init = (row_comm_rank == 0) ? init : T{};
}
if (GraphViewType::is_multi_gpu) {
@@ -407,97 +340,162 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
assert(minor_tmp_buffer_size == 0);
}
- for (size_t i = 0; i < loop_count; ++i) {
- matrix_partition_device_t matrix_partition(
- graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i);
-
- auto major_tmp_buffer_size = vertex_t{0};
- if (GraphViewType::is_multi_gpu) {
- auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
- auto const row_comm_size = row_comm.get_size();
- auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
- auto const col_comm_rank = col_comm.get_rank();
+ for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+ matrix_partition_device_t matrix_partition(graph_view, i);
- major_tmp_buffer_size =
- (in == GraphViewType::is_adj_matrix_transposed)
- ? graph_view.is_hypergraph_partitioned()
- ? matrix_partition.get_major_size()
- : graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)
- : vertex_t{0};
- }
+ auto major_tmp_buffer_size =
+ GraphViewType::is_multi_gpu && update_major ? matrix_partition.get_major_size() : vertex_t{0};
auto major_tmp_buffer =
allocate_dataframe_buffer(major_tmp_buffer_size, handle.get_stream());
auto major_buffer_first = get_dataframe_buffer_begin(major_tmp_buffer);
auto major_init = T{};
- if (in == GraphViewType::is_adj_matrix_transposed) {
+ if (update_major) {
if (GraphViewType::is_multi_gpu) {
- auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
- auto const row_comm_rank = row_comm.get_rank();
auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
auto const col_comm_rank = col_comm.get_rank();
- major_init = graph_view.is_hypergraph_partitioned() ? (col_comm_rank == 0) ? init : T{}
- : (row_comm_rank == 0) ? init : T{};
+ major_init = (col_comm_rank == 0) ? init : T{};
} else {
major_init = init;
}
}
- int comm_root_rank = 0;
- if (GraphViewType::is_multi_gpu) {
- auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
- auto const row_comm_rank = row_comm.get_rank();
- auto const row_comm_size = row_comm.get_size();
- auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
- auto const col_comm_rank = col_comm.get_rank();
- comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank
- : col_comm_rank * row_comm_size + i;
- }
-
- if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) {
- raft::grid_1d_thread_t update_grid(graph_view.get_vertex_partition_size(comm_root_rank),
+ auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
+ ? vertex_t{0}
+ : matrix_partition.get_major_value_start_offset();
+ auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed
+ ? matrix_partition.get_major_value_start_offset()
+ : vertex_t{0};
+ auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
+ if (segment_offsets.size() > 0) {
+ // FIXME: we may further improve performance by 1) concurrently running kernels on different
+ // segments; 2) individually tuning block sizes for different segments; and 3) adding one more
+ // segment for very high degree vertices and running segmented reduction
+ static_assert(detail::num_segments_per_vertex_partition == 3);
+ if (segment_offsets[1] > 0) {
+ raft::grid_1d_block_t update_grid(segment_offsets[1],
+ detail::copy_v_transform_reduce_nbr_for_all_block_size,
+ handle.get_device_properties().maxGridSize[0]);
+ // FIXME: with C++17 we can collapse the if-else statement below with a functor with "if
+ // constexpr" that returns either a multi-GPU output buffer or a single-GPU output buffer.
+ if (GraphViewType::is_multi_gpu) {
+ detail::for_all_major_for_all_nbr_high_degree
+ <<>>(
+ matrix_partition,
+ matrix_partition.get_major_first(),
+ matrix_partition.get_major_first() + segment_offsets[1],
+ adj_matrix_row_value_input_first + row_value_input_offset,
+ adj_matrix_col_value_input_first + col_value_input_offset,
+ update_major ? major_buffer_first : minor_buffer_first,
+ e_op,
+ major_init);
+ } else {
+ detail::for_all_major_for_all_nbr_high_degree
+ <<>>(
+ matrix_partition,
+ matrix_partition.get_major_first(),
+ matrix_partition.get_major_first() + segment_offsets[1],
+ adj_matrix_row_value_input_first + row_value_input_offset,
+ adj_matrix_col_value_input_first + col_value_input_offset,
+ vertex_value_output_first,
+ e_op,
+ major_init);
+ }
+ }
+ if (segment_offsets[2] - segment_offsets[1] > 0) {
+ raft::grid_1d_warp_t update_grid(segment_offsets[2] - segment_offsets[1],
detail::copy_v_transform_reduce_nbr_for_all_block_size,
handle.get_device_properties().maxGridSize[0]);
-
- if (GraphViewType::is_multi_gpu) {
- auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
- auto const row_comm_size = row_comm.get_size();
- auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
- auto const col_comm_rank = col_comm.get_rank();
-
- auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
- ? vertex_t{0}
- : matrix_partition.get_major_value_start_offset();
- auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed
- ? matrix_partition.get_major_value_start_offset()
- : vertex_t{0};
-
- detail::for_all_major_for_all_nbr_low_degree
- <<>>(
- matrix_partition,
- graph_view.get_vertex_partition_first(comm_root_rank),
- graph_view.get_vertex_partition_last(comm_root_rank),
- adj_matrix_row_value_input_first + row_value_input_offset,
- adj_matrix_col_value_input_first + col_value_input_offset,
- (in == GraphViewType::is_adj_matrix_transposed) ? major_buffer_first
- : minor_buffer_first,
- e_op,
- major_init);
- } else {
- detail::for_all_major_for_all_nbr_low_degree
- <<>>(
- matrix_partition,
- graph_view.get_vertex_partition_first(comm_root_rank),
- graph_view.get_vertex_partition_last(comm_root_rank),
- adj_matrix_row_value_input_first,
- adj_matrix_col_value_input_first,
- vertex_value_output_first,
- e_op,
- major_init);
+ // FIXME: with C++17 we can collapse the if-else statement below with a functor with "if
+ // constexpr" that returns either a multi-GPU output buffer or a single-GPU output buffer.
+ if (GraphViewType::is_multi_gpu) {
+ detail::for_all_major_for_all_nbr_mid_degree
+ <<>>(
+ matrix_partition,
+ matrix_partition.get_major_first() + segment_offsets[1],
+ matrix_partition.get_major_first() + segment_offsets[2],
+ adj_matrix_row_value_input_first + row_value_input_offset,
+ adj_matrix_col_value_input_first + col_value_input_offset,
+ update_major ? major_buffer_first + segment_offsets[1] : minor_buffer_first,
+ e_op,
+ major_init);
+ } else {
+ detail::for_all_major_for_all_nbr_mid_degree
+ <<>>(
+ matrix_partition,
+ matrix_partition.get_major_first() + segment_offsets[1],
+ matrix_partition.get_major_first() + segment_offsets[2],
+ adj_matrix_row_value_input_first + row_value_input_offset,
+ adj_matrix_col_value_input_first + col_value_input_offset,
+ vertex_value_output_first + (update_major ? segment_offsets[1] : vertex_t{0}),
+ e_op,
+ major_init);
+ }
+ }
+ if (segment_offsets[3] - segment_offsets[2] > 0) {
+ raft::grid_1d_thread_t update_grid(segment_offsets[3] - segment_offsets[2],
+ detail::copy_v_transform_reduce_nbr_for_all_block_size,
+ handle.get_device_properties().maxGridSize[0]);
+ // FIXME: with C++17 we can collapse the if-else statement below with a functor with "if
+ // constexpr" that returns either a multi-GPU output buffer or a single-GPU output buffer.
+ if (GraphViewType::is_multi_gpu) {
+ detail::for_all_major_for_all_nbr_low_degree
+ <<>>(
+ matrix_partition,
+ matrix_partition.get_major_first() + segment_offsets[2],
+ matrix_partition.get_major_last(),
+ adj_matrix_row_value_input_first + row_value_input_offset,
+ adj_matrix_col_value_input_first + col_value_input_offset,
+ update_major ? major_buffer_first + segment_offsets[2] : minor_buffer_first,
+ e_op,
+ major_init);
+ } else {
+ detail::for_all_major_for_all_nbr_low_degree
+ <<>>(
+ matrix_partition,
+ matrix_partition.get_major_first() + segment_offsets[2],
+ matrix_partition.get_major_last(),
+ adj_matrix_row_value_input_first + row_value_input_offset,
+ adj_matrix_col_value_input_first + col_value_input_offset,
+ vertex_value_output_first + (update_major ? segment_offsets[2] : vertex_t{0}),
+ e_op,
+ major_init);
+ }
+ }
+ } else {
+ if (matrix_partition.get_major_size() > 0) {
+ raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(),
+ detail::copy_v_transform_reduce_nbr_for_all_block_size,
+ handle.get_device_properties().maxGridSize[0]);
+ // FIXME: with C++17 we can collapse the if-else statement below with a functor with "if
+ // constexpr" that returns either a multi-GPU output buffer or a single-GPU output buffer.
+ if (GraphViewType::is_multi_gpu) {
+ detail::for_all_major_for_all_nbr_low_degree
+ <<>>(
+ matrix_partition,
+ matrix_partition.get_major_first(),
+ matrix_partition.get_major_last(),
+ adj_matrix_row_value_input_first + row_value_input_offset,
+ adj_matrix_col_value_input_first + col_value_input_offset,
+ update_major ? major_buffer_first : minor_buffer_first,
+ e_op,
+ major_init);
+ } else {
+ detail::for_all_major_for_all_nbr_low_degree
+ <<>>(
+ matrix_partition,
+ matrix_partition.get_major_first(),
+ matrix_partition.get_major_last(),
+ adj_matrix_row_value_input_first + row_value_input_offset,
+ adj_matrix_col_value_input_first + col_value_input_offset,
+ vertex_value_output_first,
+ e_op,
+ major_init);
+ }
}
}
- if (GraphViewType::is_multi_gpu && (in == GraphViewType::is_adj_matrix_transposed)) {
+ if (GraphViewType::is_multi_gpu && update_major) {
auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
auto const row_comm_rank = row_comm.get_rank();
auto const row_comm_size = row_comm.get_size();
@@ -505,29 +503,17 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
auto const col_comm_rank = col_comm.get_rank();
auto const col_comm_size = col_comm.get_size();
- if (graph_view.is_hypergraph_partitioned()) {
- device_reduce(
- col_comm,
- major_buffer_first,
- vertex_value_output_first,
- static_cast(graph_view.get_vertex_partition_size(i * row_comm_size + i)),
- raft::comms::op_t::SUM,
- i,
- handle.get_stream());
- } else {
- device_reduce(row_comm,
- major_buffer_first,
- vertex_value_output_first,
- static_cast(
- graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)),
- raft::comms::op_t::SUM,
- i,
- handle.get_stream());
- }
+ device_reduce(col_comm,
+ major_buffer_first,
+ vertex_value_output_first,
+ matrix_partition.get_major_size(),
+ raft::comms::op_t::SUM,
+ i,
+ handle.get_stream());
}
}
- if (GraphViewType::is_multi_gpu && (in != GraphViewType::is_adj_matrix_transposed)) {
+ if (GraphViewType::is_multi_gpu && !update_major) {
auto& comm = handle.get_comms();
auto const comm_rank = comm.get_rank();
auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
@@ -537,53 +523,17 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
auto const col_comm_rank = col_comm.get_rank();
auto const col_comm_size = col_comm.get_size();
- if (graph_view.is_hypergraph_partitioned()) {
- CUGRAPH_FAIL("unimplemented.");
- } else {
- for (int i = 0; i < col_comm_size; ++i) {
- auto offset = (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) -
- graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size));
- auto size = static_cast(
- graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i));
- device_reduce(col_comm,
- minor_buffer_first + offset,
- minor_buffer_first + offset,
- size,
- raft::comms::op_t::SUM,
- i,
- handle.get_stream());
- }
-
- // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph
- // partitioning
- auto comm_src_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
- auto comm_dst_rank = row_comm_rank * col_comm_size + col_comm_rank;
- // FIXME: this branch may no longer necessary with NCCL backend
- if (comm_src_rank == comm_rank) {
- assert(comm_dst_rank == comm_rank);
- auto offset =
- graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) -
- graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size);
- auto size = static_cast(
- graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + col_comm_rank));
- thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
- minor_buffer_first + offset,
- minor_buffer_first + offset + size,
- vertex_value_output_first);
- } else {
- device_sendrecv(
- comm,
- minor_buffer_first +
- (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) -
- graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)),
- static_cast(
- graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + col_comm_rank)),
- comm_dst_rank,
- vertex_value_output_first,
- static_cast(graph_view.get_vertex_partition_size(comm_rank)),
- comm_src_rank,
- handle.get_stream());
- }
+ for (int i = 0; i < row_comm_size; ++i) {
+ auto offset = (graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size + i) -
+ graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size));
+ device_reduce(row_comm,
+ minor_buffer_first + offset,
+ vertex_value_output_first,
+ static_cast(
+ graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)),
+ raft::comms::op_t::SUM,
+ i,
+ handle.get_stream());
}
}
}
diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index 11cf2cb1137..f904c35ef9e 100644
--- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -19,6 +19,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -27,7 +28,7 @@
#include
-#include
+#include
#include
@@ -59,10 +60,10 @@ __global__ void for_all_major_for_all_nbr_low_degree(
auto idx = static_cast(tid);
while (idx < static_cast(major_last - major_first)) {
+ auto major_offset = major_start_offset + idx;
vertex_t const* indices{nullptr};
weight_t const* weights{nullptr};
edge_t local_degree{};
- auto major_offset = major_start_offset + idx;
thrust::tie(indices, weights, local_degree) =
matrix_partition.get_local_edges(static_cast(major_offset));
if (local_degree > 0) {
@@ -170,8 +171,8 @@ __global__ void for_all_major_for_all_nbr_low_degree(
*/
template ::value_type,
+ static_assert(std::is_same::value_type,
typename GraphViewType::vertex_type>::value);
+ static_assert(std::is_same::value_type,
+ typename std::iterator_traits::value_type>::value);
static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value);
using vertex_t = typename GraphViewType::vertex_type;
@@ -206,64 +209,113 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
// 1. build a cuco::static_map object for the k, v pairs.
auto kv_map_ptr = std::make_unique>(
- static_cast(static_cast(thrust::distance(map_key_first, map_key_last)) /
- load_factor),
- invalid_vertex_id::value,
- invalid_vertex_id::value);
- auto pair_first = thrust::make_transform_iterator(
- thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)),
- [] __device__(auto val) {
- return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
- });
- kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last));
-
- // 2. aggregate each vertex out-going edges based on keys and transform-reduce.
-
- auto loop_count = size_t{1};
+ size_t{0}, invalid_vertex_id::value, invalid_vertex_id::value);
if (GraphViewType::is_multi_gpu) {
auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+ auto const row_comm_rank = row_comm.get_rank();
auto const row_comm_size = row_comm.get_size();
- loop_count = graph_view.is_hypergraph_partitioned()
- ? graph_view.get_number_of_local_adj_matrix_partitions()
- : static_cast(row_comm_size);
+
+ auto map_counts =
+ host_scalar_allgather(row_comm,
+ static_cast(thrust::distance(map_key_first, map_key_last)),
+ handle.get_stream());
+ std::vector map_displacements(row_comm_size, size_t{0});
+ std::partial_sum(map_counts.begin(), map_counts.end() - 1, map_displacements.begin() + 1);
+ rmm::device_uvector map_keys(map_displacements.back() + map_counts.back(),
+ handle.get_stream());
+ auto map_value_buffer =
+ allocate_dataframe_buffer(map_keys.size(), handle.get_stream());
+ for (int i = 0; i < row_comm_size; ++i) {
+ device_bcast(row_comm,
+ map_key_first,
+ map_keys.begin() + map_displacements[i],
+ map_counts[i],
+ i,
+ handle.get_stream());
+ device_bcast(row_comm,
+ map_value_first,
+ get_dataframe_buffer_begin(map_value_buffer) + map_displacements[i],
+ map_counts[i],
+ i,
+ handle.get_stream());
+ }
+ // FIXME: these copies are unnecessary, better fix RAFT comm's bcast to take separate input &
+ // output pointers
+ thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+ map_key_first,
+ map_key_last,
+ map_keys.begin() + map_displacements[row_comm_rank]);
+ thrust::copy(
+ rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+ map_value_first,
+ map_value_first + thrust::distance(map_key_first, map_key_last),
+ get_dataframe_buffer_begin(map_value_buffer) + map_displacements[row_comm_rank]);
+
+ handle.get_stream_view().synchronize(); // cuco::static_map currently does not take stream
+
+ kv_map_ptr.reset();
+
+ kv_map_ptr = std::make_unique>(
+ // FIXME: std::max(..., ...) as a temporary workaround for
+ // https://github.com/NVIDIA/cuCollections/issues/72 and
+ // https://github.com/NVIDIA/cuCollections/issues/73
+ std::max(static_cast(static_cast(map_keys.size()) / load_factor),
+ static_cast(thrust::distance(map_key_first, map_key_last)) + 1),
+ invalid_vertex_id