aschaffer · aschaffer · Oct 19, 2020 · Oct 2, 2020 · Oct 2, 2020 · Oct 2, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+# cuGraph 0.17.0 (Date TBD)
+
+## New Features
+
+## Improvements
+
+## Bug Fixes
+
+
 # cuGraph 0.16.0 (Date TBD)
 
 ## New Features
@@ -9,6 +18,9 @@
 - PR #1151 MNMG extension for pattern accelerator based PageRank, Katz Centrality, BFS, and SSSP implementations (C++ part)
 - PR #1163 Integrated 2D shuffling and Louvain updates
 - PR #1178 Refactored cython graph factory code to scale to additional data types
+- PR #1175 Integrated 2D pagerank python/cython infra
+- PR #1177 Integrated 2D bfs and sssp python/cython infra
+- PR #1172 MNMG Louvain implementation
 
 ## Improvements
 - PR 1081 MNMG Renumbering - sort partitions by degree
@@ -21,6 +33,7 @@
 - PR #1145 Simple edge list generator
 - PR #1144 updated documentation and APIs
 - PR #1139 MNMG Louvain Python updates, Cython cleanup
+- PR #1156 Add aarch64 gencode support
 - PR #1149 Parquet read and concat within workers
 - PR #1152 graph container cleanup, added arg for instantiating legacy types and switch statements to factory function
 - PR #1164 MG symmetrize and conda env updates
@@ -29,6 +42,10 @@
 - PR #1165 updated remaining algorithms to be NetworkX compatible
 - PR #1176 Update ci/local/README.md
 - PR #1184 BLD getting latest tags
+- PR #1222 Added min CUDA version check to MG Louvain
+- PR #1217 NetworkX Transition doc
+- PR #1223 Update mnmg docs
+
 
 ## Bug Fixes
 - PR #1131 Show style checker errors with set +e
@@ -39,7 +56,14 @@
 - PR #1166 Fix misspelling of function calls in asserts causing debug build to fail
 - PR #1180 BLD Adopt RAFT model for cuhornet dependency
 - PR #1181 Fix notebook error handling in CI
-- PR #1186 BLD Installing raft headers under cugraph 
+- PR #1199 BUG segfault in python test suite
+- PR #1186 BLD Installing raft headers under cugraph
+- PR #1192 Fix benchmark notes and documentation issues in graph.py
+- PR #1196 Move subcomms init outside of individual algorithm functions
+- PR #1198 Remove deprecated call to from_gpu_matrix
+- PR #1174 Fix bugs in MNMG pattern accelerators and pattern accelerator based implementations of MNMG PageRank, BFS, and SSSP
+
+
 
 # cuGraph 0.15.0 (26 Aug 2020)
 

diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ for i in range(len(df_page)):
 |              | Edge Betweenness Centrality            | Single-GPU   |                     |
 | Community    |                                        |              |                     |
 |              | Leiden                                 | Single-GPU   |                     |
-|              | Louvain                                | Single-GPU   |                     |
+|              | Louvain                                | Multiple-GPU |                     |
 |              | Ensemble Clustering for Graphs         | Single-GPU   |                     |
 |              | Spectral-Clustering - Balanced Cut     | Single-GPU   |                     |
 |              | Spectral-Clustering - Modularity       | Single-GPU   |                     |
@@ -57,16 +57,16 @@ for i in range(len(df_page)):
 | Layout       |                                        |              |                     |
 |              | Force Atlas 2                          | Single-GPU   |                     |
 | Link Analysis|                                        |              |                     |
-|              | Pagerank                               | Multiple-GPU | limited to 2 billion vertices |
-|              | Personal Pagerank                      | Multiple-GPU | limited to 2 billion vertices |
+|              | Pagerank                               | Multiple-GPU |                     |
+|              | Personal Pagerank                      | Single-GPU  |                     |
 |              | HITS                      				| Single-GPU   | leverages Gunrock   |
 | Link Prediction |                                     |              |                     |
 |              | Jaccard Similarity                     | Single-GPU   |                     |
 |              | Weighted Jaccard Similarity            | Single-GPU   |                     |
 |              | Overlap Similarity                     | Single-GPU   |                     |
 | Traversal    |                                        |              |                     |
-|              | Breadth First Search (BFS)             | Multiple-GPU | limited to 2 billion vertices |
-|              | Single Source Shortest Path (SSSP)     | Single-GPU   |                     |
+|              | Breadth First Search (BFS)             | Multiple-GPU |                     |
+|              | Single Source Shortest Path (SSSP)     | Multiple-GPU |                     |
 | Structure    |                                        |              |                     |
 |              | Renumbering                            | Single-GPU   | Also for multiple columns  |
 |              | Symmetrize                             | Single-GPU   |                     |
@@ -81,9 +81,7 @@ for i in range(len(df_page)):
 ## cuGraph Notice
 The current version of cuGraph has some limitations:
 
-- Vertex IDs need to be 32-bit integers (that restriction is going away in 0.16)
 - Vertex IDs are expected to be contiguous integers starting from 0.
---  If the starting index is not zero, cuGraph will add disconnected vertices to fill in the missing range.  (Auto-) Renumbering fixes this issue
 
 cuGraph provides the renumber function to mitigate this problem, which is by default automatically called when data is addted to a graph.  Input vertex IDs for the renumber function can be any type, can be non-contiguous, can be multiple columns, and can start from an arbitrary number. The renumber function maps the provided input vertex IDs to 32-bit contiguous integers starting from 0. cuGraph still requires the renumbered vertex IDs to be representable in 32-bit integers. These limitations are being addressed and will be fixed soon.
 
@@ -96,7 +94,7 @@ The amount of memory required is dependent on the graph structure and the analyt
 
 |       Size        | Recommended GPU Memory |
 |-------------------|------------------------|
-| 500 million edges |  32 GB                  |
+| 500 million edges |  32 GB                 |
 | 250 million edges |  16 GB                 |
 
 The use of managed memory for oversubscription can also be used to exceed the above memory limitations.  See the recent blog on _Tackling Large Graphs with RAPIDS cuGraph and CUDA Unified Memory on GPUs_:  https://medium.com/rapids-ai/tackling-large-graphs-with-rapids-cugraph-and-unified-virtual-memory-b5b69a065d4

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -57,7 +57,9 @@ source activate rapids
 
 logger "conda install required packages"
 conda install -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge -c defaults \
+      "libcudf=${MINOR_VERSION}" \
       "cudf=${MINOR_VERSION}" \
+      "librmm=${MINOR_VERSION}" \
       "rmm=${MINOR_VERSION}" \
       "cudatoolkit=$CUDA_REL" \
       "dask-cudf=${MINOR_VERSION}" \

diff --git a/conda/environments/cugraph_dev_cuda10.1.yml b/conda/environments/cugraph_dev_cuda10.1.yml
@@ -5,16 +5,16 @@ channels:
 - rapidsai-nightly
 - conda-forge
 dependencies:
-- cudf=0.16.*
-- libcudf=0.16.*
-- rmm=0.16.*
-- librmm=0.16.*
+- cudf=0.17.*
+- libcudf=0.17.*
+- rmm=0.17.*
+- librmm=0.17.*
 - dask>=2.12.0
 - distributed>=2.12.0
-- dask-cuda=0.16*
-- dask-cudf=0.16*
-- nccl>=2.5
-- ucx-py=0.16*
+- dask-cuda=0.17*
+- dask-cudf=0.17*
+- nccl>=2.7
+- ucx-py=0.17*
 - scipy
 - networkx
 - python-louvain

diff --git a/conda/environments/cugraph_dev_cuda10.2.yml b/conda/environments/cugraph_dev_cuda10.2.yml
@@ -5,16 +5,16 @@ channels:
 - rapidsai-nightly
 - conda-forge
 dependencies:
-- cudf=0.16.*
-- libcudf=0.16.*
-- rmm=0.16.*
-- librmm=0.16.*
+- cudf=0.17.*
+- libcudf=0.17.*
+- rmm=0.17.*
+- librmm=0.17.*
 - dask>=2.12.0
 - distributed>=2.12.0
-- dask-cuda=0.16*
-- dask-cudf=0.16*
-- nccl>=2.5
-- ucx-py=0.16*
+- dask-cuda=0.17*
+- dask-cudf=0.17*
+- nccl>=2.7
+- ucx-py=0.17*
 - scipy
 - networkx
 - python-louvain

diff --git a/conda/environments/cugraph_dev_cuda11.0.yml b/conda/environments/cugraph_dev_cuda11.0.yml
@@ -5,16 +5,16 @@ channels:
 - rapidsai-nightly
 - conda-forge
 dependencies:
-- cudf=0.16.*
-- libcudf=0.16.*
-- rmm=0.16.*
-- librmm=0.16.*
+- cudf=0.17.*
+- libcudf=0.17.*
+- rmm=0.17.*
+- librmm=0.17.*
 - dask>=2.12.0
 - distributed>=2.12.0
-- dask-cuda=0.16*
-- dask-cudf=0.16*
-- nccl>=2.5
-- ucx-py=0.16*
+- dask-cuda=0.17*
+- dask-cudf=0.17*
+- nccl>=2.7
+- ucx-py=0.17*
 - scipy
 - networkx
 - python-louvain

diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
@@ -36,7 +36,7 @@ requirements:
     - dask-cuda {{ minor_version }}
     - dask>=2.12.0
     - distributed>=2.12.0
-    - nccl>=2.5
+    - nccl>=2.7
     - ucx-py {{ minor_version }}
 
 #test:

diff --git a/conda/recipes/libcugraph/meta.yaml b/conda/recipes/libcugraph/meta.yaml
@@ -29,12 +29,12 @@ requirements:
     - cudatoolkit {{ cuda_version }}.*
     - boost-cpp>=1.66
     - libcypher-parser
-    - nccl>=2.5
+    - nccl>=2.7
     - ucx-py {{ minor_version }}
   run:
     - libcudf={{ minor_version }}
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
-    - nccl>=2.5
+    - nccl>=2.7
     - ucx-py {{ minor_version }}
 
 #test:

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -16,7 +16,7 @@
 
 cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
 
-project(CUGRAPH VERSION 0.16.0 LANGUAGES C CXX CUDA)
+project(CUGRAPH VERSION 0.17.0 LANGUAGES C CXX CUDA)
 
 ###################################################################################################
 # - build type ------------------------------------------------------------------------------------
@@ -48,14 +48,52 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-error=deprecated-declarations")
 endif(CMAKE_COMPILER_IS_GNUCXX)
 
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70")
-
 find_package(CUDA)
-if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10))
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_75,code=compute_75")
+
+# Check for aarch64 vs workstation architectures
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+  message(STATUS "CMAKE Detected aarch64 CPU architecture, selecting appropriate gencodes")
+  # This is being build for Linux4Tegra or SBSA ARM64 CUDA
+  set(GPU_ARCHS "62") # Default minimum CUDA GenCode - not supported by gunrock
+  if(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
+    set(GPU_ARCHS "${GPU_ARCHS};72")
+    set(GUNROCK_GENCODE "-DGUNROCK_GENCODE_SM72=TRUE")
+  endif()
+  if(CUDA_VERSION_MAJOR GREATER_EQUAL 11)
+    # This is probably for SBSA CUDA, or a next gen Jetson
+    set(GPU_ARCHS "${GPU_ARCHS};75;80")
+    set(GUNROCK_GENCODE "${GUNROCK_GENCODE} -DGUNROCK_GENCODE_SM75=TRUE -DGUNROCK_GENCODE_SM80=TRUE ")
+  endif()
+
+else()
+  message(STATUS "CMAKE selecting appropriate gencodes for x86 or ppc64 CPU architectures")
+  # System architecture was not aarch64,
+  # this is datacenter or workstation class hardware
+  set(GPU_ARCHS "60") # Default minimum supported CUDA gencode
+  set(GUNROCK_GENCODE "-DGUNROCK_GENCODE_SM60=TRUE")
+  if(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
+    set(GPU_ARCHS "${GPU_ARCHS};70")
+    set(GUNROCK_GENCODE "${GUNROCK_GENCODE} -DGUNROCK_GENCODE_SM70=TRUE")
+  endif()
+  if(CUDA_VERSION_MAJOR GREATER_EQUAL 10)
+    set(GPU_ARCHS "${GPU_ARCHS};75")
+    set(GUNROCK_GENCODE "${GUNROCK_GENCODE} -DGUNROCK_GENCODE_SM75=TRUE")
+  endif()
+  if(CUDA_VERSION_MAJOR GREATER_EQUAL 11)
+    set(GPU_ARCHS "${GPU_ARCHS};80")
+    set(GUNROCK_GENCODE "${GUNROCK_GENCODE} -DGUNROCK_GENCODE_SM80=TRUE")
+  endif()
+
 endif()
 
+message("-- Building for GPU_ARCHS = ${GPU_ARCHS}")
+foreach(arch ${GPU_ARCHS})
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${arch},code=sm_${arch}")
+endforeach()
+
+list(GET GPU_ARCHS -1 ptx)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${ptx},code=compute_${ptx}")
+
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xptxas --disable-warnings")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wno-error=sign-compare,-Wno-error=unused-but-set-variable")
@@ -172,6 +210,45 @@ if(NOT thrust_POPULATED)
 endif()
 set(THRUST_INCLUDE_DIR "${thrust_SOURCE_DIR}")
 
+# - cuco
+message("Fetching cuco")
+
+FetchContent_Declare(
+    cuco
+    GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
+    GIT_TAG        729d07db2e544e173efefdd168db21f7b8adcfaf
+    GIT_SHALLOW    true
+)
+
+FetchContent_GetProperties(cuco)
+if(NOT cuco_POPULATED)
+  FetchContent_Populate(cuco)
+endif()
+set(CUCO_INCLUDE_DIR "${cuco_SOURCE_DIR}/include")
+
+# - libcudacxx
+#     NOTE:  This is necessary because libcudacxx is not supported in
+#      debian cuda 10.2 packages.  Once 10.2 is deprecated
+#      we should not need this any longer.
+message("Fetching libcudacxx")
+
+FetchContent_Declare(
+    libcudacxx
+    GIT_REPOSITORY https://github.com/NVIDIA/libcudacxx.git
+    GIT_TAG        1.3.0
+    GIT_SHALLOW    true
+)
+
+FetchContent_GetProperties(libcudacxx)
+if(NOT libcudacxx_POPULATED)
+  message("populating libcudacxx")
+  FetchContent_Populate(libcudacxx)
+endif()
+set(LIBCUDACXX_INCLUDE_DIR "${libcudacxx_SOURCE_DIR}/include")
+message("set LIBCUDACXX_INCLUDE_DIR to: ${LIBCUDACXX_INCLUDE_DIR}")
+
+
+
 ###################################################################################################
 # - External Projects -----------------------------------------------------------------------------
 
@@ -198,18 +275,13 @@ set(CUGUNROCK_DIR ${CMAKE_CURRENT_BINARY_DIR}/cugunrock CACHE STRING
 
 ExternalProject_Add(cugunrock
   GIT_REPOSITORY    https://github.com/rapidsai/cugunrock.git
-  GIT_TAG           main
+  GIT_TAG           0b92fae6ee9026188a811b4d08915779e7c97178
   PREFIX            ${CUGUNROCK_DIR}
   CMAKE_ARGS        -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-                    -DGPU_ARCHS=""
                     -DGUNROCK_BUILD_SHARED_LIBS=OFF
                     -DGUNROCK_BUILD_TESTS=OFF
                     -DCUDA_AUTODETECT_GENCODE=FALSE
-                    -DGUNROCK_GENCODE_SM60=TRUE
-                    -DGUNROCK_GENCODE_SM61=TRUE
-                    -DGUNROCK_GENCODE_SM70=TRUE
-                    -DGUNROCK_GENCODE_SM72=TRUE
-                    -DGUNROCK_GENCODE_SM75=TRUE
+                    ${GUNROCK_GENCODE}
   BUILD_BYPRODUCTS  ${CUGUNROCK_DIR}/lib/libgunrock.a
 )
 
@@ -250,7 +322,7 @@ else(DEFINED ENV{RAFT_PATH})
 
   ExternalProject_Add(raft
     GIT_REPOSITORY    https://github.com/rapidsai/raft.git
-    GIT_TAG           53c1e2dde4045f386f9cc4bb7d3dc99d5690b886
+    GIT_TAG           515ed005aebc2276d52308516e623a4ab0b5e82c
     PREFIX            ${RAFT_DIR}
     CONFIGURE_COMMAND ""
     BUILD_COMMAND     ""
@@ -321,6 +393,8 @@ add_dependencies(cugraph raft)
 target_include_directories(cugraph
     PRIVATE
     "${THRUST_INCLUDE_DIR}"
+    "${CUCO_INCLUDE_DIR}"
+    "${LIBCUDACXX_INCLUDE_DIR}"
     "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
     "${LIBCYPHERPARSER_INCLUDE}"
     "${Boost_INCLUDE_DIRS}"

diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp
@@ -965,7 +965,7 @@ namespace experimental {
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-void bfs(raft::handle_t &handle,
+void bfs(raft::handle_t const &handle,
          graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
          vertex_t *distances,
          vertex_t *predecessors,
@@ -998,7 +998,7 @@ void bfs(raft::handle_t &handle,
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-void sssp(raft::handle_t &handle,
+void sssp(raft::handle_t const &handle,
           graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
           weight_t *distances,
           vertex_t *predecessors,
@@ -1046,7 +1046,7 @@ void sssp(raft::handle_t &handle,
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  */
 template <typename vertex_t, typename edge_t, typename weight_t, typename result_t, bool multi_gpu>
-void pagerank(raft::handle_t &handle,
+void pagerank(raft::handle_t const &handle,
               graph_view_t<vertex_t, edge_t, weight_t, true, multi_gpu> const &graph_view,
               weight_t *adj_matrix_row_out_weight_sums,
               vertex_t *personalization_vertices,