diff --git a/CHANGELOG.md b/CHANGELOG.md
index 44b9c097774..cd6d6690659 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,15 +1,45 @@
 # cuGraph 0.16.0 (Date TBD)
 
 ## New Features
-- PR 1098 Add new graph classes to support 2D partitioning
+- PR #1098 Add new graph classes to support 2D partitioning
+- PR #1124 Sub-communicator initialization for 2D partitioning support
+- PR #838 Add pattern accelerator API functions and pattern accelerator API based implementations of PageRank, Katz Centrality, BFS, and SSSP
+- PR #1147 Added support for NetworkX graphs as input type
+- PR #1157 Louvain API update to use graph_container_t
+- PR #1151 MNMG extension for pattern accelerator based PageRank, Katz Centrality, BFS, and SSSP implementations (C++ part)
+- PR #1163 Integrated 2D shuffling and Louvain updates
+- PR #1178 Refactored cython graph factory code to scale to additional data types
 
 ## Improvements
 - PR 1081 MNMG Renumbering - sort partitions by degree
 - PR 1115 Replace deprecated rmm::mr::get_default_resource with rmm::mr::get_current_device_resource
+- PR #1133 added python 2D shuffling
+- PR 1129 Refactored test to use common dataset and added additional doc pages
+- PR 1135 SG Updates to Louvain et. al.
+- PR 1132 Upgrade Thrust to latest commit
 - PR #1129 Refactored test to use common dataset and added additional doc pages
+- PR #1145 Simple edge list generator
+- PR #1144 updated documentation and APIs
+- PR #1139 MNMG Louvain Python updates, Cython cleanup
+- PR #1149 Parquet read and concat within workers
+- PR #1152 graph container cleanup, added arg for instantiating legacy types and switch statements to factory function
+- PR #1164 MG symmetrize and conda env updates
+- PR #1162 enhanced networkx testing
+- PR #1169 Added RAPIDS cpp packages to cugraph dev env
+- PR #1165 updated remaining algorithms to be NetworkX compatible
+- PR #1176 Update ci/local/README.md
+- PR #1184 BLD getting latest tags
 
 ## Bug Fixes
 - PR #1131 Show style checker errors with set +e
+- PR #1150 Update RAFT git tag
+- PR #1155 Remove RMM library dependency and CXX11 ABI handling
+- PR #1158 Pass size_t* & size_t* instead of size_t[] & int[] for raft allgatherv's input parameters recvcounts & displs
+- PR #1168 Disabled MG tests on single GPU
+- PR #1166 Fix misspelling of function calls in asserts causing debug build to fail
+- PR #1180 BLD Adopt RAFT model for cuhornet dependency
+- PR #1181 Fix notebook error handling in CI
+- PR #1186 BLD Installing raft headers under cugraph 
 
 # cuGraph 0.15.0 (26 Aug 2020)
 
diff --git a/SOURCEBUILD.md b/SOURCEBUILD.md
index 29aa20ad522..8acd90c4f7f 100644
--- a/SOURCEBUILD.md
+++ b/SOURCEBUILD.md
@@ -244,22 +244,5 @@ unset LD_LIBRARY_PATH
 
 Python API documentation can be generated from [docs](docs) directory.
 
-## C++ ABI issues
-
-cuGraph builds with C++14 features.  By default, we build cuGraph with the latest ABI (the ABI changed with C++11).  The version of cuDF pointed to in
-the conda installation above is build with the new ABI.
-
-If you see link errors indicating trouble finding functions that use C++ strings when trying to build cuGraph you may have an ABI incompatibility.
-
-There are a couple of complications that may make this a problem:
-* if you need to link in a library built with the old ABI, you may need to build the entire tool chain from source using the old ABI.
-* if you build cudf from source (for whatever reason), the default behavior for cudf (at least through version 0.5.x) is to build using the old ABI.  You can build with the new ABI, but you need to follow the instructions in CUDF to explicitly turn that on.
-
-If you must build cugraph with the old ABI, you can use the following command (instead of the cmake call above):
-
-```bash
-cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=OFF
-```
-
 ## Attribution
 Portions adopted from https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md
diff --git a/build.sh b/build.sh
index e0557344384..ae3ad575227 100755
--- a/build.sh
+++ b/build.sh
@@ -105,7 +105,6 @@ if (( ${NUMARGS} == 0 )) || hasArg libcugraph; then
     mkdir -p ${LIBCUGRAPH_BUILD_DIR}
     cd ${LIBCUGRAPH_BUILD_DIR}
     cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
-          -DCMAKE_CXX11_ABI=${BUILD_ABI} \
 	  -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
           -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ..
     make -j${PARALLEL_LEVEL} VERBOSE=${VERBOSE} ${INSTALL_TARGET}
@@ -131,8 +130,7 @@ if (( ${NUMARGS} == 0 )) || hasArg docs; then
         mkdir -p ${LIBCUGRAPH_BUILD_DIR}
         cd ${LIBCUGRAPH_BUILD_DIR}
         cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
-            -DCMAKE_CXX11_ABI=${BUILD_ABI} \
-	        -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
+            -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
             -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ..
     fi
 
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 3cef2e56877..83f234f787b 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -53,7 +53,7 @@ logger "Check GPU usage..."
 nvidia-smi
 
 logger "Activate conda env..."
-source activate gdf
+source activate rapids
 
 logger "conda install required packages"
 conda install -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge -c defaults \
@@ -98,6 +98,10 @@ fi
 # TEST - Run GoogleTest and py.tests for libcugraph and cuGraph
 ################################################################################
 
+set +e -Eo pipefail
+EXITCODE=0
+trap "EXITCODE=1" ERR
+
 if hasArg --skip-tests; then
     logger "Skipping Tests..."
 else
@@ -122,3 +126,5 @@ else
     ${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log
     python ${WORKSPACE}/ci/utils/nbtestlog2junitxml.py nbtest.log
 fi
+
+return ${EXITCODE}
diff --git a/ci/gpu/test-notebooks.sh b/ci/gpu/test-notebooks.sh
index 247eb328d2e..389d3be0bfd 100755
--- a/ci/gpu/test-notebooks.sh
+++ b/ci/gpu/test-notebooks.sh
@@ -23,7 +23,7 @@ TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u)
 # Add notebooks that should be skipped here
 # (space-separated list of filenames without paths)
 
-SKIPNBS="uvm.ipynb bfs_benchmark.ipynb louvain_benchmark.ipynb pagerank_benchmark.ipynb sssp_benchmark.ipynb release.ipynb"
+SKIPNBS="uvm.ipynb bfs_benchmark.ipynb louvain_benchmark.ipynb pagerank_benchmark.ipynb sssp_benchmark.ipynb release.ipynb nx_cugraph_bc_benchmarking.ipynb"
 
 ## Check env
 env
diff --git a/ci/local/README.md b/ci/local/README.md
index 28bbe3590ea..07e2041d0a3 100644
--- a/ci/local/README.md
+++ b/ci/local/README.md
@@ -18,19 +18,19 @@ Build and test your local repository using a base gpuCI Docker image
 where:
     -H   Show this help text
     -r   Path to repository (defaults to working directory)
-    -i   Use Docker image (default is gpuci/rapidsai-base:cuda10.0-ubuntu16.04-gcc5-py3.6)
+    -i   Use Docker image (default is gpuci/rapidsai:${NIGHTLY_VERSION}-cuda10.1-devel-ubuntu16.04-py3.7)
     -s   Skip building and testing and start an interactive shell in a container of the Docker image
 ```
 
 Example Usage:
-`bash build.sh -r ~/rapids/cugraph -i gpuci/rapidsai-base:cuda10.1-ubuntu16.04-gcc5-py3.6`
+`bash build.sh -r ~/rapids/cugraph -i gpuci/rapidsai:0.16-cuda10.2-devel-ubuntu16.04-py3.7`
 
 For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai/tags) page.
 
 Style Check:
 ```bash
 $ bash ci/local/build.sh -r ~/rapids/cugraph -s
-$ source activate gdf    #Activate gpuCI conda environment
+$ source activate rapids    # Activate gpuCI conda environment
 $ cd rapids
 $ flake8 python
 ```
@@ -42,7 +42,7 @@ There are some caveats to be aware of when using this script, especially if you
 
 ### Docker Image Build Repository
 
-The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cugraph/build_rapidsai-base_cuda10.1-ubuntu16.04-gcc5-py3.6/`. Feel free to remove this directory after the script is finished.
+The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cugraph/build_rapidsai_cuda10.1-ubuntu16.04-py3.7/`. Feel free to remove this directory after the script is finished.
 
 *Note*: The script *will not* override your local build repository. Your local environment stays in tact.
 
diff --git a/conda/environments/cugraph_dev_cuda10.1.yml b/conda/environments/cugraph_dev_cuda10.1.yml
index c9d04da58f4..05113f3d7ee 100644
--- a/conda/environments/cugraph_dev_cuda10.1.yml
+++ b/conda/environments/cugraph_dev_cuda10.1.yml
@@ -6,7 +6,9 @@ channels:
 - conda-forge
 dependencies:
 - cudf=0.16.*
+- libcudf=0.16.*
 - rmm=0.16.*
+- librmm=0.16.*
 - dask>=2.12.0
 - distributed>=2.12.0
 - dask-cuda=0.16*
diff --git a/conda/environments/cugraph_dev_cuda10.2.yml b/conda/environments/cugraph_dev_cuda10.2.yml
index 0285d9b2b10..02537e4bf6c 100644
--- a/conda/environments/cugraph_dev_cuda10.2.yml
+++ b/conda/environments/cugraph_dev_cuda10.2.yml
@@ -6,7 +6,9 @@ channels:
 - conda-forge
 dependencies:
 - cudf=0.16.*
+- libcudf=0.16.*
 - rmm=0.16.*
+- librmm=0.16.*
 - dask>=2.12.0
 - distributed>=2.12.0
 - dask-cuda=0.16*
diff --git a/conda/environments/cugraph_dev_cuda11.0.yml b/conda/environments/cugraph_dev_cuda11.0.yml
index 1b6d1400897..efd4b57dcc4 100644
--- a/conda/environments/cugraph_dev_cuda11.0.yml
+++ b/conda/environments/cugraph_dev_cuda11.0.yml
@@ -6,7 +6,9 @@ channels:
 - conda-forge
 dependencies:
 - cudf=0.16.*
+- libcudf=0.16.*
 - rmm=0.16.*
+- librmm=0.16.*
 - dask>=2.12.0
 - distributed>=2.12.0
 - dask-cuda=0.16*
diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
index 1a32fd2a4b1..1376a0e30d2 100644
--- a/conda/recipes/cugraph/meta.yaml
+++ b/conda/recipes/cugraph/meta.yaml
@@ -32,6 +32,10 @@ requirements:
     - python x.x
     - libcugraph={{ version }}
     - cudf={{ minor_version }}
+    - dask-cudf {{ minor_version }}
+    - dask-cuda {{ minor_version }}
+    - dask>=2.12.0
+    - distributed>=2.12.0
     - nccl>=2.5
     - ucx-py {{ minor_version }}
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 03049a21f00..df17d7c14dd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -46,29 +46,6 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
 if(CMAKE_COMPILER_IS_GNUCXX)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wno-error=deprecated-declarations")
-
-###################################################################################################
-###   C++ ABI changes.
-###
-###   By default, cugraph builds with the new C++ ABI.  In order to insure that thirdparty
-###   applications build with the properly setting (specifically RMM) we need to set
-###   the CMAKE_CXX11_ABI flag appropriately.
-###
-###   If a user wants to build with the OLD ABI, then they need to define CMAKE_CXX11_ABI
-###   to be OFF (typically on the cmake command line).
-###
-###   This block of code will configure the old ABI if the flag is set to OFF and
-###   do nothing (the default behavior of the C++14 compiler).
-###
-    option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON)
-    if(CMAKE_CXX11_ABI)
-        message(STATUS "CUGRAPH: Enabling the GLIBCXX11 ABI")
-    else()
-        message(STATUS "CUGRAPH: Disabling the GLIBCXX11 ABI")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -D_GLIBCXX_USE_CXX11_ABI=0")
-    endif(CMAKE_CXX11_ABI)
 endif(CMAKE_COMPILER_IS_GNUCXX)
 
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60")
@@ -172,49 +149,20 @@ find_path(RMM_INCLUDE "rmm"
     "$ENV{CONDA_PREFIX}/include/rmm"
     "$ENV{CONDA_PREFIX}/include")
 
-find_library(RMM_LIBRARY "rmm"
-    HINTS
-    "$ENV{RMM_ROOT}/lib"
-    "$ENV{CONDA_PREFIX}/lib")
-
-message(STATUS "RMM: RMM_LIBRARY set to ${RMM_LIBRARY}")
 message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}")
 
-add_library(rmm SHARED IMPORTED ${RMM_LIBRARY})
-if (RMM_INCLUDE AND RMM_LIBRARY)
-    set_target_properties(rmm PROPERTIES IMPORTED_LOCATION ${RMM_LIBRARY})
-endif (RMM_INCLUDE AND RMM_LIBRARY)
-
 ###################################################################################################
-# - Fetch Content -----------------------------------------------------------------------------
+# - Fetch Content ---------------------------------------------------------------------------------
 include(FetchContent)
 
-# - CUB
-message("Fetching CUB")
-
-FetchContent_Declare(
-    cub
-    GIT_REPOSITORY https://github.com/thrust/cub.git
-    GIT_TAG        1.9.10
-    GIT_SHALLOW    true
-)
-
-FetchContent_GetProperties(cub)
-if(NOT cub_POPULATED)
-  FetchContent_Populate(cub)
-  # We are not using the cub CMake targets, so no need to call `add_subdirectory()`.
-endif()
-set(CUB_INCLUDE_DIR "${cub_SOURCE_DIR}")
-
-# - THRUST
+# - THRUST/CUB
 message("Fetching Thrust")
 
 FetchContent_Declare(
     thrust
     GIT_REPOSITORY https://github.com/thrust/thrust.git
-    GIT_TAG        1.9.10
-    GIT_SHALLOW    true
-    PATCH_COMMAND COMMAND patch -p1 < "${CMAKE_CURRENT_SOURCE_DIR}/cmake/thrust-ret-if-fail.patch"
+    # August 28, 2020
+    GIT_TAG        52a8bda46c5c2128414d1d47f546b486ff0be2f0
 )
 
 FetchContent_GetProperties(thrust)
@@ -224,9 +172,6 @@ if(NOT thrust_POPULATED)
 endif()
 set(THRUST_INCLUDE_DIR "${thrust_SOURCE_DIR}")
 
-
-
-
 ###################################################################################################
 # - External Projects -----------------------------------------------------------------------------
 
@@ -240,7 +185,7 @@ set(CUHORNET_INCLUDE_DIR ${CUHORNET_DIR}/src/cuhornet CACHE STRING "Path to cuho
 
 ExternalProject_Add(cuhornet
   GIT_REPOSITORY    https://github.com/rapidsai/cuhornet.git
-  GIT_TAG           main
+  GIT_TAG           9cb8e8803852bd895a9c95c0fe778ad6eeefa7ad
   PREFIX            ${CUHORNET_DIR}
   CONFIGURE_COMMAND ""
   BUILD_COMMAND     ""
@@ -305,7 +250,7 @@ else(DEFINED ENV{RAFT_PATH})
 
   ExternalProject_Add(raft
     GIT_REPOSITORY    https://github.com/rapidsai/raft.git
-    GIT_TAG           099e2b874b05555a78bed1666fa2d22f784e56a7
+    GIT_TAG           53c1e2dde4045f386f9cc4bb7d3dc99d5690b886
     PREFIX            ${RAFT_DIR}
     CONFIGURE_COMMAND ""
     BUILD_COMMAND     ""
@@ -332,6 +277,7 @@ add_library(cugraph SHARED
     src/db/db_parser_integration_test.cu
     src/db/db_operators.cu
     src/utilities/spmv_1D.cu
+    src/utilities/cython.cu
     src/structure/graph.cu
     src/link_analysis/pagerank.cu
     src/link_analysis/pagerank_1D.cu
@@ -344,10 +290,8 @@ add_library(cugraph SHARED
     src/converters/renumber.cu
     src/converters/COOtoCSR.cu
     src/community/spectral_clustering.cu
-    src/community/louvain.cpp
-    src/community/louvain_kernels.cu
-    src/community/leiden.cpp
-    src/community/leiden_kernels.cu
+    src/community/louvain.cu
+    src/community/leiden.cu
     src/community/ktruss.cu
     src/community/ECG.cu
     src/community/triangles_counting.cu
@@ -359,6 +303,10 @@ add_library(cugraph SHARED
     src/centrality/betweenness_centrality.cu
     src/experimental/graph.cu
     src/experimental/graph_view.cu
+    src/experimental/bfs.cu
+    src/experimental/sssp.cu
+    src/experimental/pagerank.cu
+    src/experimental/katz_centrality.cu
 )
 
 #
@@ -372,7 +320,6 @@ add_dependencies(cugraph raft)
 # - include paths ---------------------------------------------------------------------------------
 target_include_directories(cugraph
     PRIVATE
-    "${CUB_INCLUDE_DIR}"
     "${THRUST_INCLUDE_DIR}"
     "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
     "${LIBCYPHERPARSER_INCLUDE}"
@@ -395,7 +342,7 @@ target_include_directories(cugraph
 # - link libraries --------------------------------------------------------------------------------
 
 target_link_libraries(cugraph PRIVATE
-    ${RMM_LIBRARY} gunrock cublas cusparse curand cusolver cudart cuda ${LIBCYPHERPARSER_LIBRARY} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARIES})
+    gunrock cublas cusparse curand cusolver cudart cuda ${LIBCYPHERPARSER_LIBRARY} ${MPI_CXX_LIBRARIES} ${NCCL_LIBRARIES})
 
 if(OpenMP_CXX_FOUND)
 target_link_libraries(cugraph PRIVATE
@@ -408,7 +355,7 @@ target_link_libraries(cugraph PRIVATE
 ###   ...
 ###
 ###   libgomp.so is included in the conda base environment and copied to every new conda
-###   environment. If a full file path is provided (e.g ${RMM_LIBRARY}), cmake
+###   environment. If a full file path is provided (e.g ${NCCL_LIBRARIES}), cmake
 ###   extracts the directory path and adds the directory path to BUILD_RPATH (if BUILD_RPATH is not
 ###   disabled).
 ###
@@ -421,7 +368,7 @@ target_link_libraries(cugraph PRIVATE
 ###   If a full path to libgomp.so is provided (which is the case with OpenMP::OpenMP_CXX), cmake
 ###   checks whether there is any other libgomp.so with the different full path (after resolving
 ###   soft links) in the search paths (implicit directoires + BUILD_RAPTH). There is one in the
-###   path included in BUILD_RPATH when ${RMM_LIBRARY} are added; this one can
+###   path included in BUILD_RPATH when ${NCCL_LIBRARIES} are added; this one can
 ###   potentially hide the one in the provided full path and cmake generates a warning (and RPATH
 ###   is searched before the directories in /etc/ld.so/conf; ld.so.conf does not coincide but
 ###   overlaps with implicit directories).
@@ -470,6 +417,8 @@ install(TARGETS cugraph LIBRARY
 install(DIRECTORY include/
     DESTINATION include/cugraph)
 
+install(DIRECTORY ${RAFT_DIR}/cpp/include/raft/
+    DESTINATION include/cugraph/raft)
 ###################################################################################################
 # - make documentation ----------------------------------------------------------------------------
 # requires doxygen and graphviz to be installed
diff --git a/cpp/cmake/Modules/ConfigureArrow.cmake b/cpp/cmake/Modules/ConfigureArrow.cmake
index 647f335959e..b27e53dd415 100644
--- a/cpp/cmake/Modules/ConfigureArrow.cmake
+++ b/cpp/cmake/Modules/ConfigureArrow.cmake
@@ -21,14 +21,6 @@ set(ARROW_CMAKE_ARGS " -DARROW_WITH_LZ4=OFF"
                      " -DARROW_USE_GLOG=OFF"
                      " -DCMAKE_VERBOSE_MAKEFILE=ON")
 
-if(NOT CMAKE_CXX11_ABI)
-    message(STATUS "ARROW: Disabling the GLIBCXX11 ABI")
-    list(APPEND ARROW_CMAKE_ARGS " -DARROW_TENSORFLOW=ON")
-elseif(CMAKE_CXX11_ABI)
-    message(STATUS "ARROW: Enabling the GLIBCXX11 ABI")
-    list(APPEND ARROW_CMAKE_ARGS " -DARROW_TENSORFLOW=OFF")
-endif(NOT CMAKE_CXX11_ABI)
-
 configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/Arrow.CMakeLists.txt.cmake"
                "${ARROW_ROOT}/CMakeLists.txt")
 
diff --git a/cpp/cmake/Modules/ConfigureGoogleTest.cmake b/cpp/cmake/Modules/ConfigureGoogleTest.cmake
index d62bee2b198..9fac40f4649 100644
--- a/cpp/cmake/Modules/ConfigureGoogleTest.cmake
+++ b/cpp/cmake/Modules/ConfigureGoogleTest.cmake
@@ -4,16 +4,6 @@ set(GTEST_CMAKE_ARGS "")
                      #" -Dgtest_build_samples=ON" 
                      #" -DCMAKE_VERBOSE_MAKEFILE=ON")
 
-if(NOT CMAKE_CXX11_ABI)
-    message(STATUS "GTEST: Disabling the GLIBCXX11 ABI")
-    list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_C_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0")
-    list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0")
-elseif(CMAKE_CXX11_ABI)
-    message(STATUS "GTEST: Enabling the GLIBCXX11 ABI")
-    list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_C_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=1")
-    list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=1")
-endif(NOT CMAKE_CXX11_ABI)
-
 configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/GoogleTest.CMakeLists.txt.cmake"
                "${GTEST_ROOT}/CMakeLists.txt")
 
diff --git a/cpp/cmake/thrust-ret-if-fail.patch b/cpp/cmake/thrust-ret-if-fail.patch
deleted file mode 100644
index 990b3f993be..00000000000
--- a/cpp/cmake/thrust-ret-if-fail.patch
+++ /dev/null
@@ -1,16 +0,0 @@
-diff --git a/thrust/system/cuda/detail/core/util.h b/thrust/system/cuda/detail/core/util.h
-index a2c87772..ea4ed640 100644
---- a/thrust/system/cuda/detail/core/util.h
-+++ b/thrust/system/cuda/detail/core/util.h
-@@ -652,7 +652,10 @@ namespace core {
-   }
- 
- #define CUDA_CUB_RET_IF_FAIL(e) \
--  if (cub::Debug((e), __FILE__, __LINE__)) return e;
-+  {                             \
-+    auto const error = (e);     \
-+    if (cub::Debug(error, __FILE__, __LINE__)) return error; \
-+  }
- 
-   // uninitialized
-   // -------
diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp
index 489f43a69c4..9118ed3a7c4 100644
--- a/cpp/include/algorithms.hpp
+++ b/cpp/include/algorithms.hpp
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <experimental/graph_view.hpp>
 #include <graph.hpp>
 #include <internals.hpp>
 #include <raft/handle.hpp>
@@ -616,31 +617,30 @@ void bfs(raft::handle_t const &handle,
  *
  * @throws     cugraph::logic_error when an error occurs.
  *
- * @tparam vertex_t                  Type of vertex identifiers.
- *                                   Supported value : int (signed, 32-bit)
- * @tparam edge_t                    Type of edge identifiers.
- *                                   Supported value : int (signed, 32-bit)
- * @tparam weight_t                  Type of edge weights. Supported values : float or double.
+ * @tparam     graph_t               Type of graph
  *
+ * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
  * @param[in]  graph                 input graph object (CSR)
- * @param[out] final_modularity      modularity of the returned clustering
- * @param[out] num_level             number of levels of the returned clustering
  * @param[out] clustering            Pointer to device array where the clustering should be stored
- * @param[in]  max_iter              (optional) maximum number of iterations to run (default 100)
+ * @param[in]  max_level             (optional) maximum number of levels to run (default 100)
  * @param[in]  resolution            (optional) The value of the resolution parameter to use.
  *                                   Called gamma in the modularity formula, this changes the size
  *                                   of the communities.  Higher resolutions lead to more smaller
  *                                   communities, lower resolutions lead to fewer larger
- * communities. (default 1)
+ *                                   communities. (default 1)
+ *
+ * @return                           a pair containing:
+ *                                     1) number of levels of the returned clustering
+ *                                     2) modularity of the returned clustering
  *
  */
-template <typename vertex_t, typename edge_t, typename weight_t>
-void louvain(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-             weight_t *final_modularity,
-             int *num_level,
-             vertex_t *louvain_parts,
-             int max_iter        = 100,
-             weight_t resolution = weight_t{1});
+template <typename graph_t>
+std::pair<size_t, typename graph_t::weight_type> louvain(
+  raft::handle_t const &handle,
+  graph_t const &graph,
+  typename graph_t::vertex_type *clustering,
+  size_t max_level                         = 100,
+  typename graph_t::weight_type resolution = typename graph_t::weight_type{1});
 
 /**
  * @brief      Leiden implementation
@@ -662,9 +662,8 @@ void louvain(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
  *                                   Supported value : int (signed, 32-bit)
  * @tparam weight_t                  Type of edge weights. Supported values : float or double.
  *
+ * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
  * @param[in]  graph                 input graph object (CSR)
- * @param[out] final_modularity      modularity of the returned clustering
- * @param[out] num_level             number of levels of the returned clustering
  * @param[out] clustering            Pointer to device array where the clustering should be stored
  * @param[in]  max_iter              (optional) maximum number of iterations to run (default 100)
  * @param[in]  resolution            (optional) The value of the resolution parameter to use.
@@ -672,14 +671,17 @@ void louvain(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
  *                                   of the communities.  Higher resolutions lead to more smaller
  *                                   communities, lower resolutions lead to fewer larger
  * communities. (default 1)
+ *
+ * @return                           a pair containing:
+ *                                     1) number of levels of the returned clustering
+ *                                     2) modularity of the returned clustering
  */
 template <typename vertex_t, typename edge_t, typename weight_t>
-void leiden(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-            weight_t &final_modularity,
-            int &num_level,
-            vertex_t *leiden_parts,
-            int max_iter        = 100,
-            weight_t resolution = weight_t{1});
+std::pair<size_t, weight_t> leiden(raft::handle_t const &handle,
+                                   GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
+                                   vertex_t *clustering,
+                                   size_t max_iter     = 100,
+                                   weight_t resolution = weight_t{1});
 
 /**
  * @brief Computes the ecg clustering of the given graph.
@@ -692,21 +694,26 @@ void leiden(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
  *
  * @throws     cugraph::logic_error when an error occurs.
  *
- * @tparam VT                        Type of vertex identifiers. Supported value : int (signed,
+ * @tparam vertex_t                  Type of vertex identifiers. Supported value : int (signed,
  * 32-bit)
- * @tparam ET                        Type of edge identifiers.  Supported value : int (signed,
+ * @tparam edge_t                    Type of edge identifiers.  Supported value : int (signed,
  * 32-bit)
- * @tparam WT                        Type of edge weights. Supported values : float or double.
+ * @tparam weight_t                  Type of edge weights. Supported values : float or double.
  *
+ * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
  * @param[in]  graph_coo             input graph object (COO)
  * @param[in]  graph_csr             input graph object (CSR)
  * @param[in]  min_weight            The minimum weight parameter
  * @param[in]  ensemble_size         The ensemble size parameter
- * @param[out] ecg_parts             A device pointer to array where the partitioning should be
+ * @param[out] clustering            A device pointer to array where the partitioning should be
  * written
  */
-template <typename VT, typename ET, typename WT>
-void ecg(GraphCSRView<VT, ET, WT> const &graph_csr, WT min_weight, VT ensemble_size, VT *ecg_parts);
+template <typename vertex_t, typename edge_t, typename weight_t>
+void ecg(raft::handle_t const &handle,
+         GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
+         weight_t min_weight,
+         vertex_t ensemble_size,
+         vertex_t *clustering);
 
 namespace triangle {
 
@@ -927,4 +934,178 @@ void hits(GraphCSRView<VT, ET, WT> const &graph,
 
 }  // namespace gunrock
 
+namespace experimental {
+
+/**
+ * @brief Run breadth-first search to find the distances (and predecessors) from the source
+ * vertex.
+ *
+ * This function computes the distances (minimum number of hops to reach the vertex) from the source
+ * vertex. If @p predecessors is not `nullptr`, this function calculates the predecessor of each
+ * vertex (parent vertex in the breadth-first search tree) as well.
+ *
+ * @throws cugraph::logic_error on erroneous input arguments.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object.
+ * @param distances Pointer to the output distance array.
+ * @param predecessors Pointer to the output predecessor array or `nullptr`.
+ * @param source_vertex Source vertex to start breadth-first search (root vertex of the breath-first
+ * search tree).
+ * @param direction_optimizing If set to true, this algorithm switches between the push based
+ * breadth-first search and pull based breadth-first search depending on the size of the
+ * breadth-first search frontier (currently unsupported). This option is valid only for symmetric
+ * input graphs.
+ * @param depth_limit Sets the maximum number of breadth-first search iterations. Any vertices
+ * farther than @p depth_limit hops from @p source_vertex will be marked as unreachable.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void bfs(raft::handle_t &handle,
+         graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
+         vertex_t *distances,
+         vertex_t *predecessors,
+         vertex_t source_vertex,
+         bool direction_optimizing = false,
+         vertex_t depth_limit      = std::numeric_limits<vertex_t>::max(),
+         bool do_expensive_check   = false);
+
+/**
+ * @brief Run single-source shortest-path to compute the minimum distances (and predecessors) from
+ * the source vertex.
+ *
+ * This function computes the distances (minimum edge weight sums) from the source vertex. If @p
+ * predecessors is not `nullptr`, this function calculates the predecessor of each vertex in the
+ * shortest-path as well. Graph edge weights should be non-negative.
+ *
+ * @throws cugraph::logic_error on erroneous input arguments.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object.
+ * @param distances Pointer to the output distance array.
+ * @param predecessors Pointer to the output predecessor array or `nullptr`.
+ * @param source_vertex Source vertex to start single-source shortest-path.
+ * @param cutoff Single-source shortest-path terminates if no more vertices are reachable within the
+ * distance of @p cutoff. Any vertex farther than @p cutoff will be marked as unreachable.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void sssp(raft::handle_t &handle,
+          graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
+          weight_t *distances,
+          vertex_t *predecessors,
+          vertex_t source_vertex,
+          weight_t cutoff         = std::numeric_limits<weight_t>::max(),
+          bool do_expensive_check = false);
+
+/**
+ * @brief Compute PageRank scores.
+ *
+ * This function computes general (if @p personalization_vertices is `nullptr`) or personalized (if
+ * @p personalization_vertices is not `nullptr`.) PageRank scores.
+ *
+ * @throws cugraph::logic_error on erroneous input arguments or if fails to converge before @p
+ * max_iterations.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam result_t Type of PageRank scores.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object.
+ * @param adj_matrix_row_out_weight_sums Pointer to an array storing sums of out-going edge weights
+ * for the vertices in the rows of the graph adjacency matrix (for re-use) or `nullptr`. If
+ * `nullptr`, these values are freshly computed. Computing these values outsid this function reduces
+ * the number of memoray allocations/deallocations and computing if a user repeatedly computes
+ * PageRank scores using the same graph with different personalization vectors.
+ * @param personalization_vertices Pointer to an array storing personalization vertex identifiers
+ * (compute personalized PageRank) or `nullptr` (compute general PageRank).
+ * @param personalization_values Pointer to an array storing personalization values for the vertices
+ * in the personalization set. Relevant only if @p personalization_vertices is not `nullptr`.
+ * @param personalization_vector_size Size of the personalization set. If @personalization_vertices
+ * is not `nullptr`, the sizes of the arrays pointed by @p personalization_vertices and @p
+ * personalization_values should be @p personalization_vector_size.
+ * @param pageranks Pointer to the output PageRank score array.
+ * @param alpha PageRank damping factor.
+ * @param epsilon Error tolerance to check convergence. Convergence is assumed if the sum of the
+ * differences in PageRank values between two consecutive iterations is less than the number of
+ * vertices in the graph multiplied by @p epsilon.
+ * @param max_iterations Maximum number of PageRank iterations.
+ * @param has_initial_guess If set to `true`, values in the PageRank output array (pointed by @p
+ * pageranks) is used as initial PageRank values. If false, initial PageRank values are set to 1.0
+ * divided by the number of vertices in the graph.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, typename result_t, bool multi_gpu>
+void pagerank(raft::handle_t &handle,
+              graph_view_t<vertex_t, edge_t, weight_t, true, multi_gpu> const &graph_view,
+              weight_t *adj_matrix_row_out_weight_sums,
+              vertex_t *personalization_vertices,
+              result_t *personalization_values,
+              vertex_t personalization_vector_size,
+              result_t *pageranks,
+              result_t alpha,
+              result_t epsilon,
+              size_t max_iterations   = 500,
+              bool has_initial_guess  = false,
+              bool do_expensive_check = false);
+
+/**
+ * @brief Compute Katz Centrality scores.
+ *
+ * This function computes Katz Centrality scores.
+ *
+ * @throws cugraph::logic_error on erroneous input arguments or if fails to converge before @p
+ * max_iterations.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam result_t Type of Katz Centrality scores.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object.
+ * @param betas Pointer to an array holding the values to be added to each vertex's new Katz
+ * Centrality score in every iteration or `nullptr`. If set to `nullptr`, constant @p beta is used
+ * instead.
+ * @param katz_centralities Pointer to the output Katz Centrality score array.
+ * @param alpha Katz Centrality attenuation factor. This should be smaller than the inverse of the
+ * maximum eigenvalue of the adjacency matrix of @p graph.
+ * @param beta Constant value to be added to each vertex's new Katz Centrality score in every
+ * iteration. Relevant only when @p betas is `nullptr`.
+ * @param epsilon Error tolerance to check convergence. Convergence is assuemd if the sum of the
+ * differences in Katz Centrality values between two consecutive iterations is less than the number
+ * of vertices in the graph multiplied by @p epsilon.
+ * @param max_iterations Maximum number of Katz Centrality iterations.
+ * @param has_initial_guess If set to `true`, values in the Katz Centrality output array (pointed by
+ * @p katz_centralities) is used as initial Katz Centrality values. If false, zeros are used as
+ * initial Katz Centrality values.
+ * @param normalize If set to `true`, final Katz Centrality scores are normalized (the L2-norm of
+ * the returned Katz Centrality score array is 1.0) before returning.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, typename result_t, bool multi_gpu>
+void katz_centrality(raft::handle_t &handle,
+                     graph_view_t<vertex_t, edge_t, weight_t, true, multi_gpu> const &graph_view,
+                     result_t *betas,
+                     result_t *katz_centralities,
+                     result_t alpha,
+                     result_t beta,
+                     result_t epsilon,
+                     size_t max_iterations   = 500,
+                     bool has_initial_guess  = false,
+                     bool normalize          = false,
+                     bool do_expensive_check = false);
+
+}  // namespace experimental
+
 }  // namespace cugraph
diff --git a/cpp/include/experimental/detail/graph_utils.cuh b/cpp/include/experimental/detail/graph_utils.cuh
index fe092342f80..c94348329f7 100644
--- a/cpp/include/experimental/detail/graph_utils.cuh
+++ b/cpp/include/experimental/detail/graph_utils.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <experimental/graph_view.hpp>
+#include <partition_manager.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/handle.hpp>
@@ -39,57 +40,57 @@ rmm::device_uvector<edge_t> compute_major_degree(
   std::vector<edge_t const *> const &adj_matrix_partition_offsets,
   partition_t<vertex_t> const &partition)
 {
-  auto &comm_p_row           = handle.get_subcomm(comm_p_row_key);
-  auto const comm_p_row_rank = comm_p_row.get_rank();
-  auto const comm_p_row_size = comm_p_row.get_size();
-  auto &comm_p_col           = handle.get_subcomm(comm_p_col_key);
-  auto const comm_p_col_rank = comm_p_col.get_rank();
-  auto const comm_p_col_size = comm_p_col.get_size();
+  auto &row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_rank = row_comm.get_rank();
+  auto const row_comm_size = row_comm.get_size();
+  auto &col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_rank = col_comm.get_rank();
+  auto const col_comm_size = col_comm.get_size();
 
   rmm::device_uvector<edge_t> local_degrees(0, handle.get_stream());
   rmm::device_uvector<edge_t> degrees(0, handle.get_stream());
 
   vertex_t max_num_local_degrees{0};
-  for (int i = 0; i < comm_p_col_size; ++i) {
+  for (int i = 0; i < col_comm_size; ++i) {
     auto vertex_partition_idx =
       partition.is_hypergraph_partitioned()
-        ? static_cast<size_t>(comm_p_row_size) * static_cast<size_t>(i) +
-            static_cast<size_t>(comm_p_row_rank)
-        : static_cast<size_t>(comm_p_col_size) * static_cast<size_t>(comm_p_row_rank) +
+        ? static_cast<size_t>(row_comm_size) * static_cast<size_t>(i) +
+            static_cast<size_t>(row_comm_rank)
+        : static_cast<size_t>(col_comm_size) * static_cast<size_t>(row_comm_rank) +
             static_cast<size_t>(i);
     vertex_t major_first{};
     vertex_t major_last{};
     std::tie(major_first, major_last) = partition.get_vertex_partition_range(vertex_partition_idx);
     max_num_local_degrees             = std::max(max_num_local_degrees, major_last - major_first);
-    if (i == comm_p_col_rank) { degrees.resize(major_last - major_first, handle.get_stream()); }
+    if (i == col_comm_rank) { degrees.resize(major_last - major_first, handle.get_stream()); }
   }
   local_degrees.resize(max_num_local_degrees, handle.get_stream());
-  for (int i = 0; i < comm_p_col_size; ++i) {
+  for (int i = 0; i < col_comm_size; ++i) {
     auto vertex_partition_idx =
       partition.is_hypergraph_partitioned()
-        ? static_cast<size_t>(comm_p_row_size) * static_cast<size_t>(i) +
-            static_cast<size_t>(comm_p_row_rank)
-        : static_cast<size_t>(comm_p_col_size) * static_cast<size_t>(comm_p_row_rank) +
+        ? static_cast<size_t>(row_comm_size) * static_cast<size_t>(i) +
+            static_cast<size_t>(row_comm_rank)
+        : static_cast<size_t>(col_comm_size) * static_cast<size_t>(row_comm_rank) +
             static_cast<size_t>(i);
     vertex_t major_first{};
     vertex_t major_last{};
     std::tie(major_first, major_last) = partition.get_vertex_partition_range(vertex_partition_idx);
-    auto p_offsets                    = partition.is_hypergraph_partitioned()
-                       ? adj_matrix_partition_offsets[i]
-                       : adj_matrix_partition_offsets[0] +
-                           (major_first - partition.get_vertex_partition_range_first(
-                                            comm_p_col_size * comm_p_row_rank));
+    auto p_offsets =
+      partition.is_hypergraph_partitioned()
+        ? adj_matrix_partition_offsets[i]
+        : adj_matrix_partition_offsets[0] +
+            (major_first - partition.get_vertex_partition_first(col_comm_size * row_comm_rank));
     thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                       thrust::make_counting_iterator(vertex_t{0}),
                       thrust::make_counting_iterator(major_last - major_first),
                       local_degrees.data(),
                       [p_offsets] __device__(auto i) { return p_offsets[i + 1] - p_offsets[i]; });
-    comm_p_row.reduce(local_degrees.data(),
-                      i == comm_p_col_rank ? degrees.data() : static_cast<edge_t *>(nullptr),
-                      degrees.size(),
-                      raft::comms::op_t::SUM,
-                      comm_p_col_rank,
-                      handle.get_stream());
+    row_comm.reduce(local_degrees.data(),
+                    i == col_comm_rank ? degrees.data() : static_cast<edge_t *>(nullptr),
+                    degrees.size(),
+                    raft::comms::op_t::SUM,
+                    col_comm_rank,
+                    handle.get_stream());
   }
 
   auto status = handle.get_comms().sync_stream(
diff --git a/cpp/include/experimental/graph.hpp b/cpp/include/experimental/graph.hpp
index ea4a7882363..88c84414cd0 100644
--- a/cpp/include/experimental/graph.hpp
+++ b/cpp/include/experimental/graph.hpp
@@ -70,12 +70,6 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
           bool sorted_by_global_degree_within_vertex_partition,
           bool do_expensive_check = false);
 
-  vertex_t get_number_of_local_vertices() const
-  {
-    return partition_.get_vertex_partition_range_last() -
-           partition_.get_vertex_partition_range_first();
-  }
-
   graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> view()
   {
     std::vector<edge_t const *> offsets(adj_matrix_partition_offsets_.size(), nullptr);
@@ -187,4 +181,4 @@ struct invalid_edge_id : invalid_idx<edge_t> {
 };
 
 }  // namespace experimental
-}  // namespace cugraph
\ No newline at end of file
+}  // namespace cugraph
diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/experimental/graph_view.hpp
index b3b899a5068..c655b1451ca 100644
--- a/cpp/include/experimental/graph_view.hpp
+++ b/cpp/include/experimental/graph_view.hpp
@@ -21,6 +21,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <algorithm>
+#include <cassert>
 #include <cstddef>
 #include <string>
 #include <tuple>
@@ -30,28 +31,31 @@
 namespace cugraph {
 namespace experimental {
 
-// FIXME: these should better be defined somewhere else.
-std::string const comm_p_row_key = "comm_p_row";
-std::string const comm_p_col_key = "comm_p_key";
-
 /**
  * @brief store vertex partitioning map
  *
- * Say P = P_row * P_col GPUs. We need to partition 1D vertex arrays (storing per vertex values) and
- * the 2D graph adjacency matrix (or transposed 2D graph adjacency matrix) of G. An 1D vertex array
- * of size V is divided to P linear partitions; each partition has the size close to V / P. We
- * consider two different strategies to partition the 2D matrix: the default strategy and the
- * hypergraph partitioning based strategy (the latter is for future extension).
+ * Say P = P_row * P_col GPUs. For communication, we need P_row row communicators of size P_col and
+ * P_col column communicators of size P_row. row_comm_size = P_col and col_comm_size = P_row.
+ * row_comm_rank & col_comm_rank are ranks within the row & column communicators, respectively.
+ *
+ * We need to partition 1D vertex arrays (storing per vertex values) and the 2D graph adjacency
+ * matrix (or transposed 2D graph adjacency matrix) of G. An 1D vertex array of size V is divided to
+ * P linear partitions; each partition has the size close to V / P. We consider two different
+ * strategies to partition the 2D matrix: the default strategy and the hypergraph partitioning based
+ * strategy (the latter is for future extension).
+ * FIXME: in the future we may use the latter for both as this leads to simpler communication
+ * patterns and better control over parallelism vs memory footprint trade-off.
  *
  * In the default case, one GPU will be responsible for 1 rectangular partition. The matrix will be
  * horizontally partitioned first to P_row slabs. Each slab will be further vertically partitioned
  * to P_col rectangles. Each rectangular partition will have the size close to V / P_row by V /
  * P_col.
  *
- * To be more specific, a GPU with (row_rank, col_rank) will be responsible for one rectangular
- * partition [a,b) by [c,d) where a = vertex_partition_offsets[P_col * row_rank], b =
- * vertex_partition_offsets[p_col * (row_rank + 1)], c = vertex_partition_offsets[P_row * col_rank],
- * and d = vertex_partition_offsets[p_row * (col_rank + 1)]
+ * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for one
+ * rectangular partition [a,b) by [c,d) where a = vertex_partition_offsets[row_comm_size *
+ * col_comm_rank], b = vertex_partition_offsets[row_comm_size * (col_comm_rank + 1)], c =
+ * vertex_partition_offsets[col_comm_size * row_comm_rank], and d =
+ * vertex_partition_offsets[col_comm_size * (row_comm_rank + 1)].
  *
  * In the future, we may apply hyper-graph partitioning to divide V vertices to P groups minimizing
  * edge cuts across groups while balancing the number of vertices in each group. We will also
@@ -59,13 +63,16 @@ std::string const comm_p_col_key = "comm_p_key";
  * will be more non-zeros in the diagonal partitions of the 2D graph adjacency matrix (or the
  * transposed 2D graph adjacency matrix) than the off-diagonal partitions. The default strategy does
  * not balance the number of nonzeros if hyper-graph partitioning is applied. To solve this problem,
- * the matrix is first horizontally partitioned to P (instead of P_row) slabs, then each slab will
- * be further vertically partitioned to P_col rectangles. One GPU will be responsible P_col
- * rectangular partitions in this case.
+ * the matrix is first horizontally partitioned to P slabs, then each slab will be further
+ * vertically partitioned to P_row (instead of P_col in the default case) rectangles. One GPU will
+ * be responsible col_comm_size rectangular partitions in this case.
  *
- * To be more specific, a GPU with (row_rank, col_rank) will be responsible for P_col rectangular
- * partitions [a_i,b_i) by [c,d) where a_i = vertex_partition_offsets[P_row * i + row_rank] and b_i
- * = vertex_partition_offsets[P_row * i + row_rank + 1]. c and d are same to 1) and i = [0, P_col).
+ * To be more specific, a GPU with (col_comm_rank, row_comm_rank) will be responsible for
+ * col_comm_size rectangular partitions [a_i,b_i) by [c,d) where a_i =
+ * vertex_partition_offsets[row_comm_size * i + row_comm_rank] and b_i =
+ * vertex_partition_offsets[row_comm_size * i + row_comm_rank + 1]. c is
+ * vertex_partition_offsets[row_comm_size * col_comm_rank] and d =
+ * vertex_partition_offsests[row_comm_size * (col_comm_rank + 1)].
  *
  * See E. G. Boman et. al., “Scalable matrix computations on large scale-free graphs using 2D graph
  * partitioning”, 2013 for additional detail.
@@ -77,20 +84,20 @@ class partition_t {
  public:
   partition_t(std::vector<vertex_t> const& vertex_partition_offsets,
               bool hypergraph_partitioned,
-              int comm_p_row_size,
-              int comm_p_col_size,
-              int comm_p_row_rank,
-              int comm_p_col_rank)
+              int row_comm_size,
+              int col_comm_size,
+              int row_comm_rank,
+              int col_comm_rank)
     : vertex_partition_offsets_(vertex_partition_offsets),
       hypergraph_partitioned_(hypergraph_partitioned),
-      comm_p_rank_(comm_p_col_size * comm_p_row_rank + comm_p_col_rank),
-      comm_p_row_size_(comm_p_row_size),
-      comm_p_col_size_(comm_p_col_size),
-      comm_p_row_rank_(comm_p_row_rank),
-      comm_p_col_rank_(comm_p_col_rank)
+      comm_rank_(col_comm_size * row_comm_rank + col_comm_rank),
+      row_comm_size_(row_comm_size),
+      col_comm_size_(col_comm_size),
+      row_comm_rank_(row_comm_rank),
+      col_comm_rank_(col_comm_rank)
   {
     CUGRAPH_EXPECTS(
-      vertex_partition_offsets.size() == static_cast<size_t>(comm_p_row_size * comm_p_col_size),
+      vertex_partition_offsets.size() == static_cast<size_t>(row_comm_size * col_comm_size + 1),
       "Invalid API parameter: erroneous vertex_partition_offsets.size().");
 
     CUGRAPH_EXPECTS(
@@ -98,23 +105,24 @@ class partition_t {
       "Invalid API parameter: partition.vertex_partition_offsets values should be non-descending.");
     CUGRAPH_EXPECTS(vertex_partition_offsets_[0] == vertex_t{0},
                     "Invalid API parameter: partition.vertex_partition_offsets[0] should be 0.");
-  }
 
-  std::tuple<vertex_t, vertex_t> get_vertex_partition_range() const
-  {
-    return std::make_tuple(vertex_partition_offsets_[comm_p_rank_],
-                           vertex_partition_offsets_[comm_p_rank_ + 1]);
+    vertex_t start_offset{0};
+    matrix_partition_major_value_start_offsets_.assign(get_number_of_matrix_partitions(), 0);
+    for (size_t i = 0; i < matrix_partition_major_value_start_offsets_.size(); ++i) {
+      matrix_partition_major_value_start_offsets_[i] = start_offset;
+      start_offset += get_matrix_partition_major_last(i) - get_matrix_partition_major_first(i);
+    }
   }
 
-  vertex_t get_vertex_partition_range_first() const
+  std::tuple<vertex_t, vertex_t> get_local_vertex_range() const
   {
-    return vertex_partition_offsets_[comm_p_rank_];
+    return std::make_tuple(vertex_partition_offsets_[comm_rank_],
+                           vertex_partition_offsets_[comm_rank_ + 1]);
   }
 
-  vertex_t get_vertex_partition_range_last() const
-  {
-    return vertex_partition_offsets_[comm_p_rank_ + 1];
-  }
+  vertex_t get_local_vertex_first() const { return vertex_partition_offsets_[comm_rank_]; }
+
+  vertex_t get_local_vertex_last() const { return vertex_partition_offsets_[comm_rank_ + 1]; }
 
   std::tuple<vertex_t, vertex_t> get_vertex_partition_range(size_t vertex_partition_idx) const
   {
@@ -122,49 +130,94 @@ class partition_t {
                            vertex_partition_offsets_[vertex_partition_idx + 1]);
   }
 
-  vertex_t get_vertex_partition_range_first(size_t vertex_partition_idx) const
+  vertex_t get_vertex_partition_first(size_t vertex_partition_idx) const
   {
     return vertex_partition_offsets_[vertex_partition_idx];
   }
 
-  vertex_t get_vertex_partition_range_last(size_t vertex_partition_idx) const
+  vertex_t get_vertex_partition_last(size_t vertex_partition_idx) const
   {
     return vertex_partition_offsets_[vertex_partition_idx + 1];
   }
 
-  std::tuple<vertex_t, vertex_t> get_matrix_partition_major_range(size_t partition_idx) const
+  vertex_t get_vertex_partition_size(size_t vertex_partition_idx) const
   {
-    auto major_first =
-      hypergraph_partitioned_
-        ? vertex_partition_offsets_[comm_p_row_size_ * partition_idx + comm_p_row_rank_]
-        : vertex_partition_offsets_[comm_p_row_rank_ * comm_p_col_size_];
-    auto major_last =
-      hypergraph_partitioned_
-        ? vertex_partition_offsets_[comm_p_row_size_ * partition_idx + comm_p_row_rank_ + 1]
-        : vertex_partition_offsets_[(comm_p_row_rank_ + 1) * comm_p_col_size_];
+    return get_vertex_partition_last(vertex_partition_idx) -
+           get_vertex_partition_first(vertex_partition_idx);
+  }
+
+  size_t get_number_of_matrix_partitions() const
+  {
+    return hypergraph_partitioned_ ? col_comm_size_ : 1;
+  }
 
+  // major: row of the graph adjacency matrix (if the graph adjacency matrix is stored as is) or
+  // column of the graph adjacency matrix (if the transposed graph adjacency matrix is stored).
+  std::tuple<vertex_t, vertex_t> get_matrix_partition_major_range(size_t partition_idx) const
+  {
+    auto major_first = get_matrix_partition_major_first(partition_idx);
+    auto major_last  = get_matrix_partition_major_last(partition_idx);
     return std::make_tuple(major_first, major_last);
   }
 
+  vertex_t get_matrix_partition_major_first(size_t partition_idx) const
+  {
+    return hypergraph_partitioned_
+             ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_]
+             : vertex_partition_offsets_[col_comm_rank_ * row_comm_size_];
+  }
+
+  vertex_t get_matrix_partition_major_last(size_t partition_idx) const
+  {
+    return hypergraph_partitioned_
+             ? vertex_partition_offsets_[row_comm_size_ * partition_idx + row_comm_rank_ + 1]
+             : vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_];
+  }
+
+  vertex_t get_matrix_partition_major_value_start_offset(size_t partition_idx) const
+  {
+    return matrix_partition_major_value_start_offsets_[partition_idx];
+  }
+
+  // minor: column of the graph adjacency matrix (if the graph adjacency matrix is stored as is) or
+  // row of the graph adjacency matrix (if the transposed graph adjacency matrix is stored).
   std::tuple<vertex_t, vertex_t> get_matrix_partition_minor_range() const
   {
-    auto minor_first = vertex_partition_offsets_[comm_p_col_rank_ * comm_p_row_size_];
-    auto minor_last  = vertex_partition_offsets_[(comm_p_col_rank_ + 1) * comm_p_row_size_];
+    auto minor_first = get_matrix_partition_minor_first();
+    auto minor_last  = get_matrix_partition_minor_last();
 
     return std::make_tuple(minor_first, minor_last);
   }
 
+  vertex_t get_matrix_partition_minor_first() const
+  {
+    return hypergraph_partitioned_ ? vertex_partition_offsets_[col_comm_rank_ * row_comm_size_]
+                                   : vertex_partition_offsets_[row_comm_rank_ * col_comm_size_];
+  }
+
+  vertex_t get_matrix_partition_minor_last() const
+  {
+    return hypergraph_partitioned_
+             ? vertex_partition_offsets_[(col_comm_rank_ + 1) * row_comm_size_]
+             : vertex_partition_offsets_[(row_comm_rank_ + 1) * col_comm_size_];
+  }
+
+  // FIXME: this function may be removed if we use the same partitioning strategy whether hypergraph
+  // partitioning is applied or not
   bool is_hypergraph_partitioned() const { return hypergraph_partitioned_; }
 
  private:
   std::vector<vertex_t> vertex_partition_offsets_{};  // size = P + 1
   bool hypergraph_partitioned_{false};
 
-  int comm_p_rank_{0};
-  int comm_p_row_size_{0};
-  int comm_p_col_size_{0};
-  int comm_p_row_rank_{0};
-  int comm_p_col_rank_{0};
+  int comm_rank_{0};
+  int row_comm_size_{0};
+  int col_comm_size_{0};
+  int row_comm_rank_{0};
+  int col_comm_rank_{0};
+
+  std::vector<vertex_t>
+    matrix_partition_major_value_start_offsets_{};  // size = get_number_of_matrix_partitions()
 };
 
 struct graph_properties_t {
@@ -195,6 +248,18 @@ class graph_base_t {
   vertex_t get_number_of_vertices() const { return number_of_vertices_; }
   edge_t get_number_of_edges() const { return number_of_edges_; }
 
+  template <typename vertex_type = vertex_t>
+  std::enable_if_t<std::is_signed<vertex_type>::value, bool> is_valid_vertex(vertex_type v) const
+  {
+    return ((v >= 0) && (v < number_of_vertices_));
+  }
+
+  template <typename vertex_type = vertex_t>
+  std::enable_if_t<std::is_unsigned<vertex_type>::value, bool> is_valid_vertex(vertex_type v) const
+  {
+    return (v < number_of_vertices_);
+  }
+
   bool is_symmetric() const { return properties_.is_symmetric; }
   bool is_multigraph() const { return properties_.is_multigraph; }
 
@@ -254,13 +319,114 @@ class graph_view_t<vertex_t,
                bool sorted_by_global_degree_within_vertex_partition,
                bool do_expensive_check = false);
 
+  bool is_weighted() const { return adj_matrix_partition_weights_.size() > 0; }
+
   vertex_t get_number_of_local_vertices() const
   {
-    return partition_.get_vertex_partition_range_last() -
-           partition_.get_vertex_partition_range_first();
+    return partition_.get_local_vertex_last() - partition_.get_local_vertex_first();
+  }
+
+  vertex_t get_local_vertex_first() const { return partition_.get_local_vertex_first(); }
+
+  vertex_t get_local_vertex_last() const { return partition_.get_local_vertex_last(); }
+
+  vertex_t get_vertex_partition_first(size_t vertex_partition_idx) const
+  {
+    return partition_.get_vertex_partition_first(vertex_partition_idx);
+  }
+
+  vertex_t get_vertex_partition_last(size_t vertex_partition_idx) const
+  {
+    return partition_.get_vertex_partition_last(vertex_partition_idx);
+  }
+
+  vertex_t get_vertex_partition_size(size_t vertex_partition_idx) const
+  {
+    return get_vertex_partition_last(vertex_partition_idx) -
+           get_vertex_partition_first(vertex_partition_idx);
+  }
+
+  bool is_local_vertex_nocheck(vertex_t v) const
+  {
+    return (v >= get_local_vertex_first()) && (v < get_local_vertex_last());
+  }
+
+  size_t get_number_of_local_adj_matrix_partitions() const
+  {
+    return adj_matrix_partition_offsets_.size();
+  }
+
+  vertex_t get_number_of_local_adj_matrix_partition_rows() const
+  {
+    if (!store_transposed) {
+      vertex_t ret{0};
+      for (size_t i = 0; i < partition_.get_number_of_matrix_partitions(); ++i) {
+        ret += partition_.get_matrix_partition_major_last(i) -
+               partition_.get_matrix_partition_major_first(i);
+      }
+      return ret;
+    } else {
+      return partition_.get_matrix_partition_minor_last() -
+             partition_.get_matrix_partition_minor_first();
+    }
   }
 
-  size_t get_number_of_adj_matrix_partitions() { return adj_matrix_partition_offsets_.size(); }
+  vertex_t get_number_of_local_adj_matrix_partition_cols() const
+  {
+    if (store_transposed) {
+      vertex_t ret{0};
+      for (size_t i = 0; i < partition_.get_number_of_matrix_partitions(); ++i) {
+        ret += partition_.get_matrix_partition_major_last(i) -
+               partition_.get_matrix_partition_major_first(i);
+      }
+      return ret;
+    } else {
+      return partition_.get_matrix_partition_minor_last() -
+             partition_.get_matrix_partition_minor_first();
+    }
+  }
+
+  vertex_t get_local_adj_matrix_partition_row_first(size_t adj_matrix_partition_idx) const
+  {
+    return store_transposed ? partition_.get_matrix_partition_minor_first()
+                            : partition_.get_matrix_partition_major_first(adj_matrix_partition_idx);
+  }
+
+  vertex_t get_local_adj_matrix_partition_row_last(size_t adj_matrix_partition_idx) const
+  {
+    return store_transposed ? partition_.get_matrix_partition_minor_last()
+                            : partition_.get_matrix_partition_major_last(adj_matrix_partition_idx);
+  }
+
+  vertex_t get_local_adj_matrix_partition_row_value_start_offset(
+    size_t adj_matrix_partition_idx) const
+  {
+    return store_transposed
+             ? 0
+             : partition_.get_matrix_partition_major_value_start_offset(adj_matrix_partition_idx);
+  }
+
+  vertex_t get_local_adj_matrix_partition_col_first(size_t adj_matrix_partition_idx) const
+  {
+    return store_transposed ? partition_.get_matrix_partition_major_first(adj_matrix_partition_idx)
+                            : partition_.get_matrix_partition_minor_first();
+  }
+
+  vertex_t get_local_adj_matrix_partition_col_last(size_t adj_matrix_partition_idx) const
+  {
+    return store_transposed ? partition_.get_matrix_partition_major_last(adj_matrix_partition_idx)
+                            : partition_.get_matrix_partition_minor_last();
+  }
+
+  vertex_t get_local_adj_matrix_partition_col_value_start_offset(
+    size_t adj_matrix_partition_idx) const
+  {
+    return store_transposed
+             ? partition_.get_matrix_partition_major_value_start_offset(adj_matrix_partition_idx)
+             : 0;
+  }
+
+  bool is_hypergraph_partitioned() const { return partition_.is_hypergraph_partitioned(); }
 
   // FIXME: this function is not part of the public stable API.This function is mainly for pattern
   // accelerator implementation. This function is currently public to support the legacy
@@ -335,8 +501,81 @@ class graph_view_t<vertex_t,
                bool sorted_by_degree,
                bool do_expensive_check = false);
 
+  bool is_weighted() const { return weights_ != nullptr; }
+
   vertex_t get_number_of_local_vertices() const { return this->get_number_of_vertices(); }
 
+  constexpr vertex_t get_local_vertex_first() const { return vertex_t{0}; }
+
+  vertex_t get_local_vertex_last() const { return this->get_number_of_vertices(); }
+
+  vertex_t get_vertex_partition_first(size_t vertex_partition_idx) const { return vertex_t{0}; }
+
+  vertex_t get_vertex_partition_last(size_t vertex_partition_idx) const
+  {
+    return this->get_number_of_vertices();
+  }
+
+  vertex_t get_vertex_partition_size(size_t vertex_partition_idx) const
+  {
+    return get_vertex_partition_last(vertex_partition_idx) -
+           get_vertex_partition_first(vertex_partition_idx);
+  }
+
+  constexpr bool is_local_vertex_nocheck(vertex_t v) const { return true; }
+
+  constexpr size_t get_number_of_local_adj_matrix_partitions() const { return size_t(1); }
+
+  vertex_t get_number_of_local_adj_matrix_partition_rows() const
+  {
+    return this->get_number_of_vertices();
+  }
+
+  vertex_t get_number_of_local_adj_matrix_partition_cols() const
+  {
+    return this->get_number_of_vertices();
+  }
+
+  vertex_t get_local_adj_matrix_partition_row_first(size_t adj_matrix_partition_idx) const
+  {
+    assert(adj_matrix_partition_idx == 0);
+    return vertex_t{0};
+  }
+
+  vertex_t get_local_adj_matrix_partition_row_last(size_t adj_matrix_partition_idx) const
+  {
+    assert(adj_matrix_partition_idx == 0);
+    return this->get_number_of_vertices();
+  }
+
+  vertex_t get_local_adj_matrix_partition_row_value_start_offset(
+    size_t adj_matrix_partition_idx) const
+  {
+    assert(adj_matrix_partition_idx == 0);
+    return vertex_t{0};
+  }
+
+  vertex_t get_local_adj_matrix_partition_col_first(size_t adj_matrix_partition_idx) const
+  {
+    assert(adj_matrix_partition_idx == 0);
+    return vertex_t{0};
+  }
+
+  vertex_t get_local_adj_matrix_partition_col_last(size_t adj_matrix_partition_idx) const
+  {
+    assert(adj_matrix_partition_idx == 0);
+    return this->get_number_of_vertices();
+  }
+
+  vertex_t get_local_adj_matrix_partition_col_value_start_offset(
+    size_t adj_matrix_partition_idx) const
+  {
+    assert(adj_matrix_partition_idx == 0);
+    return vertex_t{0};
+  }
+
+  bool is_hypergraph_partitioned() const { return false; }
+
   // FIXME: this function is not part of the public stable API.This function is mainly for pattern
   // accelerator implementation. This function is currently public to support the legacy
   // implementations directly accessing CSR/CSC data, but this function will eventually become
@@ -364,4 +603,4 @@ class graph_view_t<vertex_t,
 };
 
 }  // namespace experimental
-}  // namespace cugraph
\ No newline at end of file
+}  // namespace cugraph
diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp
index d96fe5cfd16..8941afcd95d 100644
--- a/cpp/include/graph.hpp
+++ b/cpp/include/graph.hpp
@@ -53,6 +53,10 @@ enum class DegreeDirection {
 template <typename vertex_t, typename edge_t, typename weight_t>
 class GraphViewBase {
  public:
+  using vertex_type = vertex_t;
+  using edge_type   = edge_t;
+  using weight_type = weight_t;
+
   raft::handle_t *handle;
   weight_t *edge_data;  ///< edge weight
 
diff --git a/cpp/include/matrix_partition_device.cuh b/cpp/include/matrix_partition_device.cuh
new file mode 100644
index 00000000000..53796530f60
--- /dev/null
+++ b/cpp/include/matrix_partition_device.cuh
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph_view.hpp>
+#include <utilities/error.hpp>
+
+#include <thrust/tuple.h>
+
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+class matrix_partition_device_base_t {
+ public:
+  matrix_partition_device_base_t(edge_t const* offsets,
+                                 vertex_t const* indices,
+                                 weight_t const* weights)
+    : offsets_(offsets), indices_(indices), weights_(weights)
+  {
+  }
+
+  __device__ thrust::tuple<vertex_t const*, weight_t const*, edge_t> get_local_edges(
+    vertex_t major_offset) const noexcept
+  {
+    auto edge_offset  = *(offsets_ + major_offset);
+    auto local_degree = *(offsets_ + (major_offset + 1)) - edge_offset;
+    auto indices      = indices_ + edge_offset;
+    auto weights      = weights_ != nullptr ? weights_ + edge_offset : nullptr;
+    return thrust::make_tuple(indices, weights, local_degree);
+  }
+
+  __device__ edge_t get_local_degree(vertex_t major_offset) const noexcept
+  {
+    return *(offsets_ + (major_offset + 1)) - *(offsets_ + major_offset);
+  }
+
+ private:
+  // should be trivially copyable to device
+  edge_t const* offsets_{nullptr};
+  vertex_t const* indices_{nullptr};
+  weight_t const* weights_{nullptr};
+};
+
+template <typename GraphViewType, typename Enable = void>
+class matrix_partition_device_t;
+
+// multi-GPU version
+template <typename GraphViewType>
+class matrix_partition_device_t<GraphViewType, std::enable_if_t<GraphViewType::is_multi_gpu>>
+  : public matrix_partition_device_base_t<typename GraphViewType::vertex_type,
+                                          typename GraphViewType::edge_type,
+                                          typename GraphViewType::weight_type> {
+ public:
+  matrix_partition_device_t(GraphViewType const& graph_view, size_t partition_idx)
+    : matrix_partition_device_base_t<typename GraphViewType::vertex_type,
+                                     typename GraphViewType::edge_type,
+                                     typename GraphViewType::weight_type>(
+        graph_view.offsets(partition_idx),
+        graph_view.indices(partition_idx),
+        graph_view.weights(partition_idx)),
+      major_first_(GraphViewType::is_adj_matrix_transposed
+                     ? graph_view.get_local_adj_matrix_partition_col_first(partition_idx)
+                     : graph_view.get_local_adj_matrix_partition_row_first(partition_idx)),
+      major_last_(GraphViewType::is_adj_matrix_transposed
+                    ? graph_view.get_local_adj_matrix_partition_col_last(partition_idx)
+                    : graph_view.get_local_adj_matrix_partition_row_last(partition_idx)),
+      minor_first_(GraphViewType::is_adj_matrix_transposed
+                     ? graph_view.get_local_adj_matrix_partition_row_first(partition_idx)
+                     : graph_view.get_local_adj_matrix_partition_col_first(partition_idx)),
+      minor_last_(GraphViewType::is_adj_matrix_transposed
+                    ? graph_view.get_local_adj_matrix_partition_row_last(partition_idx)
+                    : graph_view.get_local_adj_matrix_partition_col_last(partition_idx)),
+      major_value_start_offset_(
+        GraphViewType::is_adj_matrix_transposed
+          ? graph_view.get_local_adj_matrix_partition_col_value_start_offset(partition_idx)
+          : graph_view.get_local_adj_matrix_partition_row_value_start_offset(partition_idx))
+  {
+  }
+
+  typename GraphViewType::vertex_type get_major_value_start_offset() const
+  {
+    return major_value_start_offset_;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_major_first() const noexcept
+  {
+    return major_first_;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_major_last() const noexcept
+  {
+    return major_last_;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_major_size() const noexcept
+  {
+    return major_last_ - major_first_;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_minor_first() const noexcept
+  {
+    return minor_first_;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_minor_last() const noexcept
+  {
+    return minor_last_;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_minor_size() const noexcept
+  {
+    return minor_last_ - minor_first_;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_major_offset_from_major_nocheck(
+    typename GraphViewType::vertex_type major) const noexcept
+  {
+    return major - major_first_;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_minor_offset_from_minor_nocheck(
+    typename GraphViewType::vertex_type minor) const noexcept
+  {
+    return minor - minor_first_;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_major_from_major_offset_nocheck(
+    typename GraphViewType::vertex_type major_offset) const noexcept
+  {
+    return major_first_ + major_offset;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_minor_from_minor_offset_nocheck(
+    typename GraphViewType::vertex_type minor_offset) const noexcept
+  {
+    return minor_first_ + minor_offset;
+  }
+
+ private:
+  // should be trivially copyable to device
+  typename GraphViewType::vertex_type major_first_{0};
+  typename GraphViewType::vertex_type major_last_{0};
+  typename GraphViewType::vertex_type minor_first_{0};
+  typename GraphViewType::vertex_type minor_last_{0};
+
+  typename GraphViewType::vertex_type major_value_start_offset_{0};
+};
+
+// single-GPU version
+template <typename GraphViewType>
+class matrix_partition_device_t<GraphViewType, std::enable_if_t<!GraphViewType::is_multi_gpu>>
+  : public matrix_partition_device_base_t<typename GraphViewType::vertex_type,
+                                          typename GraphViewType::edge_type,
+                                          typename GraphViewType::weight_type> {
+ public:
+  matrix_partition_device_t(GraphViewType const& graph_view, size_t partition_idx)
+    : matrix_partition_device_base_t<typename GraphViewType::vertex_type,
+                                     typename GraphViewType::edge_type,
+                                     typename GraphViewType::weight_type>(
+        graph_view.offsets(), graph_view.indices(), graph_view.weights()),
+      number_of_vertices_(graph_view.get_number_of_vertices())
+  {
+    assert(partition_idx == 0);
+  }
+
+  typename GraphViewType::vertex_type get_major_value_start_offset() const
+  {
+    return typename GraphViewType::vertex_type{0};
+  }
+
+  __host__ __device__ constexpr typename GraphViewType::vertex_type get_major_first() const noexcept
+  {
+    return typename GraphViewType::vertex_type{0};
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_major_last() const noexcept
+  {
+    return number_of_vertices_;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_major_size() const noexcept
+  {
+    return number_of_vertices_;
+  }
+
+  __host__ __device__ constexpr typename GraphViewType::vertex_type get_minor_first() const noexcept
+  {
+    return typename GraphViewType::vertex_type{0};
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_minor_last() const noexcept
+  {
+    return number_of_vertices_;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_minor_size() const noexcept
+  {
+    return number_of_vertices_;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_major_offset_from_major_nocheck(
+    typename GraphViewType::vertex_type major) const noexcept
+  {
+    return major;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_minor_offset_from_minor_nocheck(
+    typename GraphViewType::vertex_type minor) const noexcept
+  {
+    return minor;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_major_from_major_offset_nocheck(
+    typename GraphViewType::vertex_type major_offset) const noexcept
+  {
+    return major_offset;
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type get_minor_from_minor_offset_nocheck(
+    typename GraphViewType::vertex_type minor_offset) const noexcept
+  {
+    return minor_offset;
+  }
+
+ private:
+  typename GraphViewType::vertex_type number_of_vertices_;
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/partition_manager.hpp b/cpp/include/partition_manager.hpp
new file mode 100644
index 00000000000..c15aa504084
--- /dev/null
+++ b/cpp/include/partition_manager.hpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/comms/comms.hpp>
+#include <raft/handle.hpp>
+
+#include <cassert>
+#include <cmath>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace cugraph {
+namespace partition_2d {
+
+// default key-naming mechanism:
+//
+struct key_naming_t {
+  // simplified key (one per all row subcomms / one per all column sub-comms):
+  //
+  key_naming_t(void)
+    : row_suffix_(std::string("_p_row")),
+      col_suffix_(std::string("_p_col")),
+      name_(std::string("comm"))
+  {
+  }
+
+  std::string col_name(void) const { return name_ + col_suffix_; }
+
+  std::string row_name(void) const { return name_ + row_suffix_; }
+
+ private:
+  std::string const row_suffix_;
+  std::string const col_suffix_;
+  std::string name_;
+};
+
+using pair_comms_t =
+  std::pair<std::shared_ptr<raft::comms::comms_t>, std::shared_ptr<raft::comms::comms_t>>;
+
+// class responsible for creating 2D partition sub-comms:
+// this is instantiated by each worker (processing element, PE)
+// for the row/column it belongs to;
+//
+// naming policy defaults to simplified naming:
+// one key per row subcomms, one per column subcomms;
+//
+template <typename name_policy_t = key_naming_t, typename size_type = int>
+class subcomm_factory_t {
+ public:
+  subcomm_factory_t(raft::handle_t& handle, size_type row_size)
+    : handle_(handle), row_size_(row_size)
+  {
+    init_row_col_comms();
+  }
+  virtual ~subcomm_factory_t(void) {}
+
+  pair_comms_t const& row_col_comms(void) const { return row_col_subcomms_; }
+
+ protected:
+  virtual void init_row_col_comms(void)
+  {
+    name_policy_t key;
+    raft::comms::comms_t const& communicator = handle_.get_comms();
+
+    int const rank = communicator.get_rank();
+    int row_index  = rank / row_size_;
+    int col_index  = rank % row_size_;
+
+    auto row_comm =
+      std::make_shared<raft::comms::comms_t>(communicator.comm_split(row_index, col_index));
+    handle_.set_subcomm(key.row_name(), row_comm);
+
+    auto col_comm =
+      std::make_shared<raft::comms::comms_t>(communicator.comm_split(col_index, row_index));
+    handle_.set_subcomm(key.col_name(), col_comm);
+
+    row_col_subcomms_.first  = row_comm;
+    row_col_subcomms_.second = col_comm;
+  }
+
+ private:
+  raft::handle_t& handle_;
+  size_type row_size_;
+  pair_comms_t row_col_subcomms_;
+};
+}  // namespace partition_2d
+}  // namespace cugraph
diff --git a/cpp/include/patterns/any_of_adj_matrix_row.cuh b/cpp/include/patterns/any_of_adj_matrix_row.cuh
new file mode 100644
index 00000000000..199e7c230ef
--- /dev/null
+++ b/cpp/include/patterns/any_of_adj_matrix_row.cuh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph_view.hpp>
+#include <utilities/comm_utils.cuh>
+#include <utilities/error.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+#include <thrust/count.h>
+#include <thrust/execution_policy.h>
+
+namespace cugraph {
+namespace experimental {
+
+/**
+ * @brief Check any of graph adjacency matrix row properties satisfy the given predicate.
+ *
+ * Returns true if @p row_op returns true for at least once (in any process in multi-GPU), returns
+ * false otherwise. This function is inspired by thrust::any_of().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam RowOp Type of the unary predicate operator.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row properties
+ * for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param row_op Unary predicate operator that takes *(@p adj_matrix_row_value_input_first + i)
+ * (where i = [0, @p graph_view.get_number_of_local_adj_matrix_partition_rows()) and returns either
+ * true or false.
+ * @return true If the predicate returns true at least once (in any process in multi-GPU).
+ * @return false If the predicate never returns true (in any process in multi-GPU).
+ */
+template <typename GraphViewType, typename AdjMatrixRowValueInputIterator, typename RowOp>
+bool any_of_adj_matrix_row(raft::handle_t const& handle,
+                           GraphViewType const& graph_view,
+                           AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+                           RowOp row_op)
+{
+  // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
+  auto count = thrust::count_if(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    adj_matrix_row_value_input_first,
+    adj_matrix_row_value_input_first + graph_view.get_number_of_local_adj_matrix_partition_rows(),
+    row_op);
+  if (GraphViewType::is_multi_gpu) {
+    count = host_scalar_allreduce(handle.get_comms(), count, handle.get_stream());
+  }
+  return (count > 0);
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh b/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh
new file mode 100644
index 00000000000..e8e11b85913
--- /dev/null
+++ b/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh
@@ -0,0 +1,576 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph_view.hpp>
+#include <matrix_partition_device.cuh>
+#include <partition_manager.hpp>
+#include <utilities/comm_utils.cuh>
+#include <utilities/error.hpp>
+#include <utilities/thrust_tuple_utils.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/permutation_iterator.h>
+
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename GraphViewType,
+          typename VertexValueInputIterator,
+          typename MatrixMajorValueOutputIterator>
+void copy_to_matrix_major(raft::handle_t const& handle,
+                          GraphViewType const& graph_view,
+                          VertexValueInputIterator vertex_value_input_first,
+                          MatrixMajorValueOutputIterator matrix_major_value_output_first)
+{
+  if (GraphViewType::is_multi_gpu) {
+    if (graph_view.is_hypergraph_partitioned()) {
+      CUGRAPH_FAIL("unimplemented.");
+    } else {
+      auto& comm           = handle.get_comms();
+      auto const comm_rank = comm.get_rank();
+      auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_rank = row_comm.get_rank();
+      auto const row_comm_size = row_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      auto const col_comm_size = col_comm.get_size();
+
+      std::vector<size_t> rx_counts(row_comm_size, size_t{0});
+      std::vector<size_t> displacements(row_comm_size, size_t{0});
+      for (int i = 0; i < row_comm_size; ++i) {
+        rx_counts[i]     = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
+        displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
+      }
+      device_allgatherv(row_comm,
+                        vertex_value_input_first,
+                        matrix_major_value_output_first,
+                        rx_counts,
+                        displacements,
+                        handle.get_stream());
+    }
+  } else {
+    assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
+             ? graph_view.get_number_of_adj_matrix_local_cols()
+             : graph_view.get_number_of_adj_matrix_local_rows());
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 vertex_value_input_first,
+                 vertex_value_input_first + graph_view.get_number_of_local_vertices(),
+                 matrix_major_value_output_first);
+  }
+}
+
+template <typename GraphViewType,
+          typename VertexIterator,
+          typename VertexValueInputIterator,
+          typename MatrixMajorValueOutputIterator>
+void copy_to_matrix_major(raft::handle_t const& handle,
+                          GraphViewType const& graph_view,
+                          VertexIterator vertex_first,
+                          VertexIterator vertex_last,
+                          VertexValueInputIterator vertex_value_input_first,
+                          MatrixMajorValueOutputIterator matrix_major_value_output_first)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+
+  if (GraphViewType::is_multi_gpu) {
+    if (graph_view.is_hypergraph_partitioned()) {
+      CUGRAPH_FAIL("unimplemented.");
+    } else {
+      auto& comm           = handle.get_comms();
+      auto const comm_rank = comm.get_rank();
+      auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_rank = row_comm.get_rank();
+      auto const row_comm_size = row_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      auto const col_comm_size = col_comm.get_size();
+
+      auto rx_counts =
+        host_scalar_allgather(row_comm,
+                              static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
+                              handle.get_stream());
+      std::vector<size_t> displacements(row_comm_size, size_t{0});
+      std::partial_sum(rx_counts.begin(), rx_counts.end() - 1, displacements.begin() + 1);
+
+      matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, 0);
+      for (int i = 0; i < row_comm_size; ++i) {
+        rmm::device_uvector<vertex_t> rx_vertices(rx_counts[i], handle.get_stream());
+        auto rx_tmp_buffer =
+          allocate_comm_buffer<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
+            rx_counts[i], handle.get_stream());
+        auto rx_value_first = get_comm_buffer_begin<
+          typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
+
+        if (i == row_comm_rank) {
+          // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
+          // permutation iterator (and directly gathers to the internal buffer)
+          thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                         vertex_first,
+                         vertex_last,
+                         vertex_value_input_first,
+                         rx_value_first);
+        }
+
+        // FIXME: these broadcast operations can be placed between ncclGroupStart() and
+        // ncclGroupEnd()
+        device_bcast(
+          row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
+        device_bcast(
+          row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
+
+        auto map_first = thrust::make_transform_iterator(
+          rx_vertices.begin(), [matrix_partition] __device__(auto v) {
+            return matrix_partition.get_major_offset_from_major_nocheck(v);
+          });
+        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
+        // directly scatters from the internal buffer)
+        thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        rx_value_first,
+                        rx_value_first + rx_counts[i],
+                        map_first,
+                        matrix_major_value_output_first);
+      }
+    }
+  } else {
+    assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
+             ? graph_view.get_number_of_adj_matrix_local_cols()
+             : graph_view.get_number_of_adj_matrix_local_rows());
+    auto val_first = thrust::make_permutation_iterator(vertex_value_input_first, vertex_first);
+    thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    val_first,
+                    val_first + thrust::distance(vertex_first, vertex_last),
+                    vertex_first,
+                    matrix_major_value_output_first);
+  }
+}
+
+template <typename GraphViewType,
+          typename VertexValueInputIterator,
+          typename MatrixMinorValueOutputIterator>
+void copy_to_matrix_minor(raft::handle_t const& handle,
+                          GraphViewType const& graph_view,
+                          VertexValueInputIterator vertex_value_input_first,
+                          MatrixMinorValueOutputIterator matrix_minor_value_output_first)
+{
+  if (GraphViewType::is_multi_gpu) {
+    if (graph_view.is_hypergraph_partitioned()) {
+      CUGRAPH_FAIL("unimplemented.");
+    } else {
+      auto& comm           = handle.get_comms();
+      auto const comm_rank = comm.get_rank();
+      auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_rank = row_comm.get_rank();
+      auto const row_comm_size = row_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      auto const col_comm_size = col_comm.get_size();
+
+      // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph
+      // partitioning
+      auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank;
+      auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
+      auto constexpr tuple_size = thrust_tuple_size_or_one<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>::value;
+      std::vector<raft::comms::request_t> requests(2 * tuple_size);
+      device_isend<VertexValueInputIterator, MatrixMinorValueOutputIterator>(
+        comm,
+        vertex_value_input_first,
+        static_cast<size_t>(graph_view.get_number_of_local_vertices()),
+        comm_dst_rank,
+        int{0} /* base_tag */,
+        requests.data());
+      device_irecv<VertexValueInputIterator, MatrixMinorValueOutputIterator>(
+        comm,
+        matrix_minor_value_output_first +
+          (graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + col_comm_rank) -
+           graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size)),
+        static_cast<size_t>(graph_view.get_vertex_partition_size(comm_src_rank)),
+        comm_src_rank,
+        int{0} /* base_tag */,
+        requests.data() + tuple_size);
+      // FIXME: this waitall can fail if MatrixMinorValueOutputIterator is a discard iterator or a
+      // zip iterator having one or more discard iterator
+      comm.waitall(requests.size(), requests.data());
+
+      // FIXME: these broadcast operations can be placed between ncclGroupStart() and
+      // ncclGroupEnd()
+      for (int i = 0; i < col_comm_size; ++i) {
+        auto offset = graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size + i) -
+                      graph_view.get_vertex_partition_first(row_comm_rank * col_comm_size);
+        auto count = graph_view.get_vertex_partition_size(row_comm_rank * col_comm_size + i);
+        device_bcast(col_comm,
+                     matrix_minor_value_output_first + offset,
+                     matrix_minor_value_output_first + offset,
+                     count,
+                     i,
+                     handle.get_stream());
+      }
+    }
+  } else {
+    assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
+             ? graph_view.get_number_of_adj_matrix_local_rows()
+             : graph_view.get_number_of_adj_matrix_local_cols());
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 vertex_value_input_first,
+                 vertex_value_input_first + graph_view.get_number_of_local_vertices(),
+                 matrix_minor_value_output_first);
+  }
+}
+
+template <typename GraphViewType,
+          typename VertexIterator,
+          typename VertexValueInputIterator,
+          typename MatrixMinorValueOutputIterator>
+void copy_to_matrix_minor(raft::handle_t const& handle,
+                          GraphViewType const& graph_view,
+                          VertexIterator vertex_first,
+                          VertexIterator vertex_last,
+                          VertexValueInputIterator vertex_value_input_first,
+                          MatrixMinorValueOutputIterator matrix_minor_value_output_first)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+
+  if (GraphViewType::is_multi_gpu) {
+    if (graph_view.is_hypergraph_partitioned()) {
+      CUGRAPH_FAIL("unimplemented.");
+    } else {
+      auto& comm           = handle.get_comms();
+      auto const comm_rank = comm.get_rank();
+      auto& row_comm       = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_rank = row_comm.get_rank();
+      auto const row_comm_size = row_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      auto const col_comm_size = col_comm.get_size();
+
+      // FIXME: this P2P is unnecessary if apply the same partitioning scheme regardless of
+      // hypergraph partitioning is applied or not
+      auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank;
+      auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
+      auto constexpr tuple_size = thrust_tuple_size_or_one<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>::value;
+
+      std::vector<raft::comms::request_t> count_requests(2);
+      auto tx_count = thrust::distance(vertex_first, vertex_last);
+      auto rx_count = tx_count;
+      comm.isend(&tx_count, 1, comm_dst_rank, 0 /* tag */, count_requests.data());
+      comm.irecv(&rx_count, 1, comm_src_rank, 0 /* tag */, count_requests.data() + 1);
+      comm.waitall(count_requests.size(), count_requests.data());
+
+      auto src_tmp_buffer =
+        allocate_comm_buffer<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
+          tx_count, handle.get_stream());
+      auto src_value_first =
+        get_comm_buffer_begin<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
+          src_tmp_buffer);
+
+      rmm::device_uvector<vertex_t> dst_vertices(rx_count, handle.get_stream());
+      auto dst_tmp_buffer =
+        allocate_comm_buffer<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
+          rx_count, handle.get_stream());
+      auto dst_value_first =
+        get_comm_buffer_begin<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
+          dst_tmp_buffer);
+
+      thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     vertex_first,
+                     vertex_last,
+                     vertex_value_input_first,
+                     src_value_first);
+
+      std::vector<raft::comms::request_t> value_requests(2 * (1 + tuple_size));
+      device_isend<decltype(vertex_first), decltype(dst_vertices.begin())>(
+        comm, vertex_first, tx_count, comm_dst_rank, int{0} /* base_tag */, value_requests.data());
+      device_isend<decltype(src_value_first), decltype(dst_value_first)>(comm,
+                                                                         src_value_first,
+                                                                         tx_count,
+                                                                         comm_dst_rank,
+                                                                         int{1} /* base_tag */,
+                                                                         value_requests.data() + 1);
+      device_irecv<decltype(vertex_first), decltype(dst_vertices.begin())>(
+        comm,
+        dst_vertices.begin(),
+        rx_count,
+        comm_src_rank,
+        int{0} /* base_tag */,
+        value_requests.data() + (1 + tuple_size));
+      device_irecv<decltype(src_value_first), decltype(dst_value_first)>(
+        comm,
+        dst_value_first,
+        rx_count,
+        comm_src_rank,
+        int{0} /* base_tag */,
+        value_requests.data() + ((1 + tuple_size) + 1));
+      // FIXME: this waitall can fail if MatrixMinorValueOutputIterator is a discard iterator or a
+      // zip iterator having one or more discard iterator
+      comm.waitall(value_requests.size(), value_requests.data());
+
+      // FIXME: now we can clear tx_tmp_buffer
+
+      auto rx_counts = host_scalar_allgather(col_comm, rx_count, handle.get_stream());
+      std::vector<size_t> displacements(col_comm_size, size_t{0});
+      std::partial_sum(rx_counts.begin(), rx_counts.end() - 1, displacements.begin() + 1);
+
+      matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, 0);
+      for (int i = 0; i < col_comm_size; ++i) {
+        rmm::device_uvector<vertex_t> rx_vertices(rx_counts[i], handle.get_stream());
+        auto rx_tmp_buffer =
+          allocate_comm_buffer<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
+            rx_counts[i], handle.get_stream());
+        auto rx_value_first = get_comm_buffer_begin<
+          typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
+
+        // FIXME: these broadcast operations can be placed between ncclGroupStart() and
+        // ncclGroupEnd()
+        device_bcast(col_comm,
+                     dst_vertices.begin(),
+                     rx_vertices.begin(),
+                     rx_counts[i],
+                     i,
+                     handle.get_stream());
+        device_bcast(
+          col_comm, dst_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
+
+        auto map_first = thrust::make_transform_iterator(
+          rx_vertices.begin(), [matrix_partition] __device__(auto v) {
+            return matrix_partition.get_minor_offset_from_minor_nocheck(v);
+          });
+
+        thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        rx_value_first,
+                        rx_value_first + rx_counts[i],
+                        map_first,
+                        matrix_minor_value_output_first);
+      }
+    }
+  } else {
+    assert(graph_view.get_number_of_local_vertices() ==
+           graph_view.get_number_of_adj_matrix_local_rows());
+    auto val_first = thrust::make_permutation_iterator(vertex_value_input_first, vertex_first);
+    thrust::scatter(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    val_first,
+                    val_first + thrust::distance(vertex_first, vertex_last),
+                    vertex_first,
+                    matrix_minor_value_output_first);
+  }
+}
+
+}  // namespace detail
+
+/**
+ * @brief Copy vertex property values to the corresponding graph adjacency matrix row property
+ * variables.
+ *
+ * This version fills the entire set of graph adjacency matrix row property values. This function is
+ * inspired by thrust::copy().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
+ * @tparam AdjMatrixRowValueOutputIterator Type of the iterator for graph adjacency matrix row
+ * output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
+ * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
+ * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
+ * @param adj_matrix_row_value_output_first Iterator pointing to the adjacency matrix row output
+ * property variables for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_output_last` (exclusive) is deduced as @p adj_matrix_row_value_output_first
+ * + @p graph_view.get_number_of_adj_matrix_local_rows().
+ */
+template <typename GraphViewType,
+          typename VertexValueInputIterator,
+          typename AdjMatrixRowValueOutputIterator>
+void copy_to_adj_matrix_row(raft::handle_t const& handle,
+                            GraphViewType const& graph_view,
+                            VertexValueInputIterator vertex_value_input_first,
+                            AdjMatrixRowValueOutputIterator adj_matrix_row_value_output_first)
+{
+  if (GraphViewType::is_adj_matrix_transposed) {
+    copy_to_matrix_minor(
+      handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output_first);
+  } else {
+    copy_to_matrix_major(
+      handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output_first);
+  }
+}
+
+/**
+ * @brief Copy vertex property values to the corresponding graph adjacency matrix row property
+ * variables.
+ *
+ * This version fills only a subset of graph adjacency matrix row property values. [@p vertex_first,
+ * @p vertex_last) specifies the vertices with new values to be copied to graph adjacency matrix row
+ * property variables. This function is inspired by thrust::copy().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexIterator  Type of the iterator for vertex identifiers.
+ * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
+ * @tparam AdjMatrixRowValueOutputIterator Type of the iterator for graph adjacency matrix row
+ * output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_first Iterator pointing to the first (inclusive) vertex with new values to be
+ * copied. v in [vertex_first, vertex_last) should be distinct (and should belong to this process in
+ * multi-GPU), otherwise undefined behavior
+ * @param vertex_last Iterator pointing to the last (exclusive) vertex with new values to be copied.
+ * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
+ * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
+ * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
+ * @param adj_matrix_row_value_output_first Iterator pointing to the adjacency matrix row output
+ * property variables for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_output_last` (exclusive) is deduced as @p adj_matrix_row_value_output_first
+ * + @p graph_view.get_number_of_adj_matrix_local_rows().
+ */
+template <typename GraphViewType,
+          typename VertexIterator,
+          typename VertexValueInputIterator,
+          typename AdjMatrixRowValueOutputIterator>
+void copy_to_adj_matrix_row(raft::handle_t const& handle,
+                            GraphViewType const& graph_view,
+                            VertexIterator vertex_first,
+                            VertexIterator vertex_last,
+                            VertexValueInputIterator vertex_value_input_first,
+                            AdjMatrixRowValueOutputIterator adj_matrix_row_value_output_first)
+{
+  if (GraphViewType::is_adj_matrix_transposed) {
+    copy_to_matrix_minor(handle,
+                         graph_view,
+                         vertex_first,
+                         vertex_last,
+                         vertex_value_input_first,
+                         adj_matrix_row_value_output_first);
+  } else {
+    copy_to_matrix_major(handle,
+                         graph_view,
+                         vertex_first,
+                         vertex_last,
+                         vertex_value_input_first,
+                         adj_matrix_row_value_output_first);
+  }
+}
+
+/**
+ * @brief Copy vertex property values to the corresponding graph adjacency matrix column property
+ * variables.
+ *
+ * This version fills the entire set of graph adjacency matrix column property values. This function
+ * is inspired by thrust::copy().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
+ * @tparam AdjMatrixColValueOutputIterator Type of the iterator for graph adjacency matrix column
+ * output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
+ * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
+ * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
+ * @param adj_matrix_col_value_output_first Iterator pointing to the adjacency matrix column output
+ * property variables for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_adj_matrix_local_cols().
+ */
+template <typename GraphViewType,
+          typename VertexValueInputIterator,
+          typename AdjMatrixColValueOutputIterator>
+void copy_to_adj_matrix_col(raft::handle_t const& handle,
+                            GraphViewType const& graph_view,
+                            VertexValueInputIterator vertex_value_input_first,
+                            AdjMatrixColValueOutputIterator adj_matrix_col_value_output_first)
+{
+  if (GraphViewType::is_adj_matrix_transposed) {
+    copy_to_matrix_major(
+      handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output_first);
+  } else {
+    copy_to_matrix_minor(
+      handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output_first);
+  }
+}
+
+/**
+ * @brief Copy vertex property values to the corresponding graph adjacency matrix column property
+ * variables.
+ *
+ * This version fills only a subset of graph adjacency matrix column property values. [@p
+ * vertex_first, @p vertex_last) specifies the vertices with new values to be copied to graph
+ * adjacency matrix column property variables. This function is inspired by thrust::copy().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexIterator  Type of the iterator for vertex identifiers.
+ * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
+ * @tparam AdjMatrixColValueOutputIterator Type of the iterator for graph adjacency matrix column
+ * output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_first Iterator pointing to the first (inclusive) vertex with new values to be
+ * copied. v in [vertex_first, vertex_last) should be distinct (and should belong to this process in
+ * multi-GPU), otherwise undefined behavior
+ * @param vertex_last Iterator pointing to the last (exclusive) vertex with new values to be copied.
+ * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
+ * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
+ * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
+ * @param adj_matrix_col_value_output_first Iterator pointing to the adjacency matrix column output
+ * property variables for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_adj_matrix_local_cols().
+ */
+template <typename GraphViewType,
+          typename VertexIterator,
+          typename VertexValueInputIterator,
+          typename AdjMatrixColValueOutputIterator>
+void copy_to_adj_matrix_col(raft::handle_t const& handle,
+                            GraphViewType const& graph_view,
+                            VertexIterator vertex_first,
+                            VertexIterator vertex_last,
+                            VertexValueInputIterator vertex_value_input_first,
+                            AdjMatrixColValueOutputIterator adj_matrix_col_value_output_first)
+{
+  if (GraphViewType::is_adj_matrix_transposed) {
+    copy_to_matrix_major(handle,
+                         graph_view,
+                         vertex_first,
+                         vertex_last,
+                         vertex_value_input_first,
+                         adj_matrix_col_value_output_first);
+  } else {
+    copy_to_matrix_minor(handle,
+                         graph_view,
+                         vertex_first,
+                         vertex_last,
+                         vertex_value_input_first,
+                         adj_matrix_col_value_output_first);
+  }
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
new file mode 100644
index 00000000000..7737a6b875c
--- /dev/null
+++ b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
@@ -0,0 +1,643 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph_view.hpp>
+#include <matrix_partition_device.cuh>
+#include <patterns/edge_op_utils.cuh>
+#include <patterns/reduce_op.cuh>
+#include <utilities/comm_utils.cuh>
+#include <utilities/error.hpp>
+
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/tuple.h>
+#include <thrust/type_traits/integer_sequence.h>
+#include <cub/cub.cuh>
+
+#include <type_traits>
+#include <utility>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+// FIXME: block size requires tuning
+int32_t constexpr copy_v_transform_reduce_nbr_for_all_block_size = 128;
+
+#if 0
+// FIXME: delete this once we verify that the thrust replace in for_all_major_for_all_nbr_low_degree is no slower than the original for loop based imoplementation
+template <bool update_major, typename T>
+__device__ std::enable_if_t<update_major, void> accumulate_edge_op_result(T& lhs, T const& rhs)
+{
+  lhs = plus_edge_op_result(lhs, rhs);
+}
+
+template <bool update_major, typename T>
+__device__ std::enable_if_t<!update_major, void> accumulate_edge_op_result(T& lhs, T const& rhs)
+{
+  atomic_add(&lhs, rhs);
+}
+#endif
+
+template <bool update_major,
+          typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename ResultValueOutputIterator,
+          typename EdgeOp,
+          typename T>
+__global__ void for_all_major_for_all_nbr_low_degree(
+  matrix_partition_device_t<GraphViewType> matrix_partition,
+  typename GraphViewType::vertex_type row_first,
+  typename GraphViewType::vertex_type row_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  ResultValueOutputIterator result_value_output_first,
+  EdgeOp e_op,
+  T init /* relevent only if update_major == true */)
+{
+  using vertex_t      = typename GraphViewType::vertex_type;
+  using edge_t        = typename GraphViewType::edge_type;
+  using weight_t      = typename GraphViewType::weight_type;
+  using e_op_result_t = T;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto idx =
+    static_cast<size_t>(row_first - matrix_partition.get_major_first()) + static_cast<size_t>(tid);
+
+  while (idx < static_cast<size_t>(row_last - matrix_partition.get_major_first())) {
+    vertex_t const* indices{nullptr};
+    weight_t const* weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) =
+      matrix_partition.get_local_edges(static_cast<vertex_t>(idx));
+#if 1
+    auto transform_op = [&matrix_partition,
+                         &adj_matrix_row_value_input_first,
+                         &adj_matrix_col_value_input_first,
+                         &e_op,
+                         idx,
+                         indices,
+                         weights] __device__(auto i) {
+      auto minor        = indices[i];
+      auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
+      auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+      auto row          = GraphViewType::is_adj_matrix_transposed
+                   ? minor
+                   : matrix_partition.get_major_from_major_offset_nocheck(idx);
+      auto col = GraphViewType::is_adj_matrix_transposed
+                   ? matrix_partition.get_major_from_major_offset_nocheck(idx)
+                   : minor;
+      auto row_offset =
+        GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast<vertex_t>(idx);
+      auto col_offset =
+        GraphViewType::is_adj_matrix_transposed ? static_cast<vertex_t>(idx) : minor_offset;
+      return evaluate_edge_op<GraphViewType,
+                              AdjMatrixRowValueInputIterator,
+                              AdjMatrixColValueInputIterator,
+                              EdgeOp>()
+        .compute(row,
+                 col,
+                 weight,
+                 *(adj_matrix_row_value_input_first + row_offset),
+                 *(adj_matrix_col_value_input_first + col_offset),
+                 e_op);
+    };
+
+    if (update_major) {
+      *(result_value_output_first + idx) = thrust::transform_reduce(
+        thrust::seq,
+        thrust::make_counting_iterator(edge_t{0}),
+        thrust::make_counting_iterator(local_degree),
+        transform_op,
+        init,
+        [] __device__(auto lhs, auto rhs) { return plus_edge_op_result(lhs, rhs); });
+    } else {
+      thrust::for_each(
+        thrust::seq,
+        thrust::make_counting_iterator(edge_t{0}),
+        thrust::make_counting_iterator(local_degree),
+        [&matrix_partition, indices, &result_value_output_first, &transform_op] __device__(auto i) {
+          auto e_op_result  = transform_op(i);
+          auto minor        = indices[i];
+          auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+          atomic_accumulate_edge_op_result(result_value_output_first + minor_offset, e_op_result);
+        });
+    }
+#else
+    // FIXME: delete this once we verify that the code above is not slower than this.
+    e_op_result_t e_op_result_sum{init};  // relevent only if update_major == true
+    for (edge_t i = 0; i < local_degree; ++i) {
+      auto minor        = indices[i];
+      auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
+      auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+      auto row          = GraphViewType::is_adj_matrix_transposed
+                   ? minor
+                   : matrix_partition.get_major_from_major_offset_nocheck(idx);
+      auto col = GraphViewType::is_adj_matrix_transposed
+                   ? matrix_partition.get_major_from_major_offset_nocheck(idx)
+                   : minor;
+      auto row_offset =
+        GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast<vertex_t>(idx);
+      auto col_offset =
+        GraphViewType::is_adj_matrix_transposed ? static_cast<vertex_t>(idx) : minor_offset;
+      auto e_op_result = evaluate_edge_op<GraphViewType,
+                                          AdjMatrixRowValueInputIterator,
+                                          AdjMatrixColValueInputIterator,
+                                          EdgeOp>()
+                           .compute(row,
+                                    col,
+                                    weight,
+                                    *(adj_matrix_row_value_input_first + row_offset),
+                                    *(adj_matrix_col_value_input_first + col_offset),
+                                    e_op);
+      if (update_major) {
+        accumulate_edge_op_result<update_major>(e_op_result_sum, e_op_result);
+      } else {
+        accumulate_edge_op_result<update_major>(*(result_value_output_first + minor_offset),
+                                                e_op_result);
+      }
+    }
+    if (update_major) { *(result_value_output_first + idx) = e_op_result_sum; }
+#endif
+    idx += gridDim.x * blockDim.x;
+  }
+}
+
+template <bool update_major,
+          typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename ResultValueOutputIterator,
+          typename EdgeOp,
+          typename T>
+__global__ void for_all_major_for_all_nbr_mid_degree(
+  matrix_partition_device_t<GraphViewType> matrix_partition,
+  typename GraphViewType::vertex_type row_first,
+  typename GraphViewType::vertex_type row_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  ResultValueOutputIterator result_value_output_first,
+  EdgeOp e_op,
+  T init /* relevent only if update_major == true */)
+{
+  using vertex_t      = typename GraphViewType::vertex_type;
+  using edge_t        = typename GraphViewType::edge_type;
+  using weight_t      = typename GraphViewType::weight_type;
+  using e_op_result_t = T;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  static_assert(copy_v_transform_reduce_nbr_for_all_block_size % raft::warp_size() == 0);
+  auto const lane_id = tid % raft::warp_size();
+  auto idx           = static_cast<size_t>(row_first - matrix_partition.get_major_first()) +
+             static_cast<size_t>(tid / raft::warp_size());
+
+  while (idx < static_cast<size_t>(row_last - matrix_partition.get_major_first())) {
+    vertex_t const* indices{nullptr};
+    weight_t const* weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(idx);
+    auto e_op_result_sum =
+      lane_id == 0 ? init : e_op_result_t{};  // relevent only if update_major == true
+    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size) {
+      auto minor        = indices[i];
+      auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
+      auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+      auto row          = GraphViewType::is_adj_matrix_transposed
+                   ? minor
+                   : matrix_partition.get_major_from_major_offset_nocheck(idx);
+      auto col = GraphViewType::is_adj_matrix_transposed
+                   ? matrix_partition.get_major_from_major_offset_nocheck(idx)
+                   : minor;
+      auto row_offset =
+        GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast<vertex_t>(idx);
+      auto col_offset =
+        GraphViewType::is_adj_matrix_transposed ? static_cast<vertex_t>(idx) : minor_offset;
+      auto e_op_result = evaluate_edge_op<GraphViewType,
+                                          AdjMatrixRowValueInputIterator,
+                                          AdjMatrixColValueInputIterator,
+                                          EdgeOp>()
+                           .compute(row,
+                                    col,
+                                    weight,
+                                    *(adj_matrix_row_value_input_first + row_offset),
+                                    *(adj_matrix_col_value_input_first + col_offset),
+                                    e_op);
+      if (update_major) {
+        e_op_result_sum = plus_edge_op_result(e_op_result_sum, e_op_result);
+      } else {
+        atomic_accumulate_edge_op_result(result_value_output_first + minor_offset, e_op_result);
+      }
+    }
+    if (update_major) {
+      e_op_result_sum = warp_reduce_edge_op_result<e_op_result_t>().compute(e_op_result_sum);
+      if (lane_id == 0) { *(result_value_output_first + idx) = e_op_result_sum; }
+    }
+
+    idx += gridDim.x * (blockDim.x / raft::warp_size());
+  }
+}
+
+template <bool update_major,
+          typename GraphViewType,
+          typename GraphViewType::vertex_type row_first,
+          typename GraphViewType::vertex_type row_last,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename ResultValueOutputIterator,
+          typename EdgeOp,
+          typename T>
+__global__ void for_all_major_for_all_nbr_high_degree(
+  matrix_partition_device_t<GraphViewType> matrix_partition,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  ResultValueOutputIterator result_value_output_first,
+  EdgeOp e_op,
+  T init /* relevent only if update_major == true */)
+{
+  using vertex_t      = typename GraphViewType::vertex_type;
+  using edge_t        = typename GraphViewType::edge_type;
+  using weight_t      = typename GraphViewType::weight_type;
+  using e_op_result_t = T;
+
+  auto idx = static_cast<size_t>(row_first - matrix_partition.get_major_first()) +
+             static_cast<size_t>(blockIdx.x);
+
+  while (idx < static_cast<size_t>(row_last - matrix_partition.get_major_first())) {
+    vertex_t const* indices{nullptr};
+    weight_t const* weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(idx);
+    auto e_op_result_sum =
+      threadIdx.x == 0 ? init : e_op_result_t{};  // relevent only if update_major == true
+    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+      auto minor        = indices[i];
+      auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
+      auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+      auto row          = GraphViewType::is_adj_matrix_transposed
+                   ? minor
+                   : matrix_partition.get_major_from_major_offset_nocheck(idx);
+      auto col = GraphViewType::is_adj_matrix_transposed
+                   ? matrix_partition.get_major_from_major_offset_nocheck(idx)
+                   : minor;
+      auto row_offset =
+        GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast<vertex_t>(idx);
+      auto col_offset =
+        GraphViewType::is_adj_matrix_transposed ? static_cast<vertex_t>(idx) : minor_offset;
+      auto e_op_result = evaluate_edge_op<GraphViewType,
+                                          AdjMatrixRowValueInputIterator,
+                                          AdjMatrixColValueInputIterator,
+                                          EdgeOp>()
+                           .compute(row,
+                                    col,
+                                    weight,
+                                    *(adj_matrix_row_value_input_first + row_offset),
+                                    *(adj_matrix_col_value_input_first + col_offset),
+                                    e_op);
+      if (update_major) {
+        e_op_result_sum = plus_edge_op_result(e_op_result_sum, e_op_result);
+      } else {
+        atomic_accumulate_edge_op_result(result_value_output_first + minor_offset, e_op_result);
+      }
+    }
+    if (update_major) {
+      e_op_result_sum =
+        block_reduce_edge_op_result<e_op_result_t, copy_v_transform_reduce_nbr_for_all_block_size>()
+          .compute(e_op_result_sum);
+      if (threadIdx.x == 0) { *(result_value_output_first + idx) = e_op_result_sum; }
+    }
+
+    idx += gridDim.x;
+  }
+}
+
+template <bool in,  // iterate over incoming edges (in == true) or outgoing edges (in == false)
+          typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename EdgeOp,
+          typename T,
+          typename VertexValueOutputIterator>
+void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
+                                 GraphViewType const& graph_view,
+                                 AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+                                 AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+                                 EdgeOp e_op,
+                                 T init,
+                                 VertexValueOutputIterator vertex_value_output_first)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+
+  auto loop_count = size_t{1};
+  if (GraphViewType::is_multi_gpu) {
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_size = row_comm.get_size();
+    loop_count               = graph_view.is_hypergraph_partitioned()
+                   ? graph_view.get_number_of_local_adj_matrix_partitions()
+                   : static_cast<size_t>(row_comm_size);
+  }
+
+  for (size_t i = 0; i < loop_count; ++i) {
+    matrix_partition_device_t<GraphViewType> matrix_partition(
+      graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i);
+
+    auto tmp_buffer_size = vertex_t{0};
+    if (GraphViewType::is_multi_gpu) {
+      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_size = row_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+
+      tmp_buffer_size =
+        in ? GraphViewType::is_adj_matrix_transposed
+               ? graph_view.is_hypergraph_partitioned()
+                   ? matrix_partition.get_major_size()
+                   : graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)
+               : matrix_partition.get_minor_size()
+           : GraphViewType::is_adj_matrix_transposed
+               ? matrix_partition.get_minor_size()
+               : graph_view.is_hypergraph_partitioned()
+                   ? matrix_partition.get_major_size()
+                   : graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
+    }
+    auto tmp_buffer   = allocate_comm_buffer<T>(tmp_buffer_size, handle.get_stream());
+    auto buffer_first = get_comm_buffer_begin<T>(tmp_buffer);
+
+    auto local_init = init;
+    if (GraphViewType::is_multi_gpu) {
+      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_rank = row_comm.get_rank();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      if (in == GraphViewType::is_adj_matrix_transposed) {
+        local_init = graph_view.is_hypergraph_partitioned() ? (col_comm_rank == 0) ? init : T{}
+                                                            : (row_comm_rank == 0) ? init : T{};
+      } else {
+        local_init = graph_view.is_hypergraph_partitioned() ? (row_comm_rank == 0) ? init : T{}
+                                                            : (col_comm_rank == 0) ? init : T{};
+      }
+    }
+
+    if (in != GraphViewType::is_adj_matrix_transposed) {
+      if (GraphViewType::is_multi_gpu) {
+        thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     buffer_first,
+                     buffer_first + tmp_buffer_size,
+                     local_init);
+      } else {
+        thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     vertex_value_output_first,
+                     vertex_value_output_first + graph_view.get_number_of_local_vertices(),
+                     local_init);
+      }
+    }
+
+    int comm_root_rank = 0;
+    if (GraphViewType::is_multi_gpu) {
+      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_rank = row_comm.get_rank();
+      auto const row_comm_size = row_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank
+                                                              : col_comm_rank * row_comm_size + i;
+    }
+
+    raft::grid_1d_thread_t update_grid(graph_view.get_vertex_partition_size(comm_root_rank),
+                                       detail::copy_v_transform_reduce_nbr_for_all_block_size,
+                                       handle.get_device_properties().maxGridSize[0]);
+
+    if (GraphViewType::is_multi_gpu) {
+      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_size = row_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+
+      vertex_t row_value_input_offset =
+        GraphViewType::is_adj_matrix_transposed
+          ? 0
+          : graph_view.is_hypergraph_partitioned()
+              ? matrix_partition.get_major_value_start_offset()
+              : graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size + i) -
+                  graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size);
+      vertex_t col_value_input_offset =
+        GraphViewType::is_adj_matrix_transposed
+          ? graph_view.is_hypergraph_partitioned()
+              ? matrix_partition.get_major_value_start_offset()
+              : graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size + i) -
+                  graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size)
+          : 0;
+
+      detail::for_all_major_for_all_nbr_low_degree<in == GraphViewType::is_adj_matrix_transposed>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+          matrix_partition,
+          graph_view.get_vertex_partition_first(comm_root_rank),
+          graph_view.get_vertex_partition_last(comm_root_rank),
+          adj_matrix_row_value_input_first + row_value_input_offset,
+          adj_matrix_col_value_input_first + col_value_input_offset,
+          buffer_first,
+          e_op,
+          local_init);
+    } else {
+      detail::for_all_major_for_all_nbr_low_degree<in == GraphViewType::is_adj_matrix_transposed>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+          matrix_partition,
+          graph_view.get_vertex_partition_first(comm_root_rank),
+          graph_view.get_vertex_partition_last(comm_root_rank),
+          adj_matrix_row_value_input_first,
+          adj_matrix_col_value_input_first,
+          vertex_value_output_first,
+          e_op,
+          local_init);
+    }
+
+    if (GraphViewType::is_multi_gpu) {
+      if (in == GraphViewType::is_adj_matrix_transposed) {
+        auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+        auto const row_comm_rank = row_comm.get_rank();
+        auto const row_comm_size = row_comm.get_size();
+        auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+        auto const col_comm_rank = col_comm.get_rank();
+        auto const col_comm_size = col_comm.get_size();
+
+        if (graph_view.is_hypergraph_partitioned()) {
+          device_reduce(
+            col_comm,
+            buffer_first,
+            vertex_value_output_first,
+            static_cast<size_t>(graph_view.get_vertex_partition_size(i * row_comm_size + i)),
+            raft::comms::op_t::SUM,
+            i,
+            handle.get_stream());
+        } else {
+          for (int j = 0; j < row_comm_size; ++j) {
+            auto comm_root_rank = col_comm_rank * row_comm_size + j;
+            device_reduce(
+              row_comm,
+              buffer_first + (graph_view.get_vertex_partition_first(comm_root_rank) -
+                              graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size)),
+              vertex_value_output_first,
+              static_cast<size_t>(graph_view.get_vertex_partition_size(comm_root_rank)),
+              raft::comms::op_t::SUM,
+              j,
+              handle.get_stream());
+          }
+        }
+      } else {
+        CUGRAPH_FAIL("unimplemented.");
+      }
+    }
+  }
+}
+
+}  // namespace detail
+
+/**
+ * @brief Iterate over the incoming edges to update vertex properties.
+ *
+ * This function is inspired by thrust::transfrom_reduce() (iteration over the incoming edges part)
+ * and thrust::copy() (update vertex properties part, take transform_reduce output as copy input).
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam EdgeOp Type of the quaternraft::grid_1d_thread_t
+ update_grid(matrix_partition.get_major_size(),
+                                       detail::copy_v_transform_reduce_nbr_for_all_block_size,
+                                       handle.get_device_properties().maxGridSize[0]);ary (or
+ quinary) edge operator.
+ * @tparam T Type of the initial value for reduction over the incoming edges.
+ * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_adj_matrix_local_rows().
+ * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
+ * properties for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_adj_matrix_local_cols().
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
+ * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
+ * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
+ * get_number_of_local_adj_matrix_partition_cols())) and returns a value to be reduced.
+ * @param init Initial value to be added to the reduced @e_op return values for each vertex.
+ * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first
+ * (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
+ * (exclusive) is deduced as @p vertex_value_output_first + @p
+ * graph_view.get_number_of_local_vertices().
+ */
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename EdgeOp,
+          typename T,
+          typename VertexValueOutputIterator>
+void copy_v_transform_reduce_in_nbr(raft::handle_t const& handle,
+                                    GraphViewType const& graph_view,
+                                    AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+                                    AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+                                    EdgeOp e_op,
+                                    T init,
+                                    VertexValueOutputIterator vertex_value_output_first)
+{
+  detail::copy_v_transform_reduce_nbr<true>(handle,
+                                            graph_view,
+                                            adj_matrix_row_value_input_first,
+                                            adj_matrix_col_value_input_first,
+                                            e_op,
+                                            init,
+                                            vertex_value_output_first);
+}
+
+/**
+ * @brief Iterate over the outgoing edges to update vertex properties.
+ *
+ * This function is inspired by thrust::transfrom_reduce() (iteration over the outgoing edges
+ * part) and thrust::copy() (update vertex properties part, take transform_reduce output as copy
+ * input).
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam T Type of the initial value for reduction over the outgoing edges.
+ * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first
+ * +
+ * @p graph_view.get_number_of_adj_matrix_local_rows().
+ * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
+ * properties for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p
+ * adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_adj_matrix_local_cols().
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional
+ * edge weight), *(@p adj_matrix_row_value_input_first + i), and *(@p
+ * adj_matrix_col_value_input_first + j) (where i is in [0,
+ * graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
+ * get_number_of_local_adj_matrix_partition_cols())) and returns a value to be reduced.
+ * @param init Initial value to be added to the reduced @e_op return values for each vertex.
+ * @param vertex_value_output_first Iterator pointing to the vertex property variables for the
+ * first (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
+ * (exclusive) is deduced as @p vertex_value_output_first + @p
+ * graph_view.get_number_of_local_vertices().
+ */
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename EdgeOp,
+          typename T,
+          typename VertexValueOutputIterator>
+void copy_v_transform_reduce_out_nbr(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  EdgeOp e_op,
+  T init,
+  VertexValueOutputIterator vertex_value_output_first)
+{
+  detail::copy_v_transform_reduce_nbr<false>(handle,
+                                             graph_view,
+                                             adj_matrix_row_value_input_first,
+                                             adj_matrix_col_value_input_first,
+                                             e_op,
+                                             init,
+                                             vertex_value_output_first);
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/count_if_e.cuh b/cpp/include/patterns/count_if_e.cuh
new file mode 100644
index 00000000000..04f22033f91
--- /dev/null
+++ b/cpp/include/patterns/count_if_e.cuh
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph_view.hpp>
+#include <matrix_partition_device.cuh>
+#include <patterns/edge_op_utils.cuh>
+#include <utilities/comm_utils.cuh>
+#include <utilities/error.hpp>
+
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+#include <thrust/tuple.h>
+#include <cub/cub.cuh>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+// FIXME: block size requires tuning
+int32_t constexpr count_if_e_for_all_block_size = 128;
+
+// FIXME: function names conflict if included with transform_reduce_e.cuh
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename EdgeOp>
+__global__ void for_all_major_for_all_nbr_low_degree(
+  matrix_partition_device_t<GraphViewType> matrix_partition,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  typename GraphViewType::edge_type* block_counts,
+  EdgeOp e_op)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto idx       = static_cast<size_t>(tid);
+
+  edge_t count{0};
+  while (idx < static_cast<size_t>(matrix_partition.get_major_size())) {
+    vertex_t const* indices{nullptr};
+    weight_t const* weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(idx);
+#if 1
+    count += thrust::count_if(
+      thrust::seq,
+      thrust::make_counting_iterator(edge_t{0}),
+      thrust::make_counting_iterator(local_degree),
+      [&matrix_partition,
+       &adj_matrix_row_value_input_first,
+       &adj_matrix_col_value_input_first,
+       &e_op,
+       idx,
+       indices,
+       weights] __device__(auto i) {
+        auto minor        = indices[i];
+        auto weight       = weights != nullptr ? weights[i] : 1.0;
+        auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+        auto row          = GraphViewType::is_adj_matrix_transposed
+                     ? minor
+                     : matrix_partition.get_major_from_major_offset_nocheck(idx);
+        auto col = GraphViewType::is_adj_matrix_transposed
+                     ? matrix_partition.get_major_from_major_offset_nocheck(idx)
+                     : minor;
+        auto row_offset =
+          GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast<vertex_t>(idx);
+        auto col_offset =
+          GraphViewType::is_adj_matrix_transposed ? static_cast<vertex_t>(idx) : minor_offset;
+        auto e_op_result = evaluate_edge_op<GraphViewType,
+                                            AdjMatrixRowValueInputIterator,
+                                            AdjMatrixColValueInputIterator,
+                                            EdgeOp>()
+                             .compute(row,
+                                      col,
+                                      weight,
+                                      *(adj_matrix_row_value_input_first + row_offset),
+                                      *(adj_matrix_col_value_input_first + col_offset),
+                                      e_op);
+
+        return e_op_result;
+      });
+#else
+    // FIXME: delete this once we verify that the code above is not slower than this.
+    for (vertex_t i = 0; i < local_degree; ++i) {
+      auto minor        = indices[i];
+      auto weight       = weights != nullptr ? weights[i] : 1.0;
+      auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+      auto row          = GraphViewType::is_adj_matrix_transposed
+                   ? minor
+                   : matrix_partition.get_major_from_major_offset_nocheck(idx);
+      auto col = GraphViewType::is_adj_matrix_transposed
+                   ? matrix_partition.get_major_from_major_offset_nocheck(idx)
+                   : minor;
+      auto row_offset =
+        GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast<vertex_t>(idx);
+      auto col_offset =
+        GraphViewType::is_adj_matrix_transposed ? static_cast<vertex_t>(idx) : minor_offset;
+      auto e_op_result = evaluate_edge_op<GraphViewType,
+                                          AdjMatrixRowValueInputIterator,
+                                          AdjMatrixColValueInputIterator,
+                                          EdgeOp>()
+                           .compute(row,
+                                    col,
+                                    weight,
+                                    *(adj_matrix_row_value_input_first + row_offset),
+                                    *(adj_matrix_col_value_input_first + col_offset),
+                                    e_op);
+      if (e_op_result) { count++; }
+    }
+#endif
+    idx += gridDim.x * blockDim.x;
+  }
+
+  using BlockReduce = cub::BlockReduce<edge_t, count_if_e_for_all_block_size>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  count = BlockReduce(temp_storage).Sum(count);
+  if (threadIdx.x == 0) { *(block_counts + blockIdx.x) = count; }
+}
+
+}  // namespace detail
+
+/**
+ * @brief Count the number of edges that satisfies the given predicate.
+ *
+ * This function is inspired by thrust::count_if().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
+ * properties for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
+ * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
+ * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
+ * get_number_of_local_adj_matrix_partition_cols())) and returns true if this edge should be
+ * included in the returned count.
+ * @return GraphViewType::edge_type Number of times @p e_op returned true.
+ */
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename EdgeOp>
+typename GraphViewType::edge_type count_if_e(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  EdgeOp e_op)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+
+  edge_t count{0};
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, i);
+    auto row_value_input_offset =
+      GraphViewType::is_adj_matrix_transposed ? 0 : matrix_partition.get_major_value_start_offset();
+    auto col_value_input_offset =
+      GraphViewType::is_adj_matrix_transposed ? matrix_partition.get_major_value_start_offset() : 0;
+
+    raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(),
+                                       detail::count_if_e_for_all_block_size,
+                                       handle.get_device_properties().maxGridSize[0]);
+
+    rmm::device_vector<edge_t> block_counts(update_grid.num_blocks);
+
+    detail::for_all_major_for_all_nbr_low_degree<<<update_grid.num_blocks,
+                                                   update_grid.block_size,
+                                                   0,
+                                                   handle.get_stream()>>>(
+      matrix_partition,
+      adj_matrix_row_value_input_first + row_value_input_offset,
+      adj_matrix_col_value_input_first + col_value_input_offset,
+      block_counts.data().get(),
+      e_op);
+
+    // FIXME: we have several options to implement this. With cooperative group support
+    // (https://devblogs.nvidia.com/cooperative-groups/), we can run this synchronization within
+    // the previous kernel. Using atomics at the end of the previous kernel is another option
+    // (sequentialization due to atomics may not be bad as different blocks may reach the
+    // synchronization point in varying timings and the number of SMs is not very big)
+    count += thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                            block_counts.begin(),
+                            block_counts.end(),
+                            edge_t{0},
+                            thrust::plus<edge_t>());
+  }
+
+  if (GraphViewType::is_multi_gpu) {
+    count = host_scalar_allreduce(handle.get_comms(), count, handle.get_stream());
+  }
+
+  return count;
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/count_if_v.cuh b/cpp/include/patterns/count_if_v.cuh
new file mode 100644
index 00000000000..c90b259cdde
--- /dev/null
+++ b/cpp/include/patterns/count_if_v.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph_view.hpp>
+#include <utilities/comm_utils.cuh>
+#include <utilities/error.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+#include <thrust/count.h>
+#include <thrust/execution_policy.h>
+
+namespace cugraph {
+namespace experimental {
+
+/**
+ * @brief Count the number of vertices that satisfies the given predicate.
+ *
+ * This version iterates over the entire set of graph vertices. This function is inspired by
+ * thrust::count_if().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
+ * @tparam VertexOp Type of the unary predicate operator.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
+ * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
+ * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
+ * @param v_op Unary operator takes *(@p vertex_value_input_first + i) (where i is [0, @p
+ * graph_view.get_number_of_local_vertices())) and returns true if this vertex should be
+ * included in the returned count.
+ * @return GraphViewType::vertex_type Number of times @p v_op returned true.
+ */
+template <typename GraphViewType, typename VertexValueInputIterator, typename VertexOp>
+typename GraphViewType::vertex_type count_if_v(raft::handle_t const& handle,
+                                               GraphViewType const& graph_view,
+                                               VertexValueInputIterator vertex_value_input_first,
+                                               VertexOp v_op)
+{
+  auto count =
+    thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     vertex_value_input_first,
+                     vertex_value_input_first + graph_view.get_number_of_local_vertices(),
+                     v_op);
+  if (GraphViewType::is_multi_gpu) {
+    count = host_scalar_allreduce(handle.get_comms(), count, handle.get_stream());
+  }
+  return count;
+}
+
+/**
+ * @brief Count the number of vertices that satisfies the given predicate.
+ *
+ * This version (conceptually) iterates over only a subset of the graph vertices. This function
+ * actually works as thrust::count_if() on [@p input_first, @p input_last) (followed by
+ * inter-process reduction in multi-GPU). @p input_last - @p input_first (or the sum of @p
+ * input_last - @p input_first values in multi-GPU) should not overflow GraphViewType::vertex_type.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam InputIterator Type of the iterator for input values.
+ * @tparam VertexOp VertexOp Type of the unary predicate operator.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param input_first Iterator pointing to the beginning (inclusive) of the values to be passed to
+ * @p v_op.
+ * @param input_last Iterator pointing to the end (exclusive) of the values to be passed to @p v_op.
+ * @param v_op Unary operator takes *(@p input_first + i) (where i is [0, @p input_last - @p
+ * input_first)) and returns true if this vertex should be included in the returned count.
+ * @return GraphViewType::vertex_type Number of times @p v_op returned true.
+ */
+template <typename GraphViewType, typename InputIterator, typename VertexOp>
+typename GraphViewType::vertex_type count_if_v(raft::handle_t const& handle,
+                                               GraphViewType const& graph_view,
+                                               InputIterator input_first,
+                                               InputIterator input_last,
+                                               VertexOp v_op)
+{
+  auto count = thrust::count_if(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), input_first, input_last, v_op);
+  if (GraphViewType::is_multi_gpu) {
+    count = host_scalar_allreduce(handle.get_comms(), count, handle.get_stream());
+  }
+  return count;
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/edge_op_utils.cuh b/cpp/include/patterns/edge_op_utils.cuh
new file mode 100644
index 00000000000..58fb31c7605
--- /dev/null
+++ b/cpp/include/patterns/edge_op_utils.cuh
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <utilities/thrust_tuple_utils.cuh>
+
+#include <raft/device_atomics.cuh>
+
+#include <thrust/detail/type_traits/iterator/is_discard_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/tuple.h>
+#include <cub/cub.cuh>
+
+#include <array>
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+template <typename ResultOfEdgeOp, typename Enable = void>
+struct is_valid_edge_op {
+  static constexpr bool value = false;
+};
+
+template <typename ResultOfEdgeOp>
+struct is_valid_edge_op<
+  ResultOfEdgeOp,
+  typename std::conditional<false, typename ResultOfEdgeOp::type, void>::type> {
+  static constexpr bool valid = true;
+};
+
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename EdgeOp>
+struct evaluate_edge_op {
+  using vertex_type    = typename GraphViewType::vertex_type;
+  using weight_type    = typename GraphViewType::weight_type;
+  using row_value_type = typename std::iterator_traits<AdjMatrixRowValueInputIterator>::value_type;
+  using col_value_type = typename std::iterator_traits<AdjMatrixColValueInputIterator>::value_type;
+
+  template <typename V = vertex_type,
+            typename W = weight_type,
+            typename R = row_value_type,
+            typename C = col_value_type,
+            typename E = EdgeOp>
+  __device__ std::enable_if_t<is_valid_edge_op<typename std::result_of<E(V, V, W, R, C)>>::valid,
+                              typename std::result_of<E(V, V, W, R, C)>::type>
+  compute(V r, V c, W w, R rv, C cv, E e)
+  {
+    return e(r, c, w, rv, cv);
+  }
+
+  template <typename V = vertex_type,
+            typename W = weight_type,
+            typename R = row_value_type,
+            typename C = col_value_type,
+            typename E = EdgeOp>
+  __device__ std::enable_if_t<is_valid_edge_op<typename std::result_of<E(V, V, R, C)>>::valid,
+                              typename std::result_of<E(V, V, R, C)>::type>
+  compute(V r, V c, W w, R rv, C cv, E e)
+  {
+    return e(r, c, rv, cv);
+  }
+};
+
+template <typename T>
+__host__ __device__ std::enable_if_t<std::is_arithmetic<T>::value, T> plus_edge_op_result(
+  T const& lhs, T const& rhs)
+{
+  return lhs + rhs;
+}
+
+template <typename T>
+__host__ __device__ std::enable_if_t<is_thrust_tuple<T>::value, T> plus_edge_op_result(T const& lhs,
+                                                                                       T const& rhs)
+{
+  return plus_thrust_tuple<T>()(lhs, rhs);
+}
+
+template <typename Iterator, typename T>
+__device__ std::enable_if_t<thrust::detail::is_discard_iterator<Iterator>::value, void>
+atomic_accumulate_edge_op_result(Iterator iter, T const& value)
+{
+  // no-op
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<std::is_same<typename thrust::iterator_traits<Iterator>::value_type, T>::value &&
+                     std::is_arithmetic<T>::value,
+                   void>
+  atomic_accumulate_edge_op_result(Iterator iter, T const& value)
+{
+  atomicAdd(&(thrust::raw_reference_cast(*iter)), value);
+}
+
+template <typename Iterator, typename T>
+__device__
+  std::enable_if_t<is_thrust_tuple<typename thrust::iterator_traits<Iterator>::value_type>::value &&
+                     is_thrust_tuple<T>::value,
+                   void>
+  atomic_accumulate_edge_op_result(Iterator iter, T const& value)
+{
+  static_assert(thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
+                thrust::tuple_size<T>::value);
+  atomic_accumulate_thrust_tuple<Iterator, T>()(iter, value);
+  return;
+}
+
+template <typename EdgeOpResultType>
+struct warp_reduce_edge_op_result {  // only warp lane 0 has a valid result
+  template <typename T = EdgeOpResultType>
+  __device__ std::enable_if_t<std::is_arithmetic<T>::value, T> compute(T const& edge_op_result)
+  {
+    auto ret = edge_op_result;
+    for (auto offset = raft::warp_size() / 2; offset > 0; offset /= 2) {
+      ret += __shfl_down_sync(raft::warp_full_mask(), ret, offset);
+    }
+    return ret;
+  }
+
+  template <typename T = EdgeOpResultType>
+  __device__ std::enable_if_t<is_thrust_tuple<T>::value, T> compute(T const& edge_op_result)
+  {
+    return warp_reduce_thrust_tuple<T>()(edge_op_result);
+  }
+};
+
+template <typename EdgeOpResultType, size_t BlockSize>
+struct block_reduce_edge_op_result {
+  template <typename T = EdgeOpResultType>
+  __device__ std::enable_if_t<std::is_arithmetic<T>::value, T> compute(T const& edge_op_result)
+  {
+    using BlockReduce = cub::BlockReduce<T, BlockSize>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    return BlockReduce(temp_storage).Sum(edge_op_result);
+  }
+
+  template <typename T = EdgeOpResultType>
+  __device__ std::enable_if_t<is_thrust_tuple<T>::value, T> compute(T const& edge_op_result)
+  {
+    return block_reduce_thrust_tuple<T, BlockSize>()(edge_op_result);
+  }
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/reduce_op.cuh b/cpp/include/patterns/reduce_op.cuh
new file mode 100644
index 00000000000..e9011914292
--- /dev/null
+++ b/cpp/include/patterns/reduce_op.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace cugraph {
+namespace experimental {
+namespace reduce_op {
+
+// reducing N elements, any element can be a valid output.
+template <typename T>
+struct any {
+  using type                          = T;
+  static constexpr bool pure_function = true;  // this can be called in any process
+
+  __host__ __device__ T operator()(T const& lhs, T const& rhs) const { return lhs; }
+};
+
+// reducing N elements (operator < should be defined between any two elements), the minimum element
+// should be selected.
+template <typename T>
+struct min {
+  using type                          = T;
+  static constexpr bool pure_function = true;  // this can be called in any process
+
+  __host__ __device__ T operator()(T const& lhs, T const& rhs) const
+  {
+    return lhs < rhs ? lhs : rhs;
+  }
+};
+
+}  // namespace reduce_op
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/reduce_v.cuh b/cpp/include/patterns/reduce_v.cuh
new file mode 100644
index 00000000000..12224dc55f4
--- /dev/null
+++ b/cpp/include/patterns/reduce_v.cuh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph_view.hpp>
+#include <utilities/comm_utils.cuh>
+#include <utilities/error.hpp>
+
+#include <raft/handle.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
+
+namespace cugraph {
+namespace experimental {
+
+/**
+ * @brief Reduce the vertex properties.
+ *
+ * This version iterates over the entire set of graph vertices. This function is inspired by
+ * thrust::reduce().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
+ * @tparam T Type of the initial value.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
+ * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
+ * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
+ * @param init Initial value to be added to the reduced input vertex properties.
+ * @return T Reduction of the input vertex properties.
+ */
+template <typename GraphViewType, typename VertexValueInputIterator, typename T>
+T reduce_v(raft::handle_t const& handle,
+           GraphViewType const& graph_view,
+           VertexValueInputIterator vertex_value_input_first,
+           T init)
+{
+  auto ret = thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                            vertex_value_input_first,
+                            vertex_value_input_first + graph_view.get_number_of_local_vertices(),
+                            init);
+  if (GraphViewType::is_multi_gpu) {
+    ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream());
+  }
+  return ret;
+}
+
+/**
+ * @brief Reduce the vertex properties.
+ *
+ * This version (conceptually) iterates over only a subset of the graph vertices. This function
+ * actually works as thrust::reduce() on [@p input_first, @p input_last) (followed by
+ * inter-process reduction in multi-GPU).
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam InputIterator Type of the iterator for input values.
+ * @tparam T Type of the initial value.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param input_first Iterator pointing to the beginning (inclusive) of the values to be reduced.
+ * @param input_last Iterator pointing to the end (exclusive) of the values to be reduced.
+ * @param init Initial value to be added to the reduced input vertex properties.
+ * @return T Reduction of the input vertex properties.
+ */
+template <typename GraphViewType, typename InputIterator, typename T>
+T reduce_v(raft::handle_t const& handle,
+           GraphViewType const& graph_view,
+           InputIterator input_first,
+           InputIterator input_last,
+           T init)
+{
+  auto ret = thrust::reduce(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()), input_first, input_last, init);
+  if (GraphViewType::is_multi_gpu) {
+    ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream());
+  }
+  return ret;
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/transform_reduce_e.cuh b/cpp/include/patterns/transform_reduce_e.cuh
new file mode 100644
index 00000000000..3f334ceff00
--- /dev/null
+++ b/cpp/include/patterns/transform_reduce_e.cuh
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph_view.hpp>
+#include <matrix_partition_device.cuh>
+#include <patterns/edge_op_utils.cuh>
+#include <utilities/error.hpp>
+
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+#include <thrust/tuple.h>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+// FIXME: block size requires tuning
+int32_t constexpr transform_reduce_e_for_all_block_size = 128;
+
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename BlockResultIterator,
+          typename EdgeOp>
+__global__ void for_all_major_for_all_nbr_low_degree(
+  matrix_partition_device_t<GraphViewType> matrix_partition,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  BlockResultIterator block_result_first,
+  EdgeOp e_op)
+{
+  using vertex_t      = typename GraphViewType::vertex_type;
+  using edge_t        = typename GraphViewType::edge_type;
+  using weight_t      = typename GraphViewType::weight_type;
+  using e_op_result_t = typename std::iterator_traits<BlockResultIterator>::value_type;
+
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  size_t idx     = static_cast<size_t>(tid);
+
+  e_op_result_t e_op_result_sum{};
+  while (idx < static_cast<size_t>(matrix_partition.get_major_size())) {
+    vertex_t const* indices{nullptr};
+    weight_t const* weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(idx);
+#if 1
+    auto sum = thrust::transform_reduce(
+      thrust::seq,
+      thrust::make_counting_iterator(edge_t{0}),
+      thrust::make_counting_iterator(local_degree),
+      [&matrix_partition,
+       &adj_matrix_row_value_input_first,
+       &adj_matrix_col_value_input_first,
+       &e_op,
+       idx,
+       indices,
+       weights] __device__(auto i) {
+        auto minor        = indices[i];
+        auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
+        auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+        auto row          = GraphViewType::is_adj_matrix_transposed
+                     ? minor
+                     : matrix_partition.get_major_from_major_offset_nocheck(idx);
+        auto col = GraphViewType::is_adj_matrix_transposed
+                     ? matrix_partition.get_major_from_major_offset_nocheck(idx)
+                     : minor;
+        auto row_offset =
+          GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast<vertex_t>(idx);
+        auto col_offset =
+          GraphViewType::is_adj_matrix_transposed ? static_cast<vertex_t>(idx) : minor_offset;
+        return evaluate_edge_op<GraphViewType,
+                                AdjMatrixRowValueInputIterator,
+                                AdjMatrixColValueInputIterator,
+                                EdgeOp>()
+          .compute(row,
+                   col,
+                   weight,
+                   *(adj_matrix_row_value_input_first + row_offset),
+                   *(adj_matrix_col_value_input_first + col_offset),
+                   e_op);
+      },
+      e_op_result_t{},
+      [] __device__(auto lhs, auto rhs) { return plus_edge_op_result(lhs, rhs); });
+
+    e_op_result_sum = plus_edge_op_result(e_op_result_sum, sum);
+#else
+    // FIXME: delete this once we verify that the code above is not slower than this.
+    for (vertex_t i = 0; i < local_degree; ++i) {
+      auto minor        = indices[i];
+      auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
+      auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+      auto row          = GraphViewType::is_adj_matrix_transposed
+                   ? minor
+                   : matrix_partition.get_major_from_major_offset_nocheck(idx);
+      auto col = GraphViewType::is_adj_matrix_transposed
+                   ? matrix_partition.get_major_from_major_offset_nocheck(idx)
+                   : minor;
+      auto row_offset =
+        GraphViewType::is_adj_matrix_transposed ? minor_offset : static_cast<vertex_t>(idx);
+      auto col_offset =
+        GraphViewType::is_adj_matrix_transposed ? static_cast<vertex_t>(idx) : minor_offset;
+      auto e_op_result = evaluate_edge_op<GraphViewType,
+                                          AdjMatrixRowValueInputIterator,
+                                          AdjMatrixColValueInputIterator,
+                                          EdgeOp>()
+                           .compute(row,
+                                    col,
+                                    weight,
+                                    *(adj_matrix_row_value_input_first + row_offset),
+                                    *(adj_matrix_col_value_input_first + col_offset),
+                                    e_op);
+      e_op_result_sum = plus_edge_op_result(e_op_result_sum, e_op_result);
+    }
+#endif
+    idx += gridDim.x * blockDim.x;
+  }
+
+  e_op_result_sum =
+    block_reduce_edge_op_result<e_op_result_t, transform_reduce_e_for_all_block_size>().compute(
+      e_op_result_sum);
+  if (threadIdx.x == 0) { *(block_result_first + blockIdx.x) = e_op_result_sum; }
+}
+
+}  // namespace detail
+
+/**
+ * @brief Iterate over the entire set of edges and reduce @p edge_op outputs.
+ *
+ * This function is inspired by thrust::transform_reduce().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam T Type of the initial value.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
+ * properties for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
+ * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
+ * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
+ * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced.
+ * @param init Initial value to be added to the transform-reduced input vertex properties.
+ * @return T Reduction of the @p edge_op outputs.
+ */
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename EdgeOp,
+          typename T>
+T transform_reduce_e(raft::handle_t const& handle,
+                     GraphViewType const& graph_view,
+                     AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+                     AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+                     EdgeOp e_op,
+                     T init)
+{
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+
+  using vertex_t = typename GraphViewType::vertex_type;
+
+  T result{};
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, i);
+    auto row_value_input_offset =
+      GraphViewType::is_adj_matrix_transposed ? 0 : matrix_partition.get_major_value_start_offset();
+    auto col_value_input_offset =
+      GraphViewType::is_adj_matrix_transposed ? matrix_partition.get_major_value_start_offset() : 0;
+
+    raft::grid_1d_thread_t update_grid(matrix_partition.get_major_size(),
+                                       detail::transform_reduce_e_for_all_block_size,
+                                       handle.get_device_properties().maxGridSize[0]);
+
+    rmm::device_vector<T> block_results(update_grid.num_blocks);
+
+    detail::for_all_major_for_all_nbr_low_degree<<<update_grid.num_blocks,
+                                                   update_grid.block_size,
+                                                   0,
+                                                   handle.get_stream()>>>(
+      matrix_partition,
+      adj_matrix_row_value_input_first + row_value_input_offset,
+      adj_matrix_col_value_input_first + col_value_input_offset,
+      block_results.data(),
+      e_op);
+
+    // FIXME: we have several options to implement this. With cooperative group support
+    // (https://devblogs.nvidia.com/cooperative-groups/), we can run this synchronization within the
+    // previous kernel. Using atomics at the end of the previous kernel is another option
+    // (sequentialization due to atomics may not be bad as different blocks may reach the
+    // synchronization point in varying timings and the number of SMs is not very big)
+    auto partial_result =
+      thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     block_results.begin(),
+                     block_results.end(),
+                     T(),
+                     [] __device__(auto lhs, auto rhs) { return plus_edge_op_result(lhs, rhs); });
+
+    result = plus_edge_op_result(result, partial_result);
+  }
+
+  if (GraphViewType::is_multi_gpu) {
+    result = host_scalar_allreduce(handle.get_comms(), result, handle.get_stream());
+  }
+
+  return plus_edge_op_result(init, result);
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/transform_reduce_v.cuh b/cpp/include/patterns/transform_reduce_v.cuh
new file mode 100644
index 00000000000..02538c36f47
--- /dev/null
+++ b/cpp/include/patterns/transform_reduce_v.cuh
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph_view.hpp>
+#include <utilities/comm_utils.cuh>
+#include <utilities/error.hpp>
+
+#include <raft/handle.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/transform_reduce.h>
+
+namespace cugraph {
+namespace experimental {
+
+/**
+ * @brief Apply an operator to the vertex properties and reduce.
+ *
+ * This version iterates over the entire set of graph vertices. This function is inspired by
+ * thrust::transform_reduce().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
+ * @tparam VertexOp Type of the unary vertex operator.
+ * @tparam T Type of the initial value.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
+ * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
+ * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
+ * @param v_op Unary operator takes *(@p vertex_value_input_first + i) (where i is [0, @p
+ * graph_view.get_number_of_local_vertices())) and returns a transformed value to be reduced.
+ * @param init Initial value to be added to the transform-reduced input vertex properties.
+ * @return T Reduction of the @p v_op outputs.
+ */
+template <typename GraphViewType, typename VertexValueInputIterator, typename VertexOp, typename T>
+T transform_reduce_v(raft::handle_t const& handle,
+                     GraphViewType const& graph_view,
+                     VertexValueInputIterator vertex_value_input_first,
+                     VertexOp v_op,
+                     T init)
+{
+  auto ret =
+    thrust::transform_reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                             vertex_value_input_first,
+                             vertex_value_input_first + graph_view.get_number_of_local_vertices(),
+                             v_op,
+                             init,
+                             thrust::plus<T>());
+  if (GraphViewType::is_multi_gpu) {
+    ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream());
+  }
+  return ret;
+}
+
+/**
+ * @brief Apply an operator to the vertex properties and reduce.
+ *
+ * This version (conceptually) iterates over only a subset of the graph vertices. This function
+ * actually works as thrust::transform_reduce() on [@p input_first, @p input_last) (followed by
+ * inter-process reduction in multi-GPU).
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam InputIterator Type of the iterator for input values.
+ * @tparam VertexOp
+ * @tparam T Type of the initial value.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param input_first Iterator pointing to the beginning (inclusive) of the values to be passed to
+ * @p v_op.
+ * @param input_last Iterator pointing to the end (exclusive) of the values to be passed to @p v_op.
+ * @param v_op Unary operator takes *(@p input_first + i) (where i is [0, @p input_last - @p
+ * input_first)) and returns a transformed value to be reduced.
+ * @param init Initial value to be added to the transform-reduced input vertex properties.
+ * @return T Reduction of the @p v_op outputs.
+ */
+template <typename GraphViewType, typename InputIterator, typename VertexOp, typename T>
+T transform_reduce_v(raft::handle_t const& handle,
+                     GraphViewType const& graph_view,
+                     InputIterator input_first,
+                     InputIterator input_last,
+                     VertexOp v_op,
+                     T init)
+{
+  auto ret =
+    thrust::transform_reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                             input_first,
+                             input_last,
+                             v_op,
+                             init,
+                             thrust::plus<T>());
+  if (GraphViewType::is_multi_gpu) {
+    ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream());
+  }
+  return ret;
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/transform_reduce_v_with_adj_matrix_row.cuh b/cpp/include/patterns/transform_reduce_v_with_adj_matrix_row.cuh
new file mode 100644
index 00000000000..f5af03d647c
--- /dev/null
+++ b/cpp/include/patterns/transform_reduce_v_with_adj_matrix_row.cuh
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph_view.hpp>
+#include <utilities/comm_utils.cuh>
+#include <utilities/error.hpp>
+
+#include <raft/handle.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform_reduce.h>
+
+namespace cugraph {
+namespace experimental {
+
+/**
+ * @brief Apply an operator to the matching vertex and adjacency matrix row properties and reduce.
+ *
+ * i'th vertex matches with the i'th row in the graph adjacency matrix. @p v_op takes vertex
+ * properties and adjacency matrix row properties for the matching row, and @p v_op outputs are
+ * reduced. This function is inspired by thrust::transform_reduce().
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam VertexOp Type of the binary vertex operator.
+ * @tparam T Type of the initial value.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
+ * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
+ * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param v_op Binary operator takes *(@p vertex_value_input_first + i) and *(@p
+ * adj_matrix_row_value_input_first + j) (where i and j are set for a vertex and the matching row)
+ * and returns a transformed value to be reduced.
+ * @param init Initial value to be added to the transform-reduced input vertex properties.
+ * @return T Reduction of the @p v_op outputs.
+ */
+template <typename GraphViewType,
+          typename VertexValueInputIterator,
+          typename AdjMatrixRowValueInputIterator,
+          typename VertexOp,
+          typename T>
+T transform_reduce_v_with_adj_matrix_row(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  VertexValueInputIterator vertex_value_input_first,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  VertexOp v_op,
+  T init)
+{
+  T ret{};
+
+  auto vertex_first = graph_view.get_local_vertex_first();
+  auto vertex_last  = graph_view.get_local_vertex_last();
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    auto row_first = graph_view.get_local_adj_matrix_partition_row_first(i);
+    auto row_last  = graph_view.get_local_adj_matrix_partition_row_last(i);
+
+    auto range_first = std::max(vertex_first, row_first);
+    auto range_last  = std::min(vertex_last, row_last);
+
+    if (range_last > range_first) {
+      matrix_partition_device_t<GraphViewType> matrix_partition(graph_view, i);
+      auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
+                                      ? 0
+                                      : matrix_partition.get_major_value_start_offset();
+
+      auto input_first  = thrust::make_zip_iterator(thrust::make_tuple(
+        vertex_value_input_first + (range_first - vertex_first),
+        adj_matrix_row_value_input_first + row_value_input_offset + (range_first - row_first)));
+      auto v_op_wrapper = [v_op] __device__(auto v_and_row_val) {
+        return v_op(thrust::get<0>(v_and_row_val), thrust::get<1>(v_and_row_val));
+      };
+      ret +=
+        thrust::transform_reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                 input_first,
+                                 input_first + (range_last - range_first),
+                                 v_op_wrapper,
+                                 T{},
+                                 thrust::plus<T>());
+    }
+  }
+
+  if (GraphViewType::is_multi_gpu) {
+    ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream());
+  }
+
+  return init + ret;
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh
new file mode 100644
index 00000000000..a1d18e26d1c
--- /dev/null
+++ b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh
@@ -0,0 +1,678 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph_view.hpp>
+#include <matrix_partition_device.cuh>
+#include <partition_manager.hpp>
+#include <patterns/edge_op_utils.cuh>
+#include <patterns/reduce_op.cuh>
+#include <utilities/comm_utils.cuh>
+#include <utilities/error.hpp>
+#include <utilities/thrust_tuple_utils.cuh>
+
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/tuple.h>
+#include <thrust/type_traits/integer_sequence.h>
+#include <cub/cub.cuh>
+
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+// FIXME: block size requires tuning
+int32_t constexpr update_frontier_v_push_if_out_nbr_for_all_block_size = 128;
+int32_t constexpr update_frontier_v_push_if_out_nbr_update_block_size  = 128;
+
+template <typename GraphViewType,
+          typename RowIterator,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename BufferKeyOutputIterator,
+          typename BufferPayloadOutputIterator,
+          typename EdgeOp>
+__global__ void for_all_frontier_row_for_all_nbr_low_degree(
+  matrix_partition_device_t<GraphViewType> matrix_partition,
+  RowIterator row_first,
+  RowIterator row_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  BufferKeyOutputIterator buffer_key_output_first,
+  BufferPayloadOutputIterator buffer_payload_output_first,
+  size_t* buffer_idx_ptr,
+  EdgeOp e_op)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+
+  auto num_rows  = static_cast<size_t>(thrust::distance(row_first, row_last));
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  size_t idx     = tid;
+
+  while (idx < num_rows) {
+    vertex_t row    = *(row_first + idx);
+    auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
+    vertex_t const* indices{nullptr};
+    weight_t const* weights{nullptr};
+    edge_t local_out_degree{};
+    thrust::tie(indices, weights, local_out_degree) = matrix_partition.get_local_edges(row_offset);
+    for (vertex_t i = 0; i < local_out_degree; ++i) {
+      auto col         = indices[i];
+      auto weight      = weights != nullptr ? weights[i] : 1.0;
+      auto col_offset  = matrix_partition.get_minor_offset_from_minor_nocheck(col);
+      auto e_op_result = evaluate_edge_op<GraphViewType,
+                                          AdjMatrixRowValueInputIterator,
+                                          AdjMatrixColValueInputIterator,
+                                          EdgeOp>()
+                           .compute(row,
+                                    col,
+                                    weight,
+                                    *(adj_matrix_row_value_input_first + row_offset),
+                                    *(adj_matrix_col_value_input_first + col_offset),
+                                    e_op);
+      if (thrust::get<0>(e_op_result) == true) {
+        // FIXME: This atomicAdd serializes execution. If we renumber vertices to insure that rows
+        // within a partition are sorted by their out-degree in decreasing order, we can compute
+        // a tight uppper bound for the maximum number of pushes per warp/block and use shared
+        // memory buffer to reduce the number of atomicAdd operations.
+        static_assert(sizeof(unsigned long long int) == sizeof(size_t));
+        auto buffer_idx = atomicAdd(reinterpret_cast<unsigned long long int*>(buffer_idx_ptr),
+                                    static_cast<unsigned long long int>(1));
+        *(buffer_key_output_first + buffer_idx) = col_offset;
+        *(buffer_payload_output_first + buffer_idx) =
+          remove_first_thrust_tuple_element<decltype(e_op_result)>()(e_op_result);
+      }
+    }
+
+    idx += gridDim.x * blockDim.x;
+  }
+}
+
+template <typename BufferKeyOutputIterator, typename BufferPayloadOutputIterator, typename ReduceOp>
+size_t reduce_buffer_elements(raft::handle_t const& handle,
+                              BufferKeyOutputIterator buffer_key_output_first,
+                              BufferPayloadOutputIterator buffer_payload_output_first,
+                              size_t num_buffer_elements,
+                              ReduceOp reduce_op)
+{
+  thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      buffer_key_output_first,
+                      buffer_key_output_first + num_buffer_elements,
+                      buffer_payload_output_first);
+
+  if (std::is_same<ReduceOp, reduce_op::any<typename ReduceOp::type>>::value) {
+    // FIXME: if ReducOp is any, we may have a cheaper alternative than sort & uique (i.e. discard
+    // non-first elements)
+    auto it = thrust::unique_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    buffer_key_output_first,
+                                    buffer_key_output_first + num_buffer_elements,
+                                    buffer_payload_output_first);
+    return static_cast<size_t>(thrust::distance(buffer_key_output_first, thrust::get<0>(it)));
+  } else {
+    using key_t     = typename std::iterator_traits<BufferKeyOutputIterator>::value_type;
+    using payload_t = typename std::iterator_traits<BufferPayloadOutputIterator>::value_type;
+    // FIXME: better avoid temporary buffer or at least limit the maximum buffer size (if we adopt
+    // CUDA cooperative group https://devblogs.nvidia.com/cooperative-groups and global sync(), we
+    // can use aggregate shared memory as a temporary buffer, or we can limit the buffer size, and
+    // split one thrust::reduce_by_key call to multiple thrust::reduce_by_key calls if the
+    // temporary buffer size exceeds the maximum buffer size (may be definied as percentage of the
+    // system HBM size or a function of the maximum number of threads in the system))
+    // FIXME: actually, we can find how many unique keys are here by now.
+    // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding the
+    // vertex unless reduce_op is a pure function.
+    rmm::device_vector<key_t> keys(num_buffer_elements);
+    rmm::device_vector<payload_t> values(num_buffer_elements);
+    auto it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    buffer_key_output_first,
+                                    buffer_key_output_first + num_buffer_elements,
+                                    buffer_payload_output_first,
+                                    keys.begin(),
+                                    values.begin(),
+                                    thrust::equal_to<key_t>(),
+                                    reduce_op);
+    auto num_reduced_buffer_elements =
+      static_cast<size_t>(thrust::distance(keys.begin(), thrust::get<0>(it)));
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 keys.begin(),
+                 keys.begin() + num_reduced_buffer_elements,
+                 buffer_key_output_first);
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 values.begin(),
+                 values.begin() + num_reduced_buffer_elements,
+                 buffer_payload_output_first);
+    CUDA_TRY(cudaStreamSynchronize(
+      handle.get_stream()));  // this is necessary as kyes & values will become out-of-scope once
+                              // this function returns
+    return num_reduced_buffer_elements;
+  }
+}
+
+template <size_t num_buckets,
+          typename BufferKeyInputIterator,
+          typename BufferPayloadInputIterator,
+          typename VertexValueInputIterator,
+          typename VertexValueOutputIterator,
+          typename vertex_t,
+          typename VertexOp>
+__global__ void update_frontier_and_vertex_output_values(
+  BufferKeyInputIterator buffer_key_input_first,
+  BufferPayloadInputIterator buffer_payload_input_first,
+  size_t num_buffer_elements,
+  VertexValueInputIterator vertex_value_input_first,
+  VertexValueOutputIterator vertex_value_output_first,
+  vertex_t** bucket_ptrs,
+  size_t* bucket_sizes_ptr,
+  size_t invalid_bucket_idx,
+  vertex_t invalid_vertex,
+  VertexOp v_op)
+{
+  static_assert(std::is_same<typename std::iterator_traits<BufferKeyInputIterator>::value_type,
+                             vertex_t>::value);
+  auto const tid   = threadIdx.x + blockIdx.x * blockDim.x;
+  size_t idx       = tid;
+  size_t block_idx = blockIdx.x;
+  // FIXME: it might be more performant to process more than one element per thread
+  auto num_blocks = (num_buffer_elements + blockDim.x - 1) / blockDim.x;
+
+  using BlockScan =
+    cub::BlockScan<size_t, detail::update_frontier_v_push_if_out_nbr_update_block_size>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  __shared__ size_t bucket_block_start_offsets[num_buckets];
+
+  size_t bucket_block_local_offsets[num_buckets];
+  size_t bucket_block_aggregate_sizes[num_buckets];
+
+  while (block_idx < num_blocks) {
+    for (size_t i = 0; i < num_buckets; ++i) { bucket_block_local_offsets[i] = 0; }
+
+    size_t selected_bucket_idx{invalid_bucket_idx};
+    vertex_t key{invalid_vertex};
+
+    if (idx < num_buffer_elements) {
+      key                 = *(buffer_key_input_first + idx);
+      auto v_val          = *(vertex_value_input_first + key);
+      auto payload        = *(buffer_payload_input_first + idx);
+      auto v_op_result    = v_op(v_val, payload);
+      selected_bucket_idx = thrust::get<0>(v_op_result);
+      if (selected_bucket_idx != invalid_bucket_idx) {
+        *(vertex_value_output_first + key) =
+          remove_first_thrust_tuple_element<decltype(v_op_result)>()(v_op_result);
+        bucket_block_local_offsets[selected_bucket_idx] = 1;
+      }
+    }
+
+    for (size_t i = 0; i < num_buckets; ++i) {
+      BlockScan(temp_storage)
+        .ExclusiveSum(bucket_block_local_offsets[i],
+                      bucket_block_local_offsets[i],
+                      bucket_block_aggregate_sizes[i]);
+    }
+
+    if (threadIdx.x == 0) {
+      for (size_t i = 0; i < num_buckets; ++i) {
+        static_assert(sizeof(unsigned long long int) == sizeof(size_t));
+        bucket_block_start_offsets[i] =
+          atomicAdd(reinterpret_cast<unsigned long long int*>(bucket_sizes_ptr + i),
+                    static_cast<unsigned long long int>(bucket_block_aggregate_sizes[i]));
+      }
+    }
+
+    __syncthreads();
+
+    // FIXME: better use shared memory buffer to aggreaget global memory writes
+    if (selected_bucket_idx != invalid_bucket_idx) {
+      bucket_ptrs[selected_bucket_idx][bucket_block_start_offsets[selected_bucket_idx] +
+                                       bucket_block_local_offsets[selected_bucket_idx]] = key;
+    }
+
+    idx += gridDim.x * blockDim.x;
+    block_idx += gridDim.x;
+  }
+}
+
+}  // namespace detail
+
+/**
+ * @brief Update vertex frontier and vertex property values iterating over the outgoing edges.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam VertexIterator Type of the iterator for vertex identifiers.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam ReduceOp Type of the binary reduction operator.
+ * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
+ * @tparam VertexValueOutputIterator Type of the iterator for vertex property variables.
+ * @tparam VertexFrontierType Type of the vertex frontier class which abstracts vertex frontier
+ * managements.
+ * @tparam VertexOp Type of the binary vertex operator.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param vertex_first Iterator pointing to the first (inclusive) vertex in the current frontier. v
+ * in [vertex_first, vertex_last) should be distinct (and should belong to this process in
+ * multi-GPU), otherwise undefined behavior
+ * @param vertex_last Iterator pointing to the last (exclusive) vertex in the current frontier.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
+ * properties for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
+ * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
+ * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
+ * get_number_of_local_adj_matrix_partition_cols())) and returns a value to reduced by the @p
+ * reduce_op.
+ * @param reduce_op Binary operator takes two input arguments and reduce the two variables to one.
+ * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
+ * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
+ * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
+ * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first
+ * (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
+ * (exclusive) is deduced as @p vertex_value_output_first + @p
+ * graph_view.get_number_of_local_vertices().
+ * @param vertex_frontier vertex frontier class object for vertex frontier managements. This object
+ * includes multiple bucket objects.
+ * @param v_op Binary operator takes *(@p vertex_value_input_first + i) (where i is [0, @p
+ * graph_view.get_number_of_local_vertices())) and reduced value of the @p e_op outputs for
+ * this vertex and returns the target bucket index (for frontier update) and new verrtex property
+ * values (to update *(@p vertex_value_output_first + i)).
+ */
+template <typename GraphViewType,
+          typename VertexIterator,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename EdgeOp,
+          typename ReduceOp,
+          typename VertexValueInputIterator,
+          typename VertexValueOutputIterator,
+          typename VertexFrontierType,
+          typename VertexOp>
+void update_frontier_v_push_if_out_nbr(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  VertexIterator vertex_first,
+  VertexIterator vertex_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  EdgeOp e_op,
+  ReduceOp reduce_op,
+  VertexValueInputIterator vertex_value_input_first,
+  VertexValueOutputIterator vertex_value_output_first,
+  VertexFrontierType& vertex_frontier,
+  VertexOp v_op)
+{
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+
+  // 1. fill the buffer
+
+  vertex_frontier.set_buffer_idx_value(0);
+
+  auto loop_count = size_t{1};
+  if (GraphViewType::is_multi_gpu) {
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_size = row_comm.get_size();
+    loop_count               = graph_view.is_hypergraph_partitioned()
+                   ? graph_view.get_number_of_local_adj_matrix_partitions()
+                   : static_cast<size_t>(row_comm_size);
+  }
+
+  for (size_t i = 0; i < loop_count; ++i) {
+    matrix_partition_device_t<GraphViewType> matrix_partition(
+      graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i);
+
+    rmm::device_uvector<vertex_t> frontier_rows(
+      0, handle.get_stream());  // relevant only if GraphViewType::is_multi_gpu is true
+
+    if (GraphViewType::is_multi_gpu) {
+      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_rank = row_comm.get_rank();
+      auto const row_comm_size = row_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+
+      auto sub_comm_rank = graph_view.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank;
+      auto frontier_size = (static_cast<size_t>(sub_comm_rank) == i)
+                             ? thrust::distance(vertex_first, vertex_last)
+                             : size_t{0};
+      if (graph_view.is_hypergraph_partitioned()) {
+        col_comm.bcast(&frontier_size, 1, i, handle.get_stream());
+      } else {
+        row_comm.bcast(&frontier_size, 1, i, handle.get_stream());
+      }
+      if (static_cast<size_t>(sub_comm_rank) != i) {
+        frontier_rows.resize(frontier_size, handle.get_stream());
+      }
+      device_bcast(graph_view.is_hypergraph_partitioned() ? col_comm : row_comm,
+                   vertex_first,
+                   frontier_rows.begin(),
+                   frontier_rows.size(),
+                   i,
+                   handle.get_stream());
+    }
+
+    edge_t max_pushes =
+      frontier_rows.size() > 0
+        ? thrust::transform_reduce(
+            rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+            frontier_rows.begin(),
+            frontier_rows.end(),
+            [matrix_partition] __device__(auto row) {
+              auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
+              return matrix_partition.get_local_degree(row_offset);
+            },
+            edge_t{0},
+            thrust::plus<edge_t>())
+        : thrust::transform_reduce(
+            rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+            vertex_first,
+            vertex_last,
+            [matrix_partition] __device__(auto row) {
+              auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
+              return matrix_partition.get_local_degree(row_offset);
+            },
+            edge_t{0},
+            thrust::plus<edge_t>());
+
+    // FIXME: This is highly pessimistic for single GPU (and multi-GPU as well if we maintain
+    // additional per column data for filtering in e_op). If we can pause & resume execution if
+    // buffer needs to be increased (and if we reserve address space to avoid expensive
+    // reallocation;
+    // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management/), we can
+    // start with a smaller buffer size (especially when the frontier size is large).
+    // for special cases when we can assure that there is no more than one push per destination
+    // (e.g. if cugraph::experimental::reduce_op::any is used), we can limit the buffer size to
+    // std::min(max_pushes, matrix_partition.get_minor_size()).
+    // For Volta+, we can limit the buffer size to std::min(max_pushes,
+    // matrix_partition.get_minor_size()) if the reduction operation is a pure function if we use
+    // locking.
+    // FIXME: if i != 0, this will require costly reallocation if we don't use the new CUDA feature
+    // to reserve address space.
+    vertex_frontier.resize_buffer(vertex_frontier.get_buffer_idx_value() + max_pushes);
+    auto buffer_first         = vertex_frontier.buffer_begin();
+    auto buffer_key_first     = std::get<0>(buffer_first);
+    auto buffer_payload_first = std::get<1>(buffer_first);
+
+    vertex_t row_value_input_offset = 0;
+    if (GraphViewType::is_multi_gpu) {
+      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_size = row_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      row_value_input_offset =
+        graph_view.is_hypergraph_partitioned()
+          ? matrix_partition.get_major_value_start_offset()
+          : graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size + i) -
+              graph_view.get_vertex_partition_first(col_comm_rank * row_comm_size);
+    }
+
+    raft::grid_1d_thread_t for_all_low_degree_grid(
+      frontier_rows.size() > 0 ? frontier_rows.size() : thrust::distance(vertex_first, vertex_last),
+      detail::update_frontier_v_push_if_out_nbr_for_all_block_size,
+      handle.get_device_properties().maxGridSize[0]);
+
+    // FIXME: This is highly inefficeint for graphs with high-degree vertices. If we renumber
+    // vertices to insure that rows within a partition are sorted by their out-degree in decreasing
+    // order, we will apply this kernel only to low out-degree vertices.
+    if (frontier_rows.size() > 0) {
+      detail::for_all_frontier_row_for_all_nbr_low_degree<<<for_all_low_degree_grid.num_blocks,
+                                                            for_all_low_degree_grid.block_size,
+                                                            0,
+                                                            handle.get_stream()>>>(
+        matrix_partition,
+        frontier_rows.begin(),
+        frontier_rows.begin(),
+        adj_matrix_row_value_input_first + row_value_input_offset,
+        adj_matrix_col_value_input_first,
+        buffer_key_first,
+        buffer_payload_first,
+        vertex_frontier.get_buffer_idx_ptr(),
+        e_op);
+    } else {
+      detail::for_all_frontier_row_for_all_nbr_low_degree<<<for_all_low_degree_grid.num_blocks,
+                                                            for_all_low_degree_grid.block_size,
+                                                            0,
+                                                            handle.get_stream()>>>(
+        matrix_partition,
+        vertex_first,
+        vertex_last,
+        adj_matrix_row_value_input_first + row_value_input_offset,
+        adj_matrix_col_value_input_first,
+        buffer_key_first,
+        buffer_payload_first,
+        vertex_frontier.get_buffer_idx_ptr(),
+        e_op);
+    }
+  }
+
+  // 2. reduce the buffer
+
+  auto num_buffer_offset = edge_t{0};
+
+  auto buffer_first         = vertex_frontier.buffer_begin();
+  auto buffer_key_first     = std::get<0>(buffer_first) + num_buffer_offset;
+  auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset;
+
+  auto num_buffer_elements = detail::reduce_buffer_elements(handle,
+                                                            buffer_key_first,
+                                                            buffer_payload_first,
+                                                            vertex_frontier.get_buffer_idx_value(),
+                                                            reduce_op);
+
+  if (GraphViewType::is_multi_gpu) {
+    auto& comm               = handle.get_comms();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();
+
+    std::vector<vertex_t> h_vertex_lasts(graph_view.is_hypergraph_partitioned() ? row_comm_size
+                                                                                : col_comm_size);
+    for (size_t i = 0; i < h_vertex_lasts.size(); ++i) {
+      h_vertex_lasts[i] = graph_view.get_vertex_partition_last(
+        graph_view.is_hypergraph_partitioned() ? col_comm_rank * row_comm_size + i
+                                               : row_comm_rank * col_comm_size + i);
+    }
+    rmm::device_uvector<vertex_t> d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream());
+    raft::update_device(
+      d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream());
+    rmm::device_uvector<edge_t> d_tx_buffer_last_boundaries(d_vertex_lasts.size(),
+                                                            handle.get_stream());
+    thrust::upper_bound(d_vertex_lasts.begin(),
+                        d_vertex_lasts.end(),
+                        buffer_key_first,
+                        buffer_key_first + num_buffer_elements,
+                        d_tx_buffer_last_boundaries.begin());
+    std::vector<edge_t> h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size());
+    raft::update_host(h_tx_buffer_last_boundaries.data(),
+                      d_tx_buffer_last_boundaries.data(),
+                      d_tx_buffer_last_boundaries.size(),
+                      handle.get_stream());
+    std::vector<edge_t> tx_counts(h_tx_buffer_last_boundaries.size());
+    std::adjacent_difference(
+      h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin());
+
+    std::vector<edge_t> rx_counts(graph_view.is_hypergraph_partitioned() ? row_comm_size
+                                                                         : col_comm_size);
+    std::vector<raft::comms::request_t> count_requests(tx_counts.size() + rx_counts.size());
+    for (size_t i = 0; i < tx_counts.size(); ++i) {
+      comm.isend(&tx_counts[i],
+                 1,
+                 graph_view.is_hypergraph_partitioned() ? col_comm_rank * row_comm_size + i
+                                                        : row_comm_rank * col_comm_size + i,
+                 0 /* tag */,
+                 count_requests.data() + i);
+    }
+    for (size_t i = 0; i < rx_counts.size(); ++i) {
+      comm.irecv(&rx_counts[i],
+                 1,
+                 graph_view.is_hypergraph_partitioned() ? col_comm_rank * row_comm_size + i
+                                                        : row_comm_rank + i * row_comm_size,
+                 0 /* tag */,
+                 count_requests.data() + tx_counts.size() + i);
+    }
+    comm.waitall(count_requests.size(), count_requests.data());
+
+    std::vector<edge_t> tx_offsets(tx_counts.size() + 1, edge_t{0});
+    std::partial_sum(tx_counts.begin(), tx_counts.end(), tx_offsets.begin() + 1);
+    std::vector<edge_t> rx_offsets(rx_counts.size() + 1, edge_t{0});
+    std::partial_sum(rx_counts.begin(), rx_counts.end(), rx_offsets.begin() + 1);
+
+    // FIXME: this will require costly reallocation if we don't use the new CUDA feature to reserve
+    // address space.
+    vertex_frontier.resize_buffer(num_buffer_elements + rx_offsets.back());
+
+    auto buffer_first         = vertex_frontier.buffer_begin();
+    auto buffer_key_first     = std::get<0>(buffer_first) + num_buffer_offset;
+    auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset;
+
+    auto constexpr tuple_size = thrust_tuple_size_or_one<
+      typename std::iterator_traits<decltype(buffer_payload_first)>::value_type>::value;
+
+    std::vector<raft::comms::request_t> buffer_requests((tx_counts.size() + rx_counts.size()) *
+                                                        (1 + tuple_size));
+    for (size_t i = 0; i < tx_counts.size(); ++i) {
+      auto comm_dst_rank = graph_view.is_hypergraph_partitioned()
+                             ? col_comm_rank * row_comm_size + i
+                             : row_comm_rank * col_comm_size + i;
+      comm.isend(detail::iter_to_raw_ptr(buffer_key_first + tx_offsets[i]),
+                 static_cast<size_t>(tx_counts[i]),
+                 comm_dst_rank,
+                 int{0} /* tag */,
+                 buffer_requests.data() + i * (1 + tuple_size));
+      device_isend<decltype(buffer_payload_first), decltype(buffer_payload_first)>(
+        comm,
+        buffer_payload_first + tx_offsets[i],
+        static_cast<size_t>(tx_counts[i]),
+        comm_dst_rank,
+        int{1} /* base tag */,
+        buffer_requests.data() + (i * (1 + tuple_size) + 1));
+    }
+    for (size_t i = 0; i < rx_counts.size(); ++i) {
+      auto comm_src_rank = graph_view.is_hypergraph_partitioned()
+                             ? col_comm_rank * row_comm_size + i
+                             : row_comm_rank + i * row_comm_size;
+      comm.irecv(detail::iter_to_raw_ptr(buffer_key_first + num_buffer_elements + rx_offsets[i]),
+                 static_cast<size_t>(rx_counts[i]),
+                 comm_src_rank,
+                 int{0} /* tag */,
+                 buffer_requests.data() + ((tx_counts.size() + i) * (1 + tuple_size)));
+      device_irecv<decltype(buffer_payload_first), decltype(buffer_payload_first)>(
+        comm,
+        buffer_payload_first + num_buffer_elements + rx_offsets[i],
+        static_cast<size_t>(rx_counts[i]),
+        comm_src_rank,
+        int{1} /* base tag */,
+        buffer_requests.data() + ((tx_counts.size() + i) * (1 + tuple_size) + 1));
+    }
+    comm.waitall(buffer_requests.size(), buffer_requests.data());
+
+    // FIXME: this does not exploit the fact that each segment is sorted. Lost performance
+    // optimization opportunities.
+    // FIXME: we can use [vertex_frontier.buffer_begin(), vertex_frontier.buffer_begin() +
+    // num_buffer_elements) as temporary buffer inside reduce_buffer_elements().
+    num_buffer_offset   = num_buffer_elements;
+    num_buffer_elements = detail::reduce_buffer_elements(handle,
+                                                         buffer_key_first + num_buffer_elements,
+                                                         buffer_payload_first + num_buffer_elements,
+                                                         rx_offsets.back(),
+                                                         reduce_op);
+  }
+
+  // 3. update vertex properties
+
+  if (num_buffer_elements > 0) {
+    auto buffer_first         = vertex_frontier.buffer_begin();
+    auto buffer_key_first     = std::get<0>(buffer_first) + num_buffer_offset;
+    auto buffer_payload_first = std::get<1>(buffer_first) + num_buffer_offset;
+
+    raft::grid_1d_thread_t update_grid(num_buffer_elements,
+                                       detail::update_frontier_v_push_if_out_nbr_update_block_size,
+                                       handle.get_device_properties().maxGridSize[0]);
+
+    auto constexpr invalid_vertex = invalid_vertex_id<vertex_t>::value;
+
+    auto bucket_and_bucket_size_device_ptrs =
+      vertex_frontier.get_bucket_and_bucket_size_device_pointers();
+    detail::update_frontier_and_vertex_output_values<VertexFrontierType::kNumBuckets>
+      <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+        buffer_key_first,
+        buffer_payload_first,
+        num_buffer_elements,
+        vertex_value_input_first,
+        vertex_value_output_first,
+        std::get<0>(bucket_and_bucket_size_device_ptrs).get(),
+        std::get<1>(bucket_and_bucket_size_device_ptrs).get(),
+        VertexFrontierType::kInvalidBucketIdx,
+        invalid_vertex,
+        v_op);
+
+    auto bucket_sizes_device_ptr = std::get<1>(bucket_and_bucket_size_device_ptrs);
+    thrust::host_vector<size_t> bucket_sizes(
+      bucket_sizes_device_ptr, bucket_sizes_device_ptr + VertexFrontierType::kNumBuckets);
+    for (size_t i = 0; i < VertexFrontierType::kNumBuckets; ++i) {
+      vertex_frontier.get_bucket(i).set_size(bucket_sizes[i]);
+    }
+  }
+}
+
+/*
+
+FIXME:
+
+iterating over lower triangular (or upper triangular) : triangle counting
+LRB might be necessary if the cost of processing an edge (i, j) is a function of degree(i) and
+degree(j) : triangle counting
+push-pull switching support (e.g. DOBFS), in this case, we need both
+CSR & CSC (trade-off execution time vs memory requirement, unless graph is symmetric)
+if graph is symmetric, there will be additional optimization opportunities (e.g. in-degree ==
+out-degree) For BFS, sending a bit vector (for the entire set of dest vertices per partitoin may
+work better we can use thrust::set_intersection for triangle counting think about adding thrust
+wrappers for reduction functions. Can I pass nullptr for dummy
+instead of thrust::make_counting_iterator(0)?
+*/
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/vertex_frontier.cuh b/cpp/include/patterns/vertex_frontier.cuh
new file mode 100644
index 00000000000..3b4b05ffb2f
--- /dev/null
+++ b/cpp/include/patterns/vertex_frontier.cuh
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <utilities/comm_utils.cuh>
+#include <utilities/error.hpp>
+#include <utilities/thrust_tuple_utils.cuh>
+
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+
+#include <thrust/host_vector.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+#include <cinttypes>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+// FIXME: block size requires tuning
+int32_t constexpr move_and_invalidate_if_block_size = 128;
+
+// FIXME: better move to another file for reusability
+inline size_t round_up(size_t number_to_round, size_t modulus)
+{
+  return ((number_to_round + (modulus - 1)) / modulus) * modulus;
+}
+
+template <typename TupleType, typename vertex_t, size_t... Is>
+auto make_buffer_zip_iterator_impl(std::vector<void*>& buffer_ptrs,
+                                   size_t offset,
+                                   std::index_sequence<Is...>)
+{
+  auto key_ptr    = reinterpret_cast<vertex_t*>(buffer_ptrs[0]) + offset;
+  auto payload_it = thrust::make_zip_iterator(
+    thrust::make_tuple(reinterpret_cast<typename thrust::tuple_element<Is, TupleType>::type*>(
+      buffer_ptrs[1 + Is])...));
+  return std::make_tuple(key_ptr, payload_it);
+}
+
+template <typename TupleType, typename vertex_t>
+auto make_buffer_zip_iterator(std::vector<void*>& buffer_ptrs, size_t offset)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<TupleType>::value;
+  return make_buffer_zip_iterator_impl<TupleType, vertex_t>(
+    buffer_ptrs, offset, std::make_index_sequence<tuple_size>());
+}
+
+template <size_t num_buckets, typename RowIterator, typename vertex_t, typename SplitOp>
+__global__ void move_and_invalidate_if(RowIterator row_first,
+                                       RowIterator row_last,
+                                       vertex_t** bucket_ptrs,
+                                       size_t* bucket_sizes_ptr,
+                                       size_t this_bucket_idx,
+                                       size_t invalid_bucket_idx,
+                                       vertex_t invalid_vertex,
+                                       SplitOp split_op)
+{
+  static_assert(
+    std::is_same<typename std::iterator_traits<RowIterator>::value_type, vertex_t>::value);
+  auto const tid    = threadIdx.x + blockIdx.x * blockDim.x;
+  size_t idx        = tid;
+  size_t block_idx  = blockIdx.x;
+  auto num_elements = thrust::distance(row_first, row_last);
+  // FIXME: it might be more performant to process more than one element per thread
+  auto num_blocks = (num_elements + blockDim.x - 1) / blockDim.x;
+
+  using BlockScan = cub::BlockScan<size_t, move_and_invalidate_if_block_size>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  __shared__ size_t bucket_block_start_offsets[num_buckets];
+
+  size_t bucket_block_local_offsets[num_buckets];
+  size_t bucket_block_aggregate_sizes[num_buckets];
+
+  while (block_idx < num_blocks) {
+    for (size_t i = 0; i < num_buckets; ++i) { bucket_block_local_offsets[i] = 0; }
+
+    size_t selected_bucket_idx{invalid_bucket_idx};
+    vertex_t key{invalid_vertex};
+
+    if (idx < num_elements) {
+      key                 = *(row_first + idx);
+      selected_bucket_idx = split_op(key);
+      if (selected_bucket_idx != this_bucket_idx) {
+        *(row_first + idx) = invalid_vertex;
+        if (selected_bucket_idx != invalid_bucket_idx) {
+          bucket_block_local_offsets[selected_bucket_idx] = 1;
+        }
+      }
+    }
+
+    for (size_t i = 0; i < num_buckets; ++i) {
+      BlockScan(temp_storage)
+        .ExclusiveSum(bucket_block_local_offsets[i],
+                      bucket_block_local_offsets[i],
+                      bucket_block_aggregate_sizes[i]);
+    }
+
+    if (threadIdx.x == 0) {
+      for (size_t i = 0; i < num_buckets; ++i) {
+        static_assert(sizeof(unsigned long long int) == sizeof(size_t));
+        bucket_block_start_offsets[i] =
+          atomicAdd(reinterpret_cast<unsigned long long int*>(bucket_sizes_ptr + i),
+                    static_cast<unsigned long long int>(bucket_block_aggregate_sizes[i]));
+      }
+    }
+
+    __syncthreads();
+
+    // FIXME: better use shared memory buffer to aggreaget global memory writes
+    if ((selected_bucket_idx != this_bucket_idx) && (selected_bucket_idx != invalid_bucket_idx)) {
+      bucket_ptrs[selected_bucket_idx][bucket_block_start_offsets[selected_bucket_idx] +
+                                       bucket_block_local_offsets[selected_bucket_idx]] = key;
+    }
+
+    idx += gridDim.x * blockDim.x;
+    block_idx += gridDim.x;
+  }
+}
+
+}  // namespace detail
+
+template <typename vertex_t, bool is_multi_gpu = false>
+class Bucket {
+ public:
+  Bucket(raft::handle_t const& handle, size_t capacity)
+    : handle_ptr_(&handle), elements_(capacity, invalid_vertex_id<vertex_t>::value)
+  {
+  }
+
+  void insert(vertex_t v)
+  {
+    elements_[size_] = v;
+    ++size_;
+  }
+
+  size_t size() const { return size_; }
+
+  void set_size(size_t size) { size_ = size; }
+
+  template <bool do_aggregate = is_multi_gpu>
+  std::enable_if_t<do_aggregate, size_t> aggregate_size() const
+  {
+    return host_scalar_allreduce(handle_ptr_->get_comms(), size_, handle_ptr_->get_stream());
+  }
+
+  template <bool do_aggregate = is_multi_gpu>
+  std::enable_if_t<!do_aggregate, size_t> aggregate_size() const
+  {
+    return size_;
+  }
+
+  void clear() { size_ = 0; }
+
+  size_t capacity() const { return elements_.size(); }
+
+  auto const data() const { return elements_.data().get(); }
+
+  auto data() { return elements_.data().get(); }
+
+  auto const begin() const { return elements_.begin(); }
+
+  auto begin() { return elements_.begin(); }
+
+  auto const end() const { return elements_.begin() + size_; }
+
+  auto end() { return elements_.begin() + size_; }
+
+ private:
+  raft::handle_t const* handle_ptr_{nullptr};
+  rmm::device_vector<vertex_t> elements_{};
+  size_t size_{0};
+};
+
+template <typename ReduceInputTupleType,
+          typename vertex_t,
+          bool is_multi_gpu  = false,
+          size_t num_buckets = 1>
+class VertexFrontier {
+ public:
+  static size_t constexpr kNumBuckets = num_buckets;
+  static size_t constexpr kInvalidBucketIdx{std::numeric_limits<size_t>::max()};
+
+  VertexFrontier(raft::handle_t const& handle, std::vector<size_t> bucket_capacities)
+    : handle_ptr_(&handle),
+      tmp_bucket_ptrs_(num_buckets, nullptr),
+      tmp_bucket_sizes_(num_buckets, 0),
+      buffer_ptrs_(kReduceInputTupleSize + 1 /* to store destination column number */, nullptr),
+      buffer_idx_(0, handle_ptr_->get_stream())
+  {
+    CUGRAPH_EXPECTS(bucket_capacities.size() == num_buckets,
+                    "invalid input argument bucket_capacities (size mismatch)");
+    for (size_t i = 0; i < num_buckets; ++i) {
+      buckets_.emplace_back(handle, bucket_capacities[i]);
+    }
+    buffer_.set_stream(handle_ptr_->get_stream());
+  }
+
+  Bucket<vertex_t, is_multi_gpu>& get_bucket(size_t bucket_idx) { return buckets_[bucket_idx]; }
+
+  Bucket<vertex_t, is_multi_gpu> const& get_bucket(size_t bucket_idx) const
+  {
+    return buckets_[bucket_idx];
+  }
+
+  void swap_buckets(size_t bucket_idx0, size_t bucket_idx1)
+  {
+    std::swap(buckets_[bucket_idx0], buckets_[bucket_idx1]);
+  }
+
+  template <typename SplitOp>
+  void split_bucket(size_t bucket_idx, SplitOp split_op)
+  {
+    auto constexpr invalid_vertex = invalid_vertex_id<vertex_t>::value;
+
+    auto bucket_and_bucket_size_device_ptrs = get_bucket_and_bucket_size_device_pointers();
+
+    auto& this_bucket = get_bucket(bucket_idx);
+    raft::grid_1d_thread_t move_and_invalidate_if_grid(
+      this_bucket.size(),
+      detail::move_and_invalidate_if_block_size,
+      handle_ptr_->get_device_properties().maxGridSize[0]);
+
+    detail::move_and_invalidate_if<kNumBuckets>
+      <<<move_and_invalidate_if_grid.num_blocks,
+         move_and_invalidate_if_grid.block_size,
+         0,
+         handle_ptr_->get_stream()>>>(this_bucket.begin(),
+                                      this_bucket.end(),
+                                      std::get<0>(bucket_and_bucket_size_device_ptrs).get(),
+                                      std::get<1>(bucket_and_bucket_size_device_ptrs).get(),
+                                      bucket_idx,
+                                      kInvalidBucketIdx,
+                                      invalid_vertex,
+                                      split_op);
+
+    // FIXME: if we adopt CUDA cooperative group https://devblogs.nvidia.com/cooperative-groups
+    // and global sync(), we can merge this step with the above kernel (and rename the above kernel
+    // to move_if)
+    auto it =
+      thrust::remove_if(rmm::exec_policy(handle_ptr_->get_stream())->on(handle_ptr_->get_stream()),
+                        get_bucket(bucket_idx).begin(),
+                        get_bucket(bucket_idx).end(),
+                        [] __device__(auto value) { return value == invalid_vertex; });
+
+    auto bucket_sizes_device_ptr = std::get<1>(bucket_and_bucket_size_device_ptrs);
+    thrust::host_vector<size_t> bucket_sizes(bucket_sizes_device_ptr,
+                                             bucket_sizes_device_ptr + kNumBuckets);
+    for (size_t i = 0; i < kNumBuckets; ++i) {
+      if (i != bucket_idx) { get_bucket(i).set_size(bucket_sizes[i]); }
+    }
+
+    auto size = thrust::distance(get_bucket(bucket_idx).begin(), it);
+    get_bucket(bucket_idx).set_size(size);
+
+    return;
+  }
+
+  auto get_bucket_and_bucket_size_device_pointers()
+  {
+    thrust::host_vector<vertex_t*> tmp_ptrs(buckets_.size(), nullptr);
+    thrust::host_vector<size_t> tmp_sizes(buckets_.size(), 0);
+    for (size_t i = 0; i < buckets_.size(); ++i) {
+      tmp_ptrs[i]  = get_bucket(i).data();
+      tmp_sizes[i] = get_bucket(i).size();
+    }
+    tmp_bucket_ptrs_  = tmp_ptrs;
+    tmp_bucket_sizes_ = tmp_sizes;
+    return std::make_tuple(tmp_bucket_ptrs_.data(), tmp_bucket_sizes_.data());
+  }
+
+  void resize_buffer(size_t size)
+  {
+    // FIXME: rmm::device_buffer resize incurs copy if memory is reallocated, which is unnecessary
+    // in this case.
+    buffer_.resize(compute_aggregate_buffer_size_in_bytes(size), handle_ptr_->get_stream());
+    if (size > buffer_capacity_) {
+      buffer_capacity_ = size;
+      update_buffer_ptrs();
+    }
+    buffer_size_ = size;
+  }
+
+  void clear_buffer() { resize_buffer(0); }
+
+  void shrink_to_fit_buffer()
+  {
+    if (buffer_size_ != buffer_capacity_) {
+      // FIXME: rmm::device_buffer shrink_to_fit incurs copy if memory is reallocated, which is
+      // unnecessary in this case.
+      buffer_.shrink_to_fit(handle_ptr_->get_stream());
+      update_buffer_ptrs();
+      buffer_capacity_ = buffer_size_;
+    }
+  }
+
+  auto buffer_begin()
+  {
+    return detail::make_buffer_zip_iterator<ReduceInputTupleType, vertex_t>(buffer_ptrs_, 0);
+  }
+
+  auto buffer_end()
+  {
+    return detail::make_buffer_zip_iterator<ReduceInputTupleType, vertex_t>(buffer_ptrs_,
+                                                                            buffer_size_);
+  }
+
+  auto get_buffer_idx_ptr() { return buffer_idx_.data(); }
+
+  size_t get_buffer_idx_value() { return buffer_idx_.value(handle_ptr_->get_stream()); }
+
+  void set_buffer_idx_value(size_t value)
+  {
+    buffer_idx_.set_value(value, handle_ptr_->get_stream());
+  }
+
+ private:
+  static size_t constexpr kReduceInputTupleSize = thrust::tuple_size<ReduceInputTupleType>::value;
+  static size_t constexpr kBufferAlignment      = 128;
+
+  raft::handle_t const* handle_ptr_{nullptr};
+  std::vector<Bucket<vertex_t, is_multi_gpu>> buckets_{};
+  rmm::device_vector<vertex_t*> tmp_bucket_ptrs_{};
+  rmm::device_vector<size_t> tmp_bucket_sizes_{};
+
+  std::array<size_t, kReduceInputTupleSize> tuple_element_sizes_ =
+    compute_thrust_tuple_element_sizes<ReduceInputTupleType>()();
+  std::vector<void*> buffer_ptrs_{};
+  rmm::device_buffer buffer_{};
+  size_t buffer_size_{0};
+  size_t buffer_capacity_{0};
+  rmm::device_scalar<size_t> buffer_idx_{};
+
+  // FIXME: better pick between this apporach or the approach used in allocate_comm_buffer
+  size_t compute_aggregate_buffer_size_in_bytes(size_t size)
+  {
+    size_t aggregate_buffer_size_in_bytes =
+      detail::round_up(sizeof(vertex_t) * size, kBufferAlignment);
+    for (size_t i = 0; i < kReduceInputTupleSize; ++i) {
+      aggregate_buffer_size_in_bytes +=
+        detail::round_up(tuple_element_sizes_[i] * size, kBufferAlignment);
+    }
+    return aggregate_buffer_size_in_bytes;
+  }
+
+  void update_buffer_ptrs()
+  {
+    uintptr_t ptr   = reinterpret_cast<uintptr_t>(buffer_.data());
+    buffer_ptrs_[0] = reinterpret_cast<void*>(ptr);
+    ptr += detail::round_up(sizeof(vertex_t) * buffer_capacity_, kBufferAlignment);
+    for (size_t i = 0; i < kReduceInputTupleSize; ++i) {
+      buffer_ptrs_[1 + i] = reinterpret_cast<void*>(ptr);
+      ptr += detail::round_up(tuple_element_sizes_[i] * buffer_capacity_, kBufferAlignment);
+    }
+  }
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/utilities/comm_utils.cuh b/cpp/include/utilities/comm_utils.cuh
new file mode 100644
index 00000000000..6cd6e62bc3a
--- /dev/null
+++ b/cpp/include/utilities/comm_utils.cuh
@@ -0,0 +1,788 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <utilities/thrust_tuple_utils.cuh>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/detail/type_traits/iterator/is_discard_iterator.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/detail/normal_iterator.h>
+
+#include <numeric>
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename TupleType, size_t I, size_t N>
+struct update_vector_of_tuple_scalar_elements_from_tuple_impl {
+  void update(std::vector<int64_t>& tuple_scalar_elements, TupleType const& tuple) const
+  {
+    using element_t = typename thrust::tuple_element<I, TupleType>::type;
+    static_assert(sizeof(element_t) <= sizeof(int64_t));
+    auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
+    *ptr     = thrust::get<I>(tuple);
+    update_vector_of_tuple_scalar_elements_from_tuple_impl<TupleType, I + 1, N>().update(
+      tuple_scalar_elements, tuple);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct update_vector_of_tuple_scalar_elements_from_tuple_impl<TupleType, I, I> {
+  void update(std::vector<int64_t>& tuple_scalar_elements, TupleType const& tuple) const { return; }
+};
+
+template <typename TupleType, size_t I, size_t N>
+struct update_tuple_from_vector_of_tuple_scalar_elements_impl {
+  void update(TupleType& tuple, std::vector<int64_t> const& tuple_scalar_elements) const
+  {
+    using element_t = typename thrust::tuple_element<I, TupleType>::type;
+    static_assert(sizeof(element_t) <= sizeof(int64_t));
+    auto ptr              = reinterpret_cast<element_t const*>(tuple_scalar_elements.data() + I);
+    thrust::get<I>(tuple) = *ptr;
+    update_tuple_from_vector_of_tuple_scalar_elements_impl<TupleType, I + 1, N>().update(
+      tuple, tuple_scalar_elements);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct update_tuple_from_vector_of_tuple_scalar_elements_impl<TupleType, I, I> {
+  void update(TupleType& tuple, std::vector<int64_t> const& tuple_scalar_elements) const { return; }
+};
+
+template <typename TupleType, size_t I, size_t N>
+struct host_allreduce_tuple_scalar_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           cudaStream_t stream) const
+  {
+    using element_t = typename thrust::tuple_element<I, TupleType>::type;
+    static_assert(sizeof(element_t) <= sizeof(int64_t));
+    auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
+    comm.allreduce(ptr, ptr, 1, raft::comms::op_t::SUM, stream);
+    host_allreduce_tuple_scalar_element_impl<TupleType, I + 1, N>().run(
+      comm, tuple_scalar_elements, stream);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct host_allreduce_tuple_scalar_element_impl<TupleType, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           cudaStream_t stream) const
+  {
+  }
+};
+
+template <typename T>
+T* iter_to_raw_ptr(T* ptr)
+{
+  return ptr;
+}
+
+template <typename T>
+T* iter_to_raw_ptr(thrust::device_ptr<T> ptr)
+{
+  return thrust::raw_pointer_cast(ptr);
+}
+
+template <typename T>
+auto iter_to_raw_ptr(thrust::detail::normal_iterator<thrust::device_ptr<T>> iter)
+{
+  return thrust::raw_pointer_cast(iter.base());
+}
+
+template <typename InputIterator, typename OutputValueType>
+std::enable_if_t<std::is_same<OutputValueType, thrust::detail::any_assign>::value, void>
+device_isend_impl(raft::comms::comms_t const& comm,
+                  InputIterator input_first,
+                  size_t count,
+                  int dst,
+                  int tag,
+                  raft::comms::request_t* request)
+{
+  // no-op
+}
+
+template <typename InputIterator, typename OutputValueType>
+std::enable_if_t<std::is_arithmetic<OutputValueType>::value, void> device_isend_impl(
+  raft::comms::comms_t const& comm,
+  InputIterator input_first,
+  size_t count,
+  int dst,
+  int tag,
+  raft::comms::request_t* request)
+{
+  static_assert(
+    std::is_same<typename std::iterator_traits<InputIterator>::value_type, OutputValueType>::value);
+  comm.isend(iter_to_raw_ptr(input_first), count, dst, tag, request);
+}
+
+template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
+struct device_isend_tuple_iterator_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           size_t count,
+           int dst,
+           int base_tag,
+           raft::comms::request_t* requests) const
+  {
+    using output_value_t = typename thrust::
+      tuple_element<I, typename std::iterator_traits<OutputIterator>::value_type>::type;
+    auto tuple_element_input_first = thrust::get<I>(input_first.get_iterator_tuple());
+    device_isend_impl<decltype(tuple_element_input_first), output_value_t>(
+      comm, tuple_element_input_first, count, dst, static_cast<int>(base_tag + I), requests + I);
+    device_isend_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>().run(
+      comm, input_first, count, dst, base_tag, requests);
+  }
+};
+
+template <typename InputIterator, typename OutputIterator, size_t I>
+struct device_isend_tuple_iterator_element_impl<InputIterator, OutputIterator, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           size_t count,
+           int dst,
+           int base_tag,
+           raft::comms::request_t* requests) const
+  {
+  }
+};
+
+template <typename InputValueType, typename OutputIterator>
+std::enable_if_t<thrust::detail::is_discard_iterator<OutputIterator>::value, void>
+device_irecv_impl(raft::comms::comms_t const& comm,
+                  OutputIterator output_first,
+                  size_t count,
+                  int src,
+                  int tag,
+                  raft::comms::request_t* request)
+{
+  // no-op
+}
+
+template <typename InputValueType, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_irecv_impl(raft::comms::comms_t const& comm,
+                  OutputIterator output_first,
+                  size_t count,
+                  int src,
+                  int tag,
+                  raft::comms::request_t* request)
+{
+  static_assert(
+
+    std::is_same<InputValueType, typename std::iterator_traits<OutputIterator>::value_type>::value);
+  comm.irecv(iter_to_raw_ptr(output_first), count, src, tag, request);
+}
+
+template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
+struct device_irecv_tuple_iterator_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           OutputIterator output_first,
+           size_t count,
+           int src,
+           int base_tag,
+           raft::comms::request_t* requests) const
+  {
+    using input_value_t = typename thrust::
+      tuple_element<I, typename std::iterator_traits<InputIterator>::value_type>::type;
+    auto tuple_element_output_first = thrust::get<I>(output_first.get_iterator_tuple());
+    device_irecv_impl<input_value_t, decltype(tuple_element_output_first)>(
+      comm, tuple_element_output_first, count, src, static_cast<int>(base_tag + I), requests + I);
+    device_irecv_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>().run(
+      comm, output_first, count, src, base_tag, requests);
+  }
+};
+
+template <typename InputIterator, typename OutputIterator, size_t I>
+struct device_irecv_tuple_iterator_element_impl<InputIterator, OutputIterator, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           OutputIterator output_first,
+           size_t count,
+           int src,
+           int base_tag,
+           raft::comms::request_t* requests) const
+  {
+  }
+};
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<thrust::detail::is_discard_iterator<OutputIterator>::value, void>
+device_bcast_impl(raft::comms::comms_t const& comm,
+                  InputIterator input_first,
+                  OutputIterator output_first,
+                  size_t count,
+                  int root,
+                  cudaStream_t stream)
+{
+  // no-op
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_bcast_impl(raft::comms::comms_t const& comm,
+                  InputIterator input_first,
+                  OutputIterator output_first,
+                  size_t count,
+                  int root,
+                  cudaStream_t stream)
+{
+  static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
+                             typename std::iterator_traits<OutputIterator>::value_type>::value);
+  if (comm.get_rank() == root) {
+    comm.bcast(iter_to_raw_ptr(input_first), count, root, stream);
+  } else {
+    comm.bcast(iter_to_raw_ptr(output_first), count, root, stream);
+  }
+}
+
+template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
+struct device_bcast_tuple_iterator_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t count,
+           int root,
+           cudaStream_t stream) const
+  {
+    device_bcast_impl(comm,
+                      thrust::get<I>(input_first.get_iterator_tuple()),
+                      thrust::get<I>(output_first.get_iterator_tuple()),
+                      count,
+                      root,
+                      stream);
+    device_bcast_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>(
+      comm, input_first, output_first, count, root, stream);
+  }
+};
+
+template <typename InputIterator, typename OutputIterator, size_t I>
+struct device_bcast_tuple_iterator_element_impl<InputIterator, OutputIterator, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t count,
+           int root,
+           cudaStream_t stream) const
+  {
+  }
+};
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<thrust::detail::is_discard_iterator<OutputIterator>::value, void>
+device_reduce_impl(raft::comms::comms_t const& comm,
+                   InputIterator input_first,
+                   OutputIterator output_first,
+                   size_t count,
+                   raft::comms::op_t op,
+                   int root,
+                   cudaStream_t stream)
+{
+  // no-op
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_reduce_impl(raft::comms::comms_t const& comm,
+                   InputIterator input_first,
+                   OutputIterator output_first,
+                   size_t count,
+                   raft::comms::op_t op,
+                   int root,
+                   cudaStream_t stream)
+{
+  static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
+                             typename std::iterator_traits<OutputIterator>::value_type>::value);
+  comm.reduce(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, root, stream);
+}
+
+template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
+struct device_reduce_tuple_iterator_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t count,
+           raft::comms::op_t op,
+           int root,
+           cudaStream_t stream) const
+  {
+    device_reduce_impl(comm,
+                       thrust::get<I>(input_first.get_iterator_tuple()),
+                       thrust::get<I>(output_first.get_iterator_tuple()),
+                       count,
+                       op,
+                       root,
+                       stream);
+    device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>(
+      comm, input_first, output_first, count, op, root, stream);
+  }
+};
+
+template <typename InputIterator, typename OutputIterator, size_t I>
+struct device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t count,
+           raft::comms::op_t op,
+           int root,
+           cudaStream_t stream) const
+  {
+  }
+};
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<thrust::detail::is_discard_iterator<OutputIterator>::value, void>
+device_allgatherv_impl(raft::comms::comms_t const& comm,
+                       InputIterator input_first,
+                       OutputIterator output_first,
+                       std::vector<size_t> const& recvcounts,
+                       std::vector<size_t> const& displacements,
+                       cudaStream_t stream)
+{
+  // no-op
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_allgatherv_impl(raft::comms::comms_t const& comm,
+                       InputIterator input_first,
+                       OutputIterator output_first,
+                       std::vector<size_t> const& recvcounts,
+                       std::vector<size_t> const& displacements,
+                       cudaStream_t stream)
+{
+  static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
+                             typename std::iterator_traits<OutputIterator>::value_type>::value);
+  comm.allgatherv(iter_to_raw_ptr(input_first),
+                  iter_to_raw_ptr(output_first),
+                  recvcounts.data(),
+                  displacements.data(),
+                  stream);
+}
+
+template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
+struct device_allgatherv_tuple_iterator_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           std::vector<size_t> const& recvcounts,
+           std::vector<size_t> const& displacements,
+           cudaStream_t stream) const
+  {
+    device_allgatherv_impl(comm,
+                           thrust::get<I>(input_first.get_iterator_tuple()),
+                           thrust::get<I>(output_first.get_iterator_tuple()),
+                           recvcounts,
+                           displacements,
+                           stream);
+    device_allgatherv_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>().run(
+      comm, input_first, output_first, recvcounts, displacements, stream);
+  }
+};
+
+template <typename InputIterator, typename OutputIterator, size_t I>
+struct device_allgatherv_tuple_iterator_element_impl<InputIterator, OutputIterator, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           std::vector<size_t> const& recvcounts,
+           std::vector<size_t> const& displacements,
+           cudaStream_t stream) const
+  {
+  }
+};
+
+template <typename TupleType, size_t I>
+auto allocate_comm_buffer_tuple_element_impl(size_t buffer_size, cudaStream_t stream)
+{
+  using element_t = typename thrust::tuple_element<I, TupleType>::type;
+  return rmm::device_uvector<element_t>(buffer_size, stream);
+}
+
+template <typename TupleType, size_t... Is>
+auto allocate_comm_buffer_tuple_impl(std::index_sequence<Is...>,
+                                     size_t buffer_size,
+                                     cudaStream_t stream)
+{
+  return thrust::make_tuple(
+    allocate_comm_buffer_tuple_element_impl<TupleType, Is>(buffer_size, stream)...);
+}
+
+template <typename TupleType, size_t I, typename BufferType>
+auto get_comm_buffer_begin_tuple_element_impl(BufferType& buffer)
+{
+  using element_t = typename thrust::tuple_element<I, TupleType>::type;
+  return thrust::get<I>(buffer).begin();
+}
+
+template <typename TupleType, size_t... Is, typename BufferType>
+auto get_comm_buffer_begin_tuple_impl(std::index_sequence<Is...>, BufferType& buffer)
+{
+  return thrust::make_tuple(get_comm_buffer_begin_tuple_element_impl<TupleType, Is>(buffer)...);
+}
+
+}  // namespace detail
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_allreduce(
+  raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+{
+  rmm::device_uvector<T> d_input(1, stream);
+  raft::update_device(d_input.data(), &input, 1, stream);
+  comm.allreduce(d_input.data(), d_input.data(), 1, raft::comms::op_t::SUM, stream);
+  T h_input{};
+  raft::update_host(&h_input, d_input.data(), 1, stream);
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  return h_input;
+}
+
+template <typename T>
+std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, T>
+host_scalar_allreduce(raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
+  rmm::device_uvector<int64_t> d_tuple_scalar_elements(tuple_size, stream);
+  T ret{};
+
+  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
+    h_tuple_scalar_elements, input);
+  raft::update_device(
+    d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
+  detail::host_allreduce_tuple_scalar_element_impl<T, size_t{0}, tuple_size>().run(
+    comm, d_tuple_scalar_elements, stream);
+  raft::update_host(
+    h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>().update(
+    ret, h_tuple_scalar_elements);
+
+  return ret;
+}
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value, std::vector<T>> host_scalar_allgather(
+  raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+{
+  std::vector<size_t> rx_counts(comm.get_size(), size_t{1});
+  std::vector<size_t> displacements(rx_counts.size(), size_t{0});
+  std::iota(displacements.begin(), displacements.end(), size_t{0});
+  rmm::device_uvector<T> d_outputs(rx_counts.size(), stream);
+  raft::update_device(d_outputs.data() + comm.get_rank(), &input, 1, stream);
+  comm.allgatherv(d_outputs.data() + comm.get_rank(),
+                  d_outputs.data(),
+                  rx_counts.data(),
+                  displacements.data(),
+                  stream);
+  std::vector<T> h_outputs(rx_counts.size(), size_t{0});
+  raft::update_host(h_outputs.data(), d_outputs.data(), rx_counts.size(), stream);
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  return h_outputs;
+}
+
+template <typename T>
+std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, std::vector<T>>
+host_scalar_allgather(raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  std::vector<size_t> rx_counts(comm.get_size(), tuple_size);
+  std::vector<size_t> displacements(rx_counts.size(), size_t{0});
+  for (size_t i = 0; i < displacements.size(); ++i) { displacements[i] = i * tuple_size; }
+  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
+  rmm::device_uvector<int64_t> d_allgathered_tuple_scalar_elements(comm.get_size() * tuple_size,
+                                                                   stream);
+
+  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
+    h_tuple_scalar_elements, input);
+  raft::update_device(d_allgathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size,
+                      h_tuple_scalar_elements.data(),
+                      tuple_size,
+                      stream);
+  comm.allgatherv(d_allgathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size,
+                  d_allgathered_tuple_scalar_elements.data(),
+                  rx_counts.data(),
+                  displacements.data(),
+                  stream);
+  std::vector<int64_t> h_allgathered_tuple_scalar_elements(comm.get_size() * tuple_size);
+  raft::update_host(h_allgathered_tuple_scalar_elements.data(),
+                    d_allgathered_tuple_scalar_elements.data(),
+                    comm.get_size() * tuple_size,
+                    stream);
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+
+  std::vector<T> ret(comm.get_size());
+  for (size_t i = 0; i < ret.size(); ++i) {
+    std::vector<int64_t> h_tuple_scalar_elements(
+      h_allgathered_tuple_scalar_elements.data() + i * tuple_size,
+      h_allgathered_tuple_scalar_elements.data() + (i + 1) * tuple_size);
+    detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
+      .update(ret[i], h_tuple_scalar_elements);
+  }
+
+  return ret;
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value,
+  void>
+device_isend(raft::comms::comms_t const& comm,
+             InputIterator input_first,
+             size_t count,
+             int dst,
+             int base_tag /* actual tag = base tag */,
+             raft::comms::request_t* requests)
+{
+  detail::device_isend_impl<InputIterator,
+                            typename std::iterator_traits<OutputIterator>::value_type>(
+    comm, input_first, count, dst, base_tag, requests);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  is_thrust_tuple_of_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value &&
+    is_thrust_tuple<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_isend(raft::comms::comms_t const& comm,
+             InputIterator input_first,
+             size_t count,
+             int dst,
+             int base_tag /* actual tag = base_tag + tuple index */,
+             raft::comms::request_t* requests)
+{
+  static_assert(
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
+    thrust::tuple_size<typename thrust::iterator_traits<OutputIterator>::value_type>::value);
+
+  size_t constexpr tuple_size =
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
+
+  detail::
+    device_isend_tuple_iterator_element_impl<InputIterator, OutputIterator, size_t{0}, tuple_size>()
+      .run(comm, input_first, count, dst, base_tag, requests);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value,
+  void>
+device_irecv(raft::comms::comms_t const& comm,
+             OutputIterator output_first,
+             size_t count,
+             int src,
+             int base_tag /* actual tag = base tag */,
+             raft::comms::request_t* requests)
+{
+  detail::device_irecv_impl<typename std::iterator_traits<InputIterator>::value_type,
+                            OutputIterator>(comm, output_first, count, src, base_tag, requests);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  is_thrust_tuple_of_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value &&
+    is_thrust_tuple<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_irecv(raft::comms::comms_t const& comm,
+             OutputIterator output_first,
+             size_t count,
+             int src,
+             int base_tag /* actual tag = base_tag + tuple index */,
+             raft::comms::request_t* requests)
+{
+  static_assert(
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
+    thrust::tuple_size<typename thrust::iterator_traits<OutputIterator>::value_type>::value);
+
+  size_t constexpr tuple_size =
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
+
+  detail::
+    device_irecv_tuple_iterator_element_impl<InputIterator, OutputIterator, size_t{0}, tuple_size>()
+      .run(comm, output_first, count, src, base_tag, requests);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value,
+  void>
+device_bcast(raft::comms::comms_t const& comm,
+             InputIterator input_first,
+             OutputIterator output_first,
+             size_t count,
+             int root,
+             cudaStream_t stream)
+{
+  detail::device_bcast_impl(comm, input_first, output_first, count, root, stream);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  is_thrust_tuple_of_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value &&
+    is_thrust_tuple<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_bcast(raft::comms::comms_t const& comm,
+             InputIterator input_first,
+             OutputIterator output_first,
+             size_t count,
+             int root,
+             cudaStream_t stream)
+{
+  static_assert(
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
+    thrust::tuple_size<typename thrust::iterator_traits<OutputIterator>::value_type>::value);
+
+  size_t constexpr tuple_size =
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
+
+  detail::
+    device_bcast_tuple_iterator_element_impl<InputIterator, OutputIterator, size_t{0}, tuple_size>(
+      comm, input_first, output_first, count, root, stream);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value,
+  void>
+device_reduce(raft::comms::comms_t const& comm,
+              InputIterator input_first,
+              OutputIterator output_first,
+              size_t count,
+              raft::comms::op_t op,
+              int root,
+              cudaStream_t stream)
+{
+  detail::device_reduce_impl(comm, input_first, output_first, count, op, root, stream);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  is_thrust_tuple_of_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value &&
+    is_thrust_tuple<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_reduce(raft::comms::comms_t const& comm,
+              InputIterator input_first,
+              OutputIterator output_first,
+              size_t count,
+              raft::comms::op_t op,
+              int root,
+              cudaStream_t stream)
+{
+  static_assert(
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
+    thrust::tuple_size<typename thrust::iterator_traits<OutputIterator>::value_type>::value);
+
+  size_t constexpr tuple_size =
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
+
+  detail::
+    device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator, size_t{0}, tuple_size>(
+      comm, input_first, output_first, count, op, root, stream);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value,
+  void>
+device_allgatherv(raft::comms::comms_t const& comm,
+                  InputIterator input_first,
+                  OutputIterator output_first,
+                  std::vector<size_t> const& recvcounts,
+                  std::vector<size_t> const& displacements,
+                  cudaStream_t stream)
+{
+  detail::device_allgatherv_impl(
+    comm, input_first, output_first, recvcounts, displacements, stream);
+}
+
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  is_thrust_tuple_of_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value &&
+    is_thrust_tuple<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_allgatherv(raft::comms::comms_t const& comm,
+                  InputIterator input_first,
+                  OutputIterator output_first,
+                  std::vector<size_t> const& recvcounts,
+                  std::vector<size_t> const& displacements,
+                  cudaStream_t stream)
+{
+  static_assert(
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
+    thrust::tuple_size<typename thrust::iterator_traits<OutputIterator>::value_type>::value);
+
+  size_t constexpr tuple_size =
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
+
+  detail::device_allgatherv_tuple_iterator_element_impl<InputIterator,
+                                                        OutputIterator,
+                                                        size_t{0},
+                                                        tuple_size>()
+    .run(comm, input_first, output_first, recvcounts, displacements, stream);
+}
+
+template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+auto allocate_comm_buffer(size_t buffer_size, cudaStream_t stream)
+{
+  return rmm::device_uvector<T>(buffer_size, stream);
+}
+
+template <typename T, typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+auto allocate_comm_buffer(size_t buffer_size, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  return detail::allocate_comm_buffer_tuple_impl<T>(
+    std::make_index_sequence<tuple_size>(), buffer_size, stream);
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+auto get_comm_buffer_begin(BufferType& buffer)
+{
+  return buffer.begin();
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+auto get_comm_buffer_begin(BufferType& buffer)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  return thrust::make_zip_iterator(
+    detail::get_comm_buffer_begin_tuple_impl<T>(std::make_index_sequence<tuple_size>(), buffer));
+}
+
+}  // namespace experimental
+}  // namespace cugraph
\ No newline at end of file
diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp
new file mode 100644
index 00000000000..cf7428177d6
--- /dev/null
+++ b/cpp/include/utilities/cython.hpp
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph.hpp>
+#include <graph.hpp>
+#include <raft/handle.hpp>
+
+namespace cugraph {
+namespace cython {
+
+enum class numberTypeEnum : int { int32Type, int64Type, floatType, doubleType };
+
+enum class graphTypeEnum : int {
+  // represents unintiialized or NULL ptr
+  null,
+  // represents some legacy Cxx type. This and other LegacyCxx values are not
+  // used for the unique_ptr in a graph_container_t, but instead for when this
+  // enum is used for determining high-level code paths to take to prevent
+  // needing to expose each legacy enum value to cython.
+  LegacyCSR,
+  LegacyCSC,
+  LegacyCOO,
+  // represents that a GraphCxxView* unique_ptr type is present in a
+  // graph_container_t.
+  GraphCSRViewFloat,
+  GraphCSRViewDouble,
+  GraphCSCViewFloat,
+  GraphCSCViewDouble,
+  GraphCOOViewFloat,
+  GraphCOOViewDouble,
+  // represents values present in the graph_container_t to construct a graph_t,
+  // but unlike legacy classes does not mean a graph_t unique_ptr is present in
+  // the container.
+  graph_t,
+};
+
+// "container" for a graph type instance which insulates the owner from the
+// specifics of the actual graph type. This is intended to be used in Cython
+// code that only needs to pass a graph object to another wrapped C++ API. This
+// greatly simplifies the Cython code since the Cython definition only needs to
+// define the container and not the various individual graph types in Cython.
+struct graph_container_t {
+  // FIXME: This union is in place only to support legacy calls, remove when
+  // migration to graph_t types is complete, or when legacy graph objects are
+  // constructed in the call_<<algo> wrappers instead of the
+  // populate_graph_container_legacy() function.
+  union graphPtrUnion {
+    ~graphPtrUnion() {}
+
+    void* null;
+    std::unique_ptr<GraphCSRView<int32_t, int32_t, float>> GraphCSRViewFloatPtr;
+    std::unique_ptr<GraphCSRView<int32_t, int32_t, double>> GraphCSRViewDoublePtr;
+    std::unique_ptr<GraphCSCView<int32_t, int32_t, float>> GraphCSCViewFloatPtr;
+    std::unique_ptr<GraphCSCView<int32_t, int32_t, double>> GraphCSCViewDoublePtr;
+    std::unique_ptr<GraphCOOView<int32_t, int32_t, float>> GraphCOOViewFloatPtr;
+    std::unique_ptr<GraphCOOView<int32_t, int32_t, double>> GraphCOOViewDoublePtr;
+  };
+
+  graph_container_t() : graph_ptr_union{nullptr}, graph_type{graphTypeEnum::null} {}
+  ~graph_container_t() {}
+
+  // The expected usage of a graph_container_t is for it to be created as part
+  // of a cython wrapper simply for passing a templated instantiation of a
+  // particular graph class from one call to another, and not to exist outside
+  // of the individual wrapper function (deleted when the instance goes out of
+  // scope once the wrapper function returns). Therefore, copys and assignments
+  // to an instance are not supported and these methods are deleted.
+  graph_container_t(const graph_container_t&) = delete;
+  graph_container_t& operator=(const graph_container_t&) = delete;
+
+  graphPtrUnion graph_ptr_union;
+  graphTypeEnum graph_type;
+
+  // primitive data used for constructing graph_t instances.
+  void* src_vertices;
+  void* dst_vertices;
+  void* weights;
+  void* vertex_partition_offsets;
+
+  size_t num_partition_edges;
+  size_t num_global_vertices;
+  size_t num_global_edges;
+  numberTypeEnum vertexType;
+  numberTypeEnum edgeType;
+  numberTypeEnum weightType;
+  bool transposed;
+  bool is_multi_gpu;
+  bool sorted_by_degree;
+  bool do_expensive_check;
+  bool hypergraph_partitioned;
+  int row_comm_size;
+  int col_comm_size;
+  int row_comm_rank;
+  int col_comm_rank;
+  experimental::graph_properties_t graph_props;
+};
+
+// FIXME: finish description for vertex_partition_offsets
+//
+// Factory function for populating an empty graph container with a new graph
+// object from basic types, and sets the corresponding meta-data. Args are:
+//
+// graph_container_t& graph_container
+//   Reference to the graph_container_t instance to
+//   populate. populate_graph_container() can only be called on an "empty"
+//   container (ie. a container that has not been previously populated by
+//   populate_graph_container())
+//
+// graphTypeEnum legacyType
+//   Specifies the type of graph when instantiating a legacy graph type
+//   (GraphCSRViewFloat, etc.).
+//   NOTE: this parameter will be removed when the transition to exclusinve use
+//   of the new 2D graph classes is complete.
+//
+// raft::handle_t const& handle
+//   Raft handle to be set on the new graph instance in the container
+//
+// void* src_vertices, dst_vertices, weights
+//   Pointer to an array of values representing source and destination vertices,
+//   and edge weights respectively. The value types of the array are specified
+//   using numberTypeEnum values separately (see below). offsets should be size
+//   num_vertices+1, indices should be size num_edges, weights should also be
+//   size num_edges
+//
+// void* vertex_partition_offsets
+//   Pointer to an array of vertexType values representing offsets into the
+//   individual partitions for a multi-GPU paritioned graph. The offsets are used for ...
+//
+// numberTypeEnum vertexType, edgeType, weightType
+//   numberTypeEnum enum value describing the data type for the vertices,
+//   offsets, and weights arrays respectively. These enum values are used to
+//   instantiate the proper templated graph type and for casting the arrays
+//   accordingly.
+//
+// int num_vertices, num_edges
+//   The number of vertices and edges respectively in the graph represented by
+//   the above arrays.
+//
+// bool transposed
+//   true if the resulting graph object should store a transposed adjacency
+//   matrix
+//
+// bool multi_gpu
+//   true if the resulting graph object is to be used for a multi-gpu
+//   application
+void populate_graph_container(graph_container_t& graph_container,
+                              raft::handle_t& handle,
+                              void* src_vertices,
+                              void* dst_vertices,
+                              void* weights,
+                              void* vertex_partition_offsets,
+                              numberTypeEnum vertexType,
+                              numberTypeEnum edgeType,
+                              numberTypeEnum weightType,
+                              size_t num_partition_edges,
+                              size_t num_global_vertices,
+                              size_t num_global_edges,
+                              size_t row_comm_size,  // pcols
+                              size_t col_comm_size,  // prows
+                              bool sorted_by_degree,
+                              bool transposed,
+                              bool multi_gpu);
+
+// FIXME: comment this function
+// FIXME: Should local_* values be void* as well?
+void populate_graph_container_legacy(graph_container_t& graph_container,
+                                     graphTypeEnum legacyType,
+                                     raft::handle_t const& handle,
+                                     void* offsets,
+                                     void* indices,
+                                     void* weights,
+                                     numberTypeEnum offsetType,
+                                     numberTypeEnum indexType,
+                                     numberTypeEnum weightType,
+                                     size_t num_global_vertices,
+                                     size_t num_global_edges,
+                                     int* local_vertices,
+                                     int* local_edges,
+                                     int* local_offsets);
+
+// Wrapper for calling Louvain using a graph container
+template <typename weight_t>
+std::pair<size_t, weight_t> call_louvain(raft::handle_t const& handle,
+                                         graph_container_t const& graph_container,
+                                         void* identifiers,
+                                         void* parts,
+                                         size_t max_level,
+                                         weight_t resolution);
+
+}  // namespace cython
+}  // namespace cugraph
diff --git a/cpp/include/utilities/thrust_tuple_utils.cuh b/cpp/include/utilities/thrust_tuple_utils.cuh
new file mode 100644
index 00000000000..0ad71ba5e05
--- /dev/null
+++ b/cpp/include/utilities/thrust_tuple_utils.cuh
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/device_atomics.cuh>
+
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/tuple.h>
+#include <cub/cub.cuh>
+
+#include <array>
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename TupleType, size_t I, size_t N>
+struct is_thrust_tuple_of_arithemetic_impl {
+  constexpr bool evaluate() const
+  {
+    if (!std::is_arithmetic<typename thrust::tuple_element<I, TupleType>::type>::value) {
+      return false;
+    } else {
+      return is_thrust_tuple_of_arithemetic_impl<TupleType, I + 1, N>().evaluate();
+    }
+  }
+};
+
+template <typename TupleType, size_t I>
+struct is_thrust_tuple_of_arithemetic_impl<TupleType, I, I> {
+  constexpr bool evaluate() const { return true; }
+};
+
+template <typename TupleType, size_t I, size_t N>
+struct compute_thrust_tuple_element_sizes_impl {
+  void compute(std::array<size_t, thrust::tuple_size<TupleType>::value>& arr) const
+  {
+    arr[I] = sizeof(typename thrust::tuple_element<I, TupleType>::type);
+    compute_thrust_tuple_element_sizes_impl<TupleType, I + 1, N>().compute(arr);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct compute_thrust_tuple_element_sizes_impl<TupleType, I, I> {
+  void compute(std::array<size_t, thrust::tuple_size<TupleType>::value>& arr) const {}
+};
+
+template <typename TupleType, size_t... Is>
+__device__ constexpr auto remove_first_thrust_tuple_element_impl(TupleType const& tuple,
+                                                                 std::index_sequence<Is...>)
+{
+  return thrust::make_tuple(thrust::get<1 + Is>(tuple)...);
+}
+
+template <typename TupleType, size_t I, size_t N>
+struct plus_thrust_tuple_impl {
+  __host__ __device__ constexpr void compute(TupleType& lhs, TupleType const& rhs) const
+  {
+    thrust::get<I>(lhs) += thrust::get<I>(rhs);
+    plus_thrust_tuple_impl<TupleType, I + 1, N>().compute(lhs, rhs);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct plus_thrust_tuple_impl<TupleType, I, I> {
+  __host__ __device__ constexpr void compute(TupleType& lhs, TupleType const& rhs) const {}
+};
+
+template <typename T>
+__device__ std::enable_if_t<std::is_arithmetic<T>::value, void> atomic_accumulate_impl(
+  thrust::detail::any_assign& /* dereferencing thrust::discard_iterator results in this type */ lhs,
+  T const& rhs)
+{
+  // no-op
+}
+
+template <typename T>
+__device__ std::enable_if_t<std::is_arithmetic<T>::value, void> atomic_accumulate_impl(T& lhs,
+                                                                                       T const& rhs)
+{
+  atomicAdd(&lhs, rhs);
+}
+
+template <typename Iterator, typename TupleType, size_t I, size_t N>
+struct atomic_accumulate_thrust_tuple_impl {
+  __device__ constexpr void compute(Iterator iter, TupleType const& value) const
+  {
+    atomic_accumulate_impl(thrust::raw_reference_cast(thrust::get<I>(*iter)),
+                           thrust::get<I>(value));
+    atomic_accumulate_thrust_tuple_impl<Iterator, TupleType, I + 1, N>().compute(iter, value);
+  }
+};
+
+template <typename Iterator, typename TupleType, size_t I>
+struct atomic_accumulate_thrust_tuple_impl<Iterator, TupleType, I, I> {
+  __device__ constexpr void compute(Iterator iter, TupleType const& value) const {}
+};
+
+template <typename TupleType, size_t I, size_t N>
+struct warp_reduce_thrust_tuple_impl {
+  __device__ void compute(TupleType& tuple) const
+  {
+    auto& val = thrust::get<I>(tuple);
+    for (auto offset = raft::warp_size() / 2; offset > 0; offset /= 2) {
+      val += __shfl_down_sync(raft::warp_full_mask(), val, offset);
+    }
+  }
+};
+
+template <typename TupleType, size_t I>
+struct warp_reduce_thrust_tuple_impl<TupleType, I, I> {
+  __device__ void compute(TupleType& tuple) const {}
+};
+
+template <typename TupleType, size_t BlockSize, size_t I, size_t N>
+struct block_reduce_thrust_tuple_impl {
+  __device__ void compute(TupleType& tuple) const
+  {
+    using T           = typename thrust::tuple_element<I, TupleType>::type;
+    using BlockReduce = cub::BlockReduce<T, BlockSize>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    thrust::get<I>(tuple) = BlockReduce(temp_storage).Sum(thrust::get<I>(tuple));
+  }
+};
+
+template <typename TupleType, size_t BlockSize, size_t I>
+struct block_reduce_thrust_tuple_impl<TupleType, BlockSize, I, I> {
+  __device__ void compute(TupleType& tuple) const {}
+};
+
+}  // namespace detail
+
+template <typename T>
+struct is_thrust_tuple : std::false_type {
+};
+
+template <typename... Ts>
+struct is_thrust_tuple<thrust::tuple<Ts...>> : std::true_type {
+};
+
+template <typename TupleType, typename Enable = void>
+struct is_thrust_tuple_of_arithmetic : std::false_type {
+};
+
+template <typename TupleType>
+struct is_thrust_tuple_of_arithmetic<TupleType,
+                                     std::enable_if_t<is_thrust_tuple<TupleType>::value>> {
+  static constexpr bool value =
+    detail::is_thrust_tuple_of_arithemetic_impl<TupleType,
+                                                0,
+                                                static_cast<size_t>(
+                                                  thrust::tuple_size<TupleType>::value)>()
+      .evaluate();
+};
+
+template <typename T>
+struct is_arithmetic_or_thrust_tuple_of_arithmetic
+  : std::integral_constant<bool, std::is_arithmetic<T>::value> {
+};
+
+template <typename... Ts>
+struct is_arithmetic_or_thrust_tuple_of_arithmetic<thrust::tuple<Ts...>>
+  : std::integral_constant<bool, is_thrust_tuple_of_arithmetic<thrust::tuple<Ts...>>::value> {
+};
+
+template <typename T>
+struct thrust_tuple_size_or_one : std::integral_constant<size_t, 1> {
+};
+
+template <typename... Ts>
+struct thrust_tuple_size_or_one<thrust::tuple<Ts...>>
+  : std::integral_constant<size_t, thrust::tuple_size<thrust::tuple<Ts...>>::value> {
+};
+
+template <typename TupleType>
+struct compute_thrust_tuple_element_sizes {
+  auto operator()() const
+  {
+    size_t constexpr tuple_size = thrust::tuple_size<TupleType>::value;
+    std::array<size_t, tuple_size> ret;
+    detail::compute_thrust_tuple_element_sizes_impl<TupleType, size_t{0}, tuple_size>().compute(
+      ret);
+    return ret;
+  }
+};
+
+template <typename TupleType>
+struct remove_first_thrust_tuple_element {
+  __device__ constexpr auto operator()(TupleType const& tuple) const
+  {
+    size_t constexpr tuple_size = thrust::tuple_size<TupleType>::value;
+    return detail::remove_first_thrust_tuple_element_impl(
+      tuple, std::make_index_sequence<tuple_size - 1>());
+  }
+};
+
+template <typename TupleType>
+struct plus_thrust_tuple {
+  __host__ __device__ constexpr TupleType operator()(TupleType const& lhs,
+                                                     TupleType const& rhs) const
+  {
+    size_t constexpr tuple_size = thrust::tuple_size<TupleType>::value;
+    auto ret                    = lhs;
+    detail::plus_thrust_tuple_impl<TupleType, size_t{0}, tuple_size>().compute(ret, rhs);
+    return ret;
+  }
+};
+
+template <typename Iterator, typename TupleType>
+struct atomic_accumulate_thrust_tuple {
+  __device__ constexpr void operator()(Iterator iter, TupleType const& value) const
+  {
+    static_assert(
+      thrust::tuple_size<typename thrust::iterator_traits<Iterator>::value_type>::value ==
+      thrust::tuple_size<TupleType>::value);
+    size_t constexpr tuple_size = thrust::tuple_size<TupleType>::value;
+    detail::atomic_accumulate_thrust_tuple_impl<Iterator, TupleType, size_t{0}, tuple_size>()
+      .compute(iter, value);
+  }
+};
+
+template <typename TupleType>
+struct warp_reduce_thrust_tuple {  // only warp lane 0 has a valid result
+  __device__ TupleType operator()(TupleType const& tuple) const
+  {
+    size_t constexpr tuple_size = thrust::tuple_size<TupleType>::value;
+    auto ret                    = tuple;
+    detail::warp_reduce_thrust_tuple_impl<TupleType, size_t{0}, tuple_size>().compute(ret);
+    return ret;
+  }
+};
+
+template <typename TupleType, size_t BlockSize>
+struct block_reduce_thrust_tuple {
+  __device__ TupleType operator()(TupleType const& tuple) const
+  {
+    size_t constexpr tuple_size = thrust::tuple_size<TupleType>::value;
+    auto ret                    = tuple;
+    detail::block_reduce_thrust_tuple_impl<TupleType, BlockSize, size_t{0}, tuple_size>().compute(
+      ret);
+    return ret;
+  }
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/vertex_partition_device.cuh b/cpp/include/vertex_partition_device.cuh
new file mode 100644
index 00000000000..a6a78ad3878
--- /dev/null
+++ b/cpp/include/vertex_partition_device.cuh
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph_view.hpp>
+#include <utilities/error.hpp>
+
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+template <typename vertex_t>
+class vertex_partition_device_base_t {
+ public:
+  vertex_partition_device_base_t(vertex_t number_of_vertices)
+    : number_of_vertices_(number_of_vertices)
+  {
+  }
+
+  template <typename vertex_type = vertex_t>
+  __host__ __device__ std::enable_if_t<std::is_signed<vertex_type>::value, bool> is_valid_vertex(
+    vertex_type v) const noexcept
+  {
+    return ((v >= 0) && (v < number_of_vertices_));
+  }
+
+  template <typename vertex_type = vertex_t>
+  __host__ __device__ std::enable_if_t<std::is_unsigned<vertex_type>::value, bool> is_valid_vertex(
+    vertex_type v) const noexcept
+  {
+    return (v < number_of_vertices_);
+  }
+
+ private:
+  // should be trivially copyable to device
+  vertex_t number_of_vertices_{0};
+};
+
+template <typename GraphViewType, typename Enable = void>
+class vertex_partition_device_t;
+
+// multi-GPU version
+template <typename GraphViewType>
+class vertex_partition_device_t<GraphViewType, std::enable_if_t<GraphViewType::is_multi_gpu>>
+  : public vertex_partition_device_base_t<typename GraphViewType::vertex_type> {
+ public:
+  vertex_partition_device_t(GraphViewType const& graph_view)
+    : vertex_partition_device_base_t<typename GraphViewType::vertex_type>(
+        graph_view.get_number_of_vertices()),
+      first_(graph_view.get_local_vertex_first()),
+      last_(graph_view.get_local_vertex_last())
+  {
+  }
+
+  __host__ __device__ bool is_local_vertex_nocheck(typename GraphViewType::vertex_type v) const
+    noexcept
+  {
+    return (v >= first_) && (v < last_);
+  }
+
+  __host__ __device__ typename GraphViewType::vertex_type
+  get_local_vertex_offset_from_vertex_nocheck(typename GraphViewType::vertex_type v) const noexcept
+  {
+    return v - first_;
+  }
+
+ private:
+  // should be trivially copyable to device
+  typename GraphViewType::vertex_type first_{0};
+  typename GraphViewType::vertex_type last_{0};
+};
+
+// single-GPU version
+template <typename GraphViewType>
+class vertex_partition_device_t<GraphViewType, std::enable_if_t<!GraphViewType::is_multi_gpu>>
+  : public vertex_partition_device_base_t<typename GraphViewType::vertex_type> {
+ public:
+  vertex_partition_device_t(GraphViewType const& graph_view)
+    : vertex_partition_device_base_t<typename GraphViewType::vertex_type>(
+        graph_view.get_number_of_vertices())
+  {
+  }
+
+  __host__ __device__ constexpr bool is_local_vertex_nocheck(
+    typename GraphViewType::vertex_type v) const noexcept
+  {
+    return true;
+  }
+
+  __host__ __device__ constexpr typename GraphViewType::vertex_type
+  get_local_vertex_offset_from_vertex_nocheck(typename GraphViewType::vertex_type v) const noexcept
+  {
+    return v;
+  }
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/community/ECG.cu b/cpp/src/community/ECG.cu
index 47a80fa48d6..ce7e9dd1ad2 100644
--- a/cpp/src/community/ECG.cu
+++ b/cpp/src/community/ECG.cu
@@ -108,13 +108,14 @@ void get_permutation_vector(T size, T seed, T *permutation, cudaStream_t stream)
 namespace cugraph {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void ecg(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
+void ecg(raft::handle_t const &handle,
+         GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
          weight_t min_weight,
          vertex_t ensemble_size,
-         vertex_t *ecg_parts)
+         vertex_t *clustering)
 {
   CUGRAPH_EXPECTS(graph.edge_data != nullptr, "API error, louvain expects a weighted graph");
-  CUGRAPH_EXPECTS(ecg_parts != nullptr, "Invalid API parameter: ecg_parts is NULL");
+  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is NULL");
 
   cudaStream_t stream{0};
 
@@ -142,10 +143,7 @@ void ecg(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
     rmm::device_vector<vertex_t> parts_v(size);
     vertex_t *d_parts = parts_v.data().get();
 
-    weight_t final_modularity;
-    vertex_t num_level;
-
-    cugraph::louvain(permuted_graph->view(), &final_modularity, &num_level, d_parts, 1);
+    cugraph::louvain(handle, permuted_graph->view(), d_parts, size_t{1});
 
     // For each edge in the graph determine whether the endpoints are in the same partition
     // Keep a sum for each edge of the total number of times its endpoints are in the same partition
@@ -178,18 +176,18 @@ void ecg(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
   louvain_graph.number_of_vertices = graph.number_of_vertices;
   louvain_graph.number_of_edges    = graph.number_of_edges;
 
-  weight_t final_modularity;
-  vertex_t num_level;
-  cugraph::louvain(louvain_graph, &final_modularity, &num_level, ecg_parts, 100);
+  cugraph::louvain(handle, louvain_graph, clustering, size_t{100});
 }
 
 // Explicit template instantiations.
-template void ecg<int32_t, int32_t, float>(GraphCSRView<int32_t, int32_t, float> const &graph,
+template void ecg<int32_t, int32_t, float>(raft::handle_t const &,
+                                           GraphCSRView<int32_t, int32_t, float> const &graph,
                                            float min_weight,
                                            int32_t ensemble_size,
-                                           int32_t *ecg_parts);
-template void ecg<int32_t, int32_t, double>(GraphCSRView<int32_t, int32_t, double> const &graph,
+                                           int32_t *clustering);
+template void ecg<int32_t, int32_t, double>(raft::handle_t const &,
+                                            GraphCSRView<int32_t, int32_t, double> const &graph,
                                             double min_weight,
                                             int32_t ensemble_size,
-                                            int32_t *ecg_parts);
+                                            int32_t *clustering);
 }  // namespace cugraph
diff --git a/cpp/src/community/leiden.cpp b/cpp/src/community/leiden.cpp
deleted file mode 100644
index 9e7a49db1f1..00000000000
--- a/cpp/src/community/leiden.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <algorithms.hpp>
-#include <graph.hpp>
-
-#include <rmm/thrust_rmm_allocator.h>
-
-#include <thrust/sequence.h>
-
-#include <community/leiden_kernels.hpp>
-
-#include "utilities/error.hpp"
-
-namespace cugraph {
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void leiden(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-            weight_t &final_modularity,
-            int &num_level,
-            vertex_t *leiden_parts,
-            int max_level,
-            weight_t resolution)
-{
-  CUGRAPH_EXPECTS(graph.edge_data != nullptr, "API error, leiden expects a weighted graph");
-  CUGRAPH_EXPECTS(leiden_parts != nullptr, "API error, leiden_parts is null");
-
-  detail::leiden<vertex_t, edge_t, weight_t>(
-    graph, final_modularity, num_level, leiden_parts, max_level, resolution);
-}
-
-template void leiden(
-  GraphCSRView<int32_t, int32_t, float> const &, float &, int &, int32_t *, int, float);
-template void leiden(
-  GraphCSRView<int32_t, int32_t, double> const &, double &, int &, int32_t *, int, double);
-
-}  // namespace cugraph
diff --git a/cpp/src/community/leiden.cu b/cpp/src/community/leiden.cu
new file mode 100644
index 00000000000..9e5a847cdf0
--- /dev/null
+++ b/cpp/src/community/leiden.cu
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <community/leiden.cuh>
+
+namespace cugraph {
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::pair<size_t, weight_t> leiden(raft::handle_t const &handle,
+                                   GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
+                                   vertex_t *clustering,
+                                   size_t max_level,
+                                   weight_t resolution)
+{
+  CUGRAPH_EXPECTS(graph.edge_data != nullptr,
+                  "Invalid input argument: leiden expects a weighted graph");
+  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
+
+  Leiden<GraphCSRView<vertex_t, edge_t, weight_t>> runner(handle, graph);
+
+  return runner(clustering, max_level, resolution);
+}
+
+// Explicit template instantations
+template std::pair<size_t, float> leiden(
+  raft::handle_t const &, GraphCSRView<int32_t, int32_t, float> const &, int32_t *, size_t, float);
+
+template std::pair<size_t, double> leiden(raft::handle_t const &,
+                                          GraphCSRView<int32_t, int32_t, double> const &,
+                                          int32_t *,
+                                          size_t,
+                                          double);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh
new file mode 100644
index 00000000000..f2f84433284
--- /dev/null
+++ b/cpp/src/community/leiden.cuh
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <community/louvain.cuh>
+
+namespace cugraph {
+
+template <typename graph_type>
+class Leiden : public Louvain<graph_type> {
+ public:
+  using graph_t  = graph_type;
+  using vertex_t = typename graph_type::vertex_type;
+  using edge_t   = typename graph_type::edge_type;
+  using weight_t = typename graph_type::weight_type;
+
+  Leiden(raft::handle_t const &handle, graph_type const &graph)
+    : Louvain<graph_type>(handle, graph), constraint_v_(graph.number_of_vertices)
+  {
+  }
+
+  weight_t update_clustering_constrained(weight_t total_edge_weight,
+                                         weight_t resolution,
+                                         graph_type const &graph)
+  {
+    this->timer_start("update_clustering_constrained");
+
+    rmm::device_vector<vertex_t> next_cluster_v(this->cluster_v_);
+    rmm::device_vector<weight_t> delta_Q_v(graph.number_of_edges);
+    rmm::device_vector<vertex_t> cluster_hash_v(graph.number_of_edges);
+    rmm::device_vector<weight_t> old_cluster_sum_v(graph.number_of_vertices);
+
+    vertex_t const *d_src_indices    = this->src_indices_v_.data().get();
+    vertex_t const *d_dst_indices    = graph.indices;
+    vertex_t *d_cluster_hash         = cluster_hash_v.data().get();
+    vertex_t *d_cluster              = this->cluster_v_.data().get();
+    weight_t const *d_vertex_weights = this->vertex_weights_v_.data().get();
+    weight_t *d_cluster_weights      = this->cluster_weights_v_.data().get();
+    weight_t *d_delta_Q              = delta_Q_v.data().get();
+    vertex_t *d_constraint           = constraint_v_.data().get();
+
+    weight_t new_Q =
+      this->modularity(total_edge_weight, resolution, graph, this->cluster_v_.data().get());
+
+    weight_t cur_Q = new_Q - 1;
+
+    // To avoid the potential of having two vertices swap clusters
+    // we will only allow vertices to move up (true) or down (false)
+    // during each iteration of the loop
+    bool up_down = true;
+
+    while (new_Q > (cur_Q + 0.0001)) {
+      cur_Q = new_Q;
+
+      this->compute_delta_modularity(
+        total_edge_weight, resolution, graph, cluster_hash_v, old_cluster_sum_v, delta_Q_v);
+
+      // Filter out positive delta_Q values for nodes not in the same constraint group
+      thrust::for_each(
+        rmm::exec_policy(this->stream_)->on(this->stream_),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(graph.number_of_edges),
+        [d_src_indices, d_dst_indices, d_constraint, d_delta_Q] __device__(vertex_t i) {
+          vertex_t start_cluster = d_constraint[d_src_indices[i]];
+          vertex_t end_cluster   = d_constraint[d_dst_indices[i]];
+          if (start_cluster != end_cluster) d_delta_Q[i] = weight_t{0.0};
+        });
+
+      this->assign_nodes(graph, cluster_hash_v, next_cluster_v, delta_Q_v, up_down);
+
+      up_down = !up_down;
+
+      new_Q = this->modularity(total_edge_weight, resolution, graph, next_cluster_v.data().get());
+
+      if (new_Q > cur_Q) {
+        thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
+                     next_cluster_v.begin(),
+                     next_cluster_v.end(),
+                     this->cluster_v_.begin());
+      }
+    }
+
+    this->timer_stop(this->stream_);
+    return cur_Q;
+  }
+
+  std::pair<size_t, weight_t> operator()(vertex_t *d_cluster_vec,
+                                         size_t max_level,
+                                         weight_t resolution)
+  {
+    size_t num_level{0};
+
+    weight_t total_edge_weight = thrust::reduce(rmm::exec_policy(this->stream_)->on(this->stream_),
+                                                this->weights_v_.begin(),
+                                                this->weights_v_.end());
+
+    weight_t best_modularity = weight_t{-1};
+
+    //
+    //  Initialize every cluster to reference each vertex to itself
+    //
+    thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_),
+                     this->cluster_v_.begin(),
+                     this->cluster_v_.end());
+    thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
+                 this->cluster_v_.begin(),
+                 this->cluster_v_.end(),
+                 d_cluster_vec);
+
+    //
+    //  Our copy of the graph.  Each iteration of the outer loop will
+    //  shrink this copy of the graph.
+    //
+    GraphCSRView<vertex_t, edge_t, weight_t> current_graph(this->offsets_v_.data().get(),
+                                                           this->indices_v_.data().get(),
+                                                           this->weights_v_.data().get(),
+                                                           this->number_of_vertices_,
+                                                           this->number_of_edges_);
+
+    current_graph.get_source_indices(this->src_indices_v_.data().get());
+
+    while (num_level < max_level) {
+      this->compute_vertex_and_cluster_weights(current_graph);
+
+      weight_t new_Q = this->update_clustering(total_edge_weight, resolution, current_graph);
+
+      thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
+                   this->cluster_v_.begin(),
+                   this->cluster_v_.end(),
+                   constraint_v_.begin());
+
+      new_Q = update_clustering_constrained(total_edge_weight, resolution, current_graph);
+
+      if (new_Q <= best_modularity) { break; }
+
+      best_modularity = new_Q;
+
+      this->shrink_graph(current_graph, d_cluster_vec);
+
+      num_level++;
+    }
+
+    this->timer_display(std::cout);
+
+    return std::make_pair(num_level, best_modularity);
+  }
+
+ private:
+  rmm::device_vector<vertex_t> constraint_v_;
+};
+
+}  // namespace cugraph
diff --git a/cpp/src/community/leiden_kernels.cu b/cpp/src/community/leiden_kernels.cu
deleted file mode 100644
index 5eb4219d1ac..00000000000
--- a/cpp/src/community/leiden_kernels.cu
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <graph.hpp>
-
-#include <rmm/thrust_rmm_allocator.h>
-
-#include <community/louvain_kernels.hpp>
-#include <utilities/graph_utils.cuh>
-
-//#define TIMING
-
-#ifdef TIMING
-#include <utilities/high_res_timer.hpp>
-#endif
-
-#include <converters/COOtoCSR.cuh>
-
-namespace cugraph {
-namespace detail {
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-weight_t update_clustering_by_delta_modularity_constrained(
-  weight_t total_edge_weight,
-  weight_t resolution,
-  GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-  rmm::device_vector<vertex_t> const &src_indices,
-  rmm::device_vector<weight_t> const &vertex_weights,
-  rmm::device_vector<weight_t> &cluster_weights,
-  rmm::device_vector<vertex_t> &cluster,
-  rmm::device_vector<vertex_t> &constraint,
-  cudaStream_t stream)
-{
-  rmm::device_vector<vertex_t> next_cluster(cluster);
-  rmm::device_vector<weight_t> delta_Q(graph.number_of_edges);
-  rmm::device_vector<vertex_t> cluster_hash(graph.number_of_edges);
-  rmm::device_vector<weight_t> old_cluster_sum(graph.number_of_vertices);
-
-  weight_t *d_delta_Q           = delta_Q.data().get();
-  vertex_t *d_constraint        = constraint.data().get();
-  vertex_t const *d_src_indices = src_indices.data().get();
-  vertex_t const *d_dst_indices = graph.indices;
-
-  weight_t new_Q = modularity(total_edge_weight, resolution, graph, cluster.data().get(), stream);
-
-  weight_t cur_Q = new_Q - 1;
-
-  // To avoid the potential of having two vertices swap clusters
-  // we will only allow vertices to move up (true) or down (false)
-  // during each iteration of the loop
-  bool up_down = true;
-
-  while (new_Q > (cur_Q + 0.0001)) {
-    cur_Q = new_Q;
-
-    compute_delta_modularity(total_edge_weight,
-                             resolution,
-                             graph,
-                             src_indices,
-                             vertex_weights,
-                             cluster_weights,
-                             cluster,
-                             cluster_hash,
-                             delta_Q,
-                             old_cluster_sum,
-                             stream);
-
-    // Filter out positive delta_Q values for nodes not in the same constraint group
-    thrust::for_each(
-      rmm::exec_policy(stream)->on(stream),
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(graph.number_of_edges),
-      [d_src_indices, d_dst_indices, d_constraint, d_delta_Q] __device__(vertex_t i) {
-        vertex_t start_cluster = d_constraint[d_src_indices[i]];
-        vertex_t end_cluster   = d_constraint[d_dst_indices[i]];
-        if (start_cluster != end_cluster) d_delta_Q[i] = weight_t{0.0};
-      });
-
-    assign_nodes(graph,
-                 delta_Q,
-                 cluster_hash,
-                 src_indices,
-                 next_cluster,
-                 vertex_weights,
-                 cluster_weights,
-                 up_down,
-                 stream);
-
-    up_down = !up_down;
-
-    new_Q = modularity(total_edge_weight, resolution, graph, next_cluster.data().get(), stream);
-
-    if (new_Q > cur_Q) {
-      thrust::copy(rmm::exec_policy(stream)->on(stream),
-                   next_cluster.begin(),
-                   next_cluster.end(),
-                   cluster.begin());
-    }
-  }
-
-  return cur_Q;
-}
-
-template float update_clustering_by_delta_modularity_constrained(
-  float,
-  float,
-  GraphCSRView<int32_t, int32_t, float> const &,
-  rmm::device_vector<int32_t> const &,
-  rmm::device_vector<float> const &,
-  rmm::device_vector<float> &,
-  rmm::device_vector<int32_t> &,
-  rmm::device_vector<int32_t> &,
-  cudaStream_t);
-
-template double update_clustering_by_delta_modularity_constrained(
-  double,
-  double,
-  GraphCSRView<int32_t, int32_t, double> const &,
-  rmm::device_vector<int32_t> const &,
-  rmm::device_vector<double> const &,
-  rmm::device_vector<double> &,
-  rmm::device_vector<int32_t> &,
-  rmm::device_vector<int32_t> &,
-  cudaStream_t);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void leiden(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-            weight_t &final_modularity,
-            int &num_level,
-            vertex_t *cluster_vec,
-            int max_level,
-            weight_t resolution,
-            cudaStream_t stream)
-{
-#ifdef TIMING
-  HighResTimer hr_timer;
-#endif
-
-  num_level = 0;
-
-  //
-  //  Vectors to create a copy of the graph
-  //
-  rmm::device_vector<edge_t> offsets_v(graph.offsets, graph.offsets + graph.number_of_vertices + 1);
-  rmm::device_vector<vertex_t> indices_v(graph.indices, graph.indices + graph.number_of_edges);
-  rmm::device_vector<weight_t> weights_v(graph.edge_data, graph.edge_data + graph.number_of_edges);
-  rmm::device_vector<vertex_t> src_indices_v(graph.number_of_edges);
-
-  //
-  //  Weights and clustering across iterations of algorithm
-  //
-  rmm::device_vector<weight_t> vertex_weights_v(graph.number_of_vertices);
-  rmm::device_vector<weight_t> cluster_weights_v(graph.number_of_vertices);
-  rmm::device_vector<vertex_t> cluster_v(graph.number_of_vertices);
-
-  //
-  //  Temporaries used within kernels.  Each iteration uses less
-  //  of this memory
-  //
-  rmm::device_vector<vertex_t> tmp_arr_v(graph.number_of_vertices);
-  rmm::device_vector<vertex_t> cluster_inverse_v(graph.number_of_vertices);
-
-  weight_t total_edge_weight =
-    thrust::reduce(rmm::exec_policy(stream)->on(stream), weights_v.begin(), weights_v.end());
-  weight_t best_modularity = -1;
-
-  //
-  //  Initialize every cluster to reference each vertex to itself
-  //
-  thrust::sequence(rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end());
-  thrust::copy(
-    rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end(), cluster_vec);
-
-  //
-  //  Our copy of the graph.  Each iteration of the outer loop will
-  //  shrink this copy of the graph.
-  //
-  GraphCSRView<vertex_t, edge_t, weight_t> current_graph(offsets_v.data().get(),
-                                                         indices_v.data().get(),
-                                                         weights_v.data().get(),
-                                                         graph.number_of_vertices,
-                                                         graph.number_of_edges);
-
-  current_graph.get_source_indices(src_indices_v.data().get());
-
-  while (num_level < max_level) {
-    //
-    //  Sum the weights of all edges departing a vertex.  This is
-    //  loop invariant, so we'll compute it here.
-    //
-    //  Cluster weights are equivalent to vertex weights with this initial
-    //  graph
-    //
-#ifdef TIMING
-    hr_timer.start("init");
-#endif
-
-    cugraph::detail::compute_vertex_sums(current_graph, vertex_weights_v, stream);
-    thrust::copy(rmm::exec_policy(stream)->on(stream),
-                 vertex_weights_v.begin(),
-                 vertex_weights_v.end(),
-                 cluster_weights_v.begin());
-
-#ifdef TIMING
-    hr_timer.stop();
-
-    hr_timer.start("update_clustering");
-#endif
-
-    weight_t new_Q = update_clustering_by_delta_modularity(total_edge_weight,
-                                                           resolution,
-                                                           current_graph,
-                                                           src_indices_v,
-                                                           vertex_weights_v,
-                                                           cluster_weights_v,
-                                                           cluster_v,
-                                                           stream);
-
-    // After finding the initial unconstrained partition we use that partitioning as the constraint
-    // for the second round.
-    rmm::device_vector<vertex_t> constraint(graph.number_of_vertices);
-    thrust::copy(
-      rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end(), constraint.begin());
-    new_Q = update_clustering_by_delta_modularity_constrained(total_edge_weight,
-                                                              resolution,
-                                                              current_graph,
-                                                              src_indices_v,
-                                                              vertex_weights_v,
-                                                              cluster_weights_v,
-                                                              cluster_v,
-                                                              constraint,
-                                                              stream);
-
-#ifdef TIMING
-    hr_timer.stop();
-#endif
-
-    if (new_Q <= best_modularity) { break; }
-
-    best_modularity = new_Q;
-
-#ifdef TIMING
-    hr_timer.start("shrinking graph");
-#endif
-
-    // renumber the clusters to the range 0..(num_clusters-1)
-    vertex_t num_clusters = renumber_clusters(
-      graph.number_of_vertices, cluster_v, tmp_arr_v, cluster_inverse_v, cluster_vec, stream);
-    cluster_weights_v.resize(num_clusters);
-
-    // shrink our graph to represent the graph of supervertices
-    generate_superverticies_graph(current_graph, src_indices_v, num_clusters, cluster_v, stream);
-
-    // assign each new vertex to its own cluster
-    thrust::sequence(rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end());
-
-#ifdef TIMING
-    hr_timer.stop();
-#endif
-
-    num_level++;
-  }
-
-#ifdef TIMING
-  hr_timer.display(std::cout);
-#endif
-
-  final_modularity = best_modularity;
-}
-
-template void leiden(GraphCSRView<int32_t, int32_t, float> const &,
-                     float &,
-                     int &,
-                     int32_t *,
-                     int,
-                     float,
-                     cudaStream_t);
-template void leiden(GraphCSRView<int32_t, int32_t, double> const &,
-                     double &,
-                     int &,
-                     int32_t *,
-                     int,
-                     double,
-                     cudaStream_t);
-
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/community/leiden_kernels.hpp b/cpp/src/community/leiden_kernels.hpp
deleted file mode 100644
index cbe93c04f52..00000000000
--- a/cpp/src/community/leiden_kernels.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <rmm/thrust_rmm_allocator.h>
-
-#include <graph.hpp>
-
-namespace cugraph {
-namespace detail {
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void leiden(GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-            weight_t& final_modularity,
-            int& num_level,
-            vertex_t* cluster_vec,
-            int max_level,
-            weight_t resolution,
-            cudaStream_t stream = 0);
-
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/community/louvain.cpp b/cpp/src/community/louvain.cpp
deleted file mode 100644
index 0e3f6ac51fd..00000000000
--- a/cpp/src/community/louvain.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <algorithms.hpp>
-#include <graph.hpp>
-
-#include <rmm/thrust_rmm_allocator.h>
-
-#include <thrust/sequence.h>
-
-#include <community/louvain_kernels.hpp>
-
-#include "utilities/error.hpp"
-
-namespace cugraph {
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void louvain(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-             weight_t *final_modularity,
-             int *num_level,
-             vertex_t *louvain_parts,
-             int max_level,
-             weight_t resolution)
-{
-  CUGRAPH_EXPECTS(graph.edge_data != nullptr, "API error, louvain expects a weighted graph");
-  CUGRAPH_EXPECTS(final_modularity != nullptr, "API error, final_modularity is null");
-  CUGRAPH_EXPECTS(num_level != nullptr, "API error, num_level is null");
-  CUGRAPH_EXPECTS(louvain_parts != nullptr, "API error, louvain_parts is null");
-
-  detail::louvain<vertex_t, edge_t, weight_t>(
-    graph, final_modularity, num_level, louvain_parts, max_level, resolution);
-}
-
-template void louvain(
-  GraphCSRView<int32_t, int32_t, float> const &, float *, int *, int32_t *, int, float);
-template void louvain(
-  GraphCSRView<int32_t, int32_t, double> const &, double *, int *, int32_t *, int, double);
-
-}  // namespace cugraph
diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu
new file mode 100644
index 00000000000..559bb70d098
--- /dev/null
+++ b/cpp/src/community/louvain.cu
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <community/louvain.cuh>
+#include <experimental/louvain.cuh>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::pair<size_t, weight_t> louvain(raft::handle_t const &handle,
+                                    GraphCSRView<vertex_t, edge_t, weight_t> const &graph_view,
+                                    vertex_t *clustering,
+                                    size_t max_level,
+                                    weight_t resolution)
+{
+  CUGRAPH_EXPECTS(graph_view.edge_data != nullptr,
+                  "Invalid input argument: louvain expects a weighted graph");
+  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
+
+  Louvain<GraphCSRView<vertex_t, edge_t, weight_t>> runner(handle, graph_view);
+  return runner(clustering, max_level, resolution);
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::pair<size_t, weight_t> louvain(
+  raft::handle_t const &handle,
+  experimental::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
+  vertex_t *clustering,
+  size_t max_level,
+  weight_t resolution)
+{
+  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
+
+  experimental::Louvain<experimental::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu>>
+    runner(handle, graph_view);
+  return runner(clustering, max_level, resolution);
+}
+
+}  // namespace detail
+
+template <typename graph_t>
+std::pair<size_t, typename graph_t::weight_type> louvain(raft::handle_t const &handle,
+                                                         graph_t const &graph,
+                                                         typename graph_t::vertex_type *clustering,
+                                                         size_t max_level,
+                                                         typename graph_t::weight_type resolution)
+{
+  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
+
+  return detail::louvain(handle, graph, clustering, max_level, resolution);
+}
+
+// Explicit template instantations
+template std::pair<size_t, float> louvain(
+  raft::handle_t const &, GraphCSRView<int32_t, int32_t, float> const &, int32_t *, size_t, float);
+template std::pair<size_t, double> louvain(raft::handle_t const &,
+                                           GraphCSRView<int32_t, int32_t, double> const &,
+                                           int32_t *,
+                                           size_t,
+                                           double);
+template std::pair<size_t, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int32_t, float, false, false> const &,
+  int32_t *,
+  size_t,
+  float);
+template std::pair<size_t, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int32_t, double, false, false> const &,
+  int32_t *,
+  size_t,
+  double);
+template std::pair<size_t, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int64_t, float, false, false> const &,
+  int32_t *,
+  size_t,
+  float);
+template std::pair<size_t, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int64_t, double, false, false> const &,
+  int32_t *,
+  size_t,
+  double);
+template std::pair<size_t, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int64_t, int32_t, float, false, false> const &,
+  int64_t *,
+  size_t,
+  float);
+template std::pair<size_t, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int64_t, int32_t, double, false, false> const &,
+  int64_t *,
+  size_t,
+  double);
+template std::pair<size_t, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int64_t, int64_t, float, false, false> const &,
+  int64_t *,
+  size_t,
+  float);
+template std::pair<size_t, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int64_t, int64_t, double, false, false> const &,
+  int64_t *,
+  size_t,
+  double);
+
+// instantations with multi_gpu = true
+template std::pair<size_t, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int32_t, float, false, true> const &,
+  int32_t *,
+  size_t,
+  float);
+template std::pair<size_t, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int32_t, double, false, true> const &,
+  int32_t *,
+  size_t,
+  double);
+template std::pair<size_t, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int64_t, float, false, true> const &,
+  int32_t *,
+  size_t,
+  float);
+template std::pair<size_t, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int64_t, double, false, true> const &,
+  int32_t *,
+  size_t,
+  double);
+template std::pair<size_t, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int64_t, int32_t, float, false, true> const &,
+  int64_t *,
+  size_t,
+  float);
+template std::pair<size_t, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int64_t, int32_t, double, false, true> const &,
+  int64_t *,
+  size_t,
+  double);
+template std::pair<size_t, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int64_t, int64_t, float, false, true> const &,
+  int64_t *,
+  size_t,
+  float);
+template std::pair<size_t, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int64_t, int64_t, double, false, true> const &,
+  int64_t *,
+  size_t,
+  double);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
new file mode 100644
index 00000000000..8cec3eccfe6
--- /dev/null
+++ b/cpp/src/community/louvain.cuh
@@ -0,0 +1,638 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <graph.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+
+#include <converters/COOtoCSR.cuh>
+#include <utilities/graph_utils.cuh>
+
+//#define TIMING
+
+#ifdef TIMING
+#include <utilities/high_res_timer.hpp>
+#endif
+
+namespace cugraph {
+
+template <typename graph_type>
+class Louvain {
+ public:
+  using graph_t  = graph_type;
+  using vertex_t = typename graph_type::vertex_type;
+  using edge_t   = typename graph_type::edge_type;
+  using weight_t = typename graph_type::weight_type;
+
+  Louvain(raft::handle_t const &handle, graph_type const &graph)
+    :
+#ifdef TIMING
+      hr_timer_(),
+#endif
+      handle_(handle),
+
+      // FIXME:  Don't really need to copy here but would need
+      //         to change the logic to populate this properly
+      //         in generate_superverticies_graph.
+      //
+      offsets_v_(graph.offsets, graph.offsets + graph.number_of_vertices + 1),
+      indices_v_(graph.indices, graph.indices + graph.number_of_edges),
+      weights_v_(graph.edge_data, graph.edge_data + graph.number_of_edges),
+      src_indices_v_(graph.number_of_edges),
+      vertex_weights_v_(graph.number_of_vertices),
+      cluster_weights_v_(graph.number_of_vertices),
+      cluster_v_(graph.number_of_vertices),
+      tmp_arr_v_(graph.number_of_vertices),
+      cluster_inverse_v_(graph.number_of_vertices),
+      number_of_vertices_(graph.number_of_vertices),
+      number_of_edges_(graph.number_of_edges),
+      stream_(handle.get_stream())
+  {
+  }
+
+  weight_t modularity(weight_t total_edge_weight,
+                      weight_t resolution,
+                      graph_t const &graph,
+                      vertex_t const *d_cluster)
+  {
+    vertex_t n_verts = graph.number_of_vertices;
+
+    rmm::device_vector<weight_t> inc(n_verts, weight_t{0.0});
+    rmm::device_vector<weight_t> deg(n_verts, weight_t{0.0});
+
+    edge_t const *d_offsets   = graph.offsets;
+    vertex_t const *d_indices = graph.indices;
+    weight_t const *d_weights = graph.edge_data;
+    weight_t *d_inc           = inc.data().get();
+    weight_t *d_deg           = deg.data().get();
+
+    // FIXME:  Already have weighted degree computed in main loop,
+    //         could pass that in rather than computing d_deg... which
+    //         would save an atomicAdd (synchronization)
+    //
+    thrust::for_each(
+      rmm::exec_policy(stream_)->on(stream_),
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(graph.number_of_vertices),
+      [d_inc, d_deg, d_offsets, d_indices, d_weights, d_cluster] __device__(vertex_t v) {
+        vertex_t community = d_cluster[v];
+        weight_t increase{0.0};
+        weight_t degree{0.0};
+
+        for (edge_t loc = d_offsets[v]; loc < d_offsets[v + 1]; ++loc) {
+          vertex_t neighbor = d_indices[loc];
+          degree += d_weights[loc];
+          if (d_cluster[neighbor] == community) { increase += d_weights[loc]; }
+        }
+
+        if (degree > weight_t{0.0}) atomicAdd(d_deg + community, degree);
+        if (increase > weight_t{0.0}) atomicAdd(d_inc + community, increase);
+      });
+
+    weight_t Q = thrust::transform_reduce(
+      rmm::exec_policy(stream_)->on(stream_),
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(graph.number_of_vertices),
+      [d_deg, d_inc, total_edge_weight, resolution] __device__(vertex_t community) {
+        return ((d_inc[community] / total_edge_weight) - resolution *
+                                                           (d_deg[community] * d_deg[community]) /
+                                                           (total_edge_weight * total_edge_weight));
+      },
+      weight_t{0.0},
+      thrust::plus<weight_t>());
+
+    return Q;
+  }
+
+  virtual std::pair<size_t, weight_t> operator()(vertex_t *d_cluster_vec,
+                                                 size_t max_level,
+                                                 weight_t resolution)
+  {
+    size_t num_level{0};
+
+    weight_t total_edge_weight =
+      thrust::reduce(rmm::exec_policy(stream_)->on(stream_), weights_v_.begin(), weights_v_.end());
+
+    weight_t best_modularity = weight_t{-1};
+
+    //
+    //  Initialize every cluster to reference each vertex to itself
+    //
+    thrust::sequence(rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end());
+    thrust::copy(
+      rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end(), d_cluster_vec);
+
+    //
+    //  Our copy of the graph.  Each iteration of the outer loop will
+    //  shrink this copy of the graph.
+    //
+    GraphCSRView<vertex_t, edge_t, weight_t> current_graph(offsets_v_.data().get(),
+                                                           indices_v_.data().get(),
+                                                           weights_v_.data().get(),
+                                                           number_of_vertices_,
+                                                           number_of_edges_);
+
+    current_graph.get_source_indices(src_indices_v_.data().get());
+
+    while (num_level < max_level) {
+      compute_vertex_and_cluster_weights(current_graph);
+
+      weight_t new_Q = update_clustering(total_edge_weight, resolution, current_graph);
+
+      if (new_Q <= best_modularity) { break; }
+
+      best_modularity = new_Q;
+
+      shrink_graph(current_graph, d_cluster_vec);
+
+      num_level++;
+    }
+
+    timer_display(std::cout);
+
+    return std::make_pair(num_level, best_modularity);
+  }
+
+ protected:
+  void timer_start(std::string const &region)
+  {
+#ifdef TIMING
+    hr_timer_.start(region);
+#endif
+  }
+
+  void timer_stop(cudaStream_t stream)
+  {
+#ifdef TIMING
+    CUDA_TRY(cudaStreamSynchronize(stream));
+    hr_timer_.stop();
+#endif
+  }
+
+  void timer_display(std::ostream &os)
+  {
+#ifdef TIMING
+    hr_timer_.display(os);
+#endif
+  }
+
+ public:
+  void compute_vertex_and_cluster_weights(graph_type const &graph)
+  {
+    timer_start("compute_vertex_and_cluster_weights");
+
+    edge_t const *d_offsets     = graph.offsets;
+    vertex_t const *d_indices   = graph.indices;
+    weight_t const *d_weights   = graph.edge_data;
+    weight_t *d_vertex_weights  = vertex_weights_v_.data().get();
+    weight_t *d_cluster_weights = cluster_weights_v_.data().get();
+
+    //
+    // MNMG:  copy_v_transform_reduce_out_nbr, then copy
+    //
+    thrust::for_each(
+      rmm::exec_policy(stream_)->on(stream_),
+      thrust::make_counting_iterator<edge_t>(0),
+      thrust::make_counting_iterator<edge_t>(graph.number_of_vertices),
+      [d_offsets, d_indices, d_weights, d_vertex_weights, d_cluster_weights] __device__(
+        vertex_t src) {
+        weight_t sum =
+          thrust::reduce(thrust::seq, d_weights + d_offsets[src], d_weights + d_offsets[src + 1]);
+
+        d_vertex_weights[src]  = sum;
+        d_cluster_weights[src] = sum;
+      });
+
+    timer_stop(stream_);
+  }
+
+  virtual weight_t update_clustering(weight_t total_edge_weight,
+                                     weight_t resolution,
+                                     graph_type const &graph)
+  {
+    timer_start("update_clustering");
+
+    //
+    // MNMG: This is the hard one, see writeup
+    //
+    rmm::device_vector<vertex_t> next_cluster_v(cluster_v_);
+    rmm::device_vector<weight_t> delta_Q_v(graph.number_of_edges);
+    rmm::device_vector<vertex_t> cluster_hash_v(graph.number_of_edges);
+    rmm::device_vector<weight_t> old_cluster_sum_v(graph.number_of_vertices);
+
+    vertex_t *d_cluster_hash         = cluster_hash_v.data().get();
+    vertex_t *d_cluster              = cluster_v_.data().get();
+    weight_t const *d_vertex_weights = vertex_weights_v_.data().get();
+    weight_t *d_cluster_weights      = cluster_weights_v_.data().get();
+    weight_t *d_delta_Q              = delta_Q_v.data().get();
+
+    weight_t new_Q = modularity(total_edge_weight, resolution, graph, cluster_v_.data().get());
+
+    weight_t cur_Q = new_Q - 1;
+
+    // To avoid the potential of having two vertices swap clusters
+    // we will only allow vertices to move up (true) or down (false)
+    // during each iteration of the loop
+    bool up_down = true;
+
+    while (new_Q > (cur_Q + 0.0001)) {
+      cur_Q = new_Q;
+
+      compute_delta_modularity(
+        total_edge_weight, resolution, graph, cluster_hash_v, old_cluster_sum_v, delta_Q_v);
+
+      assign_nodes(graph, cluster_hash_v, next_cluster_v, delta_Q_v, up_down);
+
+      up_down = !up_down;
+
+      new_Q = modularity(total_edge_weight, resolution, graph, next_cluster_v.data().get());
+
+      if (new_Q > cur_Q) {
+        thrust::copy(rmm::exec_policy(stream_)->on(stream_),
+                     next_cluster_v.begin(),
+                     next_cluster_v.end(),
+                     cluster_v_.begin());
+      }
+    }
+
+    timer_stop(stream_);
+    return cur_Q;
+  }
+
+  void compute_delta_modularity(weight_t total_edge_weight,
+                                weight_t resolution,
+                                graph_type const &graph,
+                                rmm::device_vector<vertex_t> &cluster_hash_v,
+                                rmm::device_vector<weight_t> &old_cluster_sum_v,
+                                rmm::device_vector<weight_t> &delta_Q_v)
+  {
+    vertex_t const *d_src_indices     = src_indices_v_.data().get();
+    vertex_t const *d_dst_indices     = graph.indices;
+    edge_t const *d_offsets           = graph.offsets;
+    weight_t const *d_weights         = graph.edge_data;
+    vertex_t const *d_cluster         = cluster_v_.data().get();
+    weight_t const *d_vertex_weights  = vertex_weights_v_.data().get();
+    weight_t const *d_cluster_weights = cluster_weights_v_.data().get();
+
+    vertex_t *d_cluster_hash    = cluster_hash_v.data().get();
+    weight_t *d_delta_Q         = delta_Q_v.data().get();
+    weight_t *d_old_cluster_sum = old_cluster_sum_v.data().get();
+    weight_t *d_new_cluster_sum = d_delta_Q;
+
+    thrust::fill(cluster_hash_v.begin(), cluster_hash_v.end(), vertex_t{-1});
+    thrust::fill(delta_Q_v.begin(), delta_Q_v.end(), weight_t{0.0});
+    thrust::fill(old_cluster_sum_v.begin(), old_cluster_sum_v.end(), weight_t{0.0});
+
+    // MNMG:  New technique using reduce_by_key.  Would require a segmented sort
+    //        or a pair of sorts on each node, so probably slower than what's here.
+    //        This might still be faster even in MNMG...
+    //
+    //
+    // FIXME:  Eventually this should use cuCollections concurrent map
+    //         implementation, but that won't be available for a while.
+    //
+    // For each source vertex, we're going to build a hash
+    // table to the destination cluster ids.  We can use
+    // the offsets ranges to define the bounds of the hash
+    // table.
+    //
+    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+                     thrust::make_counting_iterator<edge_t>(0),
+                     thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
+                     [d_src_indices,
+                      d_dst_indices,
+                      d_cluster,
+                      d_offsets,
+                      d_cluster_hash,
+                      d_new_cluster_sum,
+                      d_weights,
+                      d_old_cluster_sum] __device__(edge_t loc) {
+                       vertex_t src = d_src_indices[loc];
+                       vertex_t dst = d_dst_indices[loc];
+
+                       if (src != dst) {
+                         vertex_t old_cluster = d_cluster[src];
+                         vertex_t new_cluster = d_cluster[dst];
+                         edge_t hash_base     = d_offsets[src];
+                         edge_t n_edges       = d_offsets[src + 1] - hash_base;
+
+                         int h         = (new_cluster % n_edges);
+                         edge_t offset = hash_base + h;
+                         while (d_cluster_hash[offset] != new_cluster) {
+                           if (d_cluster_hash[offset] == -1) {
+                             atomicCAS(d_cluster_hash + offset, -1, new_cluster);
+                           } else {
+                             h      = (h + 1) % n_edges;
+                             offset = hash_base + h;
+                           }
+                         }
+
+                         atomicAdd(d_new_cluster_sum + offset, d_weights[loc]);
+
+                         if (old_cluster == new_cluster)
+                           atomicAdd(d_old_cluster_sum + src, d_weights[loc]);
+                       }
+                     });
+
+    thrust::for_each(
+      rmm::exec_policy(stream_)->on(stream_),
+      thrust::make_counting_iterator<edge_t>(0),
+      thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
+      [total_edge_weight,
+       resolution,
+       d_cluster_hash,
+       d_src_indices,
+       d_cluster,
+       d_vertex_weights,
+       d_delta_Q,
+       d_new_cluster_sum,
+       d_old_cluster_sum,
+       d_cluster_weights] __device__(edge_t loc) {
+        vertex_t new_cluster = d_cluster_hash[loc];
+        if (new_cluster >= 0) {
+          vertex_t src         = d_src_indices[loc];
+          vertex_t old_cluster = d_cluster[src];
+          weight_t k_k         = d_vertex_weights[src];
+          weight_t a_old       = d_cluster_weights[old_cluster];
+          weight_t a_new       = d_cluster_weights[new_cluster];
+
+          // NOTE: d_delta_Q and d_new_cluster_sum are aliases
+          //       for same device array to save memory
+          d_delta_Q[loc] =
+            2 * (((d_new_cluster_sum[loc] - d_old_cluster_sum[src]) / total_edge_weight) -
+                 resolution * (a_new * k_k - a_old * k_k + k_k * k_k) /
+                   (total_edge_weight * total_edge_weight));
+        } else {
+          d_delta_Q[loc] = weight_t{0.0};
+        }
+      });
+  }
+
+  void assign_nodes(graph_type const &graph,
+                    rmm::device_vector<vertex_t> &cluster_hash_v,
+                    rmm::device_vector<vertex_t> &next_cluster_v,
+                    rmm::device_vector<weight_t> &delta_Q_v,
+                    bool up_down)
+  {
+    rmm::device_vector<vertex_t> temp_vertices_v(graph.number_of_vertices);
+    rmm::device_vector<vertex_t> temp_cluster_v(graph.number_of_vertices, vertex_t{-1});
+    rmm::device_vector<weight_t> temp_delta_Q_v(graph.number_of_vertices, weight_t{0.0});
+
+    weight_t *d_delta_Q              = delta_Q_v.data().get();
+    vertex_t *d_next_cluster         = next_cluster_v.data().get();
+    vertex_t *d_cluster_hash         = cluster_hash_v.data().get();
+    weight_t const *d_vertex_weights = vertex_weights_v_.data().get();
+    weight_t *d_cluster_weights      = cluster_weights_v_.data().get();
+
+    auto cluster_reduce_iterator =
+      thrust::make_zip_iterator(thrust::make_tuple(d_cluster_hash, d_delta_Q));
+
+    auto output_edge_iterator2 = thrust::make_zip_iterator(
+      thrust::make_tuple(temp_cluster_v.data().get(), temp_delta_Q_v.data().get()));
+
+    auto cluster_reduce_end =
+      thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_),
+                            src_indices_v_.begin(),
+                            src_indices_v_.end(),
+                            cluster_reduce_iterator,
+                            temp_vertices_v.data().get(),
+                            output_edge_iterator2,
+                            thrust::equal_to<vertex_t>(),
+                            [] __device__(auto pair1, auto pair2) {
+                              if (thrust::get<1>(pair1) > thrust::get<1>(pair2))
+                                return pair1;
+                              else
+                                return pair2;
+                            });
+
+    vertex_t final_size = thrust::distance(temp_vertices_v.data().get(), cluster_reduce_end.first);
+
+    vertex_t *d_temp_vertices = temp_vertices_v.data().get();
+    vertex_t *d_temp_clusters = temp_cluster_v.data().get();
+    weight_t *d_temp_delta_Q  = temp_delta_Q_v.data().get();
+
+    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+                     thrust::make_counting_iterator<vertex_t>(0),
+                     thrust::make_counting_iterator<vertex_t>(final_size),
+                     [d_temp_delta_Q,
+                      up_down,
+                      d_next_cluster,
+                      d_temp_vertices,
+                      d_vertex_weights,
+                      d_temp_clusters,
+                      d_cluster_weights] __device__(vertex_t id) {
+                       if ((d_temp_clusters[id] >= 0) && (d_temp_delta_Q[id] > weight_t{0.0})) {
+                         vertex_t new_cluster = d_temp_clusters[id];
+                         vertex_t old_cluster = d_next_cluster[d_temp_vertices[id]];
+
+                         if ((new_cluster > old_cluster) == up_down) {
+                           weight_t src_weight = d_vertex_weights[d_temp_vertices[id]];
+                           d_next_cluster[d_temp_vertices[id]] = d_temp_clusters[id];
+
+                           atomicAdd(d_cluster_weights + new_cluster, src_weight);
+                           atomicAdd(d_cluster_weights + old_cluster, -src_weight);
+                         }
+                       }
+                     });
+  }
+
+  void shrink_graph(graph_t &graph, vertex_t *d_cluster_vec)
+  {
+    timer_start("shrinking graph");
+
+    // renumber the clusters to the range 0..(num_clusters-1)
+    vertex_t num_clusters = renumber_clusters(d_cluster_vec);
+    cluster_weights_v_.resize(num_clusters);
+
+    // shrink our graph to represent the graph of supervertices
+    generate_superverticies_graph(graph, num_clusters);
+
+    // assign each new vertex to its own cluster
+    thrust::sequence(rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end());
+
+    timer_stop(stream_);
+  }
+
+  vertex_t renumber_clusters(vertex_t *d_cluster_vec)
+  {
+    vertex_t *d_tmp_array       = tmp_arr_v_.data().get();
+    vertex_t *d_cluster_inverse = cluster_inverse_v_.data().get();
+    vertex_t *d_cluster         = cluster_v_.data().get();
+
+    vertex_t old_num_clusters = cluster_v_.size();
+
+    //
+    //  New technique.  Initialize cluster_inverse_v_ to 0
+    //
+    thrust::fill(cluster_inverse_v_.begin(), cluster_inverse_v_.end(), vertex_t{0});
+
+    //
+    // Iterate over every element c in cluster_v_ and set cluster_inverse_v to 1
+    //
+    auto first_1 = thrust::make_constant_iterator<vertex_t>(1);
+    auto last_1  = first_1 + old_num_clusters;
+
+    thrust::scatter(rmm::exec_policy(stream_)->on(stream_),
+                    first_1,
+                    last_1,
+                    cluster_v_.begin(),
+                    cluster_inverse_v_.begin());
+
+    //
+    // Now we'll copy all of the clusters that have a value of 1 into a temporary array
+    //
+    auto copy_end = thrust::copy_if(
+      rmm::exec_policy(stream_)->on(stream_),
+      thrust::make_counting_iterator<vertex_t>(0),
+      thrust::make_counting_iterator<vertex_t>(old_num_clusters),
+      tmp_arr_v_.begin(),
+      [d_cluster_inverse] __device__(const vertex_t idx) { return d_cluster_inverse[idx] == 1; });
+
+    vertex_t new_num_clusters = thrust::distance(tmp_arr_v_.begin(), copy_end);
+    tmp_arr_v_.resize(new_num_clusters);
+
+    //
+    // Now we can set each value in cluster_inverse of a cluster to its index
+    //
+    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+                     thrust::make_counting_iterator<vertex_t>(0),
+                     thrust::make_counting_iterator<vertex_t>(new_num_clusters),
+                     [d_cluster_inverse, d_tmp_array] __device__(const vertex_t idx) {
+                       d_cluster_inverse[d_tmp_array[idx]] = idx;
+                     });
+
+    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+                     thrust::make_counting_iterator<vertex_t>(0),
+                     thrust::make_counting_iterator<vertex_t>(old_num_clusters),
+                     [d_cluster, d_cluster_inverse] __device__(vertex_t i) {
+                       d_cluster[i] = d_cluster_inverse[d_cluster[i]];
+                     });
+
+    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+                     thrust::make_counting_iterator<vertex_t>(0),
+                     thrust::make_counting_iterator<vertex_t>(number_of_vertices_),
+                     [d_cluster_vec, d_cluster] __device__(vertex_t i) {
+                       d_cluster_vec[i] = d_cluster[d_cluster_vec[i]];
+                     });
+
+    cluster_inverse_v_.resize(new_num_clusters);
+    cluster_v_.resize(new_num_clusters);
+
+    return new_num_clusters;
+  }
+
+  void generate_superverticies_graph(graph_t &graph, vertex_t num_clusters)
+  {
+    rmm::device_vector<vertex_t> new_src_v(graph.number_of_edges);
+    rmm::device_vector<vertex_t> new_dst_v(graph.number_of_edges);
+    rmm::device_vector<weight_t> new_weight_v(graph.number_of_edges);
+
+    vertex_t *d_old_src    = src_indices_v_.data().get();
+    vertex_t *d_old_dst    = graph.indices;
+    weight_t *d_old_weight = graph.edge_data;
+    vertex_t *d_new_src    = new_src_v.data().get();
+    vertex_t *d_new_dst    = new_dst_v.data().get();
+    vertex_t *d_clusters   = cluster_v_.data().get();
+    weight_t *d_new_weight = new_weight_v.data().get();
+
+    //
+    //  Renumber the COO
+    //
+    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+                     thrust::make_counting_iterator<edge_t>(0),
+                     thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
+                     [d_old_src,
+                      d_old_dst,
+                      d_old_weight,
+                      d_new_src,
+                      d_new_dst,
+                      d_new_weight,
+                      d_clusters] __device__(edge_t e) {
+                       d_new_src[e]    = d_clusters[d_old_src[e]];
+                       d_new_dst[e]    = d_clusters[d_old_dst[e]];
+                       d_new_weight[e] = d_old_weight[e];
+                     });
+
+    thrust::stable_sort_by_key(
+      rmm::exec_policy(stream_)->on(stream_),
+      d_new_dst,
+      d_new_dst + graph.number_of_edges,
+      thrust::make_zip_iterator(thrust::make_tuple(d_new_src, d_new_weight)));
+    thrust::stable_sort_by_key(
+      rmm::exec_policy(stream_)->on(stream_),
+      d_new_src,
+      d_new_src + graph.number_of_edges,
+      thrust::make_zip_iterator(thrust::make_tuple(d_new_dst, d_new_weight)));
+
+    //
+    //  Now we reduce by key to combine the weights of duplicate
+    //  edges.
+    //
+    auto start     = thrust::make_zip_iterator(thrust::make_tuple(d_new_src, d_new_dst));
+    auto new_start = thrust::make_zip_iterator(thrust::make_tuple(d_old_src, d_old_dst));
+    auto new_end   = thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_),
+                                         start,
+                                         start + graph.number_of_edges,
+                                         d_new_weight,
+                                         new_start,
+                                         d_old_weight,
+                                         thrust::equal_to<thrust::tuple<vertex_t, vertex_t>>(),
+                                         thrust::plus<weight_t>());
+
+    graph.number_of_edges    = thrust::distance(new_start, new_end.first);
+    graph.number_of_vertices = num_clusters;
+
+    detail::fill_offset(d_old_src, graph.offsets, num_clusters, graph.number_of_edges, stream_);
+    CHECK_CUDA(stream_);
+
+    src_indices_v_.resize(graph.number_of_edges);
+  }
+
+ protected:
+  raft::handle_t const &handle_;
+  vertex_t number_of_vertices_;
+  edge_t number_of_edges_;
+  cudaStream_t stream_;
+
+  //
+  //  Copy of graph
+  //
+  rmm::device_vector<edge_t> offsets_v_;
+  rmm::device_vector<vertex_t> indices_v_;
+  rmm::device_vector<weight_t> weights_v_;
+  rmm::device_vector<vertex_t> src_indices_v_;
+
+  //
+  //  Weights and clustering across iterations of algorithm
+  //
+  rmm::device_vector<weight_t> vertex_weights_v_;
+  rmm::device_vector<weight_t> cluster_weights_v_;
+  rmm::device_vector<vertex_t> cluster_v_;
+
+  //
+  //  Temporaries used within kernels.  Each iteration uses less
+  //  of this memory
+  //
+  rmm::device_vector<vertex_t> tmp_arr_v_;
+  rmm::device_vector<vertex_t> cluster_inverse_v_;
+
+#ifdef TIMING
+  HighResTimer hr_timer_;
+#endif
+};
+
+}  // namespace cugraph
diff --git a/cpp/src/community/louvain_kernels.cu b/cpp/src/community/louvain_kernels.cu
deleted file mode 100644
index c93e2d82fdf..00000000000
--- a/cpp/src/community/louvain_kernels.cu
+++ /dev/null
@@ -1,746 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <graph.hpp>
-
-#include <rmm/thrust_rmm_allocator.h>
-
-#include <utilities/graph_utils.cuh>
-
-//#define TIMING
-
-#ifdef TIMING
-#include <utilities/high_res_timer.hpp>
-#endif
-
-#include <converters/COOtoCSR.cuh>
-
-namespace cugraph {
-namespace detail {
-
-namespace {  // anonym.
-constexpr int BLOCK_SIZE_1D = 64;
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-__global__  //
-  void
-  compute_vertex_sums(vertex_t n_vertex,
-                      edge_t const *offsets,
-                      weight_t const *weights,
-                      weight_t *output)
-{
-  int src = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if ((src < n_vertex)) {
-    weight_t sum{0.0};
-
-    for (int i = offsets[src]; i < offsets[src + 1]; ++i) { sum += weights[i]; }
-
-    output[src] = sum;
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-weight_t modularity(weight_t total_edge_weight,
-                    weight_t resolution,
-                    GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                    vertex_t const *d_cluster,
-                    cudaStream_t stream)
-{
-  vertex_t n_verts = graph.number_of_vertices;
-
-  rmm::device_vector<weight_t> inc(n_verts, weight_t{0.0});
-  rmm::device_vector<weight_t> deg(n_verts, weight_t{0.0});
-
-  edge_t const *d_offsets   = graph.offsets;
-  vertex_t const *d_indices = graph.indices;
-  weight_t const *d_weights = graph.edge_data;
-  weight_t *d_inc           = inc.data().get();
-  weight_t *d_deg           = deg.data().get();
-
-  // FIXME:  Already have weighted degree computed in main loop,
-  //         could pass that in rather than computing d_deg... which
-  //         would save an atomicAdd (synchronization)
-  //
-  thrust::for_each(
-    rmm::exec_policy(stream)->on(stream),
-    thrust::make_counting_iterator(0),
-    thrust::make_counting_iterator(graph.number_of_vertices),
-    [d_inc, d_deg, d_offsets, d_indices, d_weights, d_cluster] __device__(vertex_t v) {
-      vertex_t community = d_cluster[v];
-      weight_t increase{0.0};
-      weight_t degree{0.0};
-
-      for (edge_t loc = d_offsets[v]; loc < d_offsets[v + 1]; ++loc) {
-        vertex_t neighbor = d_indices[loc];
-        degree += d_weights[loc];
-        if (d_cluster[neighbor] == community) { increase += d_weights[loc]; }
-      }
-
-      if (degree > weight_t{0.0}) atomicAdd(d_deg + community, degree);
-      if (increase > weight_t{0.0}) atomicAdd(d_inc + community, increase);
-    });
-
-  weight_t Q = thrust::transform_reduce(
-    rmm::exec_policy(stream)->on(stream),
-    thrust::make_counting_iterator(0),
-    thrust::make_counting_iterator(graph.number_of_vertices),
-    [d_deg, d_inc, total_edge_weight, resolution] __device__(vertex_t community) {
-      return ((d_inc[community] / total_edge_weight) - resolution *
-                                                         (d_deg[community] * d_deg[community]) /
-                                                         (total_edge_weight * total_edge_weight));
-    },
-    weight_t{0.0},
-    thrust::plus<weight_t>());
-  return Q;
-}
-
-template float modularity(
-  float, float, GraphCSRView<int32_t, int32_t, float> const &, int32_t const *, cudaStream_t);
-
-template double modularity(
-  double, double, GraphCSRView<int32_t, int32_t, double> const &, int32_t const *, cudaStream_t);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void generate_superverticies_graph(cugraph::GraphCSRView<vertex_t, edge_t, weight_t> &current_graph,
-                                   rmm::device_vector<vertex_t> &src_indices_v,
-                                   vertex_t new_number_of_vertices,
-                                   rmm::device_vector<vertex_t> &cluster_v,
-                                   cudaStream_t stream)
-{
-  rmm::device_vector<vertex_t> new_src_v(current_graph.number_of_edges);
-  rmm::device_vector<vertex_t> new_dst_v(current_graph.number_of_edges);
-  rmm::device_vector<weight_t> new_weight_v(current_graph.number_of_edges);
-
-  vertex_t *d_old_src    = src_indices_v.data().get();
-  vertex_t *d_old_dst    = current_graph.indices;
-  weight_t *d_old_weight = current_graph.edge_data;
-  vertex_t *d_new_src    = new_src_v.data().get();
-  vertex_t *d_new_dst    = new_dst_v.data().get();
-  vertex_t *d_clusters   = cluster_v.data().get();
-  weight_t *d_new_weight = new_weight_v.data().get();
-
-  //
-  //  Renumber the COO
-  //
-  thrust::for_each(
-    rmm::exec_policy(stream)->on(stream),
-    thrust::make_counting_iterator<edge_t>(0),
-    thrust::make_counting_iterator<edge_t>(current_graph.number_of_edges),
-    [d_old_src, d_old_dst, d_new_src, d_new_dst, d_clusters, d_new_weight, d_old_weight] __device__(
-      edge_t e) {
-      d_new_src[e]    = d_clusters[d_old_src[e]];
-      d_new_dst[e]    = d_clusters[d_old_dst[e]];
-      d_new_weight[e] = d_old_weight[e];
-    });
-
-  thrust::stable_sort_by_key(
-    rmm::exec_policy(stream)->on(stream),
-    d_new_dst,
-    d_new_dst + current_graph.number_of_edges,
-    thrust::make_zip_iterator(thrust::make_tuple(d_new_src, d_new_weight)));
-  thrust::stable_sort_by_key(
-    rmm::exec_policy(stream)->on(stream),
-    d_new_src,
-    d_new_src + current_graph.number_of_edges,
-    thrust::make_zip_iterator(thrust::make_tuple(d_new_dst, d_new_weight)));
-
-  //
-  //  Now we reduce by key to combine the weights of duplicate
-  //  edges.
-  //
-  auto start     = thrust::make_zip_iterator(thrust::make_tuple(d_new_src, d_new_dst));
-  auto new_start = thrust::make_zip_iterator(thrust::make_tuple(d_old_src, d_old_dst));
-  auto new_end   = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
-                                       start,
-                                       start + current_graph.number_of_edges,
-                                       d_new_weight,
-                                       new_start,
-                                       d_old_weight,
-                                       thrust::equal_to<thrust::tuple<vertex_t, vertex_t>>(),
-                                       thrust::plus<weight_t>());
-
-  current_graph.number_of_edges    = thrust::distance(new_start, new_end.first);
-  current_graph.number_of_vertices = new_number_of_vertices;
-
-  detail::fill_offset(d_old_src,
-                      current_graph.offsets,
-                      new_number_of_vertices,
-                      current_graph.number_of_edges,
-                      stream);
-  CHECK_CUDA(stream);
-
-  src_indices_v.resize(current_graph.number_of_edges);
-}
-
-template void generate_superverticies_graph(GraphCSRView<int32_t, int32_t, float> &,
-                                            rmm::device_vector<int32_t> &,
-                                            int32_t,
-                                            rmm::device_vector<int32_t> &,
-                                            cudaStream_t);
-
-template void generate_superverticies_graph(GraphCSRView<int32_t, int32_t, double> &,
-                                            rmm::device_vector<int32_t> &,
-                                            int32_t,
-                                            rmm::device_vector<int32_t> &,
-                                            cudaStream_t);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void compute_vertex_sums(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                         rmm::device_vector<weight_t> &sums,
-                         cudaStream_t stream)
-{
-  dim3 block_size_1d =
-    dim3((graph.number_of_vertices + BLOCK_SIZE_1D * 4 - 1) / BLOCK_SIZE_1D * 4, 1, 1);
-  dim3 grid_size_1d = dim3(BLOCK_SIZE_1D * 4, 1, 1);
-
-  compute_vertex_sums<vertex_t, edge_t, weight_t><<<block_size_1d, grid_size_1d>>>(
-    graph.number_of_vertices, graph.offsets, graph.edge_data, sums.data().get());
-}
-
-template void compute_vertex_sums(GraphCSRView<int32_t, int32_t, float> const &,
-                                  rmm::device_vector<float> &,
-                                  cudaStream_t);
-
-template void compute_vertex_sums(GraphCSRView<int32_t, int32_t, double> const &,
-                                  rmm::device_vector<double> &,
-                                  cudaStream_t);
-
-template <typename vertex_t>
-vertex_t renumber_clusters(vertex_t graph_num_vertices,
-                           rmm::device_vector<vertex_t> &cluster,
-                           rmm::device_vector<vertex_t> &temp_array,
-                           rmm::device_vector<vertex_t> &cluster_inverse,
-                           vertex_t *cluster_vec,
-                           cudaStream_t stream)
-{
-  //
-  //  Now we're going to renumber the clusters from 0 to (k-1), where k is the number of
-  //  clusters in this level of the dendogram.
-  //
-  thrust::copy(
-    rmm::exec_policy(stream)->on(stream), cluster.begin(), cluster.end(), temp_array.begin());
-  thrust::sort(rmm::exec_policy(stream)->on(stream), temp_array.begin(), temp_array.end());
-  auto tmp_end =
-    thrust::unique(rmm::exec_policy(stream)->on(stream), temp_array.begin(), temp_array.end());
-
-  vertex_t old_num_clusters = cluster.size();
-  vertex_t new_num_clusters = thrust::distance(temp_array.begin(), tmp_end);
-
-  cluster.resize(new_num_clusters);
-  temp_array.resize(new_num_clusters);
-
-  thrust::fill(cluster_inverse.begin(), cluster_inverse.end(), vertex_t{-1});
-
-  vertex_t *d_tmp_array       = temp_array.data().get();
-  vertex_t *d_cluster_inverse = cluster_inverse.data().get();
-  vertex_t *d_cluster         = cluster.data().get();
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<vertex_t>(0),
-                   thrust::make_counting_iterator<vertex_t>(new_num_clusters),
-                   [d_tmp_array, d_cluster_inverse] __device__(vertex_t i) {
-                     d_cluster_inverse[d_tmp_array[i]] = i;
-                   });
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<vertex_t>(0),
-                   thrust::make_counting_iterator<vertex_t>(old_num_clusters),
-                   [d_cluster, d_cluster_inverse] __device__(vertex_t i) {
-                     d_cluster[i] = d_cluster_inverse[d_cluster[i]];
-                   });
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<vertex_t>(0),
-                   thrust::make_counting_iterator<vertex_t>(graph_num_vertices),
-                   [cluster_vec, d_cluster] __device__(vertex_t i) {
-                     cluster_vec[i] = d_cluster[cluster_vec[i]];
-                   });
-
-  return new_num_clusters;
-}
-
-template int32_t renumber_clusters(int32_t,
-                                   rmm::device_vector<int32_t> &,
-                                   rmm::device_vector<int32_t> &,
-                                   rmm::device_vector<int32_t> &,
-                                   int32_t *,
-                                   cudaStream_t);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void compute_delta_modularity(weight_t total_edge_weight,
-                              weight_t resolution,
-                              GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                              rmm::device_vector<vertex_t> const &src_indices_v,
-                              rmm::device_vector<weight_t> const &vertex_weights_v,
-                              rmm::device_vector<weight_t> const &cluster_weights_v,
-                              rmm::device_vector<vertex_t> const &cluster_v,
-                              rmm::device_vector<vertex_t> &cluster_hash_v,
-                              rmm::device_vector<weight_t> &delta_Q_v,
-                              rmm::device_vector<weight_t> &tmp_size_V_v,
-                              cudaStream_t stream)
-{
-  vertex_t const *d_src_indices     = src_indices_v.data().get();
-  vertex_t const *d_dst_indices     = graph.indices;
-  edge_t const *d_offsets           = graph.offsets;
-  weight_t const *d_weights         = graph.edge_data;
-  vertex_t const *d_cluster         = cluster_v.data().get();
-  weight_t const *d_vertex_weights  = vertex_weights_v.data().get();
-  weight_t const *d_cluster_weights = cluster_weights_v.data().get();
-
-  vertex_t *d_cluster_hash    = cluster_hash_v.data().get();
-  weight_t *d_delta_Q         = delta_Q_v.data().get();
-  weight_t *d_old_cluster_sum = tmp_size_V_v.data().get();
-  weight_t *d_new_cluster_sum = d_delta_Q;
-
-  thrust::fill(cluster_hash_v.begin(), cluster_hash_v.end(), vertex_t{-1});
-  thrust::fill(delta_Q_v.begin(), delta_Q_v.end(), weight_t{0.0});
-  thrust::fill(tmp_size_V_v.begin(), tmp_size_V_v.end(), weight_t{0.0});
-
-  //
-  // For each source vertex, we're going to build a hash
-  // table to the destination cluster ids.  We can use
-  // the offsets ranges to define the bounds of the hash
-  // table.
-  //
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<edge_t>(0),
-                   thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
-                   [d_src_indices,
-                    d_dst_indices,
-                    d_cluster,
-                    d_offsets,
-                    d_cluster_hash,
-                    d_new_cluster_sum,
-                    d_weights,
-                    d_old_cluster_sum] __device__(edge_t loc) {
-                     vertex_t src = d_src_indices[loc];
-                     vertex_t dst = d_dst_indices[loc];
-
-                     if (src != dst) {
-                       vertex_t old_cluster = d_cluster[src];
-                       vertex_t new_cluster = d_cluster[dst];
-                       edge_t hash_base     = d_offsets[src];
-                       edge_t n_edges       = d_offsets[src + 1] - hash_base;
-
-                       int h         = (new_cluster % n_edges);
-                       edge_t offset = hash_base + h;
-                       while (d_cluster_hash[offset] != new_cluster) {
-                         if (d_cluster_hash[offset] == -1) {
-                           atomicCAS(d_cluster_hash + offset, -1, new_cluster);
-                         } else {
-                           h      = (h + 1) % n_edges;
-                           offset = hash_base + h;
-                         }
-                       }
-
-                       atomicAdd(d_new_cluster_sum + offset, d_weights[loc]);
-
-                       if (old_cluster == new_cluster)
-                         atomicAdd(d_old_cluster_sum + src, d_weights[loc]);
-                     }
-                   });
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<edge_t>(0),
-                   thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
-                   [total_edge_weight,
-                    resolution,
-                    d_cluster_hash,
-                    d_src_indices,
-                    d_cluster,
-                    d_vertex_weights,
-                    d_delta_Q,
-                    d_new_cluster_sum,
-                    d_old_cluster_sum,
-                    d_cluster_weights] __device__(edge_t loc) {
-                     vertex_t new_cluster = d_cluster_hash[loc];
-                     if (new_cluster >= 0) {
-                       vertex_t src         = d_src_indices[loc];
-                       vertex_t old_cluster = d_cluster[src];
-                       weight_t k_k         = d_vertex_weights[src];
-                       weight_t a_old       = d_cluster_weights[old_cluster];
-                       weight_t a_new       = d_cluster_weights[new_cluster];
-
-                       // NOTE: d_delta_Q and d_new_cluster_sum are aliases
-                       //       for same device array to save memory
-                       d_delta_Q[loc] =
-                         2 *
-                         (((d_new_cluster_sum[loc] - d_old_cluster_sum[src]) / total_edge_weight) -
-                          resolution * (a_new * k_k - a_old * k_k + k_k * k_k) /
-                            (total_edge_weight * total_edge_weight));
-                     } else {
-                       d_delta_Q[loc] = weight_t{0.0};
-                     }
-                   });
-}
-
-template void compute_delta_modularity(float,
-                                       float,
-                                       GraphCSRView<int32_t, int32_t, float> const &,
-                                       rmm::device_vector<int32_t> const &,
-                                       rmm::device_vector<float> const &,
-                                       rmm::device_vector<float> const &,
-                                       rmm::device_vector<int32_t> const &,
-                                       rmm::device_vector<int32_t> &,
-                                       rmm::device_vector<float> &,
-                                       rmm::device_vector<float> &,
-                                       cudaStream_t);
-
-template void compute_delta_modularity(double,
-                                       double,
-                                       GraphCSRView<int32_t, int32_t, double> const &,
-                                       rmm::device_vector<int32_t> const &,
-                                       rmm::device_vector<double> const &,
-                                       rmm::device_vector<double> const &,
-                                       rmm::device_vector<int32_t> const &,
-                                       rmm::device_vector<int32_t> &,
-                                       rmm::device_vector<double> &,
-                                       rmm::device_vector<double> &,
-                                       cudaStream_t);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void assign_nodes(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                  rmm::device_vector<weight_t> &delta_Q,
-                  rmm::device_vector<vertex_t> &cluster_hash,
-                  rmm::device_vector<vertex_t> const &src_indices,
-                  rmm::device_vector<vertex_t> &next_cluster,
-                  rmm::device_vector<weight_t> const &vertex_weights,
-                  rmm::device_vector<weight_t> &cluster_weights,
-                  bool up_down,
-                  cudaStream_t stream)
-{
-  rmm::device_vector<vertex_t> temp_vertices(graph.number_of_vertices);
-  rmm::device_vector<vertex_t> temp_cluster(graph.number_of_vertices, vertex_t{-1});
-  rmm::device_vector<weight_t> temp_delta_Q(graph.number_of_vertices, weight_t{0.0});
-
-  weight_t *d_delta_Q              = delta_Q.data().get();
-  vertex_t *d_next_cluster         = next_cluster.data().get();
-  vertex_t *d_cluster_hash         = cluster_hash.data().get();
-  weight_t const *d_vertex_weights = vertex_weights.data().get();
-  weight_t *d_cluster_weights      = cluster_weights.data().get();
-
-  auto cluster_reduce_iterator =
-    thrust::make_zip_iterator(thrust::make_tuple(d_cluster_hash, d_delta_Q));
-
-  auto output_edge_iterator2 = thrust::make_zip_iterator(
-    thrust::make_tuple(temp_cluster.data().get(), temp_delta_Q.data().get()));
-
-  auto cluster_reduce_end =
-    thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
-                          src_indices.begin(),
-                          src_indices.end(),
-                          cluster_reduce_iterator,
-                          temp_vertices.data().get(),
-                          output_edge_iterator2,
-                          thrust::equal_to<vertex_t>(),
-                          [] __device__(auto pair1, auto pair2) {
-                            if (thrust::get<1>(pair1) > thrust::get<1>(pair2))
-                              return pair1;
-                            else
-                              return pair2;
-                          });
-
-  vertex_t final_size = thrust::distance(temp_vertices.data().get(), cluster_reduce_end.first);
-
-  vertex_t *d_temp_vertices = temp_vertices.data().get();
-  vertex_t *d_temp_clusters = temp_cluster.data().get();
-  weight_t *d_temp_delta_Q  = temp_delta_Q.data().get();
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   thrust::make_counting_iterator<vertex_t>(0),
-                   thrust::make_counting_iterator<vertex_t>(final_size),
-                   [d_temp_delta_Q,
-                    up_down,
-                    d_next_cluster,
-                    d_temp_vertices,
-                    d_vertex_weights,
-                    d_temp_clusters,
-                    d_cluster_weights] __device__(vertex_t id) {
-                     if ((d_temp_clusters[id] >= 0) && (d_temp_delta_Q[id] > weight_t{0.0})) {
-                       vertex_t new_cluster = d_temp_clusters[id];
-                       vertex_t old_cluster = d_next_cluster[d_temp_vertices[id]];
-
-                       if ((new_cluster > old_cluster) == up_down) {
-                         weight_t src_weight = d_vertex_weights[d_temp_vertices[id]];
-                         d_next_cluster[d_temp_vertices[id]] = d_temp_clusters[id];
-
-                         atomicAdd(d_cluster_weights + new_cluster, src_weight);
-                         atomicAdd(d_cluster_weights + old_cluster, -src_weight);
-                       }
-                     }
-                   });
-}
-
-template void assign_nodes(GraphCSRView<int32_t, int32_t, float> const &,
-                           rmm::device_vector<float> &,
-                           rmm::device_vector<int32_t> &,
-                           rmm::device_vector<int32_t> const &,
-                           rmm::device_vector<int32_t> &,
-                           rmm::device_vector<float> const &,
-                           rmm::device_vector<float> &,
-                           bool,
-                           cudaStream_t);
-
-template void assign_nodes(GraphCSRView<int32_t, int32_t, double> const &,
-                           rmm::device_vector<double> &,
-                           rmm::device_vector<int32_t> &,
-                           rmm::device_vector<int32_t> const &,
-                           rmm::device_vector<int32_t> &,
-                           rmm::device_vector<double> const &,
-                           rmm::device_vector<double> &,
-                           bool,
-                           cudaStream_t);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-weight_t update_clustering_by_delta_modularity(
-  weight_t total_edge_weight,
-  weight_t resolution,
-  GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-  rmm::device_vector<vertex_t> const &src_indices,
-  rmm::device_vector<weight_t> const &vertex_weights,
-  rmm::device_vector<weight_t> &cluster_weights,
-  rmm::device_vector<vertex_t> &cluster,
-  cudaStream_t stream)
-{
-  rmm::device_vector<vertex_t> next_cluster(cluster);
-  rmm::device_vector<weight_t> delta_Q(graph.number_of_edges);
-  rmm::device_vector<vertex_t> cluster_hash(graph.number_of_edges);
-  rmm::device_vector<weight_t> old_cluster_sum(graph.number_of_vertices);
-
-  vertex_t *d_cluster_hash         = cluster_hash.data().get();
-  vertex_t *d_cluster              = cluster.data().get();
-  weight_t const *d_vertex_weights = vertex_weights.data().get();
-  weight_t *d_cluster_weights      = cluster_weights.data().get();
-  weight_t *d_delta_Q              = delta_Q.data().get();
-
-  weight_t new_Q = modularity<vertex_t, edge_t, weight_t>(
-    total_edge_weight, resolution, graph, cluster.data().get(), stream);
-
-  weight_t cur_Q = new_Q - 1;
-
-  // To avoid the potential of having two vertices swap clusters
-  // we will only allow vertices to move up (true) or down (false)
-  // during each iteration of the loop
-  bool up_down = true;
-
-  while (new_Q > (cur_Q + 0.0001)) {
-    cur_Q = new_Q;
-
-    compute_delta_modularity(total_edge_weight,
-                             resolution,
-                             graph,
-                             src_indices,
-                             vertex_weights,
-                             cluster_weights,
-                             cluster,
-                             cluster_hash,
-                             delta_Q,
-                             old_cluster_sum,
-                             stream);
-
-    assign_nodes(graph,
-                 delta_Q,
-                 cluster_hash,
-                 src_indices,
-                 next_cluster,
-                 vertex_weights,
-                 cluster_weights,
-                 up_down,
-                 stream);
-
-    up_down = !up_down;
-
-    new_Q = modularity<vertex_t, edge_t, weight_t>(
-      total_edge_weight, resolution, graph, next_cluster.data().get(), stream);
-
-    if (new_Q > cur_Q) {
-      thrust::copy(rmm::exec_policy(stream)->on(stream),
-                   next_cluster.begin(),
-                   next_cluster.end(),
-                   cluster.begin());
-    }
-  }
-
-  return cur_Q;
-}
-
-template float update_clustering_by_delta_modularity(float,
-                                                     float,
-                                                     GraphCSRView<int32_t, int32_t, float> const &,
-                                                     rmm::device_vector<int32_t> const &,
-                                                     rmm::device_vector<float> const &,
-                                                     rmm::device_vector<float> &,
-                                                     rmm::device_vector<int32_t> &,
-                                                     cudaStream_t);
-
-template double update_clustering_by_delta_modularity(
-  double,
-  double,
-  GraphCSRView<int32_t, int32_t, double> const &,
-  rmm::device_vector<int32_t> const &,
-  rmm::device_vector<double> const &,
-  rmm::device_vector<double> &,
-  rmm::device_vector<int32_t> &,
-  cudaStream_t);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void louvain(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-             weight_t *final_modularity,
-             int *num_level,
-             vertex_t *cluster_vec,
-             int max_level,
-             weight_t resolution,
-             cudaStream_t stream)
-{
-#ifdef TIMING
-  HighResTimer hr_timer;
-#endif
-
-  *num_level = 0;
-
-  //
-  //  Vectors to create a copy of the graph
-  //
-  rmm::device_vector<edge_t> offsets_v(graph.offsets, graph.offsets + graph.number_of_vertices + 1);
-  rmm::device_vector<vertex_t> indices_v(graph.indices, graph.indices + graph.number_of_edges);
-  rmm::device_vector<weight_t> weights_v(graph.edge_data, graph.edge_data + graph.number_of_edges);
-  rmm::device_vector<vertex_t> src_indices_v(graph.number_of_edges);
-
-  //
-  //  Weights and clustering across iterations of algorithm
-  //
-  rmm::device_vector<weight_t> vertex_weights_v(graph.number_of_vertices);
-  rmm::device_vector<weight_t> cluster_weights_v(graph.number_of_vertices);
-  rmm::device_vector<vertex_t> cluster_v(graph.number_of_vertices);
-
-  //
-  //  Temporaries used within kernels.  Each iteration uses less
-  //  of this memory
-  //
-  rmm::device_vector<vertex_t> tmp_arr_v(graph.number_of_vertices);
-  rmm::device_vector<vertex_t> cluster_inverse_v(graph.number_of_vertices);
-
-  weight_t total_edge_weight =
-    thrust::reduce(rmm::exec_policy(stream)->on(stream), weights_v.begin(), weights_v.end());
-  weight_t best_modularity = -1;
-
-  //
-  //  Initialize every cluster to reference each vertex to itself
-  //
-  thrust::sequence(rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end());
-  thrust::copy(
-    rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end(), cluster_vec);
-
-  //
-  //  Our copy of the graph.  Each iteration of the outer loop will
-  //  shrink this copy of the graph.
-  //
-  GraphCSRView<vertex_t, edge_t, weight_t> current_graph(offsets_v.data().get(),
-                                                         indices_v.data().get(),
-                                                         weights_v.data().get(),
-                                                         graph.number_of_vertices,
-                                                         graph.number_of_edges);
-
-  current_graph.get_source_indices(src_indices_v.data().get());
-
-  while (*num_level < max_level) {
-    //
-    //  Sum the weights of all edges departing a vertex.  This is
-    //  loop invariant, so we'll compute it here.
-    //
-    //  Cluster weights are equivalent to vertex weights with this initial
-    //  graph
-    //
-#ifdef TIMING
-    hr_timer.start("init");
-#endif
-
-    cugraph::detail::compute_vertex_sums(current_graph, vertex_weights_v, stream);
-    thrust::copy(rmm::exec_policy(stream)->on(stream),
-                 vertex_weights_v.begin(),
-                 vertex_weights_v.end(),
-                 cluster_weights_v.begin());
-
-#ifdef TIMING
-    hr_timer.stop();
-
-    hr_timer.start("update_clustering");
-#endif
-
-    weight_t new_Q = update_clustering_by_delta_modularity(total_edge_weight,
-                                                           resolution,
-                                                           current_graph,
-                                                           src_indices_v,
-                                                           vertex_weights_v,
-                                                           cluster_weights_v,
-                                                           cluster_v,
-                                                           stream);
-
-#ifdef TIMING
-    hr_timer.stop();
-#endif
-
-    if (new_Q <= best_modularity) { break; }
-
-    best_modularity = new_Q;
-
-#ifdef TIMING
-    hr_timer.start("shrinking graph");
-#endif
-
-    // renumber the clusters to the range 0..(num_clusters-1)
-    vertex_t num_clusters = renumber_clusters(
-      graph.number_of_vertices, cluster_v, tmp_arr_v, cluster_inverse_v, cluster_vec, stream);
-    cluster_weights_v.resize(num_clusters);
-
-    // shrink our graph to represent the graph of supervertices
-    generate_superverticies_graph(current_graph, src_indices_v, num_clusters, cluster_v, stream);
-
-    // assign each new vertex to its own cluster
-    thrust::sequence(rmm::exec_policy(stream)->on(stream), cluster_v.begin(), cluster_v.end());
-
-#ifdef TIMING
-    hr_timer.stop();
-#endif
-
-    (*num_level)++;
-  }
-
-#ifdef TIMING
-  hr_timer.display(std::cout);
-#endif
-
-  *final_modularity = best_modularity;
-}
-
-template void louvain(GraphCSRView<int32_t, int32_t, float> const &,
-                      float *,
-                      int *,
-                      int32_t *,
-                      int,
-                      float,
-                      cudaStream_t);
-template void louvain(GraphCSRView<int32_t, int32_t, double> const &,
-                      double *,
-                      int *,
-                      int32_t *,
-                      int,
-                      double,
-                      cudaStream_t);
-
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/community/louvain_kernels.hpp b/cpp/src/community/louvain_kernels.hpp
deleted file mode 100644
index eabd562315a..00000000000
--- a/cpp/src/community/louvain_kernels.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <rmm/thrust_rmm_allocator.h>
-
-#include <graph.hpp>
-
-namespace cugraph {
-namespace detail {
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-weight_t modularity(weight_t total_edge_weight,
-                    weight_t resolution,
-                    GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                    vertex_t const *d_cluster,
-                    cudaStream_t stream = 0);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void generate_superverticies_graph(cugraph::GraphCSRView<vertex_t, edge_t, weight_t> &current_graph,
-                                   rmm::device_vector<vertex_t> &src_indices_v,
-                                   vertex_t new_number_of_vertices,
-                                   rmm::device_vector<vertex_t> &cluster_v,
-                                   cudaStream_t stream);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void compute_vertex_sums(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                         rmm::device_vector<weight_t> &sums,
-                         cudaStream_t stream);
-
-template <typename vertex_t>
-vertex_t renumber_clusters(vertex_t graph_num_vertices,
-                           rmm::device_vector<vertex_t> &cluster,
-                           rmm::device_vector<vertex_t> &temp_array,
-                           rmm::device_vector<vertex_t> &cluster_inverse,
-                           vertex_t *cluster_vec,
-                           cudaStream_t stream);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void compute_delta_modularity(weight_t total_edge_weight,
-                              weight_t resolution,
-                              GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                              rmm::device_vector<vertex_t> const &src_indices_v,
-                              rmm::device_vector<weight_t> const &vertex_weights_v,
-                              rmm::device_vector<weight_t> const &cluster_weights_v,
-                              rmm::device_vector<vertex_t> const &cluster_v,
-                              rmm::device_vector<vertex_t> &cluster_hash_v,
-                              rmm::device_vector<weight_t> &delta_Q_v,
-                              rmm::device_vector<weight_t> &tmp_size_V_v,
-                              cudaStream_t stream = 0);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void assign_nodes(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-                  rmm::device_vector<weight_t> &delta_Q,
-                  rmm::device_vector<vertex_t> &cluster_hash,
-                  rmm::device_vector<vertex_t> const &src_indices,
-                  rmm::device_vector<vertex_t> &next_cluster,
-                  rmm::device_vector<weight_t> const &vertex_weights,
-                  rmm::device_vector<weight_t> &cluster_weights,
-                  bool up_down,
-                  cudaStream_t stream);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-weight_t update_clustering_by_delta_modularity(
-  weight_t total_edge_weight,
-  weight_t resolution,
-  GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-  rmm::device_vector<vertex_t> const &src_indices,
-  rmm::device_vector<weight_t> const &vertex_weights,
-  rmm::device_vector<weight_t> &cluster_weights,
-  rmm::device_vector<vertex_t> &cluster,
-  cudaStream_t stream);
-
-template <typename vertex_t, typename edge_t, typename weight_t>
-void louvain(GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
-             weight_t *final_modularity,
-             int *num_level,
-             vertex_t *cluster_vec,
-             int max_level,
-             weight_t resolution,
-             cudaStream_t stream = 0);
-
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/experimental/bfs.cu b/cpp/src/experimental/bfs.cu
new file mode 100644
index 00000000000..d9d7cb1a245
--- /dev/null
+++ b/cpp/src/experimental/bfs.cu
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithms.hpp>
+#include <experimental/graph_view.hpp>
+#include <patterns/reduce_op.cuh>
+#include <patterns/update_frontier_v_push_if_out_nbr.cuh>
+#include <patterns/vertex_frontier.cuh>
+#include <utilities/error.hpp>
+#include <vertex_partition_device.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+#include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <limits>
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+namespace detail {
+
+template <typename GraphViewType, typename PredecessorIterator>
+void bfs(raft::handle_t &handle,
+         GraphViewType const &push_graph_view,
+         typename GraphViewType::vertex_type *distances,
+         PredecessorIterator predecessor_first,
+         typename GraphViewType::vertex_type source_vertex,
+         bool direction_optimizing,
+         typename GraphViewType::vertex_type depth_limit,
+         bool do_expensive_check)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+
+  static_assert(std::is_integral<vertex_t>::value,
+                "GraphViewType::vertex_type should be integral.");
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+
+  auto const num_vertices = push_graph_view.get_number_of_vertices();
+  if (num_vertices == 0) { return; }
+
+  // 1. check input arguments
+
+  CUGRAPH_EXPECTS(
+    push_graph_view.is_symmetric() || !direction_optimizing,
+    "Invalid input argument: input graph should be symmetric for direction optimizing BFS.");
+  CUGRAPH_EXPECTS(push_graph_view.is_valid_vertex(source_vertex),
+                  "Invalid input argument: source vertex out-of-range.");
+
+  if (do_expensive_check) {
+    // nothing to do
+  }
+
+  // 2. initialize distances and predecessors
+
+  auto constexpr invalid_distance = std::numeric_limits<vertex_t>::max();
+  auto constexpr invalid_vertex   = invalid_vertex_id<vertex_t>::value;
+
+  auto val_first = thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first));
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    thrust::make_counting_iterator(push_graph_view.get_local_vertex_first()),
+                    thrust::make_counting_iterator(push_graph_view.get_local_vertex_last()),
+                    val_first,
+                    [source_vertex] __device__(auto val) {
+                      auto distance = invalid_distance;
+                      if (val == source_vertex) { distance = vertex_t{0}; }
+                      return thrust::make_tuple(distance, invalid_vertex);
+                    });
+
+  // 3. initialize BFS frontier
+
+  enum class Bucket { cur, num_buckets };
+  std::vector<size_t> bucket_sizes(static_cast<size_t>(Bucket::num_buckets),
+                                   push_graph_view.get_number_of_local_vertices());
+  VertexFrontier<thrust::tuple<vertex_t>, vertex_t, false, static_cast<size_t>(Bucket::num_buckets)>
+    vertex_frontier(handle, bucket_sizes);
+
+  if (push_graph_view.is_local_vertex_nocheck(source_vertex)) {
+    vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).insert(source_vertex);
+  }
+
+  // 4. BFS iteration
+
+  vertex_t depth{0};
+  auto cur_local_vertex_frontier_first =
+    vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).begin();
+  auto cur_vertex_frontier_aggregate_size =
+    vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).aggregate_size();
+  while (true) {
+    if (direction_optimizing) {
+      CUGRAPH_FAIL("unimplemented.");
+    } else {
+      vertex_partition_device_t<GraphViewType> vertex_partition(push_graph_view);
+
+      auto cur_local_vertex_frontier_last =
+        vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).end();
+      update_frontier_v_push_if_out_nbr(
+        handle,
+        push_graph_view,
+        cur_local_vertex_frontier_first,
+        cur_local_vertex_frontier_last,
+        thrust::make_constant_iterator(0) /* dummy */,
+        thrust::make_constant_iterator(0) /* dummy */,
+        [vertex_partition, distances] __device__(
+          vertex_t src, vertex_t dst, auto src_val, auto dst_val) {
+          auto push = true;
+          if (vertex_partition.is_local_vertex_nocheck(dst)) {
+            auto distance =
+              *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(dst));
+            if (distance != invalid_distance) { push = false; }
+          }
+          // FIXME: need to test this works properly if payload size is 0 (returns a tuple of size
+          // 1)
+          return thrust::make_tuple(push, src);
+        },
+        reduce_op::any<thrust::tuple<vertex_t>>(),
+        distances,
+        thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first)),
+        vertex_frontier,
+        [depth] __device__(auto v_val, auto pushed_val) {
+          auto idx = (v_val == invalid_distance)
+                       ? static_cast<size_t>(Bucket::cur)
+                       : VertexFrontier<thrust::tuple<vertex_t>, vertex_t>::kInvalidBucketIdx;
+          return thrust::make_tuple(idx, depth + 1, thrust::get<0>(pushed_val));
+        });
+
+      auto new_vertex_frontier_aggregate_size =
+        vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).aggregate_size() -
+        cur_vertex_frontier_aggregate_size;
+      if (new_vertex_frontier_aggregate_size == 0) { break; }
+
+      cur_local_vertex_frontier_first = cur_local_vertex_frontier_last;
+      cur_vertex_frontier_aggregate_size += new_vertex_frontier_aggregate_size;
+    }
+
+    depth++;
+    if (depth >= depth_limit) { break; }
+  }
+
+  return;
+}
+
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void bfs(raft::handle_t &handle,
+         graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
+         vertex_t *distances,
+         vertex_t *predecessors,
+         vertex_t source_vertex,
+         bool direction_optimizing,
+         vertex_t depth_limit,
+         bool do_expensive_check)
+{
+  if (predecessors != nullptr) {
+    detail::bfs(handle,
+                graph_view,
+                distances,
+                predecessors,
+                source_vertex,
+                direction_optimizing,
+                depth_limit,
+                do_expensive_check);
+  } else {
+    detail::bfs(handle,
+                graph_view,
+                distances,
+                thrust::make_discard_iterator(),
+                source_vertex,
+                direction_optimizing,
+                depth_limit,
+                do_expensive_check);
+  }
+}
+
+// explicit instantiation
+
+template void bfs(raft::handle_t &handle,
+                  graph_view_t<int32_t, int32_t, float, false, true> const &graph_view,
+                  int32_t *distances,
+                  int32_t *predecessors,
+                  int32_t source_vertex,
+                  bool direction_optimizing,
+                  int32_t depth_limit,
+                  bool do_expensive_check);
+
+template void bfs(raft::handle_t &handle,
+                  graph_view_t<int32_t, int32_t, double, false, true> const &graph_view,
+                  int32_t *distances,
+                  int32_t *predecessors,
+                  int32_t source_vertex,
+                  bool direction_optimizing,
+                  int32_t depth_limit,
+                  bool do_expensive_check);
+
+template void bfs(raft::handle_t &handle,
+                  graph_view_t<int32_t, int64_t, float, false, true> const &graph_view,
+                  int32_t *distances,
+                  int32_t *predecessors,
+                  int32_t source_vertex,
+                  bool direction_optimizing,
+                  int32_t depth_limit,
+                  bool do_expensive_check);
+
+template void bfs(raft::handle_t &handle,
+                  graph_view_t<int32_t, int64_t, double, false, true> const &graph_view,
+                  int32_t *distances,
+                  int32_t *predecessors,
+                  int32_t source_vertex,
+                  bool direction_optimizing,
+                  int32_t depth_limit,
+                  bool do_expensive_check);
+
+template void bfs(raft::handle_t &handle,
+                  graph_view_t<int64_t, int64_t, float, false, true> const &graph_view,
+                  int64_t *distances,
+                  int64_t *predecessors,
+                  int64_t source_vertex,
+                  bool direction_optimizing,
+                  int64_t depth_limit,
+                  bool do_expensive_check);
+
+template void bfs(raft::handle_t &handle,
+                  graph_view_t<int64_t, int64_t, double, false, true> const &graph_view,
+                  int64_t *distances,
+                  int64_t *predecessors,
+                  int64_t source_vertex,
+                  bool direction_optimizing,
+                  int64_t depth_limit,
+                  bool do_expensive_check);
+
+template void bfs(raft::handle_t &handle,
+                  graph_view_t<int32_t, int32_t, float, false, false> const &graph_view,
+                  int32_t *distances,
+                  int32_t *predecessors,
+                  int32_t source_vertex,
+                  bool direction_optimizing,
+                  int32_t depth_limit,
+                  bool do_expensive_check);
+
+template void bfs(raft::handle_t &handle,
+                  graph_view_t<int32_t, int32_t, double, false, false> const &graph_view,
+                  int32_t *distances,
+                  int32_t *predecessors,
+                  int32_t source_vertex,
+                  bool direction_optimizing,
+                  int32_t depth_limit,
+                  bool do_expensive_check);
+
+template void bfs(raft::handle_t &handle,
+                  graph_view_t<int32_t, int64_t, float, false, false> const &graph_view,
+                  int32_t *distances,
+                  int32_t *predecessors,
+                  int32_t source_vertex,
+                  bool direction_optimizing,
+                  int32_t depth_limit,
+                  bool do_expensive_check);
+
+template void bfs(raft::handle_t &handle,
+                  graph_view_t<int32_t, int64_t, double, false, false> const &graph_view,
+                  int32_t *distances,
+                  int32_t *predecessors,
+                  int32_t source_vertex,
+                  bool direction_optimizing,
+                  int32_t depth_limit,
+                  bool do_expensive_check);
+
+template void bfs(raft::handle_t &handle,
+                  graph_view_t<int64_t, int64_t, float, false, false> const &graph_view,
+                  int64_t *distances,
+                  int64_t *predecessors,
+                  int64_t source_vertex,
+                  bool direction_optimizing,
+                  int64_t depth_limit,
+                  bool do_expensive_check);
+
+template void bfs(raft::handle_t &handle,
+                  graph_view_t<int64_t, int64_t, double, false, false> const &graph_view,
+                  int64_t *distances,
+                  int64_t *predecessors,
+                  int64_t source_vertex,
+                  bool direction_optimizing,
+                  int64_t depth_limit,
+                  bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/graph.cu b/cpp/src/experimental/graph.cu
index 7b7625fd911..0294716089c 100644
--- a/cpp/src/experimental/graph.cu
+++ b/cpp/src/experimental/graph.cu
@@ -16,6 +16,8 @@
 
 #include <experimental/detail/graph_utils.cuh>
 #include <experimental/graph.hpp>
+#include <partition_manager.hpp>
+#include <utilities/comm_utils.cuh>
 #include <utilities/error.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
@@ -190,8 +192,8 @@ std::vector<vertex_t> segment_degree_sorted_vertex_partition(raft::handle_t cons
 
   CUDA_TRY(cudaStreamSynchronize(
     handle.get_stream()));  // this is necessary as d_segment_offsets will become out-of-scope once
-                            // this functions and returning a host variable which can be used right
-                            // after return.
+                            // this function returns and this function returns a host variable which
+                            // can be used right after return.
 
   return h_segment_offsets;
 }
@@ -218,15 +220,17 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
 {
   // cheap error checks
 
-  auto &comm_p               = this->get_handle_ptr()->get_comms();
-  auto const comm_p_size     = comm_p.get_size();
-  auto &comm_p_row           = this->get_handle_ptr()->get_subcomm(comm_p_row_key);
-  auto const comm_p_row_rank = comm_p_row.get_rank();
-  auto const comm_p_row_size = comm_p_row.get_size();
-  auto &comm_p_col           = this->get_handle_ptr()->get_subcomm(comm_p_col_key);
-  auto const comm_p_col_rank = comm_p_col.get_rank();
-  auto const comm_p_col_size = comm_p_col.get_size();
-  auto default_stream        = this->get_handle_ptr()->get_stream();
+  auto &comm           = this->get_handle_ptr()->get_comms();
+  auto const comm_size = comm.get_size();
+  auto &row_comm =
+    this->get_handle_ptr()->get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_rank = row_comm.get_rank();
+  auto const row_comm_size = row_comm.get_size();
+  auto &col_comm =
+    this->get_handle_ptr()->get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_rank = col_comm.get_rank();
+  auto const col_comm_size = col_comm.get_size();
+  auto default_stream      = this->get_handle_ptr()->get_stream();
 
   CUGRAPH_EXPECTS(edgelists.size() > 0,
                   "Invalid API parameter: edgelists.size() should be non-zero.");
@@ -247,7 +251,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     "is nullptr) or should not be nullptr (otherwise).");
 
   CUGRAPH_EXPECTS((partition.is_hypergraph_partitioned() &&
-                   (edgelists.size() == static_cast<size_t>(comm_p_row_size))) ||
+                   (edgelists.size() == static_cast<size_t>(row_comm_size))) ||
                     (!(partition.is_hypergraph_partitioned()) && (edgelists.size() == 1)),
                   "Invalid API parameter: errneous edgelists.size().");
 
@@ -276,17 +280,14 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                                          major_first, major_last, minor_first, minor_last}) == 0,
                       "Invalid API parameter: edgelists[] have out-of-range values.");
     }
-    this->get_handle_ptr()->get_comms().allreduce(&number_of_local_edges_sum,
-                                                  &number_of_local_edges_sum,
-                                                  1,
-                                                  raft::comms::op_t::SUM,
-                                                  default_stream);
+    number_of_local_edges_sum =
+      host_scalar_allreduce(comm, number_of_local_edges_sum, default_stream);
     CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(),
                     "Invalid API parameter: the sum of local edges doe counts not match with "
                     "number_of_local_edges.");
 
     CUGRAPH_EXPECTS(
-      partition.get_vertex_partition_range_last(comm_p_size - 1) == number_of_vertices,
+      partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,
       "Invalid API parameter: vertex partition should cover [0, number_of_vertices).");
   }
 
@@ -355,14 +356,14 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                         d_thresholds.end(),
                         segment_offsets.begin() + 1);
 
-    rmm::device_uvector<vertex_t> aggregate_segment_offsets(
-      comm_p_row_size * segment_offsets.size(), default_stream);
-    comm_p_row.allgather(segment_offsets.data(),
-                         aggregate_segment_offsets.data(),
-                         segment_offsets.size(),
-                         default_stream);
+    rmm::device_uvector<vertex_t> aggregate_segment_offsets(row_comm_size * segment_offsets.size(),
+                                                            default_stream);
+    row_comm.allgather(segment_offsets.data(),
+                       aggregate_segment_offsets.data(),
+                       segment_offsets.size(),
+                       default_stream);
 
-    vertex_partition_segment_offsets_.resize(comm_p_row_size * (segment_offsets.size()));
+    vertex_partition_segment_offsets_.resize(row_comm_size * (segment_offsets.size()));
     raft::update_host(vertex_partition_segment_offsets_.data(),
                       aggregate_segment_offsets.data(),
                       aggregate_segment_offsets.size(),
@@ -521,7 +522,7 @@ template class graph_t<int64_t, int64_t, float, true, true>;
 template class graph_t<int64_t, int64_t, float, false, true>;
 template class graph_t<int64_t, int64_t, double, true, true>;
 template class graph_t<int64_t, int64_t, double, false, true>;
-
+//
 template class graph_t<int32_t, int32_t, float, true, false>;
 template class graph_t<int32_t, int32_t, float, false, false>;
 template class graph_t<int32_t, int32_t, double, true, false>;
diff --git a/cpp/src/experimental/graph_view.cu b/cpp/src/experimental/graph_view.cu
index b297a825a01..999c91df427 100644
--- a/cpp/src/experimental/graph_view.cu
+++ b/cpp/src/experimental/graph_view.cu
@@ -16,6 +16,7 @@
 
 #include <experimental/detail/graph_utils.cuh>
 #include <experimental/graph_view.hpp>
+#include <partition_manager.hpp>
 #include <utilities/error.hpp>
 
 #include <raft/cudart_utils.h>
@@ -76,9 +77,13 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
 {
   // cheap error checks
 
-  auto const comm_p_size     = this->get_handle_ptr()->get_comms().get_size();
-  auto const comm_p_row_size = this->get_handle_ptr()->get_subcomm(comm_p_row_key).get_size();
-  auto const comm_p_col_size = this->get_handle_ptr()->get_subcomm(comm_p_col_key).get_size();
+  auto const comm_size     = this->get_handle_ptr()->get_comms().get_size();
+  auto const row_comm_size = this->get_handle_ptr()
+                               ->get_subcomm(cugraph::partition_2d::key_naming_t().row_name())
+                               .get_size();
+  auto const col_comm_size = this->get_handle_ptr()
+                               ->get_subcomm(cugraph::partition_2d::key_naming_t().col_name())
+                               .get_size();
 
   CUGRAPH_EXPECTS(adj_matrix_partition_offsets.size() == adj_matrix_partition_indices.size(),
                   "Invalid API parameter: adj_matrix_partition_offsets.size() and "
@@ -90,13 +95,13 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
 
   CUGRAPH_EXPECTS(
     (partition.is_hypergraph_partitioned() &&
-     (adj_matrix_partition_offsets.size() == static_cast<size_t>(comm_p_row_size))) ||
+     (adj_matrix_partition_offsets.size() == static_cast<size_t>(row_comm_size))) ||
       (!(partition.is_hypergraph_partitioned()) && (adj_matrix_partition_offsets.size() == 1)),
     "Invalid API parameter: errneous adj_matrix_partition_offsets.size().");
 
   CUGRAPH_EXPECTS((sorted_by_global_degree_within_vertex_partition &&
                    (vertex_partition_segment_offsets.size() ==
-                    comm_p_col_size * (detail::num_segments_per_vertex_partition + 1))) ||
+                    col_comm_size * (detail::num_segments_per_vertex_partition + 1))) ||
                     (!sorted_by_global_degree_within_vertex_partition &&
                      (vertex_partition_segment_offsets.size() == 0)),
                   "Invalid API parameter: vertex_partition_segment_offsets.size() does not match "
@@ -107,8 +112,12 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
   if (do_expensive_check) {
     auto default_stream = this->get_handle_ptr()->get_stream();
 
-    auto const comm_p_row_rank = this->get_handle_ptr()->get_subcomm(comm_p_row_key).get_rank();
-    auto const comm_p_col_rank = this->get_handle_ptr()->get_subcomm(comm_p_col_key).get_rank();
+    auto const row_comm_rank = this->get_handle_ptr()
+                                 ->get_subcomm(cugraph::partition_2d::key_naming_t().row_name())
+                                 .get_rank();
+    auto const col_comm_rank = this->get_handle_ptr()
+                                 ->get_subcomm(cugraph::partition_2d::key_naming_t().col_name())
+                                 .get_rank();
 
     edge_t number_of_local_edges_sum{};
     for (size_t i = 0; i < adj_matrix_partition_offsets.size(); ++i) {
@@ -159,7 +168,7 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
                       "Invalid API parameter: sorted_by_global_degree_within_vertex_partition is "
                       "set to true, but degrees are not non-ascending.");
 
-      for (int i = 0; i < comm_p_col_size; ++i) {
+      for (int i = 0; i < col_comm_size; ++i) {
         CUGRAPH_EXPECTS(std::is_sorted(vertex_partition_segment_offsets.begin() +
                                          (detail::num_segments_per_vertex_partition + 1) * i,
                                        vertex_partition_segment_offsets.begin() +
@@ -170,18 +179,18 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
             0,
           "Invalid API parameter: erroneous vertex_partition_segment_offsets.");
         auto vertex_partition_idx = partition.is_hypergraph_partitioned()
-                                      ? comm_p_row_size * i + comm_p_row_rank
-                                      : comm_p_col_size * comm_p_row_rank + i;
+                                      ? row_comm_size * i + row_comm_rank
+                                      : col_comm_size * row_comm_rank + i;
         CUGRAPH_EXPECTS(
           vertex_partition_segment_offsets[(detail::num_segments_per_vertex_partition + 1) * i +
                                            detail::num_segments_per_vertex_partition] ==
-            partition.get_vertex_partition_range_first(vertex_partition_idx),
+            partition.get_vertex_partition_first(vertex_partition_idx),
           "Invalid API parameter: erroneous vertex_partition_segment_offsets.");
       }
     }
 
     CUGRAPH_EXPECTS(
-      partition.get_vertex_partition_range_last(comm_p_size - 1) == number_of_vertices,
+      partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,
       "Invalid API parameter: vertex partition should cover [0, number_of_vertices).");
 
     // FIXME: check for symmetricity may better be implemetned with transpose().
@@ -286,6 +295,10 @@ template class graph_view_t<int64_t, int64_t, float, true, true>;
 template class graph_view_t<int64_t, int64_t, float, false, true>;
 template class graph_view_t<int64_t, int64_t, double, true, true>;
 template class graph_view_t<int64_t, int64_t, double, false, true>;
+template class graph_view_t<int64_t, int32_t, float, true, true>;
+template class graph_view_t<int64_t, int32_t, float, false, true>;
+template class graph_view_t<int64_t, int32_t, double, true, true>;
+template class graph_view_t<int64_t, int32_t, double, false, true>;
 
 template class graph_view_t<int32_t, int32_t, float, true, false>;
 template class graph_view_t<int32_t, int32_t, float, false, false>;
@@ -299,6 +312,10 @@ template class graph_view_t<int64_t, int64_t, float, true, false>;
 template class graph_view_t<int64_t, int64_t, float, false, false>;
 template class graph_view_t<int64_t, int64_t, double, true, false>;
 template class graph_view_t<int64_t, int64_t, double, false, false>;
+template class graph_view_t<int64_t, int32_t, float, true, false>;
+template class graph_view_t<int64_t, int32_t, float, false, false>;
+template class graph_view_t<int64_t, int32_t, double, true, false>;
+template class graph_view_t<int64_t, int32_t, double, false, false>;
 
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/src/experimental/katz_centrality.cu b/cpp/src/experimental/katz_centrality.cu
new file mode 100644
index 00000000000..86b534bc0f3
--- /dev/null
+++ b/cpp/src/experimental/katz_centrality.cu
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithms.hpp>
+#include <experimental/graph_view.hpp>
+#include <patterns/copy_to_adj_matrix_row_col.cuh>
+#include <patterns/copy_v_transform_reduce_in_out_nbr.cuh>
+#include <patterns/count_if_v.cuh>
+#include <patterns/transform_reduce_v.cuh>
+#include <patterns/transform_reduce_v_with_adj_matrix_row.cuh>
+#include <utilities/error.hpp>
+#include <vertex_partition_device.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+#include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+namespace cugraph {
+namespace experimental {
+namespace detail {
+
+template <typename GraphViewType, typename result_t>
+void katz_centrality(raft::handle_t &handle,
+                     GraphViewType const &pull_graph_view,
+                     result_t *betas,
+                     result_t *katz_centralities,
+                     result_t alpha,
+                     result_t beta,  // relevant only if betas == nullptr
+                     result_t epsilon,
+                     size_t max_iterations,
+                     bool has_initial_guess,
+                     bool normalize,
+                     bool do_expensive_check)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  static_assert(std::is_integral<vertex_t>::value,
+                "GraphViewType::vertex_type should be integral.");
+  static_assert(std::is_floating_point<result_t>::value,
+                "result_t should be a floating-point type.");
+  static_assert(GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the pull model.");
+
+  auto const num_vertices = pull_graph_view.get_number_of_vertices();
+  if (num_vertices == 0) { return; }
+
+  // 1. check input arguments
+
+  CUGRAPH_EXPECTS((alpha >= 0.0) && (alpha <= 1.0),
+                  "Invalid input argument: alpha should be in [0.0, 1.0].");
+  CUGRAPH_EXPECTS(epsilon >= 0.0, "Invalid input argument: epsilon should be non-negative.");
+
+  if (do_expensive_check) {
+    // FIXME: should I check for betas?
+
+    if (has_initial_guess) {
+      auto num_negative_values = count_if_v(
+        handle, pull_graph_view, katz_centralities, [] __device__(auto val) { return val < 0.0; });
+      CUGRAPH_EXPECTS(num_negative_values == 0,
+                      "Invalid input argument: initial guess values should be non-negative.");
+    }
+  }
+
+  // 2. initialize katz centrality values
+
+  if (!has_initial_guess) {
+    thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 katz_centralities,
+                 katz_centralities + pull_graph_view.get_number_of_local_vertices(),
+                 result_t{0.0});
+  }
+
+  // 3. katz centrality iteration
+
+  // old katz centrality values
+  rmm::device_vector<result_t> adj_matrix_row_katz_centralities(
+    pull_graph_view.get_number_of_local_adj_matrix_partition_rows(), result_t{0.0});
+  size_t iter{0};
+  while (true) {
+    copy_to_adj_matrix_row(
+      handle, pull_graph_view, katz_centralities, adj_matrix_row_katz_centralities.begin());
+
+    copy_v_transform_reduce_in_nbr(
+      handle,
+      pull_graph_view,
+      adj_matrix_row_katz_centralities.begin(),
+      thrust::make_constant_iterator(0) /* dummy */,
+      [alpha] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+        return static_cast<result_t>(alpha * src_val * w);
+      },
+      betas != nullptr ? result_t{0.0} : beta,
+      katz_centralities);
+
+    if (betas != nullptr) {
+      auto val_first = thrust::make_zip_iterator(thrust::make_tuple(katz_centralities, betas));
+      thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        val_first,
+                        val_first + pull_graph_view.get_number_of_local_vertices(),
+                        katz_centralities,
+                        [] __device__(auto val) {
+                          auto const katz_centrality = thrust::get<0>(val);
+                          auto const beta            = thrust::get<1>(val);
+                          return katz_centrality + beta;
+                        });
+    }
+
+    auto diff_sum = transform_reduce_v_with_adj_matrix_row(
+      handle,
+      pull_graph_view,
+      katz_centralities,
+      adj_matrix_row_katz_centralities.begin(),
+      [] __device__(auto v_val, auto row_val) { return std::abs(v_val - row_val); },
+      result_t{0.0});
+
+    iter++;
+
+    if (diff_sum < static_cast<result_t>(num_vertices) * epsilon) {
+      break;
+    } else if (iter >= max_iterations) {
+      CUGRAPH_FAIL("Katz Centrality failed to converge.");
+    }
+  }
+
+  if (normalize) {
+    auto l2_norm = transform_reduce_v(
+      handle,
+      pull_graph_view,
+      katz_centralities,
+      [] __device__(auto val) { return val * val; },
+      result_t{0.0});
+    l2_norm = std::sqrt(l2_norm);
+    CUGRAPH_EXPECTS(l2_norm > 0.0,
+                    "L2 norm of the computed Katz Centrality values should be positive.");
+    thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      katz_centralities,
+                      katz_centralities + pull_graph_view.get_number_of_local_vertices(),
+                      katz_centralities,
+                      [l2_norm] __device__(auto val) { return val / l2_norm; });
+  }
+
+  return;
+}
+
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename result_t, bool multi_gpu>
+void katz_centrality(raft::handle_t &handle,
+                     graph_view_t<vertex_t, edge_t, weight_t, true, multi_gpu> const &graph_view,
+                     result_t *betas,
+                     result_t *katz_centralities,
+                     result_t alpha,
+                     result_t beta,  // relevant only if beta == nullptr
+                     result_t epsilon,
+                     size_t max_iterations,
+                     bool has_initial_guess,
+                     bool normalize,
+                     bool do_expensive_check)
+{
+  detail::katz_centrality(handle,
+                          graph_view,
+                          betas,
+                          katz_centralities,
+                          alpha,
+                          beta,
+                          epsilon,
+                          max_iterations,
+                          has_initial_guess,
+                          normalize,
+                          do_expensive_check);
+}
+
+// explicit instantiation
+
+template void katz_centrality(raft::handle_t &handle,
+                              graph_view_t<int32_t, int32_t, float, true, true> const &graph_view,
+                              float *betas,
+                              float *katz_centralities,
+                              float alpha,
+                              float beta,
+                              float epsilon,
+                              size_t max_iterations,
+                              bool has_initial_guess,
+                              bool normalize,
+                              bool do_expensive_check);
+
+template void katz_centrality(raft::handle_t &handle,
+                              graph_view_t<int32_t, int32_t, double, true, true> const &graph_view,
+                              double *betas,
+                              double *katz_centralities,
+                              double alpha,
+                              double beta,
+                              double epsilon,
+                              size_t max_iterations,
+                              bool has_initial_guess,
+                              bool normalize,
+                              bool do_expensive_check);
+
+template void katz_centrality(raft::handle_t &handle,
+                              graph_view_t<int32_t, int64_t, float, true, true> const &graph_view,
+                              float *betas,
+                              float *katz_centralities,
+                              float alpha,
+                              float beta,
+                              float epsilon,
+                              size_t max_iterations,
+                              bool has_initial_guess,
+                              bool normalize,
+                              bool do_expensive_check);
+
+template void katz_centrality(raft::handle_t &handle,
+                              graph_view_t<int32_t, int64_t, double, true, true> const &graph_view,
+                              double *betas,
+                              double *katz_centralities,
+                              double alpha,
+                              double beta,
+                              double epsilon,
+                              size_t max_iterations,
+                              bool has_initial_guess,
+                              bool normalize,
+                              bool do_expensive_check);
+
+template void katz_centrality(raft::handle_t &handle,
+                              graph_view_t<int64_t, int64_t, float, true, true> const &graph_view,
+                              float *betas,
+                              float *katz_centralities,
+                              float alpha,
+                              float beta,
+                              float epsilon,
+                              size_t max_iterations,
+                              bool has_initial_guess,
+                              bool normalize,
+                              bool do_expensive_check);
+
+template void katz_centrality(raft::handle_t &handle,
+                              graph_view_t<int64_t, int64_t, double, true, true> const &graph_view,
+                              double *betas,
+                              double *katz_centralities,
+                              double alpha,
+                              double beta,
+                              double epsilon,
+                              size_t max_iterations,
+                              bool has_initial_guess,
+                              bool normalize,
+                              bool do_expensive_check);
+
+template void katz_centrality(raft::handle_t &handle,
+                              graph_view_t<int32_t, int32_t, float, true, false> const &graph_view,
+                              float *betas,
+                              float *katz_centralities,
+                              float alpha,
+                              float beta,
+                              float epsilon,
+                              size_t max_iterations,
+                              bool has_initial_guess,
+                              bool normalize,
+                              bool do_expensive_check);
+
+template void katz_centrality(raft::handle_t &handle,
+                              graph_view_t<int32_t, int32_t, double, true, false> const &graph_view,
+                              double *betas,
+                              double *katz_centralities,
+                              double alpha,
+                              double beta,
+                              double epsilon,
+                              size_t max_iterations,
+                              bool has_initial_guess,
+                              bool normalize,
+                              bool do_expensive_check);
+
+template void katz_centrality(raft::handle_t &handle,
+                              graph_view_t<int32_t, int64_t, float, true, false> const &graph_view,
+                              float *betas,
+                              float *katz_centralities,
+                              float alpha,
+                              float beta,
+                              float epsilon,
+                              size_t max_iterations,
+                              bool has_initial_guess,
+                              bool normalize,
+                              bool do_expensive_check);
+
+template void katz_centrality(raft::handle_t &handle,
+                              graph_view_t<int32_t, int64_t, double, true, false> const &graph_view,
+                              double *betas,
+                              double *katz_centralities,
+                              double alpha,
+                              double beta,
+                              double epsilon,
+                              size_t max_iterations,
+                              bool has_initial_guess,
+                              bool normalize,
+                              bool do_expensive_check);
+
+template void katz_centrality(raft::handle_t &handle,
+                              graph_view_t<int64_t, int64_t, float, true, false> const &graph_view,
+                              float *betas,
+                              float *katz_centralities,
+                              float alpha,
+                              float beta,
+                              float epsilon,
+                              size_t max_iterations,
+                              bool has_initial_guess,
+                              bool normalize,
+                              bool do_expensive_check);
+
+template void katz_centrality(raft::handle_t &handle,
+                              graph_view_t<int64_t, int64_t, double, true, false> const &graph_view,
+                              double *betas,
+                              double *katz_centralities,
+                              double alpha,
+                              double beta,
+                              double epsilon,
+                              size_t max_iterations,
+                              bool has_initial_guess,
+                              bool normalize,
+                              bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
new file mode 100644
index 00000000000..cadc685b119
--- /dev/null
+++ b/cpp/src/experimental/louvain.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph.hpp>
+
+namespace cugraph {
+namespace experimental {
+
+template <typename graph_view_type>
+class Louvain {
+ public:
+  using graph_view_t = graph_view_type;
+  using vertex_t     = typename graph_view_t::vertex_type;
+  using edge_t       = typename graph_view_t::edge_type;
+  using weight_t     = typename graph_view_t::weight_type;
+  using graph_t      = experimental::graph_t<vertex_t,
+                                        edge_t,
+                                        weight_t,
+                                        graph_view_t::is_adj_matrix_transposed,
+                                        graph_view_t::is_multi_gpu>;
+
+  Louvain(raft::handle_t const &handle, graph_view_t const &graph_view)
+    : handle_(handle), current_graph_view_(graph_view)
+  {
+  }
+
+  virtual std::pair<size_t, weight_t> operator()(vertex_t *d_cluster_vec,
+                                                 size_t max_level,
+                                                 weight_t resolution)
+  {
+    CUGRAPH_FAIL("unimplemented");
+  }
+
+ protected:
+  raft::handle_t const &handle_;
+  graph_view_t current_graph_view_;
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/pagerank.cu b/cpp/src/experimental/pagerank.cu
new file mode 100644
index 00000000000..5948d329d64
--- /dev/null
+++ b/cpp/src/experimental/pagerank.cu
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithms.hpp>
+#include <experimental/graph_view.hpp>
+#include <patterns/any_of_adj_matrix_row.cuh>
+#include <patterns/copy_to_adj_matrix_row_col.cuh>
+#include <patterns/copy_v_transform_reduce_in_out_nbr.cuh>
+#include <patterns/count_if_e.cuh>
+#include <patterns/count_if_v.cuh>
+#include <patterns/reduce_v.cuh>
+#include <patterns/transform_reduce_v_with_adj_matrix_row.cuh>
+#include <utilities/error.hpp>
+#include <vertex_partition_device.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+namespace cugraph {
+namespace experimental {
+namespace detail {
+
+// FIXME: personalization_vector_size is confusing in OPG (local or aggregate?)
+template <typename GraphViewType, typename result_t>
+void pagerank(raft::handle_t& handle,
+              GraphViewType const& pull_graph_view,
+              typename GraphViewType::weight_type* adj_matrix_row_out_weight_sums,
+              typename GraphViewType::vertex_type* personalization_vertices,
+              result_t* personalization_values,
+              typename GraphViewType::vertex_type personalization_vector_size,
+              result_t* pageranks,
+              result_t alpha,
+              result_t epsilon,
+              size_t max_iterations,
+              bool has_initial_guess,
+              bool do_expensive_check)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  static_assert(std::is_integral<vertex_t>::value,
+                "GraphViewType::vertex_type should be integral.");
+  static_assert(std::is_floating_point<result_t>::value,
+                "result_t should be a floating-point type.");
+  static_assert(GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the pull model.");
+
+  auto const num_vertices = pull_graph_view.get_number_of_vertices();
+  if (num_vertices == 0) { return; }
+
+  // 1. check input arguments
+
+  CUGRAPH_EXPECTS(
+    (personalization_vertices == nullptr) || (personalization_values != nullptr),
+    "Invalid input argument: if personalization verties are provided, personalization "
+    "values should be provided as well.");
+  CUGRAPH_EXPECTS((alpha >= 0.0) && (alpha <= 1.0),
+                  "Invalid input argument: alpha should be in [0.0, 1.0].");
+  CUGRAPH_EXPECTS(epsilon >= 0.0, "Invalid input argument: epsilon should be non-negative.");
+
+  if (do_expensive_check) {
+    if (adj_matrix_row_out_weight_sums != nullptr) {
+      auto has_negative_weight_sums = any_of_adj_matrix_row(
+        handle, pull_graph_view, adj_matrix_row_out_weight_sums, [] __device__(auto val) {
+          return val < result_t{0.0};
+        });
+      CUGRAPH_EXPECTS(
+        has_negative_weight_sums == false,
+        "Invalid input argument: outgoing edge weight sum values should be non-negative.");
+    }
+
+    if (pull_graph_view.is_weighted()) {
+      auto num_nonpositive_edge_weights = count_if_e(
+        handle,
+        pull_graph_view,
+        thrust::make_constant_iterator(0) /* dummy */,
+        thrust::make_constant_iterator(0) /* dummy */,
+        [] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+          return w <= 0.0;
+        });
+      CUGRAPH_EXPECTS(num_nonpositive_edge_weights == 0,
+                      "Invalid input argument: input graph should have postive edge weights.");
+    }
+
+    if (has_initial_guess) {
+      auto num_negative_values = count_if_v(
+        handle, pull_graph_view, pageranks, [] __device__(auto val) { return val < 0.0; });
+      CUGRAPH_EXPECTS(num_negative_values == 0,
+                      "Invalid input argument: initial guess values should be non-negative.");
+    }
+
+    if (personalization_vertices != nullptr) {
+      vertex_partition_device_t<GraphViewType> vertex_partition(pull_graph_view);
+      auto num_invalid_vertices =
+        count_if_v(handle,
+                   pull_graph_view,
+                   personalization_vertices,
+                   personalization_vertices + personalization_vector_size,
+                   [vertex_partition] __device__(auto val) {
+                     return !(vertex_partition.is_valid_vertex(val) &&
+                              vertex_partition.is_local_vertex_nocheck(val));
+                   });
+      CUGRAPH_EXPECTS(num_invalid_vertices == 0,
+                      "Invalid input argument: peresonalization vertices have invalid vertex IDs.");
+      auto num_negative_values = count_if_v(handle,
+                                            pull_graph_view,
+                                            personalization_values,
+                                            personalization_values + personalization_vector_size,
+                                            [] __device__(auto val) { return val < 0.0; });
+      CUGRAPH_EXPECTS(num_negative_values == 0,
+                      "Invalid input argument: peresonalization values should be non-negative.");
+    }
+  }
+
+  // 2. compute the sums of the out-going edge weights (if not provided)
+
+  rmm::device_vector<weight_t> tmp_adj_matrix_row_out_weight_sums{};
+  if (adj_matrix_row_out_weight_sums == nullptr) {
+    rmm::device_vector<weight_t> tmp_out_weight_sums(pull_graph_view.get_number_of_local_vertices(),
+                                                     weight_t{0.0});
+    // FIXME: better refactor this out (computing out-degree).
+    copy_v_transform_reduce_out_nbr(
+      handle,
+      pull_graph_view,
+      thrust::make_constant_iterator(0) /* dummy */,
+      thrust::make_constant_iterator(0) /* dummy */,
+      [alpha] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+        return w;
+      },
+      weight_t{0.0},
+      tmp_out_weight_sums.data().get());
+
+    tmp_adj_matrix_row_out_weight_sums.assign(
+      pull_graph_view.get_number_of_local_adj_matrix_partition_rows(), weight_t{0.0});
+    copy_to_adj_matrix_row(handle,
+                           pull_graph_view,
+                           tmp_out_weight_sums.data().get(),
+                           tmp_adj_matrix_row_out_weight_sums.begin());
+  }
+
+  auto row_out_weight_sums = adj_matrix_row_out_weight_sums != nullptr
+                               ? adj_matrix_row_out_weight_sums
+                               : tmp_adj_matrix_row_out_weight_sums.data().get();
+
+  // 3. initialize pagerank values
+
+  if (has_initial_guess) {
+    auto sum = reduce_v(handle, pull_graph_view, pageranks, result_t{0.0});
+    CUGRAPH_EXPECTS(
+      sum > 0.0,
+      "Invalid input argument: sum of the PageRank initial guess values should be positive.");
+    thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      pageranks,
+                      pageranks + pull_graph_view.get_number_of_local_vertices(),
+                      pageranks,
+                      [sum] __device__(auto val) { return val / sum; });
+  } else {
+    thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 pageranks,
+                 pageranks + pull_graph_view.get_number_of_local_vertices(),
+                 result_t{1.0} / static_cast<result_t>(num_vertices));
+  }
+
+  // 4. sum the personalization values
+
+  result_t personalization_sum{0.0};
+  if (personalization_vertices != nullptr) {
+    personalization_sum = reduce_v(handle,
+                                   pull_graph_view,
+                                   personalization_values,
+                                   personalization_values + personalization_vector_size,
+                                   result_t{0.0});
+    CUGRAPH_EXPECTS(personalization_sum > 0.0,
+                    "Invalid input argument: sum of personalization valuese should be positive.");
+  }
+
+  // 5. pagerank iteration
+
+  // old PageRank values
+  rmm::device_vector<result_t> adj_matrix_row_pageranks(
+    pull_graph_view.get_number_of_local_adj_matrix_partition_rows(), result_t{0.0});
+  size_t iter{0};
+  while (true) {
+    copy_to_adj_matrix_row(handle, pull_graph_view, pageranks, adj_matrix_row_pageranks.begin());
+
+    auto row_val_first = thrust::make_zip_iterator(
+      thrust::make_tuple(adj_matrix_row_pageranks.begin(), row_out_weight_sums));
+    thrust::transform(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      row_val_first,
+      row_val_first + pull_graph_view.get_number_of_local_adj_matrix_partition_rows(),
+      adj_matrix_row_pageranks.begin(),
+      [] __device__(auto val) {
+        auto const row_pagerank       = thrust::get<0>(val);
+        auto const row_out_weight_sum = thrust::get<1>(val);
+        auto const divisor =
+          row_out_weight_sum == result_t{0.0} ? result_t{1.0} : row_out_weight_sum;
+        return row_pagerank / divisor;
+      });
+
+    auto dangling_sum = transform_reduce_v_with_adj_matrix_row(
+      handle,
+      pull_graph_view,
+      thrust::make_constant_iterator(0) /* dummy */,
+      row_val_first,
+      [] __device__(auto v_val, auto row_val) {
+        auto const row_pagerank       = thrust::get<0>(row_val);
+        auto const row_out_weight_sum = thrust::get<1>(row_val);
+        return row_out_weight_sum == result_t{0.0} ? row_pagerank : result_t{0.0};
+      },
+      result_t{0.0});
+
+    auto unvarying_part =
+      personalization_vertices == nullptr
+        ? (dangling_sum + static_cast<result_t>(1.0 - alpha)) / static_cast<result_t>(num_vertices)
+        : result_t{0.0};
+
+    copy_v_transform_reduce_in_nbr(
+      handle,
+      pull_graph_view,
+      adj_matrix_row_pageranks.begin(),
+      thrust::make_constant_iterator(0) /* dummy */,
+      [alpha] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+        return src_val * w * alpha;
+      },
+      unvarying_part,
+      pageranks);
+
+    if (personalization_vertices != nullptr) {
+      vertex_partition_device_t<GraphViewType> vertex_partition(pull_graph_view);
+      auto val_first = thrust::make_zip_iterator(
+        thrust::make_tuple(personalization_vertices, personalization_values));
+      thrust::for_each(
+        val_first,
+        val_first + personalization_vector_size,
+        [vertex_partition, pageranks, dangling_sum, personalization_sum, alpha] __device__(
+          auto val) {
+          auto v     = thrust::get<0>(val);
+          auto value = thrust::get<1>(val);
+          *(pageranks + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v)) +=
+            (dangling_sum + static_cast<result_t>(1.0 - alpha)) * (value / personalization_sum);
+        });
+    }
+
+    auto diff_sum = transform_reduce_v_with_adj_matrix_row(
+      handle,
+      pull_graph_view,
+      pageranks,
+      thrust::make_zip_iterator(
+        thrust::make_tuple(adj_matrix_row_pageranks.begin(), row_out_weight_sums)),
+      [] __device__(auto v_val, auto row_val) {
+        auto multiplier =
+          thrust::get<1>(row_val) == result_t{0.0} ? result_t{1.0} : thrust::get<1>(row_val);
+        return std::abs(v_val - thrust::get<0>(row_val) * multiplier);
+      },
+      result_t{0.0});
+
+    iter++;
+
+    if (diff_sum < static_cast<result_t>(num_vertices) * epsilon) {
+      break;
+    } else if (iter >= max_iterations) {
+      CUGRAPH_FAIL("PageRank failed to converge.");
+    }
+  }
+
+  return;
+}
+
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename result_t, bool multi_gpu>
+void pagerank(raft::handle_t& handle,
+              graph_view_t<vertex_t, edge_t, weight_t, true, multi_gpu> const& graph_view,
+              weight_t* adj_matrix_row_out_weight_sums,
+              vertex_t* personalization_vertices,
+              result_t* personalization_values,
+              vertex_t personalization_vector_size,
+              result_t* pageranks,
+              result_t alpha,
+              result_t epsilon,
+              size_t max_iterations,
+              bool has_initial_guess,
+              bool do_expensive_check)
+{
+  detail::pagerank(handle,
+                   graph_view,
+                   adj_matrix_row_out_weight_sums,
+                   personalization_vertices,
+                   personalization_values,
+                   personalization_vector_size,
+                   pageranks,
+                   alpha,
+                   epsilon,
+                   max_iterations,
+                   has_initial_guess,
+                   do_expensive_check);
+}
+
+// explicit instantiation
+
+template void pagerank(raft::handle_t& handle,
+                       graph_view_t<int32_t, int32_t, float, true, true> const& graph_view,
+                       float* adj_matrix_row_out_weight_sums,
+                       int32_t* personalization_vertices,
+                       float* personalization_values,
+                       int32_t personalization_vector_size,
+                       float* pageranks,
+                       float alpha,
+                       float epsilon,
+                       size_t max_iterations,
+                       bool has_initial_guess,
+                       bool do_expensive_check);
+
+template void pagerank(raft::handle_t& handle,
+                       graph_view_t<int32_t, int32_t, double, true, true> const& graph_view,
+                       double* adj_matrix_row_out_weight_sums,
+                       int32_t* personalization_vertices,
+                       double* personalization_values,
+                       int32_t personalization_vector_size,
+                       double* pageranks,
+                       double alpha,
+                       double epsilon,
+                       size_t max_iterations,
+                       bool has_initial_guess,
+                       bool do_expensive_check);
+
+template void pagerank(raft::handle_t& handle,
+                       graph_view_t<int32_t, int64_t, float, true, true> const& graph_view,
+                       float* adj_matrix_row_out_weight_sums,
+                       int32_t* personalization_vertices,
+                       float* personalization_values,
+                       int32_t personalization_vector_size,
+                       float* pageranks,
+                       float alpha,
+                       float epsilon,
+                       size_t max_iterations,
+                       bool has_initial_guess,
+                       bool do_expensive_check);
+
+template void pagerank(raft::handle_t& handle,
+                       graph_view_t<int32_t, int64_t, double, true, true> const& graph_view,
+                       double* adj_matrix_row_out_weight_sums,
+                       int32_t* personalization_vertices,
+                       double* personalization_values,
+                       int32_t personalization_vector_size,
+                       double* pageranks,
+                       double alpha,
+                       double epsilon,
+                       size_t max_iterations,
+                       bool has_initial_guess,
+                       bool do_expensive_check);
+
+template void pagerank(raft::handle_t& handle,
+                       graph_view_t<int64_t, int64_t, float, true, true> const& graph_view,
+                       float* adj_matrix_row_out_weight_sums,
+                       int64_t* personalization_vertices,
+                       float* personalization_values,
+                       int64_t personalization_vector_size,
+                       float* pageranks,
+                       float alpha,
+                       float epsilon,
+                       size_t max_iterations,
+                       bool has_initial_guess,
+                       bool do_expensive_check);
+
+template void pagerank(raft::handle_t& handle,
+                       graph_view_t<int64_t, int64_t, double, true, true> const& graph_view,
+                       double* adj_matrix_row_out_weight_sums,
+                       int64_t* personalization_vertices,
+                       double* personalization_values,
+                       int64_t personalization_vector_size,
+                       double* pageranks,
+                       double alpha,
+                       double epsilon,
+                       size_t max_iterations,
+                       bool has_initial_guess,
+                       bool do_expensive_check);
+
+template void pagerank(raft::handle_t& handle,
+                       graph_view_t<int32_t, int32_t, float, true, false> const& graph_view,
+                       float* adj_matrix_row_out_weight_sums,
+                       int32_t* personalization_vertices,
+                       float* personalization_values,
+                       int32_t personalization_vector_size,
+                       float* pageranks,
+                       float alpha,
+                       float epsilon,
+                       size_t max_iterations,
+                       bool has_initial_guess,
+                       bool do_expensive_check);
+
+template void pagerank(raft::handle_t& handle,
+                       graph_view_t<int32_t, int32_t, double, true, false> const& graph_view,
+                       double* adj_matrix_row_out_weight_sums,
+                       int32_t* personalization_vertices,
+                       double* personalization_values,
+                       int32_t personalization_vector_size,
+                       double* pageranks,
+                       double alpha,
+                       double epsilon,
+                       size_t max_iterations,
+                       bool has_initial_guess,
+                       bool do_expensive_check);
+
+template void pagerank(raft::handle_t& handle,
+                       graph_view_t<int32_t, int64_t, float, true, false> const& graph_view,
+                       float* adj_matrix_row_out_weight_sums,
+                       int32_t* personalization_vertices,
+                       float* personalization_values,
+                       int32_t personalization_vector_size,
+                       float* pageranks,
+                       float alpha,
+                       float epsilon,
+                       size_t max_iterations,
+                       bool has_initial_guess,
+                       bool do_expensive_check);
+
+template void pagerank(raft::handle_t& handle,
+                       graph_view_t<int32_t, int64_t, double, true, false> const& graph_view,
+                       double* adj_matrix_row_out_weight_sums,
+                       int32_t* personalization_vertices,
+                       double* personalization_values,
+                       int32_t personalization_vector_size,
+                       double* pageranks,
+                       double alpha,
+                       double epsilon,
+                       size_t max_iterations,
+                       bool has_initial_guess,
+                       bool do_expensive_check);
+
+template void pagerank(raft::handle_t& handle,
+                       graph_view_t<int64_t, int64_t, float, true, false> const& graph_view,
+                       float* adj_matrix_row_out_weight_sums,
+                       int64_t* personalization_vertices,
+                       float* personalization_values,
+                       int64_t personalization_vector_size,
+                       float* pageranks,
+                       float alpha,
+                       float epsilon,
+                       size_t max_iterations,
+                       bool has_initial_guess,
+                       bool do_expensive_check);
+
+template void pagerank(raft::handle_t& handle,
+                       graph_view_t<int64_t, int64_t, double, true, false> const& graph_view,
+                       double* adj_matrix_row_out_weight_sums,
+                       int64_t* personalization_vertices,
+                       double* personalization_values,
+                       int64_t personalization_vector_size,
+                       double* pageranks,
+                       double alpha,
+                       double epsilon,
+                       size_t max_iterations,
+                       bool has_initial_guess,
+                       bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/sssp.cu b/cpp/src/experimental/sssp.cu
new file mode 100644
index 00000000000..e0679ad0d56
--- /dev/null
+++ b/cpp/src/experimental/sssp.cu
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithms.hpp>
+#include <experimental/graph_view.hpp>
+#include <patterns/copy_to_adj_matrix_row_col.cuh>
+#include <patterns/count_if_e.cuh>
+#include <patterns/reduce_op.cuh>
+#include <patterns/transform_reduce_e.cuh>
+#include <patterns/update_frontier_v_push_if_out_nbr.cuh>
+#include <patterns/vertex_frontier.cuh>
+#include <utilities/error.hpp>
+#include <vertex_partition_device.cuh>
+
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+
+#include <thrust/fill.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <limits>
+
+namespace cugraph {
+namespace experimental {
+namespace detail {
+
+template <typename GraphViewType, typename PredecessorIterator>
+void sssp(raft::handle_t &handle,
+          GraphViewType const &push_graph_view,
+          typename GraphViewType::weight_type *distances,
+          PredecessorIterator predecessor_first,
+          typename GraphViewType::vertex_type source_vertex,
+          typename GraphViewType::weight_type cutoff,
+          bool do_expensive_check)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  static_assert(std::is_integral<vertex_t>::value,
+                "GraphViewType::vertex_type should be integral.");
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+
+  auto const num_vertices = push_graph_view.get_number_of_vertices();
+  auto const num_edges    = push_graph_view.get_number_of_edges();
+  if (num_vertices == 0) { return; }
+
+  // implements the Near-Far Pile method in
+  // A. Davidson, S. Baxter, M. Garland, and J. D. Owens, "Work-efficient parallel GPU methods for
+  // single-source shortest paths," 2014.
+
+  // 1. check input arguments
+
+  CUGRAPH_EXPECTS(push_graph_view.is_valid_vertex(source_vertex),
+                  "Invalid input argument: source vertex out-of-range.");
+
+  if (do_expensive_check) {
+    auto num_negative_edge_weights =
+      count_if_e(handle,
+                 push_graph_view,
+                 thrust::make_constant_iterator(0) /* dummy */,
+                 thrust::make_constant_iterator(0) /* dummy */,
+                 [] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+                   return w < 0.0;
+                 });
+    CUGRAPH_EXPECTS(num_negative_edge_weights == 0,
+                    "Invalid input argument: input graph should have non-negative edge weights.");
+  }
+
+  // 2. initialize distances and predecessors
+
+  auto constexpr invalid_distance = std::numeric_limits<weight_t>::max();
+  auto constexpr invalid_vertex   = invalid_vertex_id<vertex_t>::value;
+
+  auto val_first = thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first));
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    thrust::make_counting_iterator(push_graph_view.get_local_vertex_first()),
+                    thrust::make_counting_iterator(push_graph_view.get_local_vertex_last()),
+                    val_first,
+                    [source_vertex] __device__(auto val) {
+                      auto distance = invalid_distance;
+                      if (val == source_vertex) { distance = weight_t{0.0}; }
+                      return thrust::make_tuple(distance, invalid_vertex);
+                    });
+
+  if (num_edges == 0) { return; }
+
+  // 3. update delta
+
+  weight_t average_vertex_degree{0.0};
+  weight_t average_edge_weight{0.0};
+  thrust::tie(average_vertex_degree, average_edge_weight) = transform_reduce_e(
+    handle,
+    push_graph_view,
+    thrust::make_constant_iterator(0) /* dummy */,
+    thrust::make_constant_iterator(0) /* dummy */,
+    [] __device__(vertex_t row, vertex_t col, weight_t w, auto row_val, auto col_val) {
+      return thrust::make_tuple(weight_t{1.0}, w);
+    },
+    thrust::make_tuple(weight_t{0.0}, weight_t{0.0}));
+  average_vertex_degree /= static_cast<weight_t>(num_vertices);
+  average_edge_weight /= static_cast<weight_t>(num_edges);
+  auto delta =
+    (static_cast<weight_t>(raft::warp_size()) * average_edge_weight) / average_vertex_degree;
+
+  // 4. initialize SSSP frontier
+
+  enum class Bucket { cur_near, new_near, far, num_buckets };
+  // FIXME: need to double check the bucket sizes are sufficient
+  std::vector<size_t> bucket_sizes(static_cast<size_t>(Bucket::num_buckets),
+                                   push_graph_view.get_number_of_local_vertices());
+  VertexFrontier<thrust::tuple<weight_t, vertex_t>,
+                 vertex_t,
+                 false,
+                 static_cast<size_t>(Bucket::num_buckets)>
+    vertex_frontier(handle, bucket_sizes);
+
+  // 5. SSSP iteration
+
+  bool vertex_and_adj_matrix_row_ranges_coincide =
+    push_graph_view.get_number_of_local_vertices() ==
+        push_graph_view.get_number_of_local_adj_matrix_partition_rows()
+      ? true
+      : false;
+  rmm::device_vector<weight_t> adj_matrix_row_distances{};
+  if (!vertex_and_adj_matrix_row_ranges_coincide) {
+    adj_matrix_row_distances.assign(push_graph_view.get_number_of_local_adj_matrix_partition_rows(),
+                                    std::numeric_limits<weight_t>::max());
+  }
+  auto row_distances =
+    !vertex_and_adj_matrix_row_ranges_coincide ? adj_matrix_row_distances.data().get() : distances;
+
+  if (push_graph_view.is_local_vertex_nocheck(source_vertex)) {
+    vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).insert(source_vertex);
+  }
+
+  auto near_far_threshold = delta;
+  while (true) {
+    if (!vertex_and_adj_matrix_row_ranges_coincide) {
+      copy_to_adj_matrix_row(
+        handle,
+        push_graph_view,
+        vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).begin(),
+        vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).end(),
+        distances,
+        row_distances);
+    }
+
+    vertex_partition_device_t<GraphViewType> vertex_partition(push_graph_view);
+
+    update_frontier_v_push_if_out_nbr(
+      handle,
+      push_graph_view,
+      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).begin(),
+      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).end(),
+      row_distances,
+      thrust::make_constant_iterator(0) /* dummy */,
+      [vertex_partition, distances, cutoff] __device__(
+        vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+        auto push         = true;
+        auto new_distance = src_val + w;
+        auto threshold    = cutoff;
+        if (vertex_partition.is_local_vertex_nocheck(dst)) {
+          auto local_vertex_offset =
+            vertex_partition.get_local_vertex_offset_from_vertex_nocheck(dst);
+          auto old_distance = *(distances + local_vertex_offset);
+          threshold         = old_distance < threshold ? old_distance : threshold;
+        }
+        if (new_distance >= threshold) { push = false; }
+        return thrust::make_tuple(push, new_distance, src);
+      },
+      reduce_op::min<thrust::tuple<weight_t, vertex_t>>(),
+      distances,
+      thrust::make_zip_iterator(thrust::make_tuple(distances, predecessor_first)),
+      vertex_frontier,
+      [near_far_threshold] __device__(auto v_val, auto pushed_val) {
+        auto new_dist = thrust::get<0>(pushed_val);
+        auto idx      = new_dist < v_val
+                     ? (new_dist < near_far_threshold ? static_cast<size_t>(Bucket::new_near)
+                                                      : static_cast<size_t>(Bucket::far))
+                     : VertexFrontier<thrust::tuple<vertex_t>, vertex_t>::kInvalidBucketIdx;
+        return thrust::make_tuple(idx, thrust::get<0>(pushed_val), thrust::get<1>(pushed_val));
+      });
+
+    vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).clear();
+    if (vertex_frontier.get_bucket(static_cast<size_t>(Bucket::new_near)).aggregate_size() > 0) {
+      vertex_frontier.swap_buckets(static_cast<size_t>(Bucket::cur_near),
+                                   static_cast<size_t>(Bucket::new_near));
+    } else if (vertex_frontier.get_bucket(static_cast<size_t>(Bucket::far)).aggregate_size() >
+               0) {  // near queue is empty, split the far queue
+      auto old_near_far_threshold = near_far_threshold;
+      near_far_threshold += delta;
+
+      while (true) {
+        vertex_frontier.split_bucket(
+          static_cast<size_t>(Bucket::far),
+          [vertex_partition, distances, old_near_far_threshold, near_far_threshold] __device__(
+            auto v) {
+            auto dist =
+              *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v));
+            if (dist < old_near_far_threshold) {
+              return VertexFrontier<thrust::tuple<vertex_t>, vertex_t>::kInvalidBucketIdx;
+            } else if (dist < near_far_threshold) {
+              return static_cast<size_t>(Bucket::cur_near);
+            } else {
+              return static_cast<size_t>(Bucket::far);
+            }
+          });
+        if (vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).aggregate_size() >
+            0) {
+          break;
+        } else {
+          near_far_threshold += delta;
+        }
+      }
+    } else {
+      break;
+    }
+  }
+
+  return;
+}
+
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void sssp(raft::handle_t &handle,
+          graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
+          weight_t *distances,
+          vertex_t *predecessors,
+          vertex_t source_vertex,
+          weight_t cutoff,
+          bool do_expensive_check)
+{
+  if (predecessors != nullptr) {
+    detail::sssp(
+      handle, graph_view, distances, predecessors, source_vertex, cutoff, do_expensive_check);
+  } else {
+    detail::sssp(handle,
+                 graph_view,
+                 distances,
+                 thrust::make_discard_iterator(),
+                 source_vertex,
+                 cutoff,
+                 do_expensive_check);
+  }
+}
+
+// explicit instantiation
+
+template void sssp(raft::handle_t &handle,
+                   graph_view_t<int32_t, int32_t, float, false, true> const &graph_view,
+                   float *distances,
+                   int32_t *predecessors,
+                   int32_t source_vertex,
+                   float cutoff,
+                   bool do_expensive_check);
+
+template void sssp(raft::handle_t &handle,
+                   graph_view_t<int32_t, int32_t, double, false, true> const &graph_view,
+                   double *distances,
+                   int32_t *predecessors,
+                   int32_t source_vertex,
+                   double cutoff,
+                   bool do_expensive_check);
+
+template void sssp(raft::handle_t &handle,
+                   graph_view_t<int32_t, int64_t, float, false, true> const &graph_view,
+                   float *distances,
+                   int32_t *predecessors,
+                   int32_t source_vertex,
+                   float cutoff,
+                   bool do_expensive_check);
+
+template void sssp(raft::handle_t &handle,
+                   graph_view_t<int32_t, int64_t, double, false, true> const &graph_view,
+                   double *distances,
+                   int32_t *predecessors,
+                   int32_t source_vertex,
+                   double cutoff,
+                   bool do_expensive_check);
+
+template void sssp(raft::handle_t &handle,
+                   graph_view_t<int64_t, int64_t, float, false, true> const &graph_view,
+                   float *distances,
+                   int64_t *predecessors,
+                   int64_t source_vertex,
+                   float cutoff,
+                   bool do_expensive_check);
+
+template void sssp(raft::handle_t &handle,
+                   graph_view_t<int64_t, int64_t, double, false, true> const &graph_view,
+                   double *distances,
+                   int64_t *predecessors,
+                   int64_t source_vertex,
+                   double cutoff,
+                   bool do_expensive_check);
+
+template void sssp(raft::handle_t &handle,
+                   graph_view_t<int32_t, int32_t, float, false, false> const &graph_view,
+                   float *distances,
+                   int32_t *predecessors,
+                   int32_t source_vertex,
+                   float cutoff,
+                   bool do_expensive_check);
+
+template void sssp(raft::handle_t &handle,
+                   graph_view_t<int32_t, int32_t, double, false, false> const &graph_view,
+                   double *distances,
+                   int32_t *predecessors,
+                   int32_t source_vertex,
+                   double cutoff,
+                   bool do_expensive_check);
+
+template void sssp(raft::handle_t &handle,
+                   graph_view_t<int32_t, int64_t, float, false, false> const &graph_view,
+                   float *distances,
+                   int32_t *predecessors,
+                   int32_t source_vertex,
+                   float cutoff,
+                   bool do_expensive_check);
+
+template void sssp(raft::handle_t &handle,
+                   graph_view_t<int32_t, int64_t, double, false, false> const &graph_view,
+                   double *distances,
+                   int32_t *predecessors,
+                   int32_t source_vertex,
+                   double cutoff,
+                   bool do_expensive_check);
+
+template void sssp(raft::handle_t &handle,
+                   graph_view_t<int64_t, int64_t, float, false, false> const &graph_view,
+                   float *distances,
+                   int64_t *predecessors,
+                   int64_t source_vertex,
+                   float cutoff,
+                   bool do_expensive_check);
+
+template void sssp(raft::handle_t &handle,
+                   graph_view_t<int64_t, int64_t, double, false, false> const &graph_view,
+                   double *distances,
+                   int64_t *predecessors,
+                   int64_t source_vertex,
+                   double cutoff,
+                   bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/traversal/mg/common_utils.cuh b/cpp/src/traversal/mg/common_utils.cuh
index 6199730c28f..2cda827b471 100644
--- a/cpp/src/traversal/mg/common_utils.cuh
+++ b/cpp/src/traversal/mg/common_utils.cuh
@@ -184,7 +184,7 @@ return_t collect_vectors(raft::handle_t const &handle,
   // h_buffer_offsets has to be int because raft allgatherv expects
   // int array for displacement vector. This should be changed in
   // raft so that the displacement is templated
-  thrust::host_vector<int> h_buffer_offsets(h_buffer_len.size());
+  thrust::host_vector<size_t> h_buffer_offsets(h_buffer_len.size());
 
   thrust::exclusive_scan(
     thrust::host, h_buffer_len.begin(), h_buffer_len.end(), h_buffer_offsets.begin());
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
new file mode 100644
index 00000000000..f10b11fe8a4
--- /dev/null
+++ b/cpp/src/utilities/cython.cu
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithms.hpp>
+#include <experimental/graph_view.hpp>
+#include <graph.hpp>
+#include <partition_manager.hpp>
+#include <raft/handle.hpp>
+#include <utilities/cython.hpp>
+#include <utilities/error.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <thrust/copy.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace cugraph {
+namespace cython {
+
+namespace detail {
+
+// FIXME: Add description of this function
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool transposed,
+          bool multi_gpu,
+          std::enable_if_t<multi_gpu>* = nullptr>
+std::unique_ptr<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>
+create_graph(raft::handle_t const& handle, graph_container_t const& graph_container)
+{
+  std::vector<experimental::edgelist_t<vertex_t, edge_t, weight_t>> edgelist(
+    {{reinterpret_cast<vertex_t*>(graph_container.src_vertices),
+      reinterpret_cast<vertex_t*>(graph_container.dst_vertices),
+      reinterpret_cast<weight_t*>(graph_container.weights),
+      static_cast<edge_t>(graph_container.num_partition_edges)}});
+
+  std::vector<vertex_t> partition_offsets_vector(
+    reinterpret_cast<vertex_t*>(graph_container.vertex_partition_offsets),
+    reinterpret_cast<vertex_t*>(graph_container.vertex_partition_offsets) +
+      (graph_container.row_comm_size * graph_container.col_comm_size) + 1);
+
+  experimental::partition_t<int> partition(partition_offsets_vector,
+                                           graph_container.hypergraph_partitioned,
+                                           graph_container.row_comm_size,
+                                           graph_container.col_comm_size,
+                                           graph_container.row_comm_rank,
+                                           graph_container.col_comm_rank);
+
+  return std::make_unique<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
+    handle,
+    edgelist,
+    partition,
+    static_cast<vertex_t>(graph_container.num_global_vertices),
+    static_cast<edge_t>(graph_container.num_global_edges),
+    graph_container.graph_props,
+    graph_container.sorted_by_degree,
+    graph_container.do_expensive_check);
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool transposed,
+          bool multi_gpu,
+          std::enable_if_t<!multi_gpu>* = nullptr>
+std::unique_ptr<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>
+create_graph(raft::handle_t const& handle, graph_container_t const& graph_container)
+{
+  experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{
+    reinterpret_cast<vertex_t*>(graph_container.src_vertices),
+    reinterpret_cast<vertex_t*>(graph_container.dst_vertices),
+    reinterpret_cast<weight_t*>(graph_container.weights),
+    static_cast<edge_t>(graph_container.num_partition_edges)};
+
+  return std::make_unique<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
+    handle,
+    edgelist,
+    static_cast<vertex_t>(graph_container.num_global_vertices),
+    graph_container.graph_props,
+    graph_container.sorted_by_degree,
+    graph_container.do_expensive_check);
+}
+
+}  // namespace detail
+
+// Populates a graph_container_t with a pointer to a new graph object and sets
+// the meta-data accordingly.  The graph container owns the pointer and it is
+// assumed it will delete it on destruction.
+void populate_graph_container(graph_container_t& graph_container,
+                              raft::handle_t& handle,
+                              void* src_vertices,
+                              void* dst_vertices,
+                              void* weights,
+                              void* vertex_partition_offsets,
+                              numberTypeEnum vertexType,
+                              numberTypeEnum edgeType,
+                              numberTypeEnum weightType,
+                              size_t num_partition_edges,
+                              size_t num_global_vertices,
+                              size_t num_global_edges,
+                              size_t row_comm_size,  // pcols
+                              size_t col_comm_size,  // prows
+                              bool sorted_by_degree,
+                              bool transposed,
+                              bool multi_gpu)
+{
+  CUGRAPH_EXPECTS(graph_container.graph_type == graphTypeEnum::null,
+                  "populate_graph_container() can only be called on an empty container.");
+
+  bool do_expensive_check{false};
+  bool hypergraph_partitioned{false};
+
+  // FIXME: Consider setting up the subcomms right after initializing comms, no
+  // need to delay to this point.
+  // Setup the subcommunicators needed for this partition on the handle.
+  partition_2d::subcomm_factory_t<partition_2d::key_naming_t, int> subcomm_factory(handle,
+                                                                                   row_comm_size);
+  // FIXME: once the subcomms are set up earlier (outside this function), remove
+  // the row/col_comm_size params and retrieve them from the handle (commented
+  // out lines below)
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_rank = row_comm.get_rank();
+  // auto const row_comm_size = row_comm.get_size(); // pcols
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_rank = col_comm.get_rank();
+  // auto const col_comm_size = col_comm.get_size(); // prows
+
+  graph_container.vertex_partition_offsets = vertex_partition_offsets;
+  graph_container.src_vertices             = src_vertices;
+  graph_container.dst_vertices             = dst_vertices;
+  graph_container.weights                  = weights;
+  graph_container.num_partition_edges      = num_partition_edges;
+  graph_container.num_global_vertices      = num_global_vertices;
+  graph_container.num_global_edges         = num_global_edges;
+  graph_container.vertexType               = vertexType;
+  graph_container.edgeType                 = edgeType;
+  graph_container.weightType               = weightType;
+  graph_container.transposed               = transposed;
+  graph_container.is_multi_gpu             = multi_gpu;
+  graph_container.hypergraph_partitioned   = hypergraph_partitioned;
+  graph_container.row_comm_size            = row_comm_size;
+  graph_container.col_comm_size            = col_comm_size;
+  graph_container.row_comm_rank            = row_comm_rank;
+  graph_container.col_comm_rank            = col_comm_rank;
+  graph_container.sorted_by_degree         = sorted_by_degree;
+  graph_container.do_expensive_check       = do_expensive_check;
+
+  experimental::graph_properties_t graph_props{.is_symmetric = false, .is_multigraph = false};
+  graph_container.graph_props = graph_props;
+
+  graph_container.graph_type = graphTypeEnum::graph_t;
+}
+
+void populate_graph_container_legacy(graph_container_t& graph_container,
+                                     graphTypeEnum legacyType,
+                                     raft::handle_t const& handle,
+                                     void* offsets,
+                                     void* indices,
+                                     void* weights,
+                                     numberTypeEnum offsetType,
+                                     numberTypeEnum indexType,
+                                     numberTypeEnum weightType,
+                                     size_t num_global_vertices,
+                                     size_t num_global_edges,
+                                     int* local_vertices,
+                                     int* local_edges,
+                                     int* local_offsets)
+{
+  CUGRAPH_EXPECTS(graph_container.graph_type == graphTypeEnum::null,
+                  "populate_graph_container() can only be called on an empty container.");
+
+  // FIXME: This is soon-to-be legacy code left in place until the new graph_t
+  // class is supported everywhere else. Remove everything down to the comment
+  // line after the return stmnt.
+  // Keep new code below return stmnt enabled to ensure it builds.
+  if (weightType == numberTypeEnum::floatType) {
+    switch (legacyType) {
+      case graphTypeEnum::LegacyCSR: {
+        graph_container.graph_ptr_union.GraphCSRViewFloatPtr =
+          std::make_unique<GraphCSRView<int, int, float>>(reinterpret_cast<int*>(offsets),
+                                                          reinterpret_cast<int*>(indices),
+                                                          reinterpret_cast<float*>(weights),
+                                                          num_global_vertices,
+                                                          num_global_edges);
+        graph_container.graph_type = graphTypeEnum::GraphCSRViewFloat;
+        (graph_container.graph_ptr_union.GraphCSRViewFloatPtr)
+          ->set_local_data(local_vertices, local_edges, local_offsets);
+        (graph_container.graph_ptr_union.GraphCSRViewFloatPtr)
+          ->set_handle(const_cast<raft::handle_t*>(&handle));
+      } break;
+      case graphTypeEnum::LegacyCSC: {
+        graph_container.graph_ptr_union.GraphCSCViewFloatPtr =
+          std::make_unique<GraphCSCView<int, int, float>>(reinterpret_cast<int*>(offsets),
+                                                          reinterpret_cast<int*>(indices),
+                                                          reinterpret_cast<float*>(weights),
+                                                          num_global_vertices,
+                                                          num_global_edges);
+        graph_container.graph_type = graphTypeEnum::GraphCSCViewFloat;
+        (graph_container.graph_ptr_union.GraphCSCViewFloatPtr)
+          ->set_local_data(local_vertices, local_edges, local_offsets);
+        (graph_container.graph_ptr_union.GraphCSCViewFloatPtr)
+          ->set_handle(const_cast<raft::handle_t*>(&handle));
+      } break;
+      case graphTypeEnum::LegacyCOO: {
+        graph_container.graph_ptr_union.GraphCOOViewFloatPtr =
+          std::make_unique<GraphCOOView<int, int, float>>(reinterpret_cast<int*>(offsets),
+                                                          reinterpret_cast<int*>(indices),
+                                                          reinterpret_cast<float*>(weights),
+                                                          num_global_vertices,
+                                                          num_global_edges);
+        graph_container.graph_type = graphTypeEnum::GraphCOOViewFloat;
+        (graph_container.graph_ptr_union.GraphCOOViewFloatPtr)
+          ->set_local_data(local_vertices, local_edges, local_offsets);
+        (graph_container.graph_ptr_union.GraphCOOViewFloatPtr)
+          ->set_handle(const_cast<raft::handle_t*>(&handle));
+      } break;
+      default: CUGRAPH_FAIL("unsupported graphTypeEnum value"); break;
+    }
+
+  } else {
+    switch (legacyType) {
+      case graphTypeEnum::LegacyCSR: {
+        graph_container.graph_ptr_union.GraphCSRViewDoublePtr =
+          std::make_unique<GraphCSRView<int, int, double>>(reinterpret_cast<int*>(offsets),
+                                                           reinterpret_cast<int*>(indices),
+                                                           reinterpret_cast<double*>(weights),
+                                                           num_global_vertices,
+                                                           num_global_edges);
+        graph_container.graph_type = graphTypeEnum::GraphCSRViewDouble;
+        (graph_container.graph_ptr_union.GraphCSRViewDoublePtr)
+          ->set_local_data(local_vertices, local_edges, local_offsets);
+        (graph_container.graph_ptr_union.GraphCSRViewDoublePtr)
+          ->set_handle(const_cast<raft::handle_t*>(&handle));
+      } break;
+      case graphTypeEnum::LegacyCSC: {
+        graph_container.graph_ptr_union.GraphCSCViewDoublePtr =
+          std::make_unique<GraphCSCView<int, int, double>>(reinterpret_cast<int*>(offsets),
+                                                           reinterpret_cast<int*>(indices),
+                                                           reinterpret_cast<double*>(weights),
+                                                           num_global_vertices,
+                                                           num_global_edges);
+        graph_container.graph_type = graphTypeEnum::GraphCSCViewDouble;
+        (graph_container.graph_ptr_union.GraphCSCViewDoublePtr)
+          ->set_local_data(local_vertices, local_edges, local_offsets);
+        (graph_container.graph_ptr_union.GraphCSCViewDoublePtr)
+          ->set_handle(const_cast<raft::handle_t*>(&handle));
+      } break;
+      case graphTypeEnum::LegacyCOO: {
+        graph_container.graph_ptr_union.GraphCOOViewDoublePtr =
+          std::make_unique<GraphCOOView<int, int, double>>(reinterpret_cast<int*>(offsets),
+                                                           reinterpret_cast<int*>(indices),
+                                                           reinterpret_cast<double*>(weights),
+                                                           num_global_vertices,
+                                                           num_global_edges);
+        graph_container.graph_type = graphTypeEnum::GraphCOOViewDouble;
+        (graph_container.graph_ptr_union.GraphCOOViewDoublePtr)
+          ->set_local_data(local_vertices, local_edges, local_offsets);
+        (graph_container.graph_ptr_union.GraphCOOViewDoublePtr)
+          ->set_handle(const_cast<raft::handle_t*>(&handle));
+      } break;
+      default: CUGRAPH_FAIL("unsupported graphTypeEnum value"); break;
+    }
+  }
+  return;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+template <typename graph_view_t, typename weight_t>
+std::pair<size_t, weight_t> call_louvain(raft::handle_t const& handle,
+                                         graph_view_t const& graph_view,
+                                         void* identifiers,
+                                         void* parts,
+                                         size_t max_level,
+                                         weight_t resolution)
+{
+  thrust::copy(  // rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    thrust::device,
+    thrust::make_counting_iterator(graph_view.get_local_vertex_first()),
+    thrust::make_counting_iterator(graph_view.get_local_vertex_last()),
+    reinterpret_cast<typename graph_view_t::vertex_type*>(identifiers));
+
+  return louvain(handle,
+                 graph_view,
+                 reinterpret_cast<typename graph_view_t::vertex_type*>(parts),
+                 max_level,
+                 static_cast<weight_t>(resolution));
+}
+
+}  // namespace detail
+
+namespace detail {
+
+// Final, fully-templatized call.
+template <bool transposed,
+          typename return_t,
+          typename function_t,
+          typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool is_multi_gpu>
+return_t call_function(raft::handle_t const& handle,
+                       graph_container_t const& graph_container,
+                       function_t function)
+{
+  auto graph =
+    create_graph<vertex_t, edge_t, weight_t, transposed, is_multi_gpu>(handle, graph_container);
+
+  return function(handle, graph->view());
+}
+
+// Makes another call based on vertex_t and edge_t
+template <bool transposed,
+          typename return_t,
+          typename function_t,
+          typename weight_t,
+          bool is_multi_gpu>
+return_t call_function(raft::handle_t const& handle,
+                       graph_container_t const& graph_container,
+                       function_t function)
+{
+  // Since only vertex/edge types (int32,int32), (int32,int64), and
+  // (int64,int64) are being supported, explicitely check for those types and
+  // ensure (int64,int32) is rejected as unsupported.
+  if ((graph_container.vertexType == numberTypeEnum::int32Type) &&
+      (graph_container.edgeType == numberTypeEnum::int32Type)) {
+    return call_function<transposed,
+                         return_t,
+                         function_t,
+                         int32_t,
+                         int32_t,
+                         weight_t,
+                         is_multi_gpu>(handle, graph_container, function);
+  } else if ((graph_container.vertexType == numberTypeEnum::int32Type) &&
+             (graph_container.edgeType == numberTypeEnum::int64Type)) {
+    return call_function<transposed,
+                         return_t,
+                         function_t,
+                         int32_t,
+                         int64_t,
+                         weight_t,
+                         is_multi_gpu>(handle, graph_container, function);
+  } else if ((graph_container.vertexType == numberTypeEnum::int64Type) &&
+             (graph_container.edgeType == numberTypeEnum::int64Type)) {
+    return call_function<transposed,
+                         return_t,
+                         function_t,
+                         int64_t,
+                         int64_t,
+                         weight_t,
+                         is_multi_gpu>(handle, graph_container, function);
+  } else {
+    CUGRAPH_FAIL("vertexType/edgeType combination unsupported");
+  }
+}
+
+// Makes another call based on weight_t
+template <bool transposed, typename return_t, typename function_t, bool is_multi_gpu>
+return_t call_function(raft::handle_t const& handle,
+                       graph_container_t const& graph_container,
+                       function_t function)
+{
+  if (graph_container.weightType == numberTypeEnum::floatType) {
+    return call_function<transposed, return_t, function_t, float, transposed>(
+      handle, graph_container, function);
+  } else if (graph_container.weightType == numberTypeEnum::doubleType) {
+    return call_function<transposed, return_t, function_t, double, transposed>(
+      handle, graph_container, function);
+  } else {
+    CUGRAPH_FAIL("weightType unsupported");
+  }
+}
+
+// Makes another call based on multi_gpu
+template <bool transposed, typename return_t, typename function_t>
+return_t call_function(raft::handle_t const& handle,
+                       graph_container_t const& graph_container,
+                       function_t function)
+{
+  if (graph_container.is_multi_gpu) {
+    return call_function<transposed, return_t, function_t, true>(handle, graph_container, function);
+  } else {
+    return call_function<transposed, return_t, function_t, false>(
+      handle, graph_container, function);
+  }
+}
+
+// Initial call_function() call starts here.
+// This makes another call based on transposed
+template <typename return_t, typename function_t>
+return_t call_function(raft::handle_t const& handle,
+                       graph_container_t const& graph_container,
+                       function_t function)
+{
+  if (graph_container.transposed) {
+    return call_function<true, return_t, function_t>(handle, graph_container, function);
+  } else {
+    return call_function<false, return_t, function_t>(handle, graph_container, function);
+  }
+}
+
+template <typename weight_t>
+class louvain_functor {
+ public:
+  louvain_functor(void* identifiers, void* parts, size_t max_level, weight_t resolution)
+    : identifiers_(identifiers), parts_(parts), max_level_(max_level), resolution_(resolution)
+  {
+  }
+
+  template <typename graph_view_t>
+  std::pair<size_t, weight_t> operator()(raft::handle_t const& handle,
+                                         graph_view_t const& graph_view)
+  {
+    return cugraph::louvain(handle,
+                            graph_view,
+                            reinterpret_cast<typename graph_view_t::vertex_type*>(parts_),
+                            max_level_,
+                            resolution_);
+  }
+
+ private:
+  void* identifiers_;  // FIXME: this will be used in a future PR
+  void* parts_;
+  size_t max_level_;
+  weight_t resolution_;
+};
+
+}  // namespace detail
+
+// Wrapper for calling Louvain using a graph container
+template <typename weight_t>
+std::pair<size_t, weight_t> call_louvain(raft::handle_t const& handle,
+                                         graph_container_t const& graph_container,
+                                         void* identifiers,
+                                         void* parts,
+                                         size_t max_level,
+                                         weight_t resolution)
+{
+  // LEGACY PATH - remove when migration to graph_t types complete
+  if (graph_container.graph_type == graphTypeEnum::GraphCSRViewFloat) {
+    graph_container.graph_ptr_union.GraphCSRViewFloatPtr->get_vertex_identifiers(
+      static_cast<int32_t*>(identifiers));
+    return louvain(handle,
+                   *(graph_container.graph_ptr_union.GraphCSRViewFloatPtr),
+                   reinterpret_cast<int32_t*>(parts),
+                   max_level,
+                   static_cast<float>(resolution));
+  } else if (graph_container.graph_type == graphTypeEnum::GraphCSRViewDouble) {
+    graph_container.graph_ptr_union.GraphCSRViewDoublePtr->get_vertex_identifiers(
+      static_cast<int32_t*>(identifiers));
+    return louvain(handle,
+                   *(graph_container.graph_ptr_union.GraphCSRViewDoublePtr),
+                   reinterpret_cast<int32_t*>(parts),
+                   max_level,
+                   static_cast<double>(resolution));
+  }
+
+  // NON-LEGACY PATH
+  detail::louvain_functor<weight_t> functor{identifiers, parts, max_level, resolution};
+
+  return detail::call_function<false, std::pair<size_t, weight_t>>(
+    handle, graph_container, functor);
+}
+
+// Explicit instantiations
+template std::pair<size_t, float> call_louvain(raft::handle_t const& handle,
+                                               graph_container_t const& graph_container,
+                                               void* identifiers,
+                                               void* parts,
+                                               size_t max_level,
+                                               float resolution);
+
+template std::pair<size_t, double> call_louvain(raft::handle_t const& handle,
+                                                graph_container_t const& graph_container,
+                                                void* identifiers,
+                                                void* parts,
+                                                size_t max_level,
+                                                double resolution);
+
+}  // namespace cython
+}  // namespace cugraph
diff --git a/cpp/src/utilities/spmv_1D.cu b/cpp/src/utilities/spmv_1D.cu
index 4aec86919c9..8a7378e69d3 100644
--- a/cpp/src/utilities/spmv_1D.cu
+++ b/cpp/src/utilities/spmv_1D.cu
@@ -75,8 +75,10 @@ void MGcsrmv<vertex_t, edge_t, weight_t>::run(weight_t *x)
   auto const &comm{handle_.get_comms()};  // local
 
   std::vector<size_t> recvbuf(comm.get_size());
+  std::vector<size_t> displs(comm.get_size());
   std::copy(local_vertices_, local_vertices_ + comm.get_size(), recvbuf.begin());
-  comm.allgatherv(y_loc_.data().get(), x, recvbuf.data(), part_off_, stream);
+  std::copy(part_off_, part_off_ + comm.get_size(), displs.begin());
+  comm.allgatherv(y_loc_.data().get(), x, recvbuf.data(), displs.data(), stream);
 }
 
 template class MGcsrmv<int32_t, int32_t, double>;
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eb10790f328..ac3a27c7b77 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -46,7 +46,7 @@ function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC CMAKE_EXTRA_LIBS)
 
     target_link_libraries(${CMAKE_TEST_NAME}
         PRIVATE
-        gtest gmock_main gmock cugraph ${CUDF_LIBRARY} ${RMM_LIBRARY} ${CMAKE_EXTRA_LIBS}  ${NCCL_LIBRARIES} cudart cuda cublas cusparse cusolver curand)
+        gtest gmock_main gmock cugraph ${CUDF_LIBRARY} ${CMAKE_EXTRA_LIBS}  ${NCCL_LIBRARIES} cudart cuda cublas cusparse cusolver curand)
 
     if(OpenMP_CXX_FOUND)
         target_link_libraries(${CMAKE_TEST_NAME} PRIVATE
@@ -59,7 +59,7 @@ function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC CMAKE_EXTRA_LIBS)
 ###   ...
 ###
 ###   libgomp.so is included in the conda base environment and copied to every new conda
-###   environment. If a full file path is provided (e.g ${CUDF_LIBRARY} and ${RMM_LIBRARY}), cmake
+###   environment. If a full file path is provided (e.g ${CUDF_LIBRARY}), cmake
 ###   extracts the directory path and adds the directory path to BUILD_RPATH (if BUILD_RPATH is not
 ###   disabled).
 ###
@@ -72,7 +72,7 @@ function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC CMAKE_EXTRA_LIBS)
 ###   If a full path to libgomp.so is provided (which is the case with OpenMP::OpenMP_CXX), cmake
 ###   checks whether there is any other libgomp.so with the different full path (after resolving
 ###   soft links) in the search paths (implicit directoires + BUILD_RAPTH). There is one in the
-###   path included in BUILD_RPATH when ${CUDF_LIBRARY} and ${RMM_LIBRARY} are added; this one can
+###   path included in BUILD_RPATH when ${CUDF_LIBRARY} is added; this one can
 ###   potentially hide the one in the provided full path and cmake generates a warning (and RPATH
 ###   is searched before the directories in /etc/ld.so/conf; ld.so.conf does not coincide but
 ###   overlaps with implicit directories).
@@ -128,7 +128,7 @@ set(KATZ_TEST_SRC
   ConfigureTest(KATZ_TEST "${KATZ_TEST_SRC}" "")
 
 ###################################################################################################
-# - betweenness centrality tests -------------------------------------------------------------------------
+# - betweenness centrality tests ------------------------------------------------------------------
 
 set(BETWEENNESS_TEST_SRC
     "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
@@ -153,17 +153,19 @@ ConfigureTest(PAGERANK_TEST "${PAGERANK_TEST_SRC}" "")
 
 ###################################################################################################
 # - SSSP tests ------------------------------------------------------------------------------------
+
 set(SSSP_TEST_SRCS
-	"${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-	"${CMAKE_CURRENT_SOURCE_DIR}/traversal/sssp_test.cu")
+    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/traversal/sssp_test.cu")
 
 ConfigureTest(SSSP_TEST "${SSSP_TEST_SRCS}" "")
 
 ###################################################################################################
-# - BFS tests ------------------------------------------------------------------------------------
+# - BFS tests -------------------------------------------------------------------------------------
+
 set(BFS_TEST_SRCS
-	"${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-	"${CMAKE_CURRENT_SOURCE_DIR}/traversal/bfs_test.cu")
+    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/traversal/bfs_test.cu")
 
 ConfigureTest(BFS_TEST "${BFS_TEST_SRCS}" "")
 
@@ -172,7 +174,7 @@ ConfigureTest(BFS_TEST "${BFS_TEST_SRCS}" "")
 
 set(LOUVAIN_TEST_SRC
     "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/community/louvain_test.cpp")
+    "${CMAKE_CURRENT_SOURCE_DIR}/community/louvain_test.cu")
 
 ConfigureTest(LOUVAIN_TEST "${LOUVAIN_TEST_SRC}" "")
 
@@ -194,7 +196,7 @@ set(ECG_TEST_SRC
 ConfigureTest(ECG_TEST "${ECG_TEST_SRC}" "")
 
 ###################################################################################################
-# - Balanced cut clustering tests ---------------------------------------------------------------------------------
+# - Balanced cut clustering tests -----------------------------------------------------------------
 
 set(BALANCED_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/community/balanced_edge_test.cpp")
@@ -202,7 +204,7 @@ set(BALANCED_TEST_SRC
 ConfigureTest(BALANCED_TEST "${BALANCED_TEST_SRC}" "")
 
 ###################################################################################################
-# - TRIANGLE tests ---------------------------------------------------------------------------------
+# - TRIANGLE tests --------------------------------------------------------------------------------
 
 set(TRIANGLE_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/community/triangle_test.cu")
@@ -219,7 +221,7 @@ set(RENUMBERING_TEST_SRC
 ConfigureTest(RENUMBERING_TEST "${RENUMBERING_TEST_SRC}" "")
 
 ###################################################################################################
-#-FORCE ATLAS 2  tests ------------------------------------------------------------------------------
+# - FORCE ATLAS 2  tests --------------------------------------------------------------------------
 
 set(FA2_TEST_SRC
     "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
@@ -228,7 +230,7 @@ set(FA2_TEST_SRC
 ConfigureTest(FA2_TEST "${FA2_TEST_SRC}" "")
 
 ###################################################################################################
-#-CONNECTED COMPONENTS  tests ---------------------------------------------------------------------
+# - CONNECTED COMPONENTS  tests -------------------------------------------------------------------
 
 set(CONNECT_TEST_SRC
     "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
@@ -237,7 +239,7 @@ set(CONNECT_TEST_SRC
 ConfigureTest(CONNECT_TEST "${CONNECT_TEST_SRC}" "")
 
 ###################################################################################################
-#-STRONGLY CONNECTED COMPONENTS  tests ---------------------------------------------------------------------
+# - STRONGLY CONNECTED COMPONENTS  tests ----------------------------------------------------------
 
 set(SCC_TEST_SRC
     "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
@@ -246,7 +248,7 @@ set(SCC_TEST_SRC
 ConfigureTest(SCC_TEST "${SCC_TEST_SRC}" "")
 
 ###################################################################################################
-#-FIND_MATCHES tests ---------------------------------------------------------------------
+# - FIND_MATCHES tests ----------------------------------------------------------------------------
 
 set(FIND_MATCHES_TEST_SRC
     "${CMAKE_CURRENT_SOURCE_DIR}/db/find_matches_test.cu")
@@ -262,6 +264,42 @@ set(EXPERIMENTAL_GRAPH_TEST_SRCS
 
 ConfigureTest(EXPERIMENTAL_GRAPH_TEST "${EXPERIMENTAL_GRAPH_TEST_SRCS}" "")
 
+###################################################################################################
+# - Experimental BFS tests ------------------------------------------------------------------------
+
+set(EXPERIMENTAL_BFS_TEST_SRCS
+    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/bfs_test.cpp")
+
+ConfigureTest(EXPERIMENTAL_BFS_TEST "${EXPERIMENTAL_BFS_TEST_SRCS}" "")
+
+###################################################################################################
+# - Experimental SSSP tests -----------------------------------------------------------------------
+
+set(EXPERIMENTAL_SSSP_TEST_SRCS
+    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/sssp_test.cpp")
+
+ConfigureTest(EXPERIMENTAL_SSSP_TEST "${EXPERIMENTAL_SSSP_TEST_SRCS}" "")
+
+###################################################################################################
+# - Experimental PAGERANK tests -------------------------------------------------------------------
+
+set(EXPERIMENTAL_PAGERANK_TEST_SRCS
+    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/pagerank_test.cpp")
+
+ConfigureTest(EXPERIMENTAL_PAGERANK_TEST "${EXPERIMENTAL_PAGERANK_TEST_SRCS}" "")
+
+###################################################################################################
+# - Experimental KATZ_CENTRALITY tests ------------------------------------------------------------
+
+set(EXPERIMENTAL_KATZ_CENTRALITY_TEST_SRCS
+    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/katz_centrality_test.cpp")
+
+ConfigureTest(EXPERIMENTAL_KATZ_CENTRALITY_TEST "${EXPERIMENTAL_KATZ_CENTRALITY_TEST_SRCS}" "")
+
 ###################################################################################################
 ### enable testing ################################################################################
 ###################################################################################################
diff --git a/cpp/tests/community/ecg_test.cu b/cpp/tests/community/ecg_test.cu
index 6246a42021d..b20dd365ef2 100644
--- a/cpp/tests/community/ecg_test.cu
+++ b/cpp/tests/community/ecg_test.cu
@@ -45,7 +45,8 @@ TEST(ecg, success)
   cugraph::GraphCSRView<int, int, float> graph_csr(
     offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
 
-  cugraph::ecg<int32_t, int32_t, float>(graph_csr, .05, 16, result_v.data().get());
+  raft::handle_t handle;
+  cugraph::ecg<int32_t, int32_t, float>(handle, graph_csr, .05, 16, result_v.data().get());
 
   cluster_id = result_v;
   int max    = *max_element(cluster_id.begin(), cluster_id.end());
@@ -106,7 +107,8 @@ TEST(ecg, dolphin)
   cugraph::GraphCSRView<int, int, float> graph_csr(
     offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
 
-  cugraph::ecg<int32_t, int32_t, float>(graph_csr, .05, 16, result_v.data().get());
+  raft::handle_t handle;
+  cugraph::ecg<int32_t, int32_t, float>(handle, graph_csr, .05, 16, result_v.data().get());
 
   cluster_id = result_v;
   int max    = *max_element(cluster_id.begin(), cluster_id.end());
diff --git a/cpp/tests/community/leiden_test.cpp b/cpp/tests/community/leiden_test.cpp
index 1e8ba85249d..764ab8bf6cb 100644
--- a/cpp/tests/community/leiden_test.cpp
+++ b/cpp/tests/community/leiden_test.cpp
@@ -17,8 +17,6 @@
 
 #include <rmm/thrust_rmm_allocator.h>
 
-#include <rmm/mr/device/cnmem_memory_resource.hpp>
-
 TEST(leiden_karate, success)
 {
   std::vector<int> off_h = {0,  16,  25,  35,  41,  44,  48,  52,  56,  61,  63, 66,
@@ -57,9 +55,10 @@ TEST(leiden_karate, success)
     offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
 
   float modularity{0.0};
-  int num_level = 40;
+  size_t num_level = 40;
 
-  cugraph::leiden(G, modularity, num_level, result_v.data().get());
+  raft::handle_t handle;
+  std::tie(num_level, modularity) = cugraph::leiden(handle, G, result_v.data().get());
 
   cudaMemcpy((void*)&(cluster_id[0]),
              result_v.data().get(),
diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp
deleted file mode 100644
index 391af641b73..00000000000
--- a/cpp/tests/community/louvain_test.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-#include <utilities/base_fixture.hpp>
-
-#include <algorithms.hpp>
-#include <graph.hpp>
-
-#include <community/louvain_kernels.hpp>
-
-#include <thrust/extrema.h>
-
-#include <rmm/thrust_rmm_allocator.h>
-
-TEST(louvain, success)
-{
-  std::vector<int> off_h = {0,  16,  25,  35,  41,  44,  48,  52,  56,  61,  63, 66,
-                            67, 69,  74,  76,  78,  80,  82,  84,  87,  89,  91, 93,
-                            98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156};
-  std::vector<int> ind_h = {
-    1,  2,  3,  4,  5,  6,  7,  8,  10, 11, 12, 13, 17, 19, 21, 31, 0,  2,  3,  7,  13, 17, 19,
-    21, 30, 0,  1,  3,  7,  8,  9,  13, 27, 28, 32, 0,  1,  2,  7,  12, 13, 0,  6,  10, 0,  6,
-    10, 16, 0,  4,  5,  16, 0,  1,  2,  3,  0,  2,  30, 32, 33, 2,  33, 0,  4,  5,  0,  0,  3,
-    0,  1,  2,  3,  33, 32, 33, 32, 33, 5,  6,  0,  1,  32, 33, 0,  1,  33, 32, 33, 0,  1,  32,
-    33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31, 29, 33, 2,  23, 24, 33, 2,  31, 33, 23, 26,
-    32, 33, 1,  8,  32, 33, 0,  24, 25, 28, 32, 33, 2,  8,  14, 15, 18, 20, 22, 23, 29, 30, 31,
-    33, 8,  9,  13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32};
-  std::vector<float> w_h = {
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-
-  int num_verts = off_h.size() - 1;
-  int num_edges = ind_h.size();
-
-  std::vector<int> cluster_id(num_verts, -1);
-
-  rmm::device_vector<int> offsets_v(off_h);
-  rmm::device_vector<int> indices_v(ind_h);
-  rmm::device_vector<float> weights_v(w_h);
-  rmm::device_vector<int> result_v(cluster_id);
-
-  cugraph::GraphCSRView<int, int, float> G(
-    offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
-
-  float modularity{0.0};
-  int num_level = 40;
-
-  cugraph::louvain(G, &modularity, &num_level, result_v.data().get());
-
-  cudaMemcpy((void*)&(cluster_id[0]),
-             result_v.data().get(),
-             sizeof(int) * num_verts,
-             cudaMemcpyDeviceToHost);
-
-  int min = *min_element(cluster_id.begin(), cluster_id.end());
-
-  ASSERT_GE(min, 0);
-  ASSERT_GE(modularity, 0.402777 * 0.95);
-}
-
-TEST(louvain_modularity, simple)
-{
-  std::vector<int> off_h         = {0, 1, 4, 7, 10, 11, 12};
-  std::vector<int> src_ind_h     = {0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5};
-  std::vector<int> ind_h         = {1, 0, 2, 3, 1, 3, 4, 1, 2, 5, 2, 3};
-  std::vector<float> w_h         = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-  std::vector<float> v_weights_h = {1.0, 3.0, 3.0, 3.0, 1.0, 1.0};
-
-  //
-  //  Initial cluster, everything on its own
-  //
-  std::vector<int> cluster_h           = {0, 1, 2, 3, 4, 5};
-  std::vector<float> cluster_weights_h = {1.0, 3.0, 3.0, 3.0, 1.0, 1.0};
-
-  std::vector<int> cluster_hash_h = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  std::vector<float> delta_Q_h    = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-  std::vector<float> tmp_size_V_h = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-
-  int num_verts = off_h.size() - 1;
-  int num_edges = ind_h.size();
-
-  float q{0.0};
-
-  rmm::device_vector<int> offsets_v(off_h);
-  rmm::device_vector<int> src_indices_v(src_ind_h);
-  rmm::device_vector<int> indices_v(ind_h);
-  rmm::device_vector<float> weights_v(w_h);
-  rmm::device_vector<float> vertex_weights_v(v_weights_h);
-  rmm::device_vector<int> cluster_v(cluster_h);
-  rmm::device_vector<float> cluster_weights_v(cluster_weights_h);
-  rmm::device_vector<int> cluster_hash_v(cluster_hash_h);
-  rmm::device_vector<float> delta_Q_v(delta_Q_h);
-  rmm::device_vector<float> tmp_size_V_v(tmp_size_V_h);
-
-  cudaStream_t stream{0};
-
-  //
-  // Create graph
-  //
-  cugraph::GraphCSRView<int, int, float> G(
-    offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
-
-  q = cugraph::detail::modularity(float{12}, float{1}, G, cluster_v.data().get());
-
-  ASSERT_FLOAT_EQ(q, float{-30.0 / 144.0});
-
-  cugraph::detail::compute_delta_modularity(float{12},
-                                            float{1},
-                                            G,
-                                            src_indices_v,
-                                            vertex_weights_v,
-                                            cluster_weights_v,
-                                            cluster_v,
-                                            cluster_hash_v,
-                                            delta_Q_v,
-                                            tmp_size_V_v);
-
-  CUDA_TRY(cudaMemcpy(cluster_hash_h.data(),
-                      cluster_hash_v.data().get(),
-                      sizeof(int) * num_edges,
-                      cudaMemcpyDeviceToHost));
-  CUDA_TRY(cudaMemcpy(
-    delta_Q_h.data(), delta_Q_v.data().get(), sizeof(float) * num_edges, cudaMemcpyDeviceToHost));
-
-  ASSERT_EQ(cluster_hash_h[0], 1);
-  ASSERT_EQ(cluster_hash_h[10], 2);
-  ASSERT_EQ(cluster_hash_h[11], 3);
-  ASSERT_FLOAT_EQ(delta_Q_h[0], float{1.0 / 8.0});
-  ASSERT_FLOAT_EQ(delta_Q_h[10], float{1.0 / 8.0});
-  ASSERT_FLOAT_EQ(delta_Q_h[11], float{1.0 / 8.0});
-
-  //
-  //  Move vertex 0 into cluster 1
-  //
-  cluster_h[0]         = 1;
-  cluster_weights_h[0] = 0.0;
-  cluster_weights_h[1] = 4.0;
-
-  CUDA_TRY(cudaMemcpy(
-    cluster_v.data().get(), cluster_h.data(), sizeof(int) * num_verts, cudaMemcpyHostToDevice));
-  CUDA_TRY(cudaMemcpy(cluster_weights_v.data().get(),
-                      cluster_weights_h.data(),
-                      sizeof(float) * num_verts,
-                      cudaMemcpyHostToDevice));
-
-  q = cugraph::detail::modularity(float{12}, float{1}, G, cluster_v.data().get());
-
-  ASSERT_FLOAT_EQ(q, float{-12.0 / 144.0});
-
-  cugraph::detail::compute_delta_modularity(float{12},
-                                            float{1},
-                                            G,
-                                            src_indices_v,
-                                            vertex_weights_v,
-                                            cluster_weights_v,
-                                            cluster_v,
-                                            cluster_hash_v,
-                                            delta_Q_v,
-                                            tmp_size_V_v);
-
-  CUDA_TRY(cudaMemcpy(cluster_hash_h.data(),
-                      cluster_hash_v.data().get(),
-                      sizeof(int) * num_edges,
-                      cudaMemcpyDeviceToHost));
-  CUDA_TRY(cudaMemcpy(
-    delta_Q_h.data(), delta_Q_v.data().get(), sizeof(float) * num_edges, cudaMemcpyDeviceToHost));
-
-  ASSERT_EQ(cluster_hash_h[10], 2);
-  ASSERT_EQ(cluster_hash_h[11], 3);
-  ASSERT_FLOAT_EQ(delta_Q_h[10], float{1.0 / 8.0});
-  ASSERT_FLOAT_EQ(delta_Q_h[11], float{1.0 / 8.0});
-
-  //
-  //  Move vertex 1 into cluster 2.  Not the optimal, in fact it will reduce
-  //  modularity (so Louvain would never do this), but let's see if it reduces
-  //  by the expected amount (-12/144).
-  //
-  ASSERT_EQ(cluster_hash_h[3], 2);
-  ASSERT_FLOAT_EQ(delta_Q_h[3], float{-12.0 / 144.0});
-
-  cluster_h[1]         = 2;
-  cluster_weights_h[1] = 1.0;
-  cluster_weights_h[2] = 6.0;
-
-  CUDA_TRY(cudaMemcpy(
-    cluster_v.data().get(), cluster_h.data(), sizeof(int) * num_verts, cudaMemcpyHostToDevice));
-  CUDA_TRY(cudaMemcpy(cluster_weights_v.data().get(),
-                      cluster_weights_h.data(),
-                      sizeof(float) * num_verts,
-                      cudaMemcpyHostToDevice));
-
-  q = cugraph::detail::modularity(float{12}, float{1}, G, cluster_v.data().get());
-
-  ASSERT_FLOAT_EQ(q, float{-24.0 / 144.0});
-}
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/louvain_test.cu b/cpp/tests/community/louvain_test.cu
new file mode 100644
index 00000000000..20fa7b1d3d9
--- /dev/null
+++ b/cpp/tests/community/louvain_test.cu
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+#include <utilities/base_fixture.hpp>
+
+#include <algorithms.hpp>
+#include <graph.hpp>
+
+#include <thrust/extrema.h>
+
+#include <rmm/thrust_rmm_allocator.h>
+
+TEST(louvain, success)
+{
+  std::vector<int> off_h = {0,  16,  25,  35,  41,  44,  48,  52,  56,  61,  63, 66,
+                            67, 69,  74,  76,  78,  80,  82,  84,  87,  89,  91, 93,
+                            98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156};
+  std::vector<int> ind_h = {
+    1,  2,  3,  4,  5,  6,  7,  8,  10, 11, 12, 13, 17, 19, 21, 31, 0,  2,  3,  7,  13, 17, 19,
+    21, 30, 0,  1,  3,  7,  8,  9,  13, 27, 28, 32, 0,  1,  2,  7,  12, 13, 0,  6,  10, 0,  6,
+    10, 16, 0,  4,  5,  16, 0,  1,  2,  3,  0,  2,  30, 32, 33, 2,  33, 0,  4,  5,  0,  0,  3,
+    0,  1,  2,  3,  33, 32, 33, 32, 33, 5,  6,  0,  1,  32, 33, 0,  1,  33, 32, 33, 0,  1,  32,
+    33, 25, 27, 29, 32, 33, 25, 27, 31, 23, 24, 31, 29, 33, 2,  23, 24, 33, 2,  31, 33, 23, 26,
+    32, 33, 1,  8,  32, 33, 0,  24, 25, 28, 32, 33, 2,  8,  14, 15, 18, 20, 22, 23, 29, 30, 31,
+    33, 8,  9,  13, 14, 15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32};
+  std::vector<float> w_h = {
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+  int num_verts = off_h.size() - 1;
+  int num_edges = ind_h.size();
+
+  std::vector<int> cluster_id(num_verts, -1);
+
+  rmm::device_vector<int> offsets_v(off_h);
+  rmm::device_vector<int> indices_v(ind_h);
+  rmm::device_vector<float> weights_v(w_h);
+  rmm::device_vector<int> result_v(cluster_id);
+
+  cugraph::GraphCSRView<int, int, float> G(
+    offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
+
+  float modularity{0.0};
+  size_t num_level = 40;
+
+  raft::handle_t handle;
+
+  std::tie(num_level, modularity) = cugraph::louvain(handle, G, result_v.data().get());
+
+  cudaMemcpy((void*)&(cluster_id[0]),
+             result_v.data().get(),
+             sizeof(int) * num_verts,
+             cudaMemcpyDeviceToHost);
+
+  int min = *min_element(cluster_id.begin(), cluster_id.end());
+
+  ASSERT_GE(min, 0);
+  ASSERT_GE(modularity, 0.402777 * 0.95);
+}
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/bfs_test.cpp b/cpp/tests/experimental/bfs_test.cpp
new file mode 100644
index 00000000000..2498ca4f3f5
--- /dev/null
+++ b/cpp/tests/experimental/bfs_test.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <algorithms.hpp>
+#include <experimental/graph.hpp>
+#include <experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iterator>
+#include <limits>
+#include <vector>
+
+template <typename vertex_t, typename edge_t>
+void bfs_reference(edge_t* offsets,
+                   vertex_t* indices,
+                   vertex_t* distances,
+                   vertex_t* predecessors,
+                   vertex_t num_vertices,
+                   vertex_t source,
+                   vertex_t depth_limit = std::numeric_limits<vertex_t>::max())
+{
+  vertex_t depth{0};
+
+  std::fill(distances, distances + num_vertices, std::numeric_limits<vertex_t>::max());
+  std::fill(predecessors, predecessors + num_vertices, cugraph::invalid_vertex_id<vertex_t>::value);
+
+  *(distances + source) = depth;
+  std::vector<vertex_t> cur_frontier_rows{source};
+  std::vector<vertex_t> new_frontier_rows{};
+
+  while (cur_frontier_rows.size() > 0) {
+    for (auto const row : cur_frontier_rows) {
+      auto nbr_offset_first = *(offsets + row);
+      auto nbr_offset_last  = *(offsets + row + 1);
+      for (auto nbr_offset = nbr_offset_first; nbr_offset != nbr_offset_last; ++nbr_offset) {
+        auto nbr = *(indices + nbr_offset);
+        if (*(distances + nbr) == std::numeric_limits<vertex_t>::max()) {
+          *(distances + nbr)    = depth + 1;
+          *(predecessors + nbr) = row;
+          new_frontier_rows.push_back(nbr);
+        }
+      }
+    }
+    std::swap(cur_frontier_rows, new_frontier_rows);
+    new_frontier_rows.clear();
+    ++depth;
+    if (depth >= depth_limit) { break; }
+  }
+
+  return;
+}
+
+typedef struct BFS_Usecase_t {
+  std::string graph_file_full_path{};
+  size_t source{false};
+
+  BFS_Usecase_t(std::string const& graph_file_path, size_t source) : source(source)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} BFS_Usecase;
+
+class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
+ public:
+  Tests_BFS() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(BFS_Usecase const& configuration)
+  {
+    using weight_t = float;
+
+    raft::handle_t handle{};
+
+    auto graph =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false>(
+        handle, configuration.graph_file_full_path, false);
+    auto graph_view = graph.view();
+
+    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
+    raft::update_host(h_offsets.data(),
+                      graph_view.offsets(),
+                      graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_indices.data(),
+                      graph_view.indices(),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    ASSERT_TRUE(configuration.source >= 0 &&
+                configuration.source <= graph_view.get_number_of_vertices())
+      << "Starting sources should be >= 0 and"
+      << " less than the number of vertices in the graph.";
+
+    std::vector<vertex_t> h_reference_distances(graph_view.get_number_of_vertices());
+    std::vector<vertex_t> h_reference_predecessors(graph_view.get_number_of_vertices());
+
+    bfs_reference(h_offsets.data(),
+                  h_indices.data(),
+                  h_reference_distances.data(),
+                  h_reference_predecessors.data(),
+                  graph_view.get_number_of_vertices(),
+                  static_cast<vertex_t>(configuration.source),
+                  std::numeric_limits<vertex_t>::max());
+
+    rmm::device_uvector<vertex_t> d_distances(graph_view.get_number_of_vertices(),
+                                              handle.get_stream());
+    rmm::device_uvector<vertex_t> d_predecessors(graph_view.get_number_of_vertices(),
+                                                 handle.get_stream());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    cugraph::experimental::bfs(handle,
+                               graph_view,
+                               d_distances.begin(),
+                               d_predecessors.begin(),
+                               static_cast<vertex_t>(configuration.source),
+                               false,
+                               std::numeric_limits<vertex_t>::max(),
+                               false);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::vector<vertex_t> h_cugraph_distances(graph_view.get_number_of_vertices());
+    std::vector<vertex_t> h_cugraph_predecessors(graph_view.get_number_of_vertices());
+
+    raft::update_host(
+      h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream());
+    raft::update_host(h_cugraph_predecessors.data(),
+                      d_predecessors.data(),
+                      d_predecessors.size(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    ASSERT_TRUE(std::equal(
+      h_reference_distances.begin(), h_reference_distances.end(), h_cugraph_distances.begin()))
+      << "distances do not match with the reference values.";
+
+    for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) {
+      auto i = std::distance(h_cugraph_predecessors.begin(), it);
+      if (*it == cugraph::invalid_vertex_id<vertex_t>::value) {
+        ASSERT_TRUE(h_reference_predecessors[i] == *it)
+          << "vertex reachability do not match with the reference.";
+      } else {
+        ASSERT_TRUE(h_reference_distances[*it] + 1 == h_reference_distances[i])
+          << "distance to this vertex != distance to the predecessor vertex + 1.";
+        bool found{false};
+        for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) {
+          if (h_indices[j] == i) {
+            found = true;
+            break;
+          }
+        }
+        ASSERT_TRUE(found) << "no edge from the predecessor vertex to this vertex.";
+      }
+    }
+  }
+};
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_BFS, CheckInt32Int32) { run_current_test<int32_t, int32_t>(GetParam()); }
+
+INSTANTIATE_TEST_CASE_P(simple_test,
+                        Tests_BFS,
+                        ::testing::Values(BFS_Usecase("test/datasets/karate.mtx", 0),
+                                          BFS_Usecase("test/datasets/polbooks.mtx", 0),
+                                          BFS_Usecase("test/datasets/netscience.mtx", 0),
+                                          BFS_Usecase("test/datasets/netscience.mtx", 100),
+                                          BFS_Usecase("test/datasets/wiki2003.mtx", 1000),
+                                          BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp
new file mode 100644
index 00000000000..0352637dcf0
--- /dev/null
+++ b/cpp/tests/experimental/katz_centrality_test.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <algorithms.hpp>
+#include <experimental/graph.hpp>
+#include <experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+void katz_centrality_reference(edge_t* offsets,
+                               vertex_t* indices,
+                               weight_t* weights,
+                               result_t* betas,
+                               result_t* katz_centralities,
+                               vertex_t num_vertices,
+                               result_t alpha,
+                               result_t beta,  // relevant only if betas == nullptr
+                               result_t epsilon,
+                               size_t max_iterations,
+                               bool has_initial_guess,
+                               bool normalize)
+{
+  if (num_vertices == 0) { return; }
+
+  if (!has_initial_guess) {
+    std::fill(katz_centralities, katz_centralities + num_vertices, result_t{0.0});
+  }
+
+  std::vector<result_t> old_katz_centralities(num_vertices, result_t{0.0});
+  size_t iter{0};
+  while (true) {
+    std::copy(katz_centralities, katz_centralities + num_vertices, old_katz_centralities.begin());
+    for (vertex_t i = 0; i < num_vertices; ++i) {
+      katz_centralities[i] = betas != nullptr ? betas[i] : beta;
+      for (auto j = *(offsets + i); j < *(offsets + i + 1); ++j) {
+        auto nbr = indices[j];
+        auto w   = weights != nullptr ? weights[j] : result_t{1.0};
+        katz_centralities[i] += alpha * old_katz_centralities[nbr] * w;
+      }
+    }
+
+    result_t diff_sum{0.0};
+    for (vertex_t i = 0; i < num_vertices; ++i) {
+      diff_sum += fabs(katz_centralities[i] - old_katz_centralities[i]);
+    }
+    if (diff_sum < static_cast<result_t>(num_vertices) * epsilon) { break; }
+    iter++;
+    ASSERT_TRUE(iter < max_iterations);
+  }
+
+  if (normalize) {
+    auto l2_norm = std::sqrt(std::inner_product(
+      katz_centralities, katz_centralities + num_vertices, katz_centralities, result_t{0.0}));
+    std::transform(
+      katz_centralities, katz_centralities + num_vertices, katz_centralities, [l2_norm](auto& val) {
+        return val / l2_norm;
+      });
+  }
+
+  return;
+}
+
+typedef struct KatzCentrality_Usecase_t {
+  std::string graph_file_full_path{};
+  bool test_weighted{false};
+
+  KatzCentrality_Usecase_t(std::string const& graph_file_path, bool test_weighted)
+    : test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} KatzCentrality_Usecase;
+
+class Tests_KatzCentrality : public ::testing::TestWithParam<KatzCentrality_Usecase> {
+ public:
+  Tests_KatzCentrality() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(KatzCentrality_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    auto graph =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true>(
+        handle, configuration.graph_file_full_path, configuration.test_weighted);
+    auto graph_view = graph.view();
+
+    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
+    std::vector<weight_t> h_weights{};
+    raft::update_host(h_offsets.data(),
+                      graph_view.offsets(),
+                      graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_indices.data(),
+                      graph_view.indices(),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    if (graph_view.is_weighted()) {
+      h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0});
+      raft::update_host(h_weights.data(),
+                        graph_view.weights(),
+                        graph_view.get_number_of_edges(),
+                        handle.get_stream());
+    }
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    std::vector<result_t> h_reference_katz_centralities(graph_view.get_number_of_vertices());
+
+    std::vector<edge_t> tmps(h_offsets.size());
+    std::adjacent_difference(h_offsets.begin(), h_offsets.end(), tmps.begin());
+    auto max_it = std::max_element(tmps.begin(), tmps.end());
+
+    result_t const alpha = result_t{1.0} / static_cast<result_t>(*max_it + 1);
+    result_t constexpr beta{1.0};
+    result_t constexpr epsilon{1e-6};
+
+    katz_centrality_reference(
+      h_offsets.data(),
+      h_indices.data(),
+      h_weights.size() > 0 ? h_weights.data() : static_cast<weight_t*>(nullptr),
+      static_cast<result_t*>(nullptr),
+      h_reference_katz_centralities.data(),
+      graph_view.get_number_of_vertices(),
+      alpha,
+      beta,
+      epsilon,
+      std::numeric_limits<size_t>::max(),
+      false,
+      false);
+
+    rmm::device_uvector<result_t> d_katz_centralities(graph_view.get_number_of_vertices(),
+                                                      handle.get_stream());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    cugraph::experimental::katz_centrality(handle,
+                                           graph_view,
+                                           static_cast<result_t*>(nullptr),
+                                           d_katz_centralities.begin(),
+                                           alpha,
+                                           beta,
+                                           epsilon,
+                                           std::numeric_limits<size_t>::max(),
+                                           false,
+                                           false,
+                                           false);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::vector<result_t> h_cugraph_katz_centralities(graph_view.get_number_of_vertices());
+
+    raft::update_host(h_cugraph_katz_centralities.data(),
+                      d_katz_centralities.data(),
+                      d_katz_centralities.size(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; };
+
+    ASSERT_TRUE(std::equal(h_reference_katz_centralities.begin(),
+                           h_reference_katz_centralities.end(),
+                           h_cugraph_katz_centralities.begin(),
+                           nearly_equal))
+      << "Katz centrality values do not match with the reference values.";
+  }
+};
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_KatzCentrality, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(
+  simple_test,
+  Tests_KatzCentrality,
+  ::testing::Values(KatzCentrality_Usecase("test/datasets/karate.mtx", false),
+                    KatzCentrality_Usecase("test/datasets/karate.mtx", true),
+                    KatzCentrality_Usecase("test/datasets/web-Google.mtx", false),
+                    KatzCentrality_Usecase("test/datasets/web-Google.mtx", true),
+                    KatzCentrality_Usecase("test/datasets/ljournal-2008.mtx", false),
+                    KatzCentrality_Usecase("test/datasets/ljournal-2008.mtx", true),
+                    KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", false),
+                    KatzCentrality_Usecase("test/datasets/webbase-1M.mtx", true)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp
new file mode 100644
index 00000000000..3fe74e279ff
--- /dev/null
+++ b/cpp/tests/experimental/pagerank_test.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <algorithms.hpp>
+#include <experimental/graph.hpp>
+#include <experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+void pagerank_reference(edge_t* offsets,
+                        vertex_t* indices,
+                        weight_t* weights,
+                        vertex_t* personalization_vertices,
+                        result_t* personalization_values,
+                        result_t* pageranks,
+                        vertex_t num_vertices,
+                        vertex_t personalization_vector_size,
+                        result_t alpha,
+                        result_t epsilon,
+                        size_t max_iterations,
+                        bool has_initial_guess)
+{
+  if (num_vertices == 0) { return; }
+
+  if (has_initial_guess) {
+    auto sum = std::accumulate(pageranks, pageranks + num_vertices, result_t{0.0});
+    ASSERT_TRUE(sum > 0.0);
+    std::for_each(pageranks, pageranks + num_vertices, [sum](auto& val) { val /= sum; });
+  } else {
+    std::for_each(pageranks, pageranks + num_vertices, [num_vertices](auto& val) {
+      val = result_t{1.0} / static_cast<result_t>(num_vertices);
+    });
+  }
+
+  if (personalization_vertices != nullptr) {
+    auto sum = std::accumulate(
+      personalization_values, personalization_values + personalization_vector_size, result_t{0.0});
+    ASSERT_TRUE(sum > 0.0);
+    std::for_each(personalization_values,
+                  personalization_values + personalization_vector_size,
+                  [sum](auto& val) { val /= sum; });
+  }
+
+  std::vector<weight_t> out_weight_sums(num_vertices, result_t{0.0});
+  for (vertex_t i = 0; i < num_vertices; ++i) {
+    for (auto j = *(offsets + i); j < *(offsets + i + 1); ++j) {
+      auto nbr = indices[j];
+      auto w   = weights != nullptr ? weights[j] : 1.0;
+      out_weight_sums[nbr] += w;
+    }
+  }
+
+  std::vector<result_t> old_pageranks(num_vertices, result_t{0.0});
+  size_t iter{0};
+  while (true) {
+    std::copy(pageranks, pageranks + num_vertices, old_pageranks.begin());
+    result_t dangling_sum{0.0};
+    for (vertex_t i = 0; i < num_vertices; ++i) {
+      if (out_weight_sums[i] == result_t{0.0}) { dangling_sum += old_pageranks[i]; }
+    }
+    for (vertex_t i = 0; i < num_vertices; ++i) {
+      pageranks[i] = result_t{0.0};
+      for (auto j = *(offsets + i); j < *(offsets + i + 1); ++j) {
+        auto nbr = indices[j];
+        auto w   = weights != nullptr ? weights[j] : result_t{1.0};
+        pageranks[i] += alpha * old_pageranks[nbr] * (w / out_weight_sums[nbr]);
+      }
+      if (personalization_vertices == nullptr) {
+        pageranks[i] += (dangling_sum + (1.0 - alpha)) / static_cast<result_t>(num_vertices);
+      }
+    }
+    if (personalization_vertices != nullptr) {
+      for (vertex_t i = 0; i < personalization_vector_size; ++i) {
+        auto v = personalization_vertices[i];
+        pageranks[v] += (dangling_sum + (1.0 - alpha)) * personalization_values[i];
+      }
+    }
+    result_t diff_sum{0.0};
+    for (vertex_t i = 0; i < num_vertices; ++i) {
+      diff_sum += fabs(pageranks[i] - old_pageranks[i]);
+    }
+    if (diff_sum < static_cast<result_t>(num_vertices) * epsilon) { break; }
+    iter++;
+    ASSERT_TRUE(iter < max_iterations);
+  }
+
+  return;
+}
+
+typedef struct PageRank_Usecase_t {
+  std::string graph_file_full_path{};
+  bool test_weighted{false};
+
+  PageRank_Usecase_t(std::string const& graph_file_path, bool test_weighted)
+    : test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} PageRank_Usecase;
+
+class Tests_PageRank : public ::testing::TestWithParam<PageRank_Usecase> {
+ public:
+  Tests_PageRank() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(PageRank_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    auto graph =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true>(
+        handle, configuration.graph_file_full_path, configuration.test_weighted);
+    auto graph_view = graph.view();
+
+    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
+    std::vector<weight_t> h_weights{};
+    raft::update_host(h_offsets.data(),
+                      graph_view.offsets(),
+                      graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_indices.data(),
+                      graph_view.indices(),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    if (graph_view.is_weighted()) {
+      h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0});
+      raft::update_host(h_weights.data(),
+                        graph_view.weights(),
+                        graph_view.get_number_of_edges(),
+                        handle.get_stream());
+    }
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    std::vector<result_t> h_reference_pageranks(graph_view.get_number_of_vertices());
+
+    result_t constexpr alpha{0.85};
+    result_t constexpr epsilon{1e-6};
+
+    pagerank_reference(h_offsets.data(),
+                       h_indices.data(),
+                       h_weights.size() > 0 ? h_weights.data() : static_cast<weight_t*>(nullptr),
+                       static_cast<vertex_t*>(nullptr),
+                       static_cast<result_t*>(nullptr),
+                       h_reference_pageranks.data(),
+                       graph_view.get_number_of_vertices(),
+                       vertex_t{0},
+                       alpha,
+                       epsilon,
+                       std::numeric_limits<size_t>::max(),
+                       false);
+
+    rmm::device_uvector<result_t> d_pageranks(graph_view.get_number_of_vertices(),
+                                              handle.get_stream());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    cugraph::experimental::pagerank(handle,
+                                    graph_view,
+                                    static_cast<weight_t*>(nullptr),
+                                    static_cast<vertex_t*>(nullptr),
+                                    static_cast<result_t*>(nullptr),
+                                    vertex_t{0},
+                                    d_pageranks.begin(),
+                                    alpha,
+                                    epsilon,
+                                    std::numeric_limits<size_t>::max(),
+                                    false,
+                                    false);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::vector<result_t> h_cugraph_pageranks(graph_view.get_number_of_vertices());
+
+    raft::update_host(
+      h_cugraph_pageranks.data(), d_pageranks.data(), d_pageranks.size(), handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; };
+
+    ASSERT_TRUE(std::equal(h_reference_pageranks.begin(),
+                           h_reference_pageranks.end(),
+                           h_cugraph_pageranks.begin(),
+                           nearly_equal))
+      << "PageRank values do not match with the reference values.";
+  }
+};
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_PageRank, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(simple_test,
+                        Tests_PageRank,
+                        ::testing::Values(PageRank_Usecase("test/datasets/karate.mtx", false),
+                                          PageRank_Usecase("test/datasets/karate.mtx", true),
+                                          PageRank_Usecase("test/datasets/web-Google.mtx", false),
+                                          PageRank_Usecase("test/datasets/web-Google.mtx", true),
+                                          PageRank_Usecase("test/datasets/ljournal-2008.mtx",
+                                                           false),
+                                          PageRank_Usecase("test/datasets/ljournal-2008.mtx", true),
+                                          PageRank_Usecase("test/datasets/webbase-1M.mtx", false),
+                                          PageRank_Usecase("test/datasets/webbase-1M.mtx", true)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/sssp_test.cpp b/cpp/tests/experimental/sssp_test.cpp
new file mode 100644
index 00000000000..49eaca56f56
--- /dev/null
+++ b/cpp/tests/experimental/sssp_test.cpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <algorithms.hpp>
+#include <experimental/graph.hpp>
+#include <experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iterator>
+#include <limits>
+#include <queue>
+#include <tuple>
+#include <vector>
+
+// Dijkstra's algorithm
+template <typename vertex_t, typename edge_t, typename weight_t>
+void sssp_reference(edge_t* offsets,
+                    vertex_t* indices,
+                    weight_t* weights,
+                    weight_t* distances,
+                    vertex_t* predecessors,
+                    vertex_t num_vertices,
+                    vertex_t source,
+                    weight_t cutoff = std::numeric_limits<weight_t>::max())
+{
+  using queue_iterm_t = std::tuple<weight_t, vertex_t>;
+
+  std::fill(distances, distances + num_vertices, std::numeric_limits<weight_t>::max());
+  std::fill(predecessors, predecessors + num_vertices, cugraph::invalid_vertex_id<vertex_t>::value);
+
+  *(distances + source) = weight_t{0.0};
+  std::priority_queue<queue_iterm_t, std::vector<queue_iterm_t>, std::greater<queue_iterm_t>>
+    queue{};
+  queue.push(std::make_tuple(weight_t{0.0}, source));
+
+  while (queue.size() > 0) {
+    weight_t distance{};
+    vertex_t row{};
+    std::tie(distance, row) = queue.top();
+    queue.pop();
+    if (distance > *(distances + row)) { continue; }
+    auto nbr_offsets     = *(offsets + row);
+    auto nbr_offset_last = *(offsets + row + 1);
+    for (auto nbr_offset = nbr_offsets; nbr_offset != nbr_offset_last; ++nbr_offset) {
+      auto nbr          = *(indices + nbr_offset);
+      auto new_distance = distance + *(weights + nbr_offset);
+      auto threshold    = std::min(*(distances + nbr), cutoff);
+      if (new_distance < threshold) {
+        *(distances + nbr)    = new_distance;
+        *(predecessors + nbr) = row;
+        queue.push(std::make_tuple(new_distance, nbr));
+      }
+    }
+  }
+
+  return;
+}
+
+typedef struct SSSP_Usecase_t {
+  std::string graph_file_full_path{};
+  size_t source{false};
+
+  SSSP_Usecase_t(std::string const& graph_file_path, size_t source) : source(source)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} SSSP_Usecase;
+
+class Tests_SSSP : public ::testing::TestWithParam<SSSP_Usecase> {
+ public:
+  Tests_SSSP() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(SSSP_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    auto graph =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false>(
+        handle, configuration.graph_file_full_path, true);
+    auto graph_view = graph.view();
+
+    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
+    std::vector<weight_t> h_weights(graph_view.get_number_of_edges());
+    raft::update_host(h_offsets.data(),
+                      graph_view.offsets(),
+                      graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_indices.data(),
+                      graph_view.indices(),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    raft::update_host(h_weights.data(),
+                      graph_view.weights(),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    ASSERT_TRUE(configuration.source >= 0 &&
+                configuration.source <= graph_view.get_number_of_vertices())
+      << "Starting sources should be >= 0 and"
+      << " less than the number of vertices in the graph.";
+
+    std::vector<weight_t> h_reference_distances(graph_view.get_number_of_vertices());
+    std::vector<vertex_t> h_reference_predecessors(graph_view.get_number_of_vertices());
+
+    sssp_reference(h_offsets.data(),
+                   h_indices.data(),
+                   h_weights.data(),
+                   h_reference_distances.data(),
+                   h_reference_predecessors.data(),
+                   graph_view.get_number_of_vertices(),
+                   static_cast<vertex_t>(configuration.source));
+
+    rmm::device_uvector<weight_t> d_distances(graph_view.get_number_of_vertices(),
+                                              handle.get_stream());
+    rmm::device_uvector<vertex_t> d_predecessors(graph_view.get_number_of_vertices(),
+                                                 handle.get_stream());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    cugraph::experimental::sssp(handle,
+                                graph_view,
+                                d_distances.begin(),
+                                d_predecessors.begin(),
+                                static_cast<vertex_t>(configuration.source),
+                                std::numeric_limits<weight_t>::max(),
+                                false);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::vector<weight_t> h_cugraph_distances(graph_view.get_number_of_vertices());
+    std::vector<vertex_t> h_cugraph_predecessors(graph_view.get_number_of_vertices());
+
+    raft::update_host(
+      h_cugraph_distances.data(), d_distances.data(), d_distances.size(), handle.get_stream());
+    raft::update_host(h_cugraph_predecessors.data(),
+                      d_predecessors.data(),
+                      d_predecessors.size(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end());
+    auto epsilon            = *max_weight_element * weight_t{1e-6};
+    auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; };
+
+    ASSERT_TRUE(std::equal(h_reference_distances.begin(),
+                           h_reference_distances.end(),
+                           h_cugraph_distances.begin(),
+                           nearly_equal))
+      << "distances do not match with the reference values.";
+
+    for (auto it = h_cugraph_predecessors.begin(); it != h_cugraph_predecessors.end(); ++it) {
+      auto i = std::distance(h_cugraph_predecessors.begin(), it);
+      if (*it == cugraph::invalid_vertex_id<vertex_t>::value) {
+        ASSERT_TRUE(h_reference_predecessors[i] == *it)
+          << "vertex reachability do not match with the reference.";
+      } else {
+        auto pred_distance = h_reference_distances[*it];
+        bool found{false};
+        for (auto j = h_offsets[*it]; j < h_offsets[*it + 1]; ++j) {
+          if (h_indices[j] == i) {
+            if (nearly_equal(pred_distance + h_weights[j], h_reference_distances[i])) {
+              found = true;
+              break;
+            }
+          }
+        }
+        ASSERT_TRUE(found)
+          << "no edge from the predecessor vertex to this vertex with the matching weight.";
+      }
+    }
+  }
+};
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_SSSP, CheckInt32Int32Float) { run_current_test<int32_t, int32_t, float>(GetParam()); }
+
+#if 0
+INSTANTIATE_TEST_CASE_P(simple_test,
+                        Tests_SSSP,
+                        ::testing::Values(SSSP_Usecase("test/datasets/karate.mtx", 0)));
+#else
+INSTANTIATE_TEST_CASE_P(simple_test,
+                        Tests_SSSP,
+                        ::testing::Values(SSSP_Usecase("test/datasets/karate.mtx", 0),
+                                          SSSP_Usecase("test/datasets/dblp.mtx", 0),
+                                          SSSP_Usecase("test/datasets/wiki2003.mtx", 1000)));
+#endif
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp
index 65703e9541d..c87c63c56fb 100644
--- a/cpp/tests/utilities/test_utilities.hpp
+++ b/cpp/tests/utilities/test_utilities.hpp
@@ -15,9 +15,9 @@
  */
 #pragma once
 
-#include <utilities/error.hpp>
-
+#include <experimental/graph.hpp>
 #include <functions.hpp>
+#include <utilities/error.hpp>
 
 #include <gtest/gtest.h>
 
@@ -25,6 +25,9 @@ extern "C" {
 #include "mmio.h"
 }
 
+#include <gtest/gtest.h>
+
+#include <cfloat>
 #include <cstdio>
 #include <string>
 #include <vector>
@@ -376,5 +379,44 @@ edgelist_from_market_matrix_file_t<vertex_t, weight_t> read_edgelist_from_matrix
   return std::move(ret);
 }
 
+template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>
+read_graph_from_matrix_market_file(raft::handle_t const& handle,
+                                   std::string const& graph_file_full_path,
+                                   bool test_weighted)
+{
+  auto mm_graph =
+    read_edgelist_from_matrix_market_file<vertex_t, edge_t, weight_t>(graph_file_full_path);
+  edge_t number_of_edges = static_cast<edge_t>(mm_graph.h_rows.size());
+
+  rmm::device_uvector<vertex_t> d_edgelist_rows(number_of_edges, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_edgelist_cols(number_of_edges, handle.get_stream());
+  rmm::device_uvector<weight_t> d_edgelist_weights(test_weighted ? number_of_edges : 0,
+                                                   handle.get_stream());
+
+  raft::update_device(
+    d_edgelist_rows.data(), mm_graph.h_rows.data(), number_of_edges, handle.get_stream());
+  raft::update_device(
+    d_edgelist_cols.data(), mm_graph.h_cols.data(), number_of_edges, handle.get_stream());
+  if (test_weighted) {
+    raft::update_device(
+      d_edgelist_weights.data(), mm_graph.h_weights.data(), number_of_edges, handle.get_stream());
+  }
+
+  cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{
+    d_edgelist_rows.data(),
+    d_edgelist_cols.data(),
+    test_weighted ? d_edgelist_weights.data() : nullptr,
+    number_of_edges};
+
+  return cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>(
+    handle,
+    edgelist,
+    mm_graph.number_of_vertices,
+    cugraph::experimental::graph_properties_t{mm_graph.is_symmetric, false},
+    false,
+    true);
+}
+
 }  // namespace test
 }  // namespace cugraph
diff --git a/docs/source/api.rst b/docs/source/api.rst
index b194aa0e03c..d334b488d72 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -40,6 +40,13 @@ Betweenness Centrality
     :members:
     :undoc-members:
 
+Edge Betweenness Centrality
+---------------------------
+
+.. automodule:: cugraph.centrality.edge_betweenness_centrality
+    :members:
+    :undoc-members:
+
 Katz Centrality
 ---------------
 
diff --git a/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb b/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb
new file mode 100644
index 00000000000..6f76868f9a4
--- /dev/null
+++ b/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb
@@ -0,0 +1,202 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Benchmarking NetworkX compatibility\n",
+    "This notebook benchmark the use of a NetworkX Graph object as input into algorithms.  <p>\n",
+    "The intention of the feature is to be able to drop cuGraph into existing NetworkX code in spot where performance is not optimal.\n",
+    "\n",
+    "\n",
+    "### Betweenness Centrality\n",
+    "Both NetworkX and cuGraph allow for estimating the betweenness centrality score by using a subset of vertices rather than all the vertices.  WHile that does produce a less accurate answer, it dramatically improves performance when the sample is small.  For this test, the algorithms will use only 10% of the vertices to compute the estimate \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "__Notebook Credits__\n",
+    "\n",
+    "* Original Authors: Bradley Rees\n",
+    "* Last Edit: 09/27/2020\n",
+    "\n",
+    "RAPIDS Versions: 0.16\n",
+    "\n",
+    "Test Hardware\n",
+    "```\n",
+    "    GV100 32G, CUDA 10,0\n",
+    "    Intel(R) Core(TM) CPU i7-7800X @ 3.50GHz\n",
+    "    32GB system memory\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import networkx as nx\n",
+    "import cugraph as cnx\n",
+    "import time\n",
+    "import operator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# starting number of Nodes\n",
+    "N = 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# average degree\n",
+    "M = 16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_nx(G, k=None):\n",
+    "    t1 = time.time()\n",
+    "    bc = nx.betweenness_centrality(G, k)\n",
+    "    t2 = time.time() - t1\n",
+    "    return t2, bc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def run_cu(G, k=None):\n",
+    "    t1 = time.time()\n",
+    "    bc = cnx.betweenness_centrality(G, k)\n",
+    "    t2 = time.time() - t1\n",
+    "    return t2, bc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Betweenness Cenytrality - use all nodes - limit scale to 3,200 nodes so that executing time is not days\")\n",
+    "print(f\"Node \\tEdges  \\tSpeedup  \\t\\tcreate time  \\t\\tnx time  \\t\\tcu time \")\n",
+    "\n",
+    "for x in range(6):\n",
+    "    if x == 0:\n",
+    "        n = N\n",
+    "    else:\n",
+    "        n = n * 2\n",
+    "\n",
+    "    \n",
+    "    t1 = time.time()    \n",
+    "    # create a random graph\n",
+    "    G = nx.barabasi_albert_graph(n, M)\n",
+    "    g_time = time.time() - t1\n",
+    "    \n",
+    "    num_edges = G.number_of_edges()\n",
+    "    num_nodes = G.number_of_nodes()\n",
+    "    \n",
+    "    time_nx, bc = run_nx(G)\n",
+    "    time_cu, bcc = run_cu(G)\n",
+    "\n",
+    "    speedup = time_nx / time_cu\n",
+    "    print(f\"{num_nodes}\\t{num_edges}\\t{speedup}\\t{g_time}\\t{time_nx}\\t{time_cu}\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"PageRank - run larger graph since algorithm is fast\")\n",
+    "print(f\"Node \\tEdges  \\tSpeedup \\t\\tnx time  \\t\\tcu time \")\n",
+    "\n",
+    "pr_speedup = []\n",
+    "\n",
+    "for x in range(15):\n",
+    "    if x == 0:\n",
+    "        n = N\n",
+    "    else:\n",
+    "        n = n * 2\n",
+    "\n",
+    "    # create a random graph\n",
+    "    G = nx.barabasi_albert_graph(n, M)\n",
+    "    num_edges = G.number_of_edges()\n",
+    "    num_nodes = G.number_of_nodes()\n",
+    "    \n",
+    "    t1 = time.time()    \n",
+    "    nx_pr = nx.pagerank(G)\n",
+    "    time_nx = time.time() - t1\n",
+    "    \n",
+    "    t1 = time.time()    \n",
+    "    cp_pr = cnx.pagerank(G)\n",
+    "    time_cu = time.time() - t1\n",
+    "\n",
+    "    speedup = time_nx / time_cu\n",
+    "    print(f\"{num_nodes}\\t{num_edges}  \\t{speedup}\\t{time_nx}\\t{time_cu}\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "___\n",
+    "Copyright (c) 2020, NVIDIA CORPORATION.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\");  you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
+    "\n",
+    "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n",
+    "___"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cugraph_dev",
+   "language": "python",
+   "name": "cugraph_dev"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/python/cugraph/__init__.py b/python/cugraph/__init__.py
index 6f40641eddc..f8984f25978 100644
--- a/python/cugraph/__init__.py
+++ b/python/cugraph/__init__.py
@@ -14,6 +14,7 @@
 from cugraph.community import (
     ecg,
     ktruss_subgraph,
+    k_truss,
     louvain,
     leiden,
     spectralBalancedCutClustering,
@@ -32,6 +33,7 @@
     hypergraph,
     symmetrize,
     symmetrize_df,
+    symmetrize_ddf,
 )
 
 from cugraph.centrality import (
@@ -41,14 +43,30 @@
 )
 
 from cugraph.cores import core_number, k_core
+
 from cugraph.components import (
     weakly_connected_components,
     strongly_connected_components,
 )
+
 from cugraph.link_analysis import pagerank, hits
 
-from cugraph.link_prediction import jaccard, overlap, jaccard_w, overlap_w
-from cugraph.traversal import bfs, sssp, filter_unreachable
+from cugraph.link_prediction import (
+    jaccard,
+    jaccard_coefficient,
+    overlap,
+    overlap_coefficient,
+    jaccard_w,
+    overlap_w,
+)
+
+from cugraph.traversal import (
+    bfs,
+    bfs_edges, 
+    sssp,
+    shortest_path,
+    filter_unreachable,
+)
 
 from cugraph.utilities import utils
 
diff --git a/python/cugraph/centrality/betweenness_centrality.pxd b/python/cugraph/centrality/betweenness_centrality.pxd
index 0c17a17ad5a..829d7be37d9 100644
--- a/python/cugraph/centrality/betweenness_centrality.pxd
+++ b/python/cugraph/centrality/betweenness_centrality.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 
 cdef extern from "algorithms.hpp" namespace "cugraph":
diff --git a/python/cugraph/centrality/betweenness_centrality.py b/python/cugraph/centrality/betweenness_centrality.py
index 92bc5a7b3e0..634cc2aa7a2 100644
--- a/python/cugraph/centrality/betweenness_centrality.py
+++ b/python/cugraph/centrality/betweenness_centrality.py
@@ -16,6 +16,8 @@
 import cudf
 from cugraph.centrality import betweenness_centrality_wrapper
 from cugraph.centrality import edge_betweenness_centrality_wrapper
+from cugraph.utilities import df_edge_score_to_dictionary
+from cugraph.utilities import df_score_to_dictionary
 import cugraph
 
 
@@ -30,16 +32,20 @@ def betweenness_centrality(
     result_dtype=np.float64,
 ):
     """
-    Compute the betweenness centrality for all nodes of the graph G from a
-    sample of 'k' sources.
+    Compute the betweenness centrality for all vertices of the graph G.
+    Betweenness centrality is a measure of the number of shortest paths that
+    pass through a vertex.  A vertex with a high betweenness centrality score
+    has more paths passing through it and is therefore believed to be more
+    important. Rather than doing an all-pair shortest path, a sample of k
+    starting vertices can be used.
+
     CuGraph does not currently support the 'endpoints' and 'weight' parameters
     as seen in the corresponding networkX call.
 
     Parameters
     ----------
-    G : cuGraph.Graph
-        cuGraph graph descriptor with connectivity information. The graph can
-        be either directed (DiGraph) or undirected (Graph).
+    G : cuGraph.Graph or networkx.Graph
+        The graph can be either directed (DiGraph) or undirected (Graph).
         Weights in the graph are ignored, the current implementation uses
         BFS traversals. Use weight parameter if weights need to be considered
         (currently not supported)
@@ -86,11 +92,11 @@ def betweenness_centrality(
 
     Returns
     -------
-    df : cudf.DataFrame
+    df : cudf.DataFrame or Dictionary if using NetworkX
         GPU data frame containing two cudf.Series of size V: the vertex
         identifiers and the corresponding betweenness centrality values.
         Please note that the resulting the 'vertex' column might not be
-        in ascending order.
+        in ascending order.  The Dictionary conatains the same two columns
 
         df['vertex'] : cudf.Series
             Contains the vertex identifiers
@@ -99,10 +105,10 @@ def betweenness_centrality(
 
     Examples
     --------
-    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
     >>>                   dtype=['int32', 'int32', 'float32'], header=None)
     >>> G = cugraph.Graph()
-    >>> G.from_cudf_edgelist(M, source='0', destination='1')
+    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
     >>> bc = cugraph.betweenness_centrality(G)
     """
     # vertices is intended to be a cuDF series that contains a sampling of
@@ -111,8 +117,6 @@ def betweenness_centrality(
     # NOTE: cuDF doesn't currently support sampling, but there is a python
     # workaround.
 
-    vertices = _initialize_vertices(G, k, seed)
-
     if weight is not None:
         raise NotImplementedError(
             "weighted implementation of betweenness "
@@ -122,30 +126,42 @@ def betweenness_centrality(
     if result_dtype not in [np.float32, np.float64]:
         raise TypeError("result type can only be np.float32 or np.float64")
 
+    G, isNx = cugraph.utilities.check_nx_graph(G)
+
+    vertices = _initialize_vertices(G, k, seed)
+
     df = betweenness_centrality_wrapper.betweenness_centrality(
         G, normalized, endpoints, weight, vertices, result_dtype
     )
 
     if G.renumbered:
-        return G.unrenumber(df, "vertex")
+        df = G.unrenumber(df, "vertex")
 
-    return df
+    if isNx is True:
+        dict = df_score_to_dictionary(df, 'betweenness_centrality')
+        return dict
+    else:
+        return df
 
 
 def edge_betweenness_centrality(
     G, k=None, normalized=True, weight=None, seed=None, result_dtype=np.float64
 ):
     """
-    Compute the edge betweenness centrality for all edges of the graph G from a
-    sample of 'k' sources.
+    Compute the edge betweenness centrality for all edges of the graph G.
+    Betweenness centrality is a measure of the number of shortest paths
+    that pass over an edge.  An edge with a high betweenness centrality
+    score has more paths passing over it and is therefore believed to be
+    more important. Rather than doing an all-pair shortest path, a sample
+    of k starting vertices can be used.
+
     CuGraph does not currently support the 'weight' parameter
     as seen in the corresponding networkX call.
 
     Parameters
     ----------
-    G : cuGraph.Graph
-        cuGraph graph descriptor with connectivity information. The graph can
-        be either directed (DiGraph) or undirected (Graph).
+    G : cuGraph.Graph or networkx.Graph
+        The graph can be either directed (DiGraph) or undirected (Graph).
         Weights in the graph are ignored, the current implementation uses
         BFS traversals. Use weight parameter if weights need to be considered
         (currently not supported)
@@ -187,7 +203,7 @@ def edge_betweenness_centrality(
 
     Returns
     -------
-    df : cudf.DataFrame
+    df : cudf.DataFrame or Dictionary if using NetworkX
         GPU data frame containing three cudf.Series of size E: the vertex
         identifiers of the sources, the vertex identifies of the destinations
         and the corresponding betweenness centrality values.
@@ -211,14 +227,13 @@ def edge_betweenness_centrality(
 
     Examples
     --------
-    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
     >>>                   dtype=['int32', 'int32', 'float32'], header=None)
     >>> G = cugraph.Graph()
-    >>> G.from_cudf_edgelist(M, source='0', destination='1')
+    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
     >>> ebc = cugraph.edge_betweenness_centrality(G)
     """
 
-    vertices = _initialize_vertices(G, k, seed)
     if weight is not None:
         raise NotImplementedError(
             "weighted implementation of betweenness "
@@ -227,6 +242,9 @@ def edge_betweenness_centrality(
     if result_dtype not in [np.float32, np.float64]:
         raise TypeError("result type can only be np.float32 or np.float64")
 
+    G, isNx = cugraph.utilities.check_nx_graph(G)
+    vertices = _initialize_vertices(G, k, seed)
+
     df = edge_betweenness_centrality_wrapper.edge_betweenness_centrality(
         G, normalized, weight, vertices, result_dtype
     )
@@ -240,7 +258,10 @@ def edge_betweenness_centrality(
         df[["src", "dst"]][lower_triangle] = df[["dst", "src"]][lower_triangle]
         df = df.groupby(by=["src", "dst"]).sum().reset_index()
 
-    return df
+    if isNx is True:
+        return df_edge_score_to_dictionary(df, 'betweenness_centrality')
+    else:
+        return df
 
 
 # In order to compare with pre-set sources,
diff --git a/python/cugraph/centrality/betweenness_centrality_wrapper.pyx b/python/cugraph/centrality/betweenness_centrality_wrapper.pyx
index a20a58b844b..bb0e88a79ba 100644
--- a/python/cugraph/centrality/betweenness_centrality_wrapper.pyx
+++ b/python/cugraph/centrality/betweenness_centrality_wrapper.pyx
@@ -19,7 +19,7 @@
 from cugraph.centrality.betweenness_centrality cimport betweenness_centrality as c_betweenness_centrality
 from cugraph.centrality.betweenness_centrality cimport handle_t
 from cugraph.structure.graph import DiGraph
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libc.stdint cimport uintptr_t
 from libcpp cimport bool
 import cudf
diff --git a/python/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx b/python/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx
index 9a5a022f640..cdc8a1c61a2 100644
--- a/python/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx
+++ b/python/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx
@@ -17,9 +17,9 @@
 # cython: language_level = 3
 
 from cugraph.centrality.betweenness_centrality cimport edge_betweenness_centrality as c_edge_betweenness_centrality
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure import graph_primtypes_wrapper
 from cugraph.structure.graph import DiGraph, Graph
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libc.stdint cimport uintptr_t
 from libcpp cimport bool
 import cudf
diff --git a/python/cugraph/centrality/katz_centrality.pxd b/python/cugraph/centrality/katz_centrality.pxd
index a8496a2f508..53867f48ac6 100644
--- a/python/cugraph/centrality/katz_centrality.pxd
+++ b/python/cugraph/centrality/katz_centrality.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 
 cdef extern from "algorithms.hpp" namespace "cugraph":
diff --git a/python/cugraph/centrality/katz_centrality.py b/python/cugraph/centrality/katz_centrality.py
index d57682c726c..3e2680a196f 100644
--- a/python/cugraph/centrality/katz_centrality.py
+++ b/python/cugraph/centrality/katz_centrality.py
@@ -12,10 +12,12 @@
 # limitations under the License.
 
 from cugraph.centrality import katz_centrality_wrapper
+import cugraph
 
 
 def katz_centrality(
-    G, alpha=None, max_iter=100, tol=1.0e-6, nstart=None, normalized=True
+    G, alpha=None, beta=None, max_iter=100, tol=1.0e-6,
+    nstart=None, normalized=True
 ):
     """
     Compute the Katz centrality for the nodes of the graph G. cuGraph does not
@@ -30,7 +32,7 @@ def katz_centrality(
 
     Parameters
     ----------
-    G : cuGraph.Graph
+    G : cuGraph.Graph or networkx.Graph
         cuGraph graph descriptor with connectivity information. The graph can
         contain either directed (DiGraph) or undirected edges (Graph).
     alpha : float
@@ -45,6 +47,8 @@ def katz_centrality(
         (1/degree_max). Therefore, setting alpha to (1/degree_max) will
         guarantee that it will never exceed alpha_max thus in turn fulfilling
         the requirement for convergence.
+    beta : None
+        A weight scalar - currently Not Supported
     max_iter : int
         The maximum number of iterations before an answer is returned. This can
         be used to limit the execution time and do an early exit before the
@@ -72,7 +76,7 @@ def katz_centrality(
 
     Returns
     -------
-    df : cudf.DataFrame
+    df : cudf.DataFrame or Dictionary if using NetworkX
         GPU data frame containing two cudf.Series of size V: the vertex
         identifiers and the corresponding katz centrality values.
 
@@ -90,6 +94,14 @@ def katz_centrality(
     >>> kc = cugraph.katz_centrality(G)
     """
 
+    if beta is not None:
+        raise NotImplementedError(
+                "The beta argument is "
+                "currently not supported"
+        )
+
+    G, isNx = cugraph.utilities.check_nx_graph(G)
+
     if nstart is not None:
         if G.renumbered is True:
             nstart = G.add_internal_vertex_id(nstart, 'vertex', 'vertex')
@@ -101,4 +113,8 @@ def katz_centrality(
     if G.renumbered:
         df = G.unrenumber(df, "vertex")
 
-    return df
+    if isNx is True:
+        dict = cugraph.utilities.df_score_to_dictionary(df, 'katz_centrality')
+        return dict
+    else:
+        return df
diff --git a/python/cugraph/centrality/katz_centrality_wrapper.pyx b/python/cugraph/centrality/katz_centrality_wrapper.pyx
index 01b942991a5..926ed0452e0 100644
--- a/python/cugraph/centrality/katz_centrality_wrapper.pyx
+++ b/python/cugraph/centrality/katz_centrality_wrapper.pyx
@@ -17,8 +17,8 @@
 # cython: language_level = 3
 
 from cugraph.centrality.katz_centrality cimport katz_centrality as c_katz_centrality
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
 
@@ -37,7 +37,7 @@ def get_output_df(input_graph, nstart):
         if len(nstart) != num_verts:
             raise ValueError('nstart must have initial guess for all vertices')
 
-        nstart['values'] = graph_new_wrapper.datatype_cast([nstart['values']], [np.float64])
+        nstart['values'] = graph_primtypes_wrapper.datatype_cast([nstart['values']], [np.float64])
         df['katz_centrality'][nstart['vertex']] = nstart['values']
 
     return df
diff --git a/python/cugraph/community/__init__.py b/python/cugraph/community/__init__.py
index 31e6f097a7a..d3bb6472894 100644
--- a/python/cugraph/community/__init__.py
+++ b/python/cugraph/community/__init__.py
@@ -24,3 +24,4 @@
 from cugraph.community.subgraph_extraction import subgraph
 from cugraph.community.triangle_count import triangles
 from cugraph.community.ktruss_subgraph import ktruss_subgraph
+from cugraph.community.ktruss_subgraph import k_truss
diff --git a/python/cugraph/community/ecg.pxd b/python/cugraph/community/ecg.pxd
index 33af448754b..9f1dc269b6f 100644
--- a/python/cugraph/community/ecg.pxd
+++ b/python/cugraph/community/ecg.pxd
@@ -16,12 +16,13 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 
 
 cdef extern from "algorithms.hpp" namespace "cugraph":
 
     cdef void ecg[VT,ET,WT](
+        const handle_t &handle,
         const GraphCSRView[VT,ET,WT] &graph,
         WT min_weight,
         VT ensemble_size,
diff --git a/python/cugraph/community/ecg.py b/python/cugraph/community/ecg.py
index 85d97b50a8e..2e9da6bd2e5 100644
--- a/python/cugraph/community/ecg.py
+++ b/python/cugraph/community/ecg.py
@@ -12,9 +12,11 @@
 # limitations under the License.
 
 from cugraph.community import ecg_wrapper
+from cugraph.utilities import check_nx_graph
+from cugraph.utilities import df_score_to_dictionary
 
 
-def ecg(input_graph, min_weight=0.05, ensemble_size=16):
+def ecg(input_graph, min_weight=0.05, ensemble_size=16, weight=None):
     """
     Compute the Ensemble Clustering for Graphs (ECG) partition of the input
     graph. ECG runs truncated Louvain on an ensemble of permutations of the
@@ -26,8 +28,8 @@ def ecg(input_graph, min_weight=0.05, ensemble_size=16):
 
     Parameters
     ----------
-    input_graph : cugraph.Graph
-        cuGraph graph descriptor, should contain the connectivity information
+    input_graph : cugraph.Graph or NetworkX Graph
+        The graph descriptor should contain the connectivity information
         and weights. The adjacency list will be computed if not already
         present.
 
@@ -41,9 +43,14 @@ def ecg(input_graph, min_weight=0.05, ensemble_size=16):
         The default value is 16, larger values may produce higher quality
         partitions for some graphs.
 
+    weight : str
+        This parameter is here for NetworkX compatibility and
+        represents which NetworkX data column represents Edge weights.
+        Default is None
+
     Returns
     -------
-    parts : cudf.DataFrame
+    parts : cudf.DataFrame or python dictionary
         GPU data frame of size V containing two columns, the vertex id and
         the partition id it is assigned to.
 
@@ -63,9 +70,14 @@ def ecg(input_graph, min_weight=0.05, ensemble_size=16):
 
     """
 
+    input_graph, isNx = check_nx_graph(input_graph, weight)
+
     parts = ecg_wrapper.ecg(input_graph, min_weight, ensemble_size)
 
     if input_graph.renumbered:
         parts = input_graph.unrenumber(parts, "vertex")
 
-    return parts
+    if isNx is True:
+        return df_score_to_dictionary(parts, 'partition')
+    else:
+        return parts
diff --git a/python/cugraph/community/ecg_wrapper.pyx b/python/cugraph/community/ecg_wrapper.pyx
index 913a633c088..ed193b2e4bb 100644
--- a/python/cugraph/community/ecg_wrapper.pyx
+++ b/python/cugraph/community/ecg_wrapper.pyx
@@ -17,8 +17,8 @@
 # cython: language_level = 3
 
 from cugraph.community.ecg cimport ecg as c_ecg
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 
 import cudf
@@ -36,9 +36,12 @@ def ecg(input_graph, min_weight=.05, ensemble_size=16):
     if input_graph.adjlist.weights is None:
         raise Exception('ECG must be called on a weighted graph')
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets,
-                                                      input_graph.adjlist.indices], [np.int32, np.int64])
-    [weights] = graph_new_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets,
+                                                                input_graph.adjlist.indices], [np.int32, np.int64])
+    [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
@@ -62,13 +65,21 @@ def ecg(input_graph, min_weight=.05, ensemble_size=16):
 
         graph_float.get_vertex_identifiers(<int*>c_identifier)
 
-        c_ecg[int,int,float](graph_float, min_weight, ensemble_size, <int*> c_partition)
+        c_ecg[int,int,float](handle_ptr.get()[0],
+                             graph_float,
+                             min_weight,
+                             ensemble_size,
+                             <int*> c_partition)
     else:
         graph_double = GraphCSRView[int,int,double](<int*>c_offsets, <int*>c_indices,
                                                 <double*>c_weights, num_verts, num_edges)
 
         graph_double.get_vertex_identifiers(<int*>c_identifier)
 
-        c_ecg[int,int,double](graph_double, min_weight, ensemble_size, <int*> c_partition)
+        c_ecg[int,int,double](handle_ptr.get()[0],
+                              graph_double,
+                              min_weight,
+                              ensemble_size,
+                              <int*> c_partition)
 
     return df
diff --git a/python/cugraph/community/ktruss_subgraph.pxd b/python/cugraph/community/ktruss_subgraph.pxd
index 08e59d2f8f2..ab3a5189414 100644
--- a/python/cugraph/community/ktruss_subgraph.pxd
+++ b/python/cugraph/community/ktruss_subgraph.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 
 
 cdef extern from "algorithms.hpp" namespace "cugraph":
diff --git a/python/cugraph/community/ktruss_subgraph.py b/python/cugraph/community/ktruss_subgraph.py
index 74fc343c097..8e4f1471955 100644
--- a/python/cugraph/community/ktruss_subgraph.py
+++ b/python/cugraph/community/ktruss_subgraph.py
@@ -13,6 +13,47 @@
 
 from cugraph.community import ktruss_subgraph_wrapper
 from cugraph.structure.graph import Graph
+from cugraph.utilities import check_nx_graph
+from cugraph.utilities import cugraph_to_nx
+
+
+def k_truss(G, k):
+    """
+    Returns the K-Truss subgraph of a graph for a specific k.
+
+    The k-truss of a graph is a subgraph where each edge is part of at least
+    (k−2) triangles. K-trusses are used for finding tighlty knit groups of
+    vertices in a graph. A k-truss is a relaxation of a k-clique in the graph
+    and was define in [1]. Finding cliques is computationally demanding and
+    finding the maximal k-clique is known to be NP-Hard.
+
+    Parameters
+    ----------
+    G : cuGraph.Graph or networkx.Graph
+        cuGraph graph descriptor with connectivity information. k-Trusses are
+        defined for only undirected graphs as they are defined for
+        undirected triangle in a graph.
+
+    k : int
+        The desired k to be used for extracting the k-truss subgraph.
+
+    Returns
+    -------
+    G_truss : cuGraph.Graph or networkx.Graph
+        A cugraph graph descriptor with the k-truss subgraph for the given k.
+        The networkx graph will NOT have all attributes copied over
+    """
+
+    G, isNx = check_nx_graph(G)
+
+    if isNx is True:
+        k_sub = ktruss_subgraph(G, k)
+        S = cugraph_to_nx(k_sub)
+        return S
+    else:
+        return ktruss_subgraph(G, k)
+
+# FIXME: merge this function with k_truss
 
 
 def ktruss_subgraph(G, k, use_weights=True):
@@ -69,10 +110,10 @@ def ktruss_subgraph(G, k, use_weights=True):
 
     Examples
     --------
-    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
     >>>                   dtype=['int32', 'int32', 'float32'], header=None)
     >>> G = cugraph.Graph()
-    >>> G.from_cudf_edgelist(M, source='0', destination='1')
+    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
     >>> k_subgraph = cugraph.ktruss_subgraph(G, 3)
     """
 
diff --git a/python/cugraph/community/ktruss_subgraph_wrapper.pyx b/python/cugraph/community/ktruss_subgraph_wrapper.pyx
index 8a2c81f70fa..9f8138f4d57 100644
--- a/python/cugraph/community/ktruss_subgraph_wrapper.pyx
+++ b/python/cugraph/community/ktruss_subgraph_wrapper.pyx
@@ -17,8 +17,8 @@
 # cython: language_level = 3
 
 from cugraph.community.ktruss_subgraph cimport *
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
 from libc.float cimport FLT_MAX_EXP
@@ -39,7 +39,7 @@ def ktruss_subgraph_double(input_graph, k, use_weights):
 
 
 def ktruss_subgraph(input_graph, k, use_weights):
-    if graph_new_wrapper.weight_type(input_graph) == np.float64 and use_weights:
+    if graph_primtypes_wrapper.weight_type(input_graph) == np.float64 and use_weights:
         return ktruss_subgraph_double(input_graph, k, use_weights)
     else:
         return ktruss_subgraph_float(input_graph, k, use_weights)
diff --git a/python/cugraph/community/leiden.pxd b/python/cugraph/community/leiden.pxd
index 1c6009b30b6..80e0e12f65a 100644
--- a/python/cugraph/community/leiden.pxd
+++ b/python/cugraph/community/leiden.pxd
@@ -16,15 +16,16 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+
+from libcpp.utility cimport pair
+from cugraph.structure.graph_primtypes cimport *
 
 
 cdef extern from "algorithms.hpp" namespace "cugraph":
 
-    cdef void leiden[vertex_t,edge_t,weight_t](
+    cdef pair[size_t, weight_t] leiden[vertex_t,edge_t,weight_t](
+        const handle_t &handle,
         const GraphCSRView[vertex_t,edge_t,weight_t] &graph,
-        weight_t &final_modularity,
-        int &num_level,
         vertex_t *leiden_parts,
-        int max_level,
+        size_t max_level,
         weight_t resolution) except +
diff --git a/python/cugraph/community/leiden.py b/python/cugraph/community/leiden.py
index 355b2939617..8c1b79b8b63 100644
--- a/python/cugraph/community/leiden.py
+++ b/python/cugraph/community/leiden.py
@@ -13,9 +13,11 @@
 
 from cugraph.community import leiden_wrapper
 from cugraph.structure.graph import Graph
+from cugraph.utilities import check_nx_graph
+from cugraph.utilities import df_score_to_dictionary
 
 
-def leiden(input_graph, max_iter=100, resolution=1.):
+def leiden(G, max_iter=100, resolution=1.):
     """
     Compute the modularity optimizing partition of the input graph using the
     Leiden algorithm
@@ -28,7 +30,7 @@ def leiden(input_graph, max_iter=100, resolution=1.):
 
     Parameters
     ----------
-    input_graph : cugraph.Graph
+    G : cugraph.Graph
         cuGraph graph descriptor of type Graph
 
         The adjacency list will be computed if not already present.
@@ -70,15 +72,19 @@ def leiden(input_graph, max_iter=100, resolution=1.):
     >>> G.from_cudf_edgelist(M, source='0', destination='1')
     >>> parts, modularity_score = cugraph.leiden(G)
     """
+    G, isNx = check_nx_graph(G)
 
-    if type(input_graph) is not Graph:
-        raise Exception("input graph must be undirected")
+    if type(G) is not Graph:
+        raise Exception(f"input graph must be undirected was {type(G)}")
 
     parts, modularity_score = leiden_wrapper.leiden(
-        input_graph, max_iter, resolution
+        G, max_iter, resolution
     )
 
-    if input_graph.renumbered:
-        parts = input_graph.unrenumber(parts, "vertex")
+    if G.renumbered:
+        parts = G.unrenumber(parts, "vertex")
+
+    if isNx is True:
+        parts = df_score_to_dictionary(parts, "partition")
 
     return parts, modularity_score
diff --git a/python/cugraph/community/leiden_wrapper.pyx b/python/cugraph/community/leiden_wrapper.pyx
index 9ed220bb2a2..70fcfcf701b 100644
--- a/python/cugraph/community/leiden_wrapper.pyx
+++ b/python/cugraph/community/leiden_wrapper.pyx
@@ -17,8 +17,8 @@
 # cython: language_level = 3
 
 from cugraph.community.leiden cimport leiden as c_leiden
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 
 import cudf
@@ -33,16 +33,19 @@ def leiden(input_graph, max_iter, resolution):
     if not input_graph.adjlist:
         input_graph.view_adj_list()
 
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+
     weights = None
     final_modularity = None
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
 
     if input_graph.adjlist.weights is not None:
-        [weights] = graph_new_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
+        [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
     else:
         weights = cudf.Series(np.full(num_edges, 1.0, dtype=np.float32))
 
@@ -69,12 +72,11 @@ def leiden(input_graph, max_iter, resolution):
                                                   <float*>c_weights, num_verts, num_edges)
 
         graph_float.get_vertex_identifiers(<int*>c_identifier)
-        c_leiden(graph_float,
-                  final_modularity_float,
-                  num_level,
-                  <int*> c_partition,
-                  <int> max_iter,
-                  <float> resolution)
+        num_level, final_modularity_float = c_leiden(handle_ptr.get()[0],
+                                                     graph_float,
+                                                     <int*> c_partition,
+                                                     <int> max_iter,
+                                                     <float> resolution)
 
         final_modularity = final_modularity_float
     else:
@@ -82,12 +84,11 @@ def leiden(input_graph, max_iter, resolution):
                                                     <double*>c_weights, num_verts, num_edges)
 
         graph_double.get_vertex_identifiers(<int*>c_identifier)
-        c_leiden(graph_double,
-                  final_modularity_double,
-                  num_level,
-                  <int*> c_partition,
-                  <int> max_iter,
-                  <double> resolution)
+        num_level, final_modularity_double = c_leiden(handle_ptr.get()[0],
+                                                      graph_double,
+                                                      <int*> c_partition,
+                                                      <int> max_iter,
+                                                      <double> resolution)
         final_modularity = final_modularity_double
 
     return df, final_modularity
diff --git a/python/cugraph/community/louvain.pxd b/python/cugraph/community/louvain.pxd
index 7cc72b4d0ed..eca15ba3d20 100644
--- a/python/cugraph/community/louvain.pxd
+++ b/python/cugraph/community/louvain.pxd
@@ -16,15 +16,17 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
 
+from libcpp.utility cimport pair
+from cugraph.structure.graph_primtypes cimport *
 
-cdef extern from "algorithms.hpp" namespace "cugraph":
 
-    cdef void louvain[vertex_t,edge_t,weight_t](
-        const GraphCSRView[vertex_t,edge_t,weight_t] &graph,
-        weight_t *final_modularity,
-        int *num_level,
-        vertex_t *louvain_parts,
-        int max_level,
+cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+
+    cdef pair[size_t, weight_t] call_louvain[weight_t](
+        const handle_t &handle,
+        const graph_container_t &g,
+        void *identifiers,
+        void *parts,
+        size_t max_level,
         weight_t resolution) except +
diff --git a/python/cugraph/community/louvain.py b/python/cugraph/community/louvain.py
index 0d1fd9ec084..d4d56a1100c 100644
--- a/python/cugraph/community/louvain.py
+++ b/python/cugraph/community/louvain.py
@@ -13,9 +13,11 @@
 
 from cugraph.community import louvain_wrapper
 from cugraph.structure.graph import Graph
+from cugraph.utilities import check_nx_graph
+from cugraph.utilities import df_score_to_dictionary
 
 
-def louvain(input_graph, max_iter=100, resolution=1.):
+def louvain(G, max_iter=100, resolution=1.):
     """
     Compute the modularity optimizing partition of the input graph using the
     Louvain method
@@ -28,10 +30,10 @@ def louvain(input_graph, max_iter=100, resolution=1.):
 
     Parameters
     ----------
-    input_graph : cugraph.Graph
-        cuGraph graph descriptor of type Graph
-
-        The adjacency list will be computed if not already present.
+    G : cugraph.Graph or NetworkX Graph
+        The graph descriptor should contain the connectivity information
+        and weights. The adjacency list will be computed if not already
+        present.
 
     max_iter : integer
         This controls the maximum number of levels/iterations of the Louvain
@@ -71,14 +73,19 @@ def louvain(input_graph, max_iter=100, resolution=1.):
     >>> parts, modularity_score = cugraph.louvain(G)
     """
 
-    if type(input_graph) is not Graph:
+    G, isNx = check_nx_graph(G)
+
+    if type(G) is not Graph:
         raise Exception("input graph must be undirected")
 
     parts, modularity_score = louvain_wrapper.louvain(
-        input_graph, max_iter, resolution
+        G, max_iter, resolution
     )
 
-    if input_graph.renumbered:
-        parts = input_graph.unrenumber(parts, "vertex")
+    if G.renumbered:
+        parts = G.unrenumber(parts, "vertex")
+
+    if isNx is True:
+        parts = df_score_to_dictionary(parts, "partition")
 
     return parts, modularity_score
diff --git a/python/cugraph/community/louvain_wrapper.pyx b/python/cugraph/community/louvain_wrapper.pyx
index 79db57125b1..6b218a0b962 100644
--- a/python/cugraph/community/louvain_wrapper.pyx
+++ b/python/cugraph/community/louvain_wrapper.pyx
@@ -16,9 +16,9 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.community.louvain cimport louvain as c_louvain
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.community cimport louvain as c_louvain
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 
 import cudf
@@ -26,26 +26,39 @@ import rmm
 import numpy as np
 
 
-def louvain(input_graph, max_iter, resolution):
+# FIXME: move this to a more reusable location
+numberTypeMap = {np.dtype("int32") : <int>numberTypeEnum.int32Type,
+                 np.dtype("int64") : <int>numberTypeEnum.int64Type,
+                 np.dtype("float32") : <int>numberTypeEnum.floatType,
+                 np.dtype("double") : <int>numberTypeEnum.doubleType}
+
+
+def louvain(input_graph, max_level, resolution):
     """
     Call louvain
     """
     if not input_graph.adjlist:
         input_graph.view_adj_list()
 
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+    handle_ = handle_ptr.get();
+
     weights = None
     final_modularity = None
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
 
     if input_graph.adjlist.weights is not None:
-        [weights] = graph_new_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
+        [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
     else:
         weights = cudf.Series(np.full(num_edges, 1.0, dtype=np.float32))
 
+    weight_t = weights.dtype
+
     # Create the output dataframe
     df = cudf.DataFrame()
     df['vertex'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
@@ -56,38 +69,43 @@ def louvain(input_graph, max_iter, resolution):
     cdef uintptr_t c_identifier = df['vertex'].__cuda_array_interface__['data'][0]
     cdef uintptr_t c_partition = df['partition'].__cuda_array_interface__['data'][0]
     cdef uintptr_t c_weights = weights.__cuda_array_interface__['data'][0]
-
-    cdef GraphCSRView[int,int,float] graph_float
-    cdef GraphCSRView[int,int,double] graph_double
+    cdef uintptr_t c_local_verts = <uintptr_t> NULL;
+    cdef uintptr_t c_local_edges = <uintptr_t> NULL;
+    cdef uintptr_t c_local_offsets = <uintptr_t> NULL;
 
     cdef float final_modularity_float = 1.0
     cdef double final_modularity_double = 1.0
     cdef int num_level = 0
 
-    if weights.dtype == np.float32:
-        graph_float = GraphCSRView[int,int,float](<int*>c_offsets, <int*>c_indices,
-                                                  <float*>c_weights, num_verts, num_edges)
-
-        graph_float.get_vertex_identifiers(<int*>c_identifier)
-        c_louvain(graph_float,
-                  &final_modularity_float,
-                  &num_level,
-                  <int*> c_partition,
-                  <int> max_iter,
-                  <float> resolution)
+    cdef graph_container_t graph_container
+
+    # FIXME: The excessive casting for the enum arg is needed to make cython
+    #        understand how to pass the enum value (this is the same pattern
+    #        used by cudf). This will not be needed with Cython 3.0
+    populate_graph_container_legacy(graph_container,
+                                    <graphTypeEnum>(<int>(graphTypeEnum.LegacyCSR)),
+                                    handle_[0],
+                                    <void*>c_offsets, <void*>c_indices, <void*>c_weights,
+                                    <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
+                                    <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
+                                    <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
+                                    num_verts, num_edges,
+                                    <int*>c_local_verts, <int*>c_local_edges, <int*>c_local_offsets)
+
+    if weight_t == np.float32:
+        num_level, final_modularity_float = c_louvain.call_louvain[float](handle_[0], graph_container,
+                                                      <void*> c_identifier,
+                                                      <void*> c_partition,
+                                                      max_level,
+                                                      resolution)
 
         final_modularity = final_modularity_float
     else:
-        graph_double = GraphCSRView[int,int,double](<int*>c_offsets, <int*>c_indices,
-                                                    <double*>c_weights, num_verts, num_edges)
-
-        graph_double.get_vertex_identifiers(<int*>c_identifier)
-        c_louvain(graph_double,
-                  &final_modularity_double,
-                  &num_level,
-                  <int*> c_partition,
-                  <int> max_iter,
-                  <double> resolution)
+        num_level, final_modularity_double = c_louvain.call_louvain[double](handle_[0], graph_container,
+                                                                            <void*> c_identifier,
+                                                                            <void*> c_partition,
+                                                                            max_level,
+                                                                            resolution)
         final_modularity = final_modularity_double
 
     return df, final_modularity
diff --git a/python/cugraph/community/spectral_clustering.pxd b/python/cugraph/community/spectral_clustering.pxd
index 360ff08a04e..27ce6130b05 100644
--- a/python/cugraph/community/spectral_clustering.pxd
+++ b/python/cugraph/community/spectral_clustering.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 
 
 cdef extern from "algorithms.hpp" namespace "cugraph::ext_raft":
@@ -30,7 +30,7 @@ cdef extern from "algorithms.hpp" namespace "cugraph::ext_raft":
         const float kmean_tolerance,
         const int kmean_max_iter,
         VT* clustering) except +
-    
+
     cdef void spectralModularityMaximization[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph,
         const int n_clusters,
@@ -40,19 +40,19 @@ cdef extern from "algorithms.hpp" namespace "cugraph::ext_raft":
         const float kmean_tolerance,
         const int kmean_max_iter,
         VT* clustering) except +
-    
+
     cdef void analyzeClustering_modularity[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph,
         const int n_clusters,
         const VT* clustering,
         WT* score) except +
-    
+
     cdef void analyzeClustering_edge_cut[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph,
         const int n_clusters,
         const VT* clustering,
         WT* score) except +
-    
+
     cdef void analyzeClustering_ratio_cut[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph,
         const int n_clusters,
diff --git a/python/cugraph/community/spectral_clustering.py b/python/cugraph/community/spectral_clustering.py
index 92f8920199b..b5f175e8237 100644
--- a/python/cugraph/community/spectral_clustering.py
+++ b/python/cugraph/community/spectral_clustering.py
@@ -12,6 +12,8 @@
 # limitations under the License.
 
 from cugraph.community import spectral_clustering_wrapper
+from cugraph.utilities import check_nx_graph
+from cugraph.utilities import df_score_to_dictionary
 
 
 def spectralBalancedCutClustering(
@@ -29,7 +31,7 @@ def spectralBalancedCutClustering(
 
     Parameters
     ----------
-    G : cugraph.Graph
+    G : cugraph.Graph or networkx.Graph
         cuGraph graph descriptor
     num_clusters : integer
          Specifies the number of clusters to find
@@ -71,6 +73,8 @@ def spectralBalancedCutClustering(
     >>> df = cugraph.spectralBalancedCutClustering(G, 5)
     """
 
+    G, isNx = check_nx_graph(G)
+
     df = spectral_clustering_wrapper.spectralBalancedCutClustering(
         G,
         num_clusters,
@@ -84,6 +88,9 @@ def spectralBalancedCutClustering(
     if G.renumbered:
         df = G.unrenumber(df, "vertex")
 
+    if isNx is True:
+        df = df_score_to_dictionary(df, "cluster")
+
     return df
 
 
@@ -141,6 +148,8 @@ def spectralModularityMaximizationClustering(
     >>> df = cugraph.spectralModularityMaximizationClustering(G, 5)
     """
 
+    G, isNx = check_nx_graph(G)
+
     df = spectral_clustering_wrapper.spectralModularityMaximizationClustering(
         G,
         num_clusters,
@@ -154,6 +163,9 @@ def spectralModularityMaximizationClustering(
     if G.renumbered:
         df = G.unrenumber(df, "vertex")
 
+    if isNx is True:
+        df = df_score_to_dictionary(df, "cluster")
+
     return df
 
 
@@ -250,6 +262,8 @@ def analyzeClustering_edge_cut(G, n_clusters, clustering,
     >>>   'vertex', 'cluster')
     """
 
+    G, isNx = check_nx_graph(G)
+
     if G.renumbered:
         clustering = G.add_internal_vertex_id(clustering,
                                               vertex_col_name,
diff --git a/python/cugraph/community/spectral_clustering_wrapper.pyx b/python/cugraph/community/spectral_clustering_wrapper.pyx
index fff027bac7e..0593d987c0d 100644
--- a/python/cugraph/community/spectral_clustering_wrapper.pyx
+++ b/python/cugraph/community/spectral_clustering_wrapper.pyx
@@ -21,8 +21,8 @@ from cugraph.community.spectral_clustering cimport spectralModularityMaximizatio
 from cugraph.community.spectral_clustering cimport analyzeClustering_modularity as c_analyze_clustering_modularity
 from cugraph.community.spectral_clustering cimport analyzeClustering_edge_cut as c_analyze_clustering_edge_cut
 from cugraph.community.spectral_clustering cimport analyzeClustering_ratio_cut as c_analyze_clustering_ratio_cut
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
 
@@ -50,13 +50,13 @@ def spectralBalancedCutClustering(input_graph,
 
     weights = None
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
 
     if input_graph.adjlist.weights is not None:
-        [weights] = graph_new_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
+        [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
     else:
         weights = cudf.Series(np.full(num_edges, 1.0, dtype=np.float32))
 
@@ -122,8 +122,8 @@ def spectralModularityMaximizationClustering(input_graph,
     if input_graph.adjlist.weights is None:
         raise Exception("spectral modularity maximization must be called on a graph with weights")
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
-    [weights] = graph_new_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
@@ -181,8 +181,8 @@ def analyzeClustering_modularity(input_graph, n_clusters, clustering):
     if not input_graph.adjlist:
         input_graph.view_adj_list()
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
-    [weights] = graph_new_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
 
     score = None
     num_verts = input_graph.number_of_vertices()
@@ -191,7 +191,7 @@ def analyzeClustering_modularity(input_graph, n_clusters, clustering):
     if input_graph.adjlist.weights is None:
         raise Exception("analyze clustering modularity must be called on a graph with weights")
     if input_graph.adjlist.weights is not None:
-        [weights] = graph_new_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
+        [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
     else:
         weights = cudf.Series(np.full(num_edges, 1.0, dtype=np.float32))
 
@@ -237,14 +237,14 @@ def analyzeClustering_edge_cut(input_graph, n_clusters, clustering):
     if not input_graph.adjlist:
         input_graph.view_adj_list()
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
 
     score = None
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
 
     if input_graph.adjlist.weights is not None:
-        [weights] = graph_new_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
+        [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
     else:
         weights = cudf.Series(np.full(num_edges, 1.0, dtype=np.float32))
 
@@ -290,14 +290,14 @@ def analyzeClustering_ratio_cut(input_graph, n_clusters, clustering):
     if not input_graph.adjlist:
         input_graph.view_adj_list()
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
 
     score = None
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
 
     if input_graph.adjlist.weights is not None:
-        [weights] = graph_new_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
+        [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
     else:
         weights = cudf.Series(np.full(num_edges, 1.0, dtype=np.float32))
 
diff --git a/python/cugraph/community/subgraph_extraction.pxd b/python/cugraph/community/subgraph_extraction.pxd
index 12cef73fad4..97a71056006 100644
--- a/python/cugraph/community/subgraph_extraction.pxd
+++ b/python/cugraph/community/subgraph_extraction.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libcpp.memory cimport unique_ptr
 
 
diff --git a/python/cugraph/community/subgraph_extraction.py b/python/cugraph/community/subgraph_extraction.py
index 6a17061db92..8c702c2f58f 100644
--- a/python/cugraph/community/subgraph_extraction.py
+++ b/python/cugraph/community/subgraph_extraction.py
@@ -13,6 +13,8 @@
 
 from cugraph.community import subgraph_extraction_wrapper
 from cugraph.structure.graph import null_check
+from cugraph.utilities import check_nx_graph
+from cugraph.utilities import cugraph_to_nx
 
 
 def subgraph(G, vertices):
@@ -36,12 +38,12 @@ def subgraph(G, vertices):
 
     Examples
     --------
-    >>> M = cudf.read_csv('datasets/karate.csv',
+    >>> gdf = cudf.read_csv('datasets/karate.csv',
                           delimiter = ' ',
                           dtype=['int32', 'int32', 'float32'],
                           header=None)
     >>> G = cugraph.Graph()
-    >>> G.from_cudf_edgelist(M, source='0', destination='1')
+    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
     >>> verts = numpy.zeros(3, dtype=numpy.int32)
     >>> verts[0] = 0
     >>> verts[1] = 1
@@ -52,6 +54,8 @@ def subgraph(G, vertices):
 
     null_check(vertices)
 
+    G, isNx = check_nx_graph(G)
+
     if G.renumbered:
         vertices = G.lookup_internal_vertex_id(vertices)
 
@@ -70,4 +74,7 @@ def subgraph(G, vertices):
     else:
         result_graph.from_cudf_edgelist(df, source="src", destination="dst")
 
+    if isNx is True:
+        result_graph = cugraph_to_nx(result_graph)
+
     return result_graph
diff --git a/python/cugraph/community/subgraph_extraction_wrapper.pyx b/python/cugraph/community/subgraph_extraction_wrapper.pyx
index 03593dafe03..5dbb6ce1e27 100644
--- a/python/cugraph/community/subgraph_extraction_wrapper.pyx
+++ b/python/cugraph/community/subgraph_extraction_wrapper.pyx
@@ -17,8 +17,8 @@
 # cython: language_level = 3
 
 from cugraph.community.subgraph_extraction cimport extract_subgraph_vertex as c_extract_subgraph_vertex
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 
 import cudf
@@ -38,13 +38,13 @@ def subgraph(input_graph, vertices):
     if not input_graph.edgelist:
         input_graph.view_edge_list()
 
-    [src, dst] = graph_new_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']], [np.int32])
+    [src, dst] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']], [np.int32])
 
     if input_graph.edgelist.weights:
-        [weights] = graph_new_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['weights']], [np.float32, np.float64])
+        [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['weights']], [np.float32, np.float64])
         if weights.dtype == np.float64:
             use_float = False
-        
+
     cdef GraphCOOView[int,int,float]  in_graph_float
     cdef GraphCOOView[int,int,double] in_graph_double
     cdef unique_ptr[GraphCOO[int,int,float]]  out_graph_float
@@ -56,7 +56,7 @@ def subgraph(input_graph, vertices):
 
     if weights is not None:
         c_weights = weights.__cuda_array_interface__['data'][0]
-    
+
     cdef uintptr_t c_vertices = vertices.__cuda_array_interface__['data'][0]
 
     num_verts = input_graph.number_of_vertices()
@@ -75,7 +75,7 @@ def subgraph(input_graph, vertices):
     vertices_df['v'] = vertices
     vertices_df = vertices_df.reset_index(drop=True).reset_index()
 
-    df = df.merge(vertices_df, left_on='src', right_on='index', how='left').drop(['src', 'index']).rename(columns={'v': 'src'}, copy=False)
-    df = df.merge(vertices_df, left_on='dst', right_on='index', how='left').drop(['dst', 'index']).rename(columns={'v': 'dst'}, copy=False)
-    
+    df = df.merge(vertices_df, left_on='src', right_on='index', how='left').drop(columns=['src', 'index']).rename(columns={'v': 'src'}, copy=False)
+    df = df.merge(vertices_df, left_on='dst', right_on='index', how='left').drop(columns=['dst', 'index']).rename(columns={'v': 'dst'}, copy=False)
+
     return df
diff --git a/python/cugraph/community/triangle_count.pxd b/python/cugraph/community/triangle_count.pxd
index 6876d067f7a..70795a3f43a 100644
--- a/python/cugraph/community/triangle_count.pxd
+++ b/python/cugraph/community/triangle_count.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libc.stdint cimport uint64_t
 
 
diff --git a/python/cugraph/community/triangle_count.py b/python/cugraph/community/triangle_count.py
index 52193c74a3e..ff4dc9a5c5f 100644
--- a/python/cugraph/community/triangle_count.py
+++ b/python/cugraph/community/triangle_count.py
@@ -13,16 +13,20 @@
 
 from cugraph.community import triangle_count_wrapper
 from cugraph.structure.graph import Graph
+from cugraph.utilities import check_nx_graph
 
 
 def triangles(G):
     """
-    Compute the triangle (number of cycles of length three) count of the
+    Compute the number of triangles (cycles of length three) in the
     input graph.
 
+    Unlike NetworkX, this algorithm simply returns the total number of
+    triangle and not the number per vertex.
+
     Parameters
     ----------
-    G : cugraph.graph
+    G : cugraph.graph or networkx.Graph
         cuGraph graph descriptor, should contain the connectivity information,
         (edge weights are not used in this algorithm)
 
@@ -34,15 +38,17 @@ def triangles(G):
 
     Examples
     --------
-    >>> M = cudf.read_csv('datasets/karate.csv',
+    >>> gdf = cudf.read_csv('datasets/karate.csv',
                           delimiter = ' ',
                           dtype=['int32', 'int32', 'float32'],
                           header=None)
     >>> G = cugraph.Graph()
-    >>> G.from_cudf_edgelist(M, source='0', destination='1')
+    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
     >>> count = cugraph.triangles(G)
     """
 
+    G, _ = check_nx_graph(G)
+
     if type(G) is not Graph:
         raise Exception("input graph must be undirected")
 
diff --git a/python/cugraph/community/triangle_count_wrapper.pyx b/python/cugraph/community/triangle_count_wrapper.pyx
index f34f6a7a947..d7cabd4676f 100644
--- a/python/cugraph/community/triangle_count_wrapper.pyx
+++ b/python/cugraph/community/triangle_count_wrapper.pyx
@@ -17,8 +17,8 @@
 # cython: language_level = 3
 
 from cugraph.community.triangle_count cimport triangle_count as c_triangle_count
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 import numpy as np
 
@@ -36,8 +36,8 @@ def triangles(input_graph):
     if not input_graph.adjlist:
         input_graph.view_adj_list()
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets,
-                                                          input_graph.adjlist.indices], [np.int32])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets,
+                                                                input_graph.adjlist.indices], [np.int32])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
@@ -49,5 +49,5 @@ def triangles(input_graph):
     graph = GraphCSRView[int,int,float](<int*>c_offsets, <int*>c_indices, <float*>NULL, num_verts, num_edges)
 
     result = c_triangle_count(graph)
-    
+
     return result
diff --git a/python/cugraph/components/connectivity.pxd b/python/cugraph/components/connectivity.pxd
index b2dc953e052..94fa165969d 100644
--- a/python/cugraph/components/connectivity.pxd
+++ b/python/cugraph/components/connectivity.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 
 
 cdef extern from "algorithms.hpp" namespace "cugraph":
@@ -30,4 +30,3 @@ cdef extern from "algorithms.hpp" namespace "cugraph":
         const GraphCSRView[VT,ET,WT] &graph,
         cugraph_cc_t connect_type,
         VT *labels) except +
-
diff --git a/python/cugraph/components/connectivity.py b/python/cugraph/components/connectivity.py
index 522eff78c20..f0b40601ab9 100644
--- a/python/cugraph/components/connectivity.py
+++ b/python/cugraph/components/connectivity.py
@@ -12,6 +12,8 @@
 # limitations under the License.
 
 from cugraph.components import connectivity_wrapper
+from cugraph.utilities import check_nx_graph
+from cugraph.utilities import df_score_to_dictionary
 
 
 def weakly_connected_components(G):
@@ -21,7 +23,7 @@ def weakly_connected_components(G):
 
     Parameters
     ----------
-    G : cugraph.Graph
+    G : cugraph.Graph or networkx.Graph
         cuGraph graph descriptor, should contain the connectivity information
         as an edge list (edge weights are not used for this algorithm).
         Currently, the graph should be undirected where an undirected edge is
@@ -32,8 +34,13 @@ def weakly_connected_components(G):
     Returns
     -------
     df : cudf.DataFrame
-      df['labels'][i] gives the label id of the i'th vertex
-      df['vertices'][i] gives the vertex id of the i'th vertex
+        GPU data frame containing two cudf.Series of size V: the vertex
+        identifiers and the corresponding component identifier.
+
+        df['vertices']
+            Contains the vertex identifier
+        df['labels']
+            The component identifier
 
     Examples
     --------
@@ -46,11 +53,16 @@ def weakly_connected_components(G):
     >>> df = cugraph.weakly_connected_components(G)
     """
 
+    G, isNx = check_nx_graph(G)
+
     df = connectivity_wrapper.weakly_connected_components(G)
 
     if G.renumbered:
         df = G.unrenumber(df, "vertices")
 
+    if isNx is True:
+        df = df_score_to_dictionary(df, "labels", "vertices")
+
     return df
 
 
@@ -61,7 +73,7 @@ def strongly_connected_components(G):
 
     Parameters
     ----------
-    G : cugraph.Graph
+    G : cugraph.Graph or networkx.Graph
       cuGraph graph descriptor, should contain the connectivity information as
       an edge list (edge weights are not used for this algorithm). The graph
       can be either directed or undirected where an undirected edge is
@@ -72,8 +84,13 @@ def strongly_connected_components(G):
     Returns
     -------
     df : cudf.DataFrame
-      df['labels'][i] gives the label id of the i'th vertex
-      df['vertices'][i] gives the vertex id of the i'th vertex
+        GPU data frame containing two cudf.Series of size V: the vertex
+        identifiers and the corresponding component identifier.
+
+        df['vertices']
+            Contains the vertex identifier
+        df['labels']
+            The component identifier
 
     Examples
     --------
@@ -86,9 +103,14 @@ def strongly_connected_components(G):
     >>> df = cugraph.strongly_connected_components(G)
     """
 
+    G, isNx = check_nx_graph(G)
+
     df = connectivity_wrapper.strongly_connected_components(G)
 
     if G.renumbered:
         df = G.unrenumber(df, "vertices")
 
+    if isNx is True:
+        df = df_score_to_dictionary(df, "labels", "vertices")
+
     return df
diff --git a/python/cugraph/components/connectivity_wrapper.pyx b/python/cugraph/components/connectivity_wrapper.pyx
index a738ad0c9db..9f6fa353001 100644
--- a/python/cugraph/components/connectivity_wrapper.pyx
+++ b/python/cugraph/components/connectivity_wrapper.pyx
@@ -17,9 +17,9 @@
 # cython: language_level = 3
 
 from cugraph.components.connectivity cimport *
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure import utils_wrapper
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 from cugraph.structure.symmetrize import symmetrize
 from cugraph.structure.graph import Graph as type_Graph
@@ -33,24 +33,24 @@ def weakly_connected_components(input_graph):
     """
     offsets = None
     indices = None
-    
+
     if type(input_graph) is not type_Graph:
         #
         # Need to create a symmetrized CSR for this local
         # computation, don't want to keep it.
         #
-        [src, dst] = graph_new_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'],
-                                                      input_graph.edgelist.edgelist_df['dst']],
-                                                     [np.int32])
+        [src, dst] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'],
+                                                            input_graph.edgelist.edgelist_df['dst']],
+                                                           [np.int32])
         src, dst = symmetrize(src, dst)
         [offsets, indices] = utils_wrapper.coo2csr(src, dst)[0:2]
     else:
         if not input_graph.adjlist:
             input_graph.view_adj_list()
 
-        [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets,
-                                                              input_graph.adjlist.indices],
-                                                             [np.int32])
+        [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets,
+                                                                    input_graph.adjlist.indices],
+                                                                   [np.int32])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
@@ -58,7 +58,7 @@ def weakly_connected_components(input_graph):
     df = cudf.DataFrame()
     df['vertices'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
     df['labels'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
-    
+
     cdef uintptr_t c_offsets    = offsets.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_indices    = indices.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_identifier = df['vertices'].__cuda_array_interface__['data'][0];
@@ -83,7 +83,7 @@ def strongly_connected_components(input_graph):
     if not input_graph.adjlist:
         input_graph.view_adj_list()
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
@@ -91,7 +91,7 @@ def strongly_connected_components(input_graph):
     df = cudf.DataFrame()
     df['vertices'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
     df['labels'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
-    
+
     cdef uintptr_t c_offsets    = offsets.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_indices    = indices.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_identifier = df['vertices'].__cuda_array_interface__['data'][0];
diff --git a/python/cugraph/cores/core_number.pxd b/python/cugraph/cores/core_number.pxd
index f679ccf7800..cf28720a3e8 100644
--- a/python/cugraph/cores/core_number.pxd
+++ b/python/cugraph/cores/core_number.pxd
@@ -16,11 +16,10 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 
 cdef extern from "algorithms.hpp" namespace "cugraph":
 
     cdef void core_number[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph,
         VT *core_number) except +
-
diff --git a/python/cugraph/cores/core_number.py b/python/cugraph/cores/core_number.py
index 6476a863d2d..02f1b67ee35 100644
--- a/python/cugraph/cores/core_number.py
+++ b/python/cugraph/cores/core_number.py
@@ -12,6 +12,8 @@
 # limitations under the License.
 
 from cugraph.cores import core_number_wrapper
+from cugraph.utilities import check_nx_graph
+from cugraph.utilities import df_score_to_dictionary
 
 
 def core_number(G):
@@ -24,15 +26,15 @@ def core_number(G):
 
     Parameters
     ----------
-    graph : cuGraph.Graph
-        cuGraph graph descriptor with connectivity information. The graph
-        should contain undirected edges where undirected edges are represented
-        as directed edges in both directions. While this graph can contain edge
-        weights, they don't participate in the calculation of the core numbers.
+    graph : cuGraph.Graph or networkx.Graph
+        The graph should contain undirected edges where undirected edges are
+        represented as directed edges in both directions. While this graph
+        can contain edge weights, they don't participate in the calculation
+        of the core numbers.
 
     Returns
     -------
-    df : cudf.DataFrame
+    df : cudf.DataFrame or python dictionary (in NetworkX input)
         GPU data frame containing two cudf.Series of size V: the vertex
         identifiers and the corresponding core number values.
 
@@ -50,9 +52,14 @@ def core_number(G):
     >>> cn = cugraph.core_number(G)
     """
 
+    G, isNx = check_nx_graph(G)
+
     df = core_number_wrapper.core_number(G)
 
     if G.renumbered:
         df = G.unrenumber(df, "vertex")
 
+    if isNx is True:
+        df = df_score_to_dictionary(df, 'core_number')
+
     return df
diff --git a/python/cugraph/cores/core_number_wrapper.pyx b/python/cugraph/cores/core_number_wrapper.pyx
index 0b8dc63c294..3df1df5f8e9 100644
--- a/python/cugraph/cores/core_number_wrapper.pyx
+++ b/python/cugraph/cores/core_number_wrapper.pyx
@@ -17,8 +17,8 @@
 # cython: language_level = 3
 
 cimport cugraph.cores.core_number as c_core
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 
 import cudf
@@ -33,7 +33,7 @@ def core_number(input_graph):
     if not input_graph.adjlist:
         input_graph.view_adj_list()
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
diff --git a/python/cugraph/cores/k_core.pxd b/python/cugraph/cores/k_core.pxd
index 9b001494143..556dbc95ed9 100644
--- a/python/cugraph/cores/k_core.pxd
+++ b/python/cugraph/cores/k_core.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 
 cdef extern from "algorithms.hpp" namespace "cugraph":
 
diff --git a/python/cugraph/cores/k_core.py b/python/cugraph/cores/k_core.py
index 8c6c05c3178..ebf12f60cda 100644
--- a/python/cugraph/cores/k_core.py
+++ b/python/cugraph/cores/k_core.py
@@ -12,6 +12,8 @@
 # limitations under the License.
 
 from cugraph.cores import k_core_wrapper, core_number_wrapper
+from cugraph.utilities import cugraph_to_nx
+from cugraph.utilities import check_nx_graph
 
 
 def k_core(G, k=None, core_number=None):
@@ -23,7 +25,7 @@ def k_core(G, k=None, core_number=None):
 
     Parameters
     ----------
-    G : cuGraph.Graph
+    G : cuGraph.Graph or networkx.Graph
         cuGraph graph descriptor with connectivity information. The graph
         should contain undirected edges where undirected edges are represented
         as directed edges in both directions. While this graph can contain edge
@@ -56,6 +58,8 @@ def k_core(G, k=None, core_number=None):
     >>> KCoreGraph = cugraph.k_core(G)
     """
 
+    G, isNx = check_nx_graph(G)
+
     mytype = type(G)
     KCoreGraph = mytype()
 
@@ -88,4 +92,7 @@ def k_core(G, k=None, core_number=None):
             k_core_df, source="src", destination="dst"
         )
 
+    if isNx is True:
+        KCoreGraph = cugraph_to_nx(KCoreGraph)
+
     return KCoreGraph
diff --git a/python/cugraph/cores/k_core_wrapper.pyx b/python/cugraph/cores/k_core_wrapper.pyx
index 3083ffdf42e..51ecec09dc5 100644
--- a/python/cugraph/cores/k_core_wrapper.pyx
+++ b/python/cugraph/cores/k_core_wrapper.pyx
@@ -17,8 +17,8 @@
 # cython: language_level = 3
 
 from cugraph.cores.k_core cimport k_core as c_k_core
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
 from libc.float cimport FLT_MAX_EXP
@@ -32,7 +32,7 @@ import numpy as np
 ####         Ripple down through implementation (algorithms.hpp, core_number.cu)
 
 cdef (uintptr_t, uintptr_t) core_number_params(core_number):
-    [core_number['vertex'], core_number['values']] = graph_new_wrapper.datatype_cast([core_number['vertex'], core_number['values']], [np.int32])
+    [core_number['vertex'], core_number['values']] = graph_primtypes_wrapper.datatype_cast([core_number['vertex'], core_number['values']], [np.int32])
     cdef uintptr_t c_vertex = core_number['vertex'].__cuda_array_interface__['data'][0]
     cdef uintptr_t c_values = core_number['values'].__cuda_array_interface__['data'][0]
     return (c_vertex, c_values)
@@ -54,7 +54,7 @@ def k_core(input_graph, k, core_number):
     """
     Call k_core
     """
-    if graph_new_wrapper.weight_type(input_graph) == np.float64:
+    if graph_primtypes_wrapper.weight_type(input_graph) == np.float64:
         return k_core_double(input_graph, k, core_number)
     else:
         return k_core_float(input_graph, k, core_number)
diff --git a/python/cugraph/dask/__init__.py b/python/cugraph/dask/__init__.py
index 76c47338852..e62a8bfcdb4 100644
--- a/python/cugraph/dask/__init__.py
+++ b/python/cugraph/dask/__init__.py
@@ -14,3 +14,4 @@
 from .link_analysis.pagerank import pagerank
 from .traversal.bfs import bfs
 from .common.read_utils import get_chunksize
+from .community.louvain import louvain
diff --git a/python/cugraph/dask/common/input_utils.py b/python/cugraph/dask/common/input_utils.py
index c08582c1774..0140c9f06f9 100644
--- a/python/cugraph/dask/common/input_utils.py
+++ b/python/cugraph/dask/common/input_utils.py
@@ -223,3 +223,12 @@ def get_local_data(input_graph, by, load_balance=True):
 def get_mg_batch_data(dask_cudf_data):
     data = DistributedDataHandler.create(data=dask_cudf_data)
     return data
+
+
+def get_distributed_data(input_ddf):
+    ddf = input_ddf
+    comms = Comms.get_comms()
+    data = DistributedDataHandler.create(data=ddf)
+    if data.worker_info is None and comms is not None:
+        data.calculate_worker_and_rank_info(comms)
+    return data
diff --git a/python/cugraph/dask/common/mg_utils.py b/python/cugraph/dask/common/mg_utils.py
index 198b0756c00..7556afb122a 100644
--- a/python/cugraph/dask/common/mg_utils.py
+++ b/python/cugraph/dask/common/mg_utils.py
@@ -11,6 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from cugraph.raft.dask.common.utils import default_client
+import numba.cuda
 
 
 # FIXME: We currently look for the default client from dask, as such is the
@@ -32,3 +33,11 @@ def prepare_worker_to_parts(data, client=None):
         if worker not in data.worker_to_parts:
             data.worker_to_parts[worker] = [placeholder]
     return data
+
+
+def is_single_gpu():
+    ngpus = len(numba.cuda.gpus)
+    if ngpus > 1:
+        return False
+    else:
+        return True
diff --git a/python/cugraph/dask/common/part_utils.py b/python/cugraph/dask/common/part_utils.py
index 45dc7ed7ef2..505272fa563 100644
--- a/python/cugraph/dask/common/part_utils.py
+++ b/python/cugraph/dask/common/part_utils.py
@@ -16,12 +16,14 @@
 from dask.distributed import futures_of, default_client, wait
 from toolz import first
 import collections
-import dask_cudf as dc
+import dask_cudf
 from dask.array.core import Array as daskArray
 from dask_cudf.core import DataFrame as daskDataFrame
 from dask_cudf.core import Series as daskSeries
 from functools import reduce
 import cugraph.comms.comms as Comms
+from dask.delayed import delayed
+import cudf
 
 
 def workers_to_parts(futures):
@@ -193,10 +195,47 @@ def load_balance_func(ddf_, by, client=None):
                for idx, wf in enumerate(worker_to_data.items())]
     wait(futures)
 
-    ddf = dc.from_delayed(futures)
+    ddf = dask_cudf.from_delayed(futures)
     ddf.divisions = divisions
 
     # Repartition the data
     ddf = repartition(ddf, cumsum_parts)
 
     return ddf
+
+
+def concat_dfs(df_list):
+    """
+    Concat a list of cudf dataframes
+    """
+    return cudf.concat(df_list)
+
+
+def get_delayed_dict(ddf):
+    """
+    Returns a dicitionary with the dataframe tasks as keys and
+    the dataframe delayed objects as values
+    """
+    df_delayed = {}
+    for delayed_obj in ddf.to_delayed():
+        df_delayed[str(delayed_obj.key)] = delayed_obj
+    return df_delayed
+
+
+def concat_within_workers(client, ddf):
+    """
+    Concats all partitions within workers without transfers
+    """
+    df_delayed = get_delayed_dict(ddf)
+
+    result = []
+    for worker, tasks in client.has_what().items():
+        worker_task_list = []
+
+        for task in list(tasks):
+            if task in df_delayed:
+                worker_task_list.append(df_delayed[task])
+        concat_tasks = delayed(concat_dfs)(worker_task_list)
+        result.append(client.persist(collections=concat_tasks, workers=worker))
+
+    return dask_cudf.from_delayed(result)
diff --git a/python/cugraph/dask/community/__init__.py b/python/cugraph/dask/community/__init__.py
new file mode 100644
index 00000000000..3eb2ddc8090
--- /dev/null
+++ b/python/cugraph/dask/community/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .louvain import louvain
diff --git a/python/cugraph/dask/community/louvain.pxd b/python/cugraph/dask/community/louvain.pxd
new file mode 100644
index 00000000000..b6b4cd23143
--- /dev/null
+++ b/python/cugraph/dask/community/louvain.pxd
@@ -0,0 +1,31 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from libcpp.utility cimport pair
+from cugraph.structure.graph_primtypes cimport *
+
+
+cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+
+    cdef pair[size_t, weight_t] call_louvain[weight_t](
+        const handle_t &handle,
+        const graph_container_t &g,
+        void *identifiers,
+        void *parts,
+        size_t max_level,
+        weight_t resolution) except +
diff --git a/python/cugraph/dask/community/louvain.py b/python/cugraph/dask/community/louvain.py
new file mode 100644
index 00000000000..06f3b47b3b4
--- /dev/null
+++ b/python/cugraph/dask/community/louvain.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dask.distributed import wait, default_client
+
+import cugraph.comms.comms as Comms
+from cugraph.dask.common.input_utils import get_distributed_data
+from cugraph.structure.shuffle import shuffle
+from cugraph.dask.community import louvain_wrapper as c_mg_louvain
+
+
+def call_louvain(sID,
+                 data,
+                 num_verts,
+                 num_edges,
+                 partition_row_size,
+                 partition_col_size,
+                 vertex_partition_offsets,
+                 sorted_by_degree,
+                 max_level,
+                 resolution):
+
+    wid = Comms.get_worker_id(sID)
+    handle = Comms.get_handle(sID)
+
+    return c_mg_louvain.louvain(data[0],
+                                num_verts,
+                                num_edges,
+                                partition_row_size,
+                                partition_col_size,
+                                vertex_partition_offsets,
+                                wid,
+                                handle,
+                                sorted_by_degree,
+                                max_level,
+                                resolution)
+
+
+def louvain(input_graph, max_iter=100, resolution=1.0, load_balance=True):
+    """
+    Compute the modularity optimizing partition of the input graph using the
+    Louvain method on multiple GPUs
+
+    Examples
+    --------
+    >>> import cugraph.dask as dcg
+    >>> Comms.initialize()
+    >>> chunksize = dcg.get_chunksize(input_data_path)
+    >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize,
+                                 delimiter=' ',
+                                 names=['src', 'dst', 'value'],
+                                 dtype=['int32', 'int32', 'float32'])
+    >>> dg = cugraph.Graph()
+    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
+                                   edge_attr='value')
+    >>> parts, modularity_score = dcg.louvain(dg)
+    """
+    # FIXME: finish docstring: describe parameters, etc.
+
+    # FIXME: import here to prevent circular import: cugraph->louvain
+    # wrapper->cugraph/structure->cugraph/dask->dask/louvain->cugraph/structure
+    # from cugraph.structure.graph import Graph
+
+    # FIXME: dask methods to populate graphs from edgelists are only present on
+    # DiGraph classes. Disable the Graph check for now and assume inputs are
+    # symmetric DiGraphs.
+    # if type(graph) is not Graph:
+    #     raise Exception("input graph must be undirected")
+
+    client = default_client()
+    # Calling renumbering results in data that is sorted by degree
+    input_graph.compute_renumber_edge_list(transposed=False)
+    sorted_by_degree = True
+    (ddf,
+     num_verts,
+     partition_row_size,
+     partition_col_size,
+     vertex_partition_offsets) = shuffle(input_graph, transposed=False)
+    num_edges = len(ddf)
+    data = get_distributed_data(ddf)
+
+    result = dict([(data.worker_info[wf[0]]["rank"],
+                    client.submit(
+                        call_louvain,
+                        Comms.get_session_id(),
+                        wf[1],
+                        num_verts,
+                        num_edges,
+                        partition_row_size,
+                        partition_col_size,
+                        vertex_partition_offsets,
+                        sorted_by_degree,
+                        max_iter,
+                        resolution,
+                        workers=[wf[0]]))
+                   for idx, wf in enumerate(data.worker_to_parts.items())])
+
+    wait(result)
+
+    (parts, modularity_score) = result[0].result()
+
+    if input_graph.renumbered:
+        # MG renumbering is lazy, but it's safe to assume it's been called at
+        # this point if renumbered=True
+        parts = input_graph.unrenumber(parts, "vertex")
+
+    return parts, modularity_score
diff --git a/python/cugraph/dask/community/louvain_wrapper.pyx b/python/cugraph/dask/community/louvain_wrapper.pyx
new file mode 100644
index 00000000000..3d72a7c3bd6
--- /dev/null
+++ b/python/cugraph/dask/community/louvain_wrapper.pyx
@@ -0,0 +1,125 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from libc.stdint cimport uintptr_t
+
+from cugraph.dask.community cimport louvain as c_louvain
+from cugraph.structure.graph_primtypes cimport *
+
+import cudf
+import numpy as np
+
+
+# FIXME: move this to a more reusable location
+numberTypeMap = {np.dtype("int32") : <int>numberTypeEnum.int32Type,
+                 np.dtype("int64") : <int>numberTypeEnum.int64Type,
+                 np.dtype("float32") : <int>numberTypeEnum.floatType,
+                 np.dtype("double") : <int>numberTypeEnum.doubleType}
+
+
+def louvain(input_df,
+            num_global_verts,
+            num_global_edges,
+            partition_row_size,
+            partition_col_size,
+            vertex_partition_offsets,
+            rank,
+            handle,
+            sorted_by_degree,
+            max_level,
+            resolution):
+    """
+    Call MG Louvain
+    """
+    # FIXME: This must be imported here to prevent a circular import
+    from cugraph.structure import graph_primtypes_wrapper
+
+    cdef size_t handle_size_t = <size_t>handle.getHandle()
+    handle_ = <handle_t*>handle_size_t
+
+    final_modularity = None
+
+    # FIXME: much of this code is common to other algo wrappers, consider adding
+    #        this to a shared utility as well
+
+    src = input_df['src']
+    dst = input_df['dst']
+    num_partition_edges = len(src)
+
+    if "value" in input_df.columns:
+        weights = input_df['value']
+    else:
+        weights = cudf.Series(np.full(num_partition_edges, 1.0, dtype=np.float32))
+
+    vertex_t = src.dtype
+    if num_global_edges > (2**31 - 1):
+        edge_t = np.dtype("int64")
+    else:
+        edge_t = np.dtype("int32")
+    weight_t = weights.dtype
+
+    # COO
+    cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_edge_weights = weights.__cuda_array_interface__['data'][0]
+
+    # data is on device, move to host (.values_host) since graph_t in
+    # graph_container needs a host array
+    cdef uintptr_t c_vertex_partition_offsets = vertex_partition_offsets.values_host.__array_interface__['data'][0]
+
+    cdef graph_container_t graph_container
+
+    # FIXME: The excessive casting for the enum arg is needed to make cython
+    #        understand how to pass the enum value (this is the same pattern
+    #        used by cudf). This will not be needed with Cython 3.0
+    populate_graph_container(graph_container,
+                             handle_[0],
+                             <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
+                             <void*>c_vertex_partition_offsets,
+                             <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
+                             <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
+                             <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
+                             num_partition_edges,
+                             num_global_verts, num_global_edges,
+                             partition_row_size, partition_col_size,
+                             sorted_by_degree,
+                             False, True)  # store_transposed, multi_gpu
+
+    # Create the output dataframe
+    df = cudf.DataFrame()
+    df['vertex'] = cudf.Series(np.zeros(num_global_verts, dtype=vertex_t))
+    df['partition'] = cudf.Series(np.zeros(num_global_verts, dtype=vertex_t))
+
+    cdef uintptr_t c_identifiers = df['vertex'].__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_partition = df['partition'].__cuda_array_interface__['data'][0]
+
+    if weight_t == np.float32:
+        num_level, final_modularity_float = c_louvain.call_louvain[float](
+            handle_[0], graph_container,
+            <void*>c_identifiers, <void*>c_partition,
+            max_level, resolution)
+        final_modularity = final_modularity_float
+
+    else:
+        num_level, final_modularity_double = c_louvain.call_louvain[double](
+            handle_[0], graph_container,
+            <void*>c_identifiers, <void*>c_partition,
+            max_level, resolution)
+        final_modularity = final_modularity_double
+
+    return df, final_modularity
diff --git a/python/cugraph/dask/link_analysis/mg_pagerank.pxd b/python/cugraph/dask/link_analysis/mg_pagerank.pxd
index 4de9becf10d..429cb775e07 100644
--- a/python/cugraph/dask/link_analysis/mg_pagerank.pxd
+++ b/python/cugraph/dask/link_analysis/mg_pagerank.pxd
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 
 
diff --git a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx
index c5a72647e03..39b856e4946 100644
--- a/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx
+++ b/python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx
@@ -17,8 +17,8 @@
 from cugraph.structure.utils_wrapper import *
 from cugraph.dask.link_analysis cimport mg_pagerank as c_pagerank
 import cudf
-from cugraph.structure.graph_new cimport *
-import cugraph.structure.graph_new_wrapper as graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+import cugraph.structure.graph_primtypes_wrapper as graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 from cython.operator cimport dereference as deref
 
@@ -41,12 +41,12 @@ def mg_pagerank(input_df, local_data, rank, handle, alpha=0.85, max_iter=100, to
     dst = dst - local_offset
     num_local_verts = local_data['verts'][rank]
     num_local_edges = len(src)
- 
+
     cdef uintptr_t c_local_verts = local_data['verts'].__array_interface__['data'][0]
     cdef uintptr_t c_local_edges = local_data['edges'].__array_interface__['data'][0]
     cdef uintptr_t c_local_offsets = local_data['offsets'].__array_interface__['data'][0]
 
-    [src, dst] = graph_new_wrapper.datatype_cast([src, dst], [np.int32])
+    [src, dst] = graph_primtypes_wrapper.datatype_cast([src, dst], [np.int32])
     _offsets, indices, weights = coo2csr(dst, src, None)
     offsets = _offsets[:num_local_verts + 1]
     del _offsets
@@ -56,11 +56,11 @@ def mg_pagerank(input_df, local_data, rank, handle, alpha=0.85, max_iter=100, to
 
     cdef uintptr_t c_identifier = df['vertex'].__cuda_array_interface__['data'][0];
     cdef uintptr_t c_pagerank_val = df['pagerank'].__cuda_array_interface__['data'][0];
-    
+
     cdef uintptr_t c_pers_vtx = <uintptr_t>NULL
     cdef uintptr_t c_pers_val = <uintptr_t>NULL
     cdef int sz = 0
-    
+
     cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0]
     cdef uintptr_t c_weights = <uintptr_t>NULL
diff --git a/python/cugraph/dask/structure/replication.pyx b/python/cugraph/dask/structure/replication.pyx
index 7256fa63448..6d579e126bf 100644
--- a/python/cugraph/dask/structure/replication.pyx
+++ b/python/cugraph/dask/structure/replication.pyx
@@ -18,7 +18,7 @@
 
 from libc.stdint cimport uintptr_t
 from cugraph.structure cimport utils as c_utils
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libc.stdint cimport uintptr_t
 
 import cudf
diff --git a/python/cugraph/dask/traversal/mg_bfs.pxd b/python/cugraph/dask/traversal/mg_bfs.pxd
index 8b9e8c1c81f..68010e2b816 100644
--- a/python/cugraph/dask/traversal/mg_bfs.pxd
+++ b/python/cugraph/dask/traversal/mg_bfs.pxd
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 
 
diff --git a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx
index 66a2668a41f..4c13aeb1286 100644
--- a/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx
+++ b/python/cugraph/dask/traversal/mg_bfs_wrapper.pyx
@@ -17,8 +17,8 @@
 from cugraph.structure.utils_wrapper import *
 from cugraph.dask.traversal cimport mg_bfs as c_bfs
 import cudf
-from cugraph.structure.graph_new cimport *
-import cugraph.structure.graph_new_wrapper as graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+import cugraph.structure.graph_primtypes_wrapper as graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 
 def mg_bfs(input_df, local_data, rank, handle, start, result_len, return_distances=False):
@@ -40,7 +40,7 @@ def mg_bfs(input_df, local_data, rank, handle, start, result_len, return_distanc
     num_local_edges = len(src)
 
     # Convert to local CSR
-    [src, dst] = graph_new_wrapper.datatype_cast([src, dst], [np.int32])
+    [src, dst] = graph_primtypes_wrapper.datatype_cast([src, dst], [np.int32])
     _offsets, indices, weights = coo2csr(src, dst, None)
     offsets = _offsets[:num_local_verts + 1]
     del _offsets
diff --git a/python/cugraph/layout/force_atlas2.pxd b/python/cugraph/layout/force_atlas2.pxd
index 3b1d64d31a1..a26abaa16c0 100644
--- a/python/cugraph/layout/force_atlas2.pxd
+++ b/python/cugraph/layout/force_atlas2.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 
 cdef extern from "internals.hpp" namespace "cugraph::internals":
diff --git a/python/cugraph/layout/force_atlas2_wrapper.pyx b/python/cugraph/layout/force_atlas2_wrapper.pyx
index 128e5f61f3c..31bf8fc029e 100644
--- a/python/cugraph/layout/force_atlas2_wrapper.pyx
+++ b/python/cugraph/layout/force_atlas2_wrapper.pyx
@@ -17,8 +17,8 @@
 # cython: language_level = 3
 
 from cugraph.layout.force_atlas2 cimport force_atlas2 as c_force_atlas2
-from cugraph.structure import graph_new_wrapper
-from cugraph.structure.graph_new cimport *
+from cugraph.structure import graph_primtypes_wrapper
+from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure import utils_wrapper
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
diff --git a/python/cugraph/link_analysis/hits.pxd b/python/cugraph/link_analysis/hits.pxd
index 2efa417655a..60d25fd3cdb 100644
--- a/python/cugraph/link_analysis/hits.pxd
+++ b/python/cugraph/link_analysis/hits.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 
 
diff --git a/python/cugraph/link_analysis/hits.py b/python/cugraph/link_analysis/hits.py
index c3b8a93c8ac..29827e1dd31 100644
--- a/python/cugraph/link_analysis/hits.py
+++ b/python/cugraph/link_analysis/hits.py
@@ -12,6 +12,8 @@
 # limitations under the License.
 
 from cugraph.link_analysis import hits_wrapper
+from cugraph.utilities import check_nx_graph
+from cugraph.utilities import df_score_to_dictionary
 
 
 def hits(G, max_iter=100, tol=1.0e-5, nstart=None, normalized=True):
@@ -72,9 +74,17 @@ def hits(G, max_iter=100, tol=1.0e-5, nstart=None, normalized=True):
     >>> hits = cugraph.hits(G, max_iter = 50)
     """
 
+    G, isNx = check_nx_graph(G)
+
     df = hits_wrapper.hits(G, max_iter, tol)
 
     if G.renumbered:
         df = G.unrenumber(df, "vertex")
 
+    if isNx is True:
+        d1 = df_score_to_dictionary(df[["vertex", "hubs"]], "hubs")
+        d2 = df_score_to_dictionary(df[["vertex", "authorities"]],
+                                    "authorities")
+        df = (d1, d2)
+
     return df
diff --git a/python/cugraph/link_analysis/hits_wrapper.pyx b/python/cugraph/link_analysis/hits_wrapper.pyx
index 5f52df63fe8..3e19e38a023 100644
--- a/python/cugraph/link_analysis/hits_wrapper.pyx
+++ b/python/cugraph/link_analysis/hits_wrapper.pyx
@@ -17,10 +17,10 @@
 # cython: language_level = 3
 
 from cugraph.link_analysis.hits cimport hits as c_hits
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure import graph_primtypes_wrapper
 import cudf
 import rmm
 import numpy as np
@@ -38,7 +38,7 @@ def hits(input_graph, max_iter=100, tol=1.0e-5, nstart=None, normalized=True):
     if not input_graph.adjlist:
         input_graph.view_adj_list()
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
@@ -59,7 +59,7 @@ def hits(input_graph, max_iter=100, tol=1.0e-5, nstart=None, normalized=True):
     cdef uintptr_t c_weights = <uintptr_t>NULL
 
     cdef GraphCSRView[int,int,float] graph_float
-    
+
     graph_float = GraphCSRView[int,int,float](<int*>c_offsets, <int*>c_indices, <float*>c_weights, num_verts, num_edges)
 
     c_hits[int,int,float](graph_float, max_iter, tol, <float*> NULL,
diff --git a/python/cugraph/link_analysis/pagerank.pxd b/python/cugraph/link_analysis/pagerank.pxd
index e5ec22a5d35..df94b95d72e 100644
--- a/python/cugraph/link_analysis/pagerank.pxd
+++ b/python/cugraph/link_analysis/pagerank.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 
 
diff --git a/python/cugraph/link_analysis/pagerank.py b/python/cugraph/link_analysis/pagerank.py
index 69106f3bf2b..69133d62af7 100644
--- a/python/cugraph/link_analysis/pagerank.py
+++ b/python/cugraph/link_analysis/pagerank.py
@@ -13,10 +13,12 @@
 
 from cugraph.link_analysis import pagerank_wrapper
 from cugraph.structure.graph import null_check
+import cugraph
 
 
 def pagerank(
-    G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None
+    G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None,
+    weight=None, dangling=None
 ):
     """
     Find the PageRank score for every vertex in a graph. cuGraph computes an
@@ -28,7 +30,7 @@ def pagerank(
 
     Parameters
     ----------
-    graph : cugraph.Graph
+    graph : cugraph.Graph or networkx.Graph
         cuGraph graph descriptor, should contain the connectivity information
         as an edge list (edge weights are not used for this algorithm).
         The transposed adjacency list will be computed if not already present.
@@ -67,6 +69,13 @@ def pagerank(
         nstart['values'] : cudf.Series
             Pagerank values for vertices
 
+    weight : str
+        Edge data column to use.  Default is None
+        This version of PageRank current does not use edge weight.
+        This parameter is here for NetworkX compatibility
+    dangling : dict
+        This parameter is here for NetworkX compatibility and ignored
+
     Returns
     -------
     PageRank : cudf.DataFrame
@@ -88,6 +97,8 @@ def pagerank(
     >>> pr = cugraph.pagerank(G, alpha = 0.85, max_iter = 500, tol = 1.0e-05)
     """
 
+    G, isNx = cugraph.utilities.check_nx_graph(G, weight)
+
     if personalization is not None:
         null_check(personalization["vertex"])
         null_check(personalization["values"])
@@ -109,4 +120,7 @@ def pagerank(
     if G.renumbered:
         df = G.unrenumber(df, "vertex")
 
-    return df
+    if isNx is True:
+        return cugraph.utilities.df_score_to_dictionary(df, 'pagerank')
+    else:
+        return df
diff --git a/python/cugraph/link_analysis/pagerank_wrapper.pyx b/python/cugraph/link_analysis/pagerank_wrapper.pyx
index 4b045264ead..9f4e555bbd9 100644
--- a/python/cugraph/link_analysis/pagerank_wrapper.pyx
+++ b/python/cugraph/link_analysis/pagerank_wrapper.pyx
@@ -18,10 +18,10 @@
 
 #cimport cugraph.link_analysis.pagerank as c_pagerank
 from cugraph.link_analysis.pagerank cimport pagerank as c_pagerank
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure import graph_primtypes_wrapper
 import cudf
 import rmm
 import numpy as np
@@ -39,8 +39,8 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.
     cdef unique_ptr[handle_t] handle_ptr
     handle_ptr.reset(new handle_t())
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.transposedadjlist.offsets, input_graph.transposedadjlist.indices], [np.int32])
-    [weights] = graph_new_wrapper.datatype_cast([input_graph.transposedadjlist.weights], [np.float32, np.float64])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.transposedadjlist.offsets, input_graph.transposedadjlist.indices], [np.int32])
+    [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.transposedadjlist.weights], [np.float32, np.float64])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
@@ -74,21 +74,21 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.
 
     cdef GraphCSCView[int,int,float] graph_float
     cdef GraphCSCView[int,int,double] graph_double
-    
+
     if personalization is not None:
         sz = personalization['vertex'].shape[0]
         personalization['vertex'] = personalization['vertex'].astype(np.int32)
         personalization['values'] = personalization['values'].astype(df['pagerank'].dtype)
         c_pers_vtx = personalization['vertex'].__cuda_array_interface__['data'][0]
         c_pers_val = personalization['values'].__cuda_array_interface__['data'][0]
-    
-    if (df['pagerank'].dtype == np.float32): 
+
+    if (df['pagerank'].dtype == np.float32):
         graph_float = GraphCSCView[int,int,float](<int*>c_offsets, <int*>c_indices, <float*>c_weights, num_verts, num_edges)
 
         c_pagerank[int,int,float](handle_ptr.get()[0], graph_float, <float*> c_pagerank_val, sz, <int*> c_pers_vtx, <float*> c_pers_val,
                                <float> alpha, <float> tol, <int> max_iter, has_guess)
         graph_float.get_vertex_identifiers(<int*>c_identifier)
-    else: 
+    else:
         graph_double = GraphCSCView[int,int,double](<int*>c_offsets, <int*>c_indices, <double*>c_weights, num_verts, num_edges)
         c_pagerank[int,int,double](handle_ptr.get()[0], graph_double, <double*> c_pagerank_val, sz, <int*> c_pers_vtx, <double*> c_pers_val,
                             <float> alpha, <float> tol, <int> max_iter, has_guess)
diff --git a/python/cugraph/link_prediction/__init__.py b/python/cugraph/link_prediction/__init__.py
index d0912c73751..f787ae10dd9 100644
--- a/python/cugraph/link_prediction/__init__.py
+++ b/python/cugraph/link_prediction/__init__.py
@@ -12,6 +12,9 @@
 # limitations under the License.
 
 from cugraph.link_prediction.jaccard import jaccard
+from cugraph.link_prediction.jaccard import jaccard_coefficient
 from cugraph.link_prediction.overlap import overlap
 from cugraph.link_prediction.wjaccard import jaccard_w
 from cugraph.link_prediction.woverlap import overlap_w
+from cugraph.link_prediction.jaccard import jaccard_coefficient
+from cugraph.link_prediction.overlap import overlap_coefficient
diff --git a/python/cugraph/link_prediction/jaccard.pxd b/python/cugraph/link_prediction/jaccard.pxd
index 4cb5a46fe53..bc55bb2cdf0 100644
--- a/python/cugraph/link_prediction/jaccard.pxd
+++ b/python/cugraph/link_prediction/jaccard.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 
 
 cdef extern from "algorithms.hpp" namespace "cugraph":
@@ -25,7 +25,7 @@ cdef extern from "algorithms.hpp" namespace "cugraph":
         const GraphCSRView[VT,ET,WT] &graph,
         const WT *weights,
         WT *result) except +
-    
+
     cdef void jaccard_list[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph,
         const WT *weights,
diff --git a/python/cugraph/link_prediction/jaccard.py b/python/cugraph/link_prediction/jaccard.py
index e2160a0a803..71cf0925342 100644
--- a/python/cugraph/link_prediction/jaccard.py
+++ b/python/cugraph/link_prediction/jaccard.py
@@ -11,10 +11,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pandas as pd
+import cudf
 from cugraph.structure.graph import Graph
 from cugraph.link_prediction import jaccard_wrapper
 from cugraph.structure.graph import null_check
-import cudf
+from cugraph.utilities import check_nx_graph
+from cugraph.utilities import df_edge_score_to_dictionary
 
 
 def jaccard(input_graph, vertex_pair=None):
@@ -127,3 +130,65 @@ def jaccard(input_graph, vertex_pair=None):
         df = input_graph.unrenumber(df, "destination")
 
     return df
+
+
+def jaccard_coefficient(G, ebunch=None):
+    """
+    For NetworkX Compatability.  See `jaccard`
+
+    Parameters
+    ----------
+    graph : cugraph.Graph
+        cuGraph graph descriptor, should contain the connectivity information
+        as an edge list (edge weights are not used for this algorithm). The
+        graph should be undirected where an undirected edge is represented by a
+        directed edge in both direction. The adjacency list will be computed if
+        not already present.
+    ebunch : cudf.DataFrame
+        A GPU dataframe consisting of two columns representing pairs of
+        vertices. If provided, the jaccard coefficient is computed for the
+        given vertex pairs.  If the vertex_pair is not provided then the
+        current implementation computes the jaccard coefficient for all
+        adjacent vertices in the graph.
+
+    Returns
+    -------
+    df  : cudf.DataFrame
+        GPU data frame of size E (the default) or the size of the given pairs
+        (first, second) containing the Jaccard weights. The ordering is
+        relative to the adjacency list, or that given by the specified vertex
+        pairs.
+
+        df['source'] : cudf.Series
+            The source vertex ID (will be identical to first if specified)
+        df['destination'] : cudf.Series
+            The destination vertex ID (will be identical to second if
+            specified)
+        df['jaccard_coeff'] : cudf.Series
+            The computed Jaccard coefficient between the source and destination
+            vertices
+
+    Examples
+    --------
+    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+    >>> G = cugraph.Graph()
+    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
+    >>> df = cugraph.jaccard_coefficient(G)
+    """
+    vertex_pair = None
+
+    G, isNx = check_nx_graph(G)
+
+    if isNx is True and ebunch is not None:
+        vertex_pair = cudf.from_pandas(pd.DataFrame(ebunch))
+
+    df = jaccard(G, vertex_pair)
+
+    if isNx is True:
+        df = df_edge_score_to_dictionary(df,
+                                         k="jaccard_coeff",
+                                         src="source",
+                                         dst="destination")
+
+    return df
diff --git a/python/cugraph/link_prediction/jaccard_wrapper.pyx b/python/cugraph/link_prediction/jaccard_wrapper.pyx
index 24e2ca429f5..cacd13dec65 100644
--- a/python/cugraph/link_prediction/jaccard_wrapper.pyx
+++ b/python/cugraph/link_prediction/jaccard_wrapper.pyx
@@ -18,8 +18,8 @@
 
 from cugraph.link_prediction.jaccard cimport jaccard as c_jaccard
 from cugraph.link_prediction.jaccard cimport jaccard_list as c_jaccard_list
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 from cython cimport floating
 
@@ -35,21 +35,21 @@ def jaccard(input_graph, weights_arr=None, vertex_pair=None):
     indices = None
 
     if input_graph.adjlist:
-        [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets,
-                                                              input_graph.adjlist.indices], [np.int32])
+        [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets,
+                                                                    input_graph.adjlist.indices], [np.int32])
     elif input_graph.transposedadjlist:
         #
         # NOTE: jaccard ONLY operates on an undirected graph, so CSR and CSC should be
         #       equivalent.  The undirected check has already happened, so we'll just use
         #       the CSC as if it were CSR.
         #
-        [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.transposedadjlist.offsets,
-                                                              input_graph.transposedadjlist.indices], [np.int32])
+        [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.transposedadjlist.offsets,
+                                                                    input_graph.transposedadjlist.indices], [np.int32])
     else:
         input_graph.view_adj_list()
-        [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets,
-                                                              input_graph.adjlist.indices], [np.int32])
-        
+        [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets,
+                                                                    input_graph.adjlist.indices], [np.int32])
+
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
 
@@ -71,7 +71,7 @@ def jaccard(input_graph, weights_arr=None, vertex_pair=None):
     weight_type = np.float32
 
     if weights_arr is not None:
-        [weights] = graph_new_wrapper.datatype_cast([weights_arr], [np.float32, np.float64])
+        [weights] = graph_primtypes_wrapper.datatype_cast([weights_arr], [np.float32, np.float64])
         c_weights = weights.__cuda_array_interface__['data'][0]
         weight_type = weights.dtype
 
@@ -111,7 +111,7 @@ def jaccard(input_graph, weights_arr=None, vertex_pair=None):
                                            <int*>c_first_col,
                                            <int*>c_second_col,
                                            <double*>c_result_col)
-            
+
         return df
     else:
         # error check performed in jaccard.py
@@ -153,5 +153,5 @@ def jaccard(input_graph, weights_arr=None, vertex_pair=None):
                                       <double*>c_result_col)
 
             graph_double.get_source_indices(<int*>c_src_index_col)
-            
+
         return df
diff --git a/python/cugraph/link_prediction/overlap.pxd b/python/cugraph/link_prediction/overlap.pxd
index 5f8c8ee8449..970032b56eb 100644
--- a/python/cugraph/link_prediction/overlap.pxd
+++ b/python/cugraph/link_prediction/overlap.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 
 
 cdef extern from "algorithms.hpp" namespace "cugraph":
@@ -25,7 +25,7 @@ cdef extern from "algorithms.hpp" namespace "cugraph":
         const GraphCSRView[VT,ET,WT] &graph,
         const WT *weights,
         WT *result) except +
-    
+
     cdef void overlap_list[VT,ET,WT](
         const GraphCSRView[VT,ET,WT] &graph,
         const WT *weights,
diff --git a/python/cugraph/link_prediction/overlap.py b/python/cugraph/link_prediction/overlap.py
index c9aa216095e..a5ca1e22979 100644
--- a/python/cugraph/link_prediction/overlap.py
+++ b/python/cugraph/link_prediction/overlap.py
@@ -11,9 +11,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pandas as pd
 from cugraph.link_prediction import overlap_wrapper
 from cugraph.structure.graph import null_check
 import cudf
+from cugraph.utilities import check_nx_graph
+from cugraph.utilities import df_edge_score_to_dictionary
+
+
+def overlap_coefficient(G, ebunch=None):
+    """
+    NetworkX similar API.  See 'jaccard' for a description
+
+    """
+    vertex_pair = None
+
+    G, isNx = check_nx_graph(G)
+
+    if isNx is True and ebunch is not None:
+        vertex_pair = cudf.from_pandas(pd.DataFrame(ebunch))
+
+    df = overlap(G, vertex_pair)
+
+    if isNx is True:
+        df = df_edge_score_to_dictionary(df,
+                                         k="overlap_coeff",
+                                         src="source",
+                                         dst="destination")
+
+    return df
 
 
 def overlap(input_graph, vertex_pair=None):
diff --git a/python/cugraph/link_prediction/overlap_wrapper.pyx b/python/cugraph/link_prediction/overlap_wrapper.pyx
index 61b04d0d315..9e2f3ba49d7 100644
--- a/python/cugraph/link_prediction/overlap_wrapper.pyx
+++ b/python/cugraph/link_prediction/overlap_wrapper.pyx
@@ -18,8 +18,8 @@
 
 from cugraph.link_prediction.overlap cimport overlap as c_overlap
 from cugraph.link_prediction.overlap cimport overlap_list as c_overlap_list
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
 from cython cimport floating
 
@@ -35,14 +35,14 @@ def overlap(input_graph, weights_arr=None, vertex_pair=None):
     if not input_graph.adjlist:
         input_graph.view_adj_list()
 
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
 
     first = None
     second = None
-    
+
     cdef uintptr_t c_result_col = <uintptr_t> NULL
     cdef uintptr_t c_first_col = <uintptr_t> NULL
     cdef uintptr_t c_second_col = <uintptr_t> NULL
@@ -58,7 +58,7 @@ def overlap(input_graph, weights_arr=None, vertex_pair=None):
     weight_type = np.float32
 
     if weights_arr is not None:
-        [weights] = graph_new_wrapper.datatype_cast([weights_arr], [np.float32, np.float64])
+        [weights] = graph_primtypes_wrapper.datatype_cast([weights_arr], [np.float32, np.float64])
         c_weights = weights.__cuda_array_interface__['data'][0]
         weight_type = weights.dtype
 
@@ -69,7 +69,7 @@ def overlap(input_graph, weights_arr=None, vertex_pair=None):
 
         df = cudf.DataFrame()
         df['overlap_coeff'] = result
-        
+
         first = vertex_pair['first']
         second = vertex_pair['second']
 
@@ -97,7 +97,7 @@ def overlap(input_graph, weights_arr=None, vertex_pair=None):
                                            <int*>c_first_col,
                                            <int*>c_second_col,
                                            <double*>c_result_col)
-        
+
         return df
     else:
         # error check performed in overlap.py
@@ -139,5 +139,5 @@ def overlap(input_graph, weights_arr=None, vertex_pair=None):
                                       <double*>c_result_col)
 
             graph_double.get_source_indices(<int*>c_src_index_col)
-            
+
         return df
diff --git a/python/cugraph/structure/__init__.py b/python/cugraph/structure/__init__.py
index b43f4f3ebfa..511e6773469 100644
--- a/python/cugraph/structure/__init__.py
+++ b/python/cugraph/structure/__init__.py
@@ -13,6 +13,7 @@
 
 from cugraph.structure.graph import Graph, DiGraph
 from cugraph.structure.number_map import NumberMap
-from cugraph.structure.symmetrize import symmetrize, symmetrize_df
+from cugraph.structure.symmetrize import symmetrize, symmetrize_df , symmetrize_ddf
 from cugraph.structure.convert_matrix import from_cudf_edgelist
 from cugraph.structure.hypergraph import hypergraph
+from cugraph.structure.shuffle import shuffle
diff --git a/python/cugraph/structure/convert_matrix.py b/python/cugraph/structure/convert_matrix.py
index 0266a158bb1..56bb9086380 100644
--- a/python/cugraph/structure/convert_matrix.py
+++ b/python/cugraph/structure/convert_matrix.py
@@ -23,7 +23,8 @@ def from_cudf_edgelist(df, source='source', destination='destination',
     """
     Return a new graph created from the edge list representaion. This function
     is added for NetworkX compatibility (this function is a RAPIDS version of
-    NetworkX's from_pandas_edge_list()).
+    NetworkX's from_pandas_edge_list()).  This function does not support
+    multiple source or destination columns.  But does support renumbering
 
     Parameters
     ----------
@@ -33,12 +34,17 @@ def from_cudf_edgelist(df, source='source', destination='destination',
         (optional) weights.
     source : string or integer
         This is used to index the source column.
-    target : string or integer
+    destination : string or integer
         This is used to index the destination (or target following NetworkX's
         terminology) column.
-    weight : string or integer, optional
+    edge_attr : string or integer, optional
         This pointer can be ``None``. If not, this is used to index the weight
         column.
+    create_using : cuGraph.Graph
+        Specify the type of Graph to create.  Default is cugraph.Graph
+    renumber : bool
+        If source and destination indices are not in range 0 to V where V
+        is number of vertices, renumber argument should be True.
 
     Examples
     --------
diff --git a/python/cugraph/structure/graph.pxd b/python/cugraph/structure/graph.pxd
deleted file mode 100644
index 2343a0604dc..00000000000
--- a/python/cugraph/structure/graph.pxd
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-
-from rmm._lib.device_buffer cimport device_buffer
-
-cdef extern from "raft/handle.hpp" namespace "raft":
-    cdef cppclass handle_t:
-        handle_t() except +
-
-cdef extern from "graph.hpp" namespace "cugraph":
-
-    ctypedef enum PropType:
-        PROP_UNDEF "cugraph::PROP_UNDEF"
-        PROP_FALSE "cugraph::PROP_FALSE"
-        PROP_TRUE "cugraph::PROP_TRUE"
-
-    ctypedef enum DegreeDirection:
-        DIRECTION_IN_PLUS_OUT "cugraph::DegreeDirection::IN_PLUS_OUT"
-        DIRECTION_IN "cugraph::DegreeDirection::IN"
-        DIRECTION_OUT "cugraph::DegreeDirection::OUT"
-
-    struct GraphProperties:
-        bool directed
-        bool weighted
-        bool multigraph
-        bool bipartite
-        bool tree
-        PropType has_negative_edges
-
-    cdef cppclass GraphViewBase[VT,ET,WT]:
-        WT *edge_data
-        handle_t *handle;
-        GraphProperties prop
-        VT number_of_vertices
-        ET number_of_edges
-        VT* local_vertices
-        ET* local_edges
-        VT* local_offsets
-        void set_handle(handle_t*)
-        void set_local_data(VT* local_vertices_, ET* local_edges_, VT* local_offsets_)
-        void get_vertex_identifiers(VT *) const
-
-        GraphViewBase(WT*,VT,ET)
-
-    cdef cppclass GraphCOOView[VT,ET,WT](GraphViewBase[VT,ET,WT]):
-        VT *src_indices
-        VT *dst_indices
-
-        void degree(ET *,DegreeDirection) const
-
-        GraphCOOView()
-        GraphCOOView(const VT *, const ET *, const WT *, size_t, size_t)
-
-    cdef cppclass GraphCompressedSparseBaseView[VT,ET,WT](GraphViewBase[VT,ET,WT]):
-        ET *offsets
-        VT *indices
-
-        void get_source_indices(VT *) const
-        void degree(ET *,DegreeDirection) const
-
-        GraphCompressedSparseBaseView(const VT *, const ET *, const WT *, size_t, size_t)
-
-    cdef cppclass GraphCSRView[VT,ET,WT](GraphCompressedSparseBaseView[VT,ET,WT]):
-        GraphCSRView()
-        GraphCSRView(const VT *, const ET *, const WT *, size_t, size_t)
-
-    cdef cppclass GraphCSCView[VT,ET,WT](GraphCompressedSparseBaseView[VT,ET,WT]):
-        GraphCSCView()
-        GraphCSCView(const VT *, const ET *, const WT *, size_t, size_t)
-
-    cdef cppclass GraphCOOContents[VT,ET,WT]:
-        VT number_of_vertices
-        ET number_of_edges
-        unique_ptr[device_buffer] src_indices
-        unique_ptr[device_buffer] dst_indices
-        unique_ptr[device_buffer] edge_data
-
-    cdef cppclass GraphCOO[VT,ET,WT]:
-        GraphCOO(
-                VT nv,
-                ET ne,
-                bool has_data) except+
-        GraphCOOContents[VT,ET,WT] release()
-        GraphCOOView[VT,ET,WT] view()
-
-    cdef cppclass GraphSparseContents[VT,ET,WT]:
-        VT number_of_vertices
-        ET number_of_edges
-        unique_ptr[device_buffer] offsets
-        unique_ptr[device_buffer] indices
-        unique_ptr[device_buffer] edge_data
-
-    cdef cppclass GraphCSC[VT,ET,WT]:
-        GraphCSC(
-                VT nv,
-                ET ne,
-                bool has_data) except+
-        GraphSparseContents[VT,ET,WT] release()
-        GraphCSCView[VT,ET,WT] view()
-
-    cdef cppclass GraphCSR[VT,ET,WT]:
-        GraphCSR(
-                VT nv,
-                ET ne,
-                bool has_data) except+
-        GraphSparseContents[VT,ET,WT] release()
-        GraphCSRView[VT,ET,WT] view()
-
-
-
-cdef extern from "algorithms.hpp" namespace "cugraph":
-
-    cdef unique_ptr[GraphCOO[VT, ET, WT]] get_two_hop_neighbors[VT,ET,WT](
-        const GraphCSRView[VT, ET, WT] &graph) except +
-
-cdef extern from "functions.hpp" namespace "cugraph":
-
-    cdef unique_ptr[device_buffer] renumber_vertices[VT_IN,VT_OUT,ET](
-        ET number_of_edges,
-        const VT_IN *src,
-        const VT_IN *dst,
-        VT_OUT *src_renumbered,
-        VT_OUT *dst_renumbered,
-        ET *map_size) except +
-
-
-cdef extern from "<utility>" namespace "std" nogil:
-    cdef unique_ptr[GraphCOO[int,int,float]] move(unique_ptr[GraphCOO[int,int,float]])
-    cdef unique_ptr[GraphCOO[int,int,double]] move(unique_ptr[GraphCOO[int,int,double]])
-    cdef GraphCOOContents[int,int,float] move(GraphCOOContents[int,int,float])
-    cdef GraphCOOContents[int,int,double] move(GraphCOOContents[int,int,double])
-    cdef device_buffer move(device_buffer)
-    cdef unique_ptr[device_buffer] move(unique_ptr[device_buffer])
-    cdef unique_ptr[GraphCSR[int,int,float]] move(unique_ptr[GraphCSR[int,int,float]])
-    cdef unique_ptr[GraphCSR[int,int,double]] move(unique_ptr[GraphCSR[int,int,double]])
-    cdef GraphSparseContents[int,int,float] move(GraphSparseContents[int,int,float])
-    cdef GraphSparseContents[int,int,double] move(GraphSparseContents[int,int,double])
-
-ctypedef unique_ptr[GraphCOO[int,int,float]] GraphCOOPtrFloat
-ctypedef unique_ptr[GraphCOO[int,int,double]] GraphCOOPtrDouble
-
-ctypedef fused GraphCOOPtrType:
-    GraphCOOPtrFloat
-    GraphCOOPtrDouble
-
-ctypedef unique_ptr[GraphCSR[int,int,float]] GraphCSRPtrFloat
-ctypedef unique_ptr[GraphCSR[int,int,double]] GraphCSRPtrDouble
-
-ctypedef fused GraphCSRPtrType:
-    GraphCSRPtrFloat
-    GraphCSRPtrDouble
-
-ctypedef GraphCOOView[int,int,float] GraphCOOViewFloat
-ctypedef GraphCOOView[int,int,double] GraphCOOViewDouble
-ctypedef GraphCSRView[int,int,float] GraphCSRViewFloat
-ctypedef GraphCSRView[int,int,double] GraphCSRViewDouble
-
-ctypedef fused GraphCOOViewType:
-    GraphCOOViewFloat
-    GraphCOOViewDouble
-
-ctypedef fused GraphCSRViewType:
-    GraphCSRViewFloat
-    GraphCSRViewDouble
-
-ctypedef fused GraphViewType:
-    GraphCOOViewFloat
-    GraphCOOViewDouble
-    GraphCSRViewFloat
-    GraphCSRViewDouble
-
-cdef coo_to_df(GraphCOOPtrType graph)
-cdef csr_to_series(GraphCSRPtrType graph)
-cdef GraphViewType get_graph_view(input_graph, bool weightless=*, GraphViewType* dummy=*)
diff --git a/python/cugraph/structure/graph.py b/python/cugraph/structure/graph.py
index c918cd44ae2..ce63eb52683 100644
--- a/python/cugraph/structure/graph.py
+++ b/python/cugraph/structure/graph.py
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure import graph_primtypes_wrapper
 from cugraph.structure.symmetrize import symmetrize
 from cugraph.structure.number_map import NumberMap
 from cugraph.dask.common.input_utils import get_local_data
@@ -133,8 +133,10 @@ def enable_batch(self):
         comms = Comms.get_comms()
 
         if client is None or comms is None:
-            msg = "MG Batch needs a Dask Client and the " \
+            msg = (
+                "MG Batch needs a Dask Client and the "
                 "Communicator needs to be initialized."
+            )
             raise Exception(msg)
 
         self.batch_enabled = True
@@ -159,9 +161,8 @@ def _replicate_edgelist(self):
         if client is None:
             return
         work_futures = replication.replicate_cudf_dataframe(
-            self.edgelist.edgelist_df,
-            client=client,
-            comms=comms)
+            self.edgelist.edgelist_df, client=client, comms=comms
+        )
 
         self.batch_edgelists = work_futures
 
@@ -175,22 +176,25 @@ def _replicate_adjlist(self):
 
         weights = None
         offsets_futures = replication.replicate_cudf_series(
-            self.adjlist.offsets,
-            client=client,
-            comms=comms)
+            self.adjlist.offsets, client=client, comms=comms
+        )
         indices_futures = replication.replicate_cudf_series(
-            self.adjlist.indices,
-            client=client,
-            comms=comms)
+            self.adjlist.indices, client=client, comms=comms
+        )
 
         if self.adjlist.weights is not None:
             weights = replication.replicate_cudf_series(self.adjlist.weights)
         else:
             weights = {worker: None for worker in offsets_futures}
 
-        merged_futures = {worker: [offsets_futures[worker],
-                                   indices_futures[worker], weights[worker]]
-                          for worker in offsets_futures}
+        merged_futures = {
+            worker: [
+                offsets_futures[worker],
+                indices_futures[worker],
+                weights[worker],
+            ]
+            for worker in offsets_futures
+        }
         self.batch_adjlists = merged_futures
 
     # FIXME: Not implemented yet
@@ -227,23 +231,29 @@ def add_nodes_from(self, nodes, bipartite=None, multipartite=None):
             nodes of the partition named as multipartite argument.
         """
         if bipartite is None and multipartite is None:
-            self._nodes['all_nodes'] = cudf.Series(nodes)
+            self._nodes["all_nodes"] = cudf.Series(nodes)
         else:
-            set_names = [i for i in self._nodes.keys() if i != 'all_nodes']
+            set_names = [i for i in self._nodes.keys() if i != "all_nodes"]
             if multipartite is not None:
                 if self.bipartite:
-                    raise Exception("The Graph is already set as bipartite. "
-                                    "Use bipartite option instead.")
+                    raise Exception(
+                        "The Graph is already set as bipartite. "
+                        "Use bipartite option instead."
+                    )
                 self.multipartite = True
             elif bipartite is not None:
                 if self.multipartite:
-                    raise Exception("The Graph is set as multipartite. "
-                                    "Use multipartite option instead.")
+                    raise Exception(
+                        "The Graph is set as multipartite. "
+                        "Use multipartite option instead."
+                    )
                 self.bipartite = True
                 multipartite = bipartite
                 if multipartite not in set_names and len(set_names) == 2:
-                    raise Exception("The Graph is set as bipartite and "
-                                    "already has two partitions initialized.")
+                    raise Exception(
+                        "The Graph is set as bipartite and "
+                        "already has two partitions initialized."
+                    )
             self._nodes[multipartite] = cudf.Series(nodes)
 
     def is_bipartite(self):
@@ -273,14 +283,15 @@ def sets(self):
         graph is not bipartite.
         """
         # TO DO: Call coloring algorithm
-        set_names = [i for i in self._nodes.keys() if i != 'all_nodes']
+        set_names = [i for i in self._nodes.keys() if i != "all_nodes"]
         if self.bipartite:
             top = self._nodes[set_names[0]]
             if len(set_names) == 2:
                 bottom = self._nodes[set_names[1]]
             else:
-                bottom = cudf.Series(set(self.nodes().values_host)
-                                     - set(top.values_host))
+                bottom = cudf.Series(
+                    set(self.nodes().values_host) - set(top.values_host)
+                )
             return top, bottom
         else:
             return {k: self._nodes[k] for k in set_names}
@@ -337,45 +348,65 @@ def from_cudf_edgelist(
 
         Examples
         --------
-        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>> df = cudf.read_csv('datasets/karate.csv', delimiter=' ',
         >>>                   dtype=['int32', 'int32', 'float32'], header=None)
         >>> G = cugraph.Graph()
-        >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2',
-                                 renumber=False)
+        >>> G.from_cudf_edgelist(df, source='0', destination='1',
+                                 edge_attr='2', renumber=False)
 
         """
         if self.edgelist is not None or self.adjlist is not None:
             raise Exception("Graph already has values")
 
+        s_col = source
+        d_col = destination
+        if not isinstance(s_col, list):
+            s_col = [s_col]
+        if not isinstance(d_col, list):
+            d_col = [d_col]
+        if not (
+            set(s_col).issubset(set(input_df.columns))
+            and set(d_col).issubset(set(input_df.columns))
+        ):
+            raise Exception(
+                "source column names and/or destination column \
+names not found in input. Recheck the source and destination parameters"
+            )
+
         # Consolidation
         if isinstance(input_df, cudf.DataFrame):
             if len(input_df[source]) > 2147483100:
-                raise Exception('cudf dataFrame edge list is too big \
-                                 to fit in a single GPU')
+                raise Exception(
+                    "cudf dataFrame edge list is too big \
+                                 to fit in a single GPU"
+                )
             elist = input_df
         elif isinstance(input_df, dask_cudf.DataFrame):
             if len(input_df[source]) > 2147483100:
-                raise Exception('dask_cudf dataFrame edge list is too big \
-                                 to fit in a single GPU')
+                raise Exception(
+                    "dask_cudf dataFrame edge list is too big \
+                                 to fit in a single GPU"
+                )
             elist = input_df.compute().reset_index(drop=True)
         else:
-            raise Exception('input should be a cudf.DataFrame or \
-                              a dask_cudf dataFrame')
+            raise Exception(
+                "input should be a cudf.DataFrame or \
+                              a dask_cudf dataFrame"
+            )
 
         renumber_map = None
         if renumber:
             # FIXME: Should SG do lazy evaluation like MG?
             elist, renumber_map = NumberMap.renumber(
-                elist, source, destination,
-                store_transposed=False
+                elist, source, destination, store_transposed=False
             )
-            source = 'src'
-            destination = 'dst'
+            source = "src"
+            destination = "dst"
             self.renumbered = True
             self.renumber_map = renumber_map
         else:
             if type(source) is list and type(destination) is list:
-                raise Exception('set renumber to True for multi column ids')
+                raise Exception("set renumber to True for multi column ids")
 
         source_col = elist[source]
         dest_col = elist[destination]
@@ -399,18 +430,21 @@ def from_cudf_edgelist(
             else:
                 source_col, dest_col = symmetrize(source_col, dest_col)
 
-        self.edgelist = Graph.EdgeList(
-            source_col, dest_col, value_col
-        )
+        self.edgelist = Graph.EdgeList(source_col, dest_col, value_col)
 
         if self.batch_enabled:
             self._replicate_edgelist()
 
         self.renumber_map = renumber_map
 
-    def from_dask_cudf_edgelist(self, input_ddf, source='source',
-                                destination='destination',
-                                edge_attr=None, renumber=True):
+    def from_dask_cudf_edgelist(
+        self,
+        input_ddf,
+        source="source",
+        destination="destination",
+        edge_attr=None,
+        renumber=True,
+    ):
         """
         Initializes the distributed graph from the dask_cudf.DataFrame
         edgelist. Undirected Graphs are not currently supported.
@@ -439,12 +473,26 @@ def from_dask_cudf_edgelist(self, input_ddf, source='source',
             is number of vertices, renumber argument should be True.
         """
         if self.edgelist is not None or self.adjlist is not None:
-            raise Exception('Graph already has values')
+            raise Exception("Graph already has values")
         if not isinstance(input_ddf, dask_cudf.DataFrame):
-            raise Exception('input should be a dask_cudf dataFrame')
+            raise Exception("input should be a dask_cudf dataFrame")
         if type(self) is Graph:
-            raise Exception('Undirected distributed graph not supported')
-
+            raise Exception("Undirected distributed graph not supported")
+
+        s_col = source
+        d_col = destination
+        if not isinstance(s_col, list):
+            s_col = [s_col]
+        if not isinstance(d_col, list):
+            d_col = [d_col]
+        if not (
+            set(s_col).issubset(set(input_ddf.columns))
+            and set(d_col).issubset(set(input_ddf.columns))
+        ):
+            raise Exception(
+                "source column names and/or destination column \
+names not found in input. Recheck the source and destination parameters"
+            )
         #
         # Keep all of the original parameters so we can lazily
         # evaluate this function
@@ -482,10 +530,10 @@ def compute_local_data(self, by, load_balance=True):
         if self.distributed:
             data = get_local_data(self, by, load_balance)
             self.local_data = {}
-            self.local_data['data'] = data
-            self.local_data['by'] = by
+            self.local_data["data"] = data
+            self.local_data["by"] = by
         else:
-            raise Exception('Graph should be a distributed graph')
+            raise Exception("Graph should be a distributed graph")
 
     def view_edge_list(self):
         """
@@ -526,7 +574,7 @@ def view_edge_list(self):
                 raise Exception("Graph has no Edgelist.")
             return self.edgelist.edgelist_df
         if self.edgelist is None:
-            src, dst, weights = graph_new_wrapper.view_edge_list(self)
+            src, dst, weights = graph_primtypes_wrapper.view_edge_list(self)
             self.edgelist = self.EdgeList(src, dst, weights)
 
         edgelist_df = self.edgelist.edgelist_df
@@ -586,9 +634,9 @@ def from_cudf_adjlist(self, offset_col, index_col, value_col=None):
 
         Examples
         --------
-        >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+        >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
         >>>                   dtype=['int32', 'int32', 'float32'], header=None)
-        >>> M = M.to_pandas()
+        >>> M = gdf.to_pandas()
         >>> M = scipy.sparse.coo_matrix((M['2'],(M['0'],M['1'])))
         >>> M = M.tocsr()
         >>> offsets = cudf.Series(M.indptr)
@@ -653,9 +701,10 @@ def compute_renumber_edge_list(self, transposed=False):
                 del self.edgelist
 
             renumbered_ddf, number_map = NumberMap.renumber(
-                self.input_df, self.source_columns,
+                self.input_df,
+                self.source_columns,
                 self.destination_columns,
-                store_transposed=transposed
+                store_transposed=transposed,
             )
             self.edgelist = self.EdgeList(renumbered_ddf)
             self.renumber_map = number_map
@@ -696,7 +745,7 @@ def view_adj_list(self):
                     self.transposedadjlist.weights,
                 )
             else:
-                off, ind, vals = graph_new_wrapper.view_adj_list(self)
+                off, ind, vals = graph_primtypes_wrapper.view_adj_list(self)
             self.adjlist = self.AdjList(off, ind, vals)
 
             if self.batch_enabled:
@@ -739,9 +788,11 @@ def view_transposed_adj_list(self):
                     self.adjlist.weights,
                 )
             else:
-                off, ind, vals = graph_new_wrapper.view_transposed_adj_list(
-                    self
-                )
+                (
+                    off,
+                    ind,
+                    vals,
+                ) = graph_primtypes_wrapper.view_transposed_adj_list(self)
             self.transposedadjlist = self.transposedAdjList(off, ind, vals)
 
             if self.batch_enabled:
@@ -776,7 +827,7 @@ def get_two_hop_neighbors(self):
         """
         if self.distributed:
             raise Exception("Not supported for distributed graph")
-        df = graph_new_wrapper.get_two_hop_neighbors(self)
+        df = graph_primtypes_wrapper.get_two_hop_neighbors(self)
         if self.renumbered is True:
             df = self.unrenumber(df, "first")
             df = self.unrenumber(df, "second")
@@ -791,12 +842,12 @@ def number_of_vertices(self):
         if self.node_count is None:
             if self.distributed:
                 if self.edgelist is not None:
-                    ddf = self.edgelist.edgelist_df[['src', 'dst']]
+                    ddf = self.edgelist.edgelist_df[["src", "dst"]]
                     self.node_count = ddf.max().max().compute() + 1
                 else:
                     raise Exception("Graph is Empty")
             elif self.adjlist is not None:
-                self.node_count = len(self.adjlist.offsets)-1
+                self.node_count = len(self.adjlist.offsets) - 1
             elif self.transposedadjlist is not None:
                 self.node_count = len(self.transposedadjlist.offsets) - 1
             elif self.edgelist is not None:
@@ -823,7 +874,7 @@ def number_of_edges(self, directed_edges=False):
             if self.edgelist is not None:
                 return len(self.edgelist.edgelist_df)
             else:
-                raise ValueError('Graph is Empty')
+                raise ValueError("Graph is Empty")
         if directed_edges and self.edgelist is not None:
             return len(self.edgelist.edgelist_df)
         if self.edge_count is None:
@@ -984,6 +1035,11 @@ def degrees(self, vertex_subset=None):
         Returns
         -------
         df : cudf.DataFrame
+            GPU DataFrame of size N (the default) or the size of the given
+            vertices (vertex_subset) containing the degrees. The ordering is
+            relative to the adjacency list, or that given by the specified
+            vertex_subset.
+
             df['vertex'] : cudf.Series
                 The vertex IDs (will be identical to vertex_subset if
                 specified).
@@ -1003,9 +1059,11 @@ def degrees(self, vertex_subset=None):
         """
         if self.distributed:
             raise Exception("Not supported for distributed graph")
-        vertex_col, in_degree_col, out_degree_col = graph_new_wrapper._degrees(
-            self
-        )
+        (
+            vertex_col,
+            in_degree_col,
+            out_degree_col,
+        ) = graph_primtypes_wrapper._degrees(self)
 
         df = cudf.DataFrame()
         df["vertex"] = vertex_col
@@ -1021,7 +1079,7 @@ def degrees(self, vertex_subset=None):
         return df
 
     def _degree(self, vertex_subset, x=0):
-        vertex_col, degree_col = graph_new_wrapper._degree(self, x)
+        vertex_col, degree_col = graph_primtypes_wrapper._degree(self, x)
         df = cudf.DataFrame()
         df["vertex"] = vertex_col
         df["degree"] = degree_col
@@ -1087,8 +1145,7 @@ def to_undirected(self):
         >>> G = DiG.to_undirected()
 
         """
-        if self.distributed:
-            raise Exception("Not supported for distributed graph")
+
         if type(self) is Graph:
             return self
         if type(self) is DiGraph:
@@ -1096,6 +1153,7 @@ def to_undirected(self):
             df = self.edgelist.edgelist_df
             G.renumbered = self.renumbered
             G.renumber_map = self.renumber_map
+            G.multi = self.multi
             if self.edgelist.weights:
                 source_col, dest_col, value_col = symmetrize(
                     df["src"], df["dst"], df["weights"]
@@ -1103,9 +1161,7 @@ def to_undirected(self):
             else:
                 source_col, dest_col = symmetrize(df["src"], df["dst"])
                 value_col = None
-            G.edgelist = Graph.EdgeList(
-                source_col, dest_col, value_col
-            )
+            G.edgelist = Graph.EdgeList(source_col, dest_col, value_col)
 
             return G
 
@@ -1122,7 +1178,7 @@ def has_node(self, n):
         if self.edgelist is None:
             raise Exception("Graph has no Edgelist.")
         if self.distributed:
-            ddf = self.edgelist.edgelist_df[['src', 'dst']]
+            ddf = self.edgelist.edgelist_df[["src", "dst"]]
             return (ddf == n).any().any().compute()
         if self.renumbered:
             tmp = self.renumber_map.to_internal_vertex_id(cudf.Series([n]))
@@ -1138,19 +1194,19 @@ def has_edge(self, u, v):
         if self.edgelist is None:
             raise Exception("Graph has no Edgelist.")
         if self.renumbered:
-            tmp = cudf.DataFrame({'src': [u, v]})
-            tmp = tmp.astype({'src': 'int'})
+            tmp = cudf.DataFrame({"src": [u, v]})
+            tmp = tmp.astype({"src": "int"})
             tmp = self.add_internal_vertex_id(
-                tmp, 'id', 'src', preserve_order=True
+                tmp, "id", "src", preserve_order=True
             )
 
-            u = tmp['id'][0]
-            v = tmp['id'][1]
+            u = tmp["id"][0]
+            v = tmp["id"][1]
 
         df = self.edgelist.edgelist_df
         if self.distributed:
-            return ((df['src'] == u) & (df['dst'] == v)).any().compute()
-        return ((df['src'] == u) & (df['dst'] == v)).any()
+            return ((df["src"] == u) & (df["dst"] == v)).any().compute()
+        return ((df["src"] == u) & (df["dst"] == v)).any()
 
     def edges(self):
         """
@@ -1177,11 +1233,11 @@ def nodes(self):
                 return self.renumber_map.implementation.df["0"]
             else:
                 return cudf.concat([df["src"], df["dst"]]).unique()
-        if 'all_nodes' in self._nodes.keys():
-            return self._nodes['all_nodes']
+        if "all_nodes" in self._nodes.keys():
+            return self._nodes["all_nodes"]
         else:
-            n = cudf.Series(dtype='int')
-            set_names = [i for i in self._nodes.keys() if i != 'all_nodes']
+            n = cudf.Series(dtype="int")
+            set_names = [i for i in self._nodes.keys() if i != "all_nodes"]
             for k in set_names:
                 n = n.append(self._nodes[k])
             return n
@@ -1191,7 +1247,7 @@ def neighbors(self, n):
             raise Exception("Graph has no Edgelist.")
         if self.distributed:
             ddf = self.edgelist.edgelist_df
-            return ddf[ddf['src'] == n]['dst'].reset_index(drop=True)
+            return ddf[ddf["src"] == n]["dst"].reset_index(drop=True)
         if self.renumbered:
             node = self.renumber_map.to_internal_vertex_id(cudf.Series([n]))
             if len(node) == 0:
@@ -1267,9 +1323,14 @@ def lookup_internal_vertex_id(self, df, column_name=None):
         """
         return self.renumber_map.to_internal_vertex_id(df, column_name)
 
-    def add_internal_vertex_id(self, df, internal_column_name,
-                               external_column_name,
-                               drop=True, preserve_order=False):
+    def add_internal_vertex_id(
+        self,
+        df,
+        internal_column_name,
+        external_column_name,
+        drop=True,
+        preserve_order=False,
+    ):
         """
         Given a DataFrame containing external vertex ids in the identified
         columns, return a DataFrame containing the internal vertex ids as the
@@ -1301,8 +1362,12 @@ def add_internal_vertex_id(self, df, internal_column_name,
             id
         """
         return self.renumber_map.add_internal_vertex_id(
-            df, internal_column_name, external_column_name,
-            drop, preserve_order)
+            df,
+            internal_column_name,
+            external_column_name,
+            drop,
+            preserve_order,
+        )
 
 
 class DiGraph(Graph):
diff --git a/python/cugraph/structure/graph_new.pxd b/python/cugraph/structure/graph_primtypes.pxd
similarity index 78%
rename from python/cugraph/structure/graph_new.pxd
rename to python/cugraph/structure/graph_primtypes.pxd
index 2343a0604dc..2879436690f 100644
--- a/python/cugraph/structure/graph_new.pxd
+++ b/python/cugraph/structure/graph_primtypes.pxd
@@ -190,3 +190,56 @@ ctypedef fused GraphViewType:
 cdef coo_to_df(GraphCOOPtrType graph)
 cdef csr_to_series(GraphCSRPtrType graph)
 cdef GraphViewType get_graph_view(input_graph, bool weightless=*, GraphViewType* dummy=*)
+
+
+# C++ utilities specifically for Cython
+cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+
+    ctypedef enum numberTypeEnum:
+        int32Type "cugraph::cython::numberTypeEnum::int32Type"
+        int64Type "cugraph::cython::numberTypeEnum::int64Type"
+        floatType "cugraph::cython::numberTypeEnum::floatType"
+        doubleType "cugraph::cython::numberTypeEnum::doubleType"
+
+    cdef cppclass graph_container_t:
+       pass
+
+    cdef void populate_graph_container(
+        graph_container_t &graph_container,
+        handle_t &handle,
+        void *src_vertices,
+        void *dst_vertices,
+        void *weights,
+        void *vertex_partition_offsets,
+        numberTypeEnum vertexType,
+        numberTypeEnum edgeType,
+        numberTypeEnum weightType,
+        size_t num_partition_edges,
+        size_t num_global_vertices,
+        size_t num_global_edges,
+        size_t row_comm_size,
+        size_t col_comm_size,
+        bool sorted_by_degree,
+        bool transposed,
+        bool multi_gpu) except +
+
+    ctypedef enum graphTypeEnum:
+        LegacyCSR "cugraph::cython::graphTypeEnum::LegacyCSR"
+        LegacyCSC "cugraph::cython::graphTypeEnum::LegacyCSC"
+        LegacyCOO "cugraph::cython::graphTypeEnum::LegacyCOO"
+
+    cdef void populate_graph_container_legacy(
+        graph_container_t &graph_container,
+        graphTypeEnum legacyType,
+        const handle_t &handle,
+        void *offsets,
+        void *indices,
+        void *weights,
+        numberTypeEnum offsetType,
+        numberTypeEnum indexType,
+        numberTypeEnum weightType,
+        size_t num_global_vertices,
+        size_t num_global_edges,
+        int *local_vertices,
+        int *local_edges,
+        int *local_offsets) except +
diff --git a/python/cugraph/structure/graph_new.pyx b/python/cugraph/structure/graph_primtypes.pyx
similarity index 100%
rename from python/cugraph/structure/graph_new.pyx
rename to python/cugraph/structure/graph_primtypes.pyx
diff --git a/python/cugraph/structure/graph_new_wrapper.pyx b/python/cugraph/structure/graph_primtypes_wrapper.pyx
similarity index 96%
rename from python/cugraph/structure/graph_new_wrapper.pyx
rename to python/cugraph/structure/graph_primtypes_wrapper.pyx
index da596bc988f..7bc62b9a1af 100644
--- a/python/cugraph/structure/graph_new_wrapper.pyx
+++ b/python/cugraph/structure/graph_primtypes_wrapper.pyx
@@ -16,9 +16,9 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
-from cugraph.structure.graph_new cimport get_two_hop_neighbors as c_get_two_hop_neighbors
-from cugraph.structure.graph_new cimport renumber_vertices as c_renumber_vertices
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure.graph_primtypes cimport get_two_hop_neighbors as c_get_two_hop_neighbors
+from cugraph.structure.graph_primtypes cimport renumber_vertices as c_renumber_vertices
 from cugraph.structure.utils_wrapper import *
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
@@ -47,7 +47,7 @@ def datatype_cast(cols, dtypes):
 
 def renumber(source_col, dest_col):
     num_edges = len(source_col)
-    
+
     src_renumbered = cudf.Series(np.zeros(num_edges), dtype=np.int32)
     dst_renumbered = cudf.Series(np.zeros(num_edges), dtype=np.int32)
 
@@ -74,15 +74,15 @@ def renumber(source_col, dest_col):
                                                                <int*>c_src_renumbered,
                                                                <int*>c_dst_renumbered,
                                                                &map_size))
-        
-        
+
+
     map = DeviceBuffer.c_from_unique_ptr(move(numbering_map))
     map = Buffer(map)
-    
+
     output_map = cudf.Series(data=map, dtype=source_col.dtype)
 
     return src_renumbered, dst_renumbered, output_map
-                                        
+
 
 def view_adj_list(input_graph):
 
@@ -213,7 +213,7 @@ def _degree_csr(offsets, indices, x=0):
     cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0]
 
     graph = GraphCSRView[int,int,float](<int*>c_offsets, <int*>c_indices, <float*>NULL, num_verts, num_edges)
-        
+
     graph.degree(<int*> c_degree, dir)
     graph.get_vertex_identifiers(<int*>c_vertex)
 
@@ -224,7 +224,7 @@ def _degree(input_graph, x=0):
     transpose_x = { 0: 0,
                     2: 1,
                     1: 2 }
-    
+
     if input_graph.adjlist is not None:
         return _degree_csr(input_graph.adjlist.offsets,
                            input_graph.adjlist.indices,
@@ -248,17 +248,17 @@ def _degree(input_graph, x=0):
             data.calculate_parts_to_sizes(comms)
             degree_ddf = [client.submit(_degree_coo, wf[1][0], 'src', 'dst', x, num_verts, comms.sessionId, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items())]
             wait(degree_ddf)
-            return degree_ddf[0].result()      
+            return degree_ddf[0].result()
         return _degree_coo(input_graph.edgelist.edgelist_df,
                            'src', 'dst', x)
-                           
+
     raise Exception("input_graph not COO, CSR or CSC")
 
-    
+
 def _degrees(input_graph):
     verts, indegrees = _degree(input_graph,1)
     verts, outdegrees = _degree(input_graph, 2)
-    
+
     return verts, indegrees, outdegrees
 
 
diff --git a/python/cugraph/structure/hypergraph.py b/python/cugraph/structure/hypergraph.py
index 9b1c4b55e61..a11c937d83d 100644
--- a/python/cugraph/structure/hypergraph.py
+++ b/python/cugraph/structure/hypergraph.py
@@ -311,11 +311,11 @@ def _create_hyper_nodes(
 ):
     nodes = events.copy(deep=False)
     if NODEID in nodes:
-        nodes.drop([NODEID], inplace=True)
+        nodes.drop(columns=[NODEID], inplace=True)
     if NODETYPE in nodes:
-        nodes.drop([NODETYPE], inplace=True)
+        nodes.drop(columns=[NODETYPE], inplace=True)
     if CATEGORY in nodes:
-        nodes.drop([CATEGORY], inplace=True)
+        nodes.drop(columns=[CATEGORY], inplace=True)
     nodes[NODETYPE] = EVENTID if not categorical_metadata \
         else _str_scalar_to_category(len(nodes), EVENTID)
     nodes[CATEGORY] = "event" if not categorical_metadata \
diff --git a/python/cugraph/structure/shuffle.py b/python/cugraph/structure/shuffle.py
new file mode 100644
index 00000000000..ea3c28463d7
--- /dev/null
+++ b/python/cugraph/structure/shuffle.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dask.dataframe.shuffle import rearrange_by_column
+import cudf
+
+
+def get_n_workers():
+    from dask.distributed import default_client
+    client = default_client()
+    return len(client.scheduler_info()['workers'])
+
+
+def get_2D_div(ngpus):
+    pcols = int(math.sqrt(ngpus))
+    while ngpus % pcols != 0:
+        pcols = pcols - 1
+    return int(ngpus/pcols), pcols
+
+
+def _set_partitions_pre(df, vertex_row_partitions, vertex_col_partitions,
+                        prows, pcols, transposed, partition_type):
+    if transposed:
+        r = df['dst']
+        c = df['src']
+    else:
+        r = df['src']
+        c = df['dst']
+    r_div = vertex_row_partitions.searchsorted(r, side='right')-1
+    c_div = vertex_col_partitions.searchsorted(c, side='right')-1
+
+    if partition_type == 1:
+        partitions = r_div * pcols + c_div
+    else:
+        partitions = r_div % prows + c_div * prows
+    return partitions
+
+
+def shuffle(dg, transposed=False, prows=None, pcols=None, partition_type=1):
+    """
+    Shuffles the renumbered input distributed graph edgelist into ngpu
+    partitions. The number of processes/gpus P = prows*pcols. The 2D
+    partitioning divides the matrix into P*pcols rectangular partitions
+    as per vertex partitioning performed in renumbering, and then shuffles
+    these partitions into P gpus.
+    """
+
+    ddf = dg.edgelist.edgelist_df
+    ngpus = get_n_workers()
+    if prows is None and pcols is None:
+        if partition_type == 1:
+            pcols, prows = get_2D_div(ngpus)
+        else:
+            prows, pcols = get_2D_div(ngpus)
+    else:
+        if prows is not None and pcols is not None:
+            if ngpus != prows*pcols:
+                raise Exception('prows*pcols should be equal to the\
+ number of processes')
+        elif prows is not None:
+            if ngpus % prows != 0:
+                raise Exception('prows must be a factor of the number\
+ of processes')
+            pcols = int(ngpus/prows)
+        elif pcols is not None:
+            if ngpus % pcols != 0:
+                raise Exception('pcols must be a factor of the number\
+ of processes')
+            prows = int(ngpus/pcols)
+
+    renumber_vertex_count = dg.renumber_map.implementation.\
+        ddf.map_partitions(len).compute()
+    renumber_vertex_cumsum = renumber_vertex_count.cumsum()
+
+    if transposed:
+        row_dtype = ddf['dst'].dtype
+        col_dtype = ddf['src'].dtype
+    else:
+        row_dtype = ddf['src'].dtype
+        col_dtype = ddf['dst'].dtype
+
+    vertex_partition_offsets = cudf.Series([0], dtype=row_dtype)
+    vertex_partition_offsets = vertex_partition_offsets.append(cudf.Series(
+        renumber_vertex_cumsum, dtype=row_dtype))
+    num_verts = vertex_partition_offsets.iloc[-1]
+    if partition_type == 1:
+        vertex_row_partitions = []
+        for i in range(prows + 1):
+            vertex_row_partitions.append(
+                vertex_partition_offsets.iloc[i*pcols])
+        vertex_row_partitions = cudf.Series(
+            vertex_row_partitions, dtype=row_dtype)
+    else:
+        vertex_row_partitions = vertex_partition_offsets
+    vertex_col_partitions = []
+    for i in range(pcols + 1):
+        vertex_col_partitions.append(vertex_partition_offsets.iloc[i*prows])
+    vertex_col_partitions = cudf.Series(vertex_col_partitions, dtype=col_dtype)
+
+    meta = ddf._meta._constructor_sliced([0])
+    partitions = ddf.map_partitions(
+        _set_partitions_pre,
+        vertex_row_partitions=vertex_row_partitions,
+        vertex_col_partitions=vertex_col_partitions, prows=prows,
+        pcols=pcols, transposed=transposed, partition_type=partition_type,
+        meta=meta)
+    ddf2 = ddf.assign(_partitions=partitions)
+    ddf3 = rearrange_by_column(
+        ddf2,
+        "_partitions",
+        max_branch=None,
+        npartitions=ngpus,
+        shuffle="tasks",
+        ignore_index=True,
+    ).drop(columns=["_partitions"])
+
+    partition_row_size = pcols
+    partition_col_size = prows
+
+    return (ddf3,
+            num_verts,
+            partition_row_size,
+            partition_col_size,
+            vertex_partition_offsets)
diff --git a/python/cugraph/structure/symmetrize.py b/python/cugraph/structure/symmetrize.py
index cf3a823ca27..6ab34f6687e 100644
--- a/python/cugraph/structure/symmetrize.py
+++ b/python/cugraph/structure/symmetrize.py
@@ -13,6 +13,7 @@
 
 from cugraph.structure import graph as csg
 import cudf
+import dask_cudf
 
 
 def symmetrize_df(df, src_name, dst_name):
@@ -21,19 +22,16 @@ def symmetrize_df(df, src_name, dst_name):
     the source and destination columns and create a new data frame
     using the same column names that symmetrize the graph so that all
     edges appear in both directions.
-
     Note that if other columns exist in the data frame (e.g. edge weights)
     the other columns will also be replicated.  That is, if (u,v,data)
     represents the source value (u), destination value (v) and some
     set of other columns (data) in the input data, then the output
     data will contain both (u,v,data) and (v,u,data) with matching
     data.
-
     If (u,v,data1) and (v,u,data2) exist in the input data where data1
     != data2 then this code will arbitrarily pick the smaller data
     element to keep, if this is not desired then the caller should
     should correct the data prior to calling symmetrize.
-
     Parameters
     ----------
     df : cudf.DataFrame
@@ -44,14 +42,17 @@ def symmetrize_df(df, src_name, dst_name):
         Name of the column in the data frame containing the source ids
     dst_name : string
         Name of the column in the data frame containing the destination ids
-
     Examples
     --------
-    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
-    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
-    >>> sym_df = cugraph.symmetrize(M, '0', '1')
-    >>> G = cugraph.Graph()
-    >>> G.add_edge_list(sym_df['0]', sym_df['1'], sym_df['2'])
+    >>> import cugraph.dask as dcg
+    >>> Comms.initialize()
+    >>> chunksize = dcg.get_chunksize(input_data_path)
+    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
+                                 delimiter=' ',
+                                 names=['src', 'dst', 'weight'],
+                                 dtype=['int32', 'int32', 'float32'])
+    >>> sym_ddf = cugraph.symmetrize_ddf(ddf, "src", "dst", "weight")
+    >>> Comms.destroy()
     """
     gdf = cudf.DataFrame()
 
@@ -75,28 +76,82 @@ def symmetrize_df(df, src_name, dst_name):
     return gdf.groupby(by=[src_name, dst_name], as_index=False).min()
 
 
+def symmetrize_ddf(df, src_name, dst_name, weight_name=None):
+    """
+    Take a COO stored in a distributed DataFrame, and the column names of
+    the source and destination columns and create a new data frame
+    using the same column names that symmetrize the graph so that all
+    edges appear in both directions.
+
+    Note that if other columns exist in the data frame (e.g. edge weights)
+    the other columns will also be replicated.  That is, if (u,v,data)
+    represents the source value (u), destination value (v) and some
+    set of other columns (data) in the input data, then the output
+    data will contain both (u,v,data) and (v,u,data) with matching
+    data.
+
+    If (u,v,data1) and (v,u,data2) exist in the input data where data1
+    != data2 then this code will arbitrarily pick the smaller data
+    element to keep, if this is not desired then the caller should
+    should correct the data prior to calling symmetrize.
+
+    Parameters
+    ----------
+    df : dask_cudf.DataFrame
+        Input data frame containing COO.  Columns should contain source
+        ids, destination ids and any properties associated with the
+        edges.
+    src_name : string
+        Name of the column in the data frame containing the source ids
+    dst_name : string
+        Name of the column in the data frame containing the destination ids
+
+    Examples
+    --------
+    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+    >>> sym_df = cugraph.symmetrize(M, '0', '1')
+    """
+    if weight_name:
+        ddf2 = df[[dst_name, src_name, weight_name]]
+        ddf2.columns = [src_name, dst_name, weight_name]
+    else:
+        ddf2 = df[[dst_name, src_name]]
+        ddf2.columns = [src_name, dst_name]
+
+    ddf = df.append(ddf2).reset_index(drop=True)
+    result = (
+        ddf.groupby(by=[src_name, dst_name], as_index=False)
+        .min()
+        .reset_index()
+    )
+    return result
+
+
 def symmetrize(source_col, dest_col, value_col=None):
     """
-    Take a COO set of source destination pairs along with associated values and
+    Take a COO set of source destination pairs along with associated values
+    stored in a single GPU or distributed
     create a new COO set of source destination pairs along with values where
     all edges exist in both directions.
 
-    Return from this call will be a COO stored as two cudf Series - the
-    symmetrized source column and the symmetrized dest column, along with
+    Return from this call will be a COO stored as two cudf Series or
+    dask_cudf.Series -the symmetrized source column and the symmetrized dest
+    column, along with
     an optional cudf Series containing the associated values (only if the
     values are passed in).
 
     Parameters
     ----------
-    source_col : cudf.Series
+    source_col : cudf.Series or dask_cudf.Series
         This cudf.Series wraps a gdf_column of size E (E: number of edges).
         The gdf column contains the source index for each edge.
         Source indices must be an integer type.
-    dest_col : cudf.Series
+    dest_col : cudf.Series or dask_cudf.Series
         This cudf.Series wraps a gdf_column of size E (E: number of edges).
         The gdf column contains the destination index for each edge.
         Destination indices must be an integer type.
-    value_col : cudf.Series (optional)
+    value_col : cudf.Series or dask_cudf.Series (optional)
         This cudf.Series wraps a gdf_column of size E (E: number of edges).
         The gdf column contains values associated with this edge.
         For this function the values can be any type, they are not
@@ -110,19 +165,31 @@ def symmetrize(source_col, dest_col, value_col=None):
     >>> destinations = cudf.Series(M['1'])
     >>> values = cudf.Series(M['2'])
     >>> src, dst, val = cugraph.symmetrize(sources, destinations, values)
-    >>> G = cugraph.Graph()
-    >>> G.add_edge_list(src, dst, val)
     """
-    csg.null_check(source_col)
-    csg.null_check(dest_col)
-
-    input_df = cudf.DataFrame({"source": source_col, "destination": dest_col})
 
+    input_df = None
+    weight_name = None
+    if type(source_col) is dask_cudf.Series:
+        # FIXME convoluted way of just wrapping dask cudf Series in a ddf
+        input_df = source_col.to_frame()
+        input_df = input_df.rename(columns={source_col.name: "source"})
+        input_df["destination"] = dest_col
+    else:
+        input_df = cudf.DataFrame(
+            {"source": source_col, "destination": dest_col}
+        )
+        csg.null_check(source_col)
+        csg.null_check(dest_col)
     if value_col is not None:
-        csg.null_check(value_col)
+        weight_name = "value"
         input_df.insert(len(input_df.columns), "value", value_col)
-
-    output_df = symmetrize_df(input_df, "source", "destination")
+    output_df = None
+    if type(source_col) is dask_cudf.Series:
+        output_df = symmetrize_ddf(
+            input_df, "source", "destination", weight_name
+        )
+    else:
+        output_df = symmetrize_df(input_df, "source", "destination")
 
     if value_col is not None:
         return (
@@ -130,5 +197,4 @@ def symmetrize(source_col, dest_col, value_col=None):
             output_df["destination"],
             output_df["value"],
         )
-
     return output_df["source"], output_df["destination"]
diff --git a/python/cugraph/structure/utils.pxd b/python/cugraph/structure/utils.pxd
index 3f48e0fdd2d..0ec9c914347 100644
--- a/python/cugraph/structure/utils.pxd
+++ b/python/cugraph/structure/utils.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libcpp.memory cimport unique_ptr
 
 cdef extern from "raft/handle.hpp" namespace "raft":
diff --git a/python/cugraph/structure/utils_wrapper.pyx b/python/cugraph/structure/utils_wrapper.pyx
index a847f74d73c..00af5813056 100644
--- a/python/cugraph/structure/utils_wrapper.pyx
+++ b/python/cugraph/structure/utils_wrapper.pyx
@@ -18,7 +18,7 @@
 
 from libc.stdint cimport uintptr_t
 from cugraph.structure cimport utils as c_utils
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libc.stdint cimport uintptr_t
 
 import cudf
diff --git a/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py b/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py
index ccb0c94b020..53942a277c2 100644
--- a/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py
+++ b/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py
@@ -14,8 +14,8 @@
 import pytest
 import numpy as np
 
-from cugraph.tests.dask.mg_context import (MGContext,
-                                           skip_if_not_enough_devices)
+from cugraph.tests.dask.mg_context import MGContext, skip_if_not_enough_devices
+from cugraph.dask.common.mg_utils import is_single_gpu
 
 # Get parameters from standard betwenness_centrality_test
 from cugraph.tests.test_betweenness_centrality import (
@@ -30,48 +30,59 @@
 from cugraph.tests.test_betweenness_centrality import (
     prepare_test,
     calc_betweenness_centrality,
-    compare_scores
+    compare_scores,
 )
 
 # =============================================================================
 # Parameters
 # =============================================================================
-DATASETS = ['../datasets/karate.csv']
+DATASETS = ["../datasets/karate.csv"]
 MG_DEVICE_COUNT_OPTIONS = [1, 2, 3, 4]
 RESULT_DTYPE_OPTIONS = [np.float64]
 
 
 # FIXME: The following creates and destroys Comms at every call making the
 # testsuite quite slow
-@pytest.mark.parametrize('graph_file', DATASETS)
-@pytest.mark.parametrize('directed', DIRECTED_GRAPH_OPTIONS)
-@pytest.mark.parametrize('subset_size', SUBSET_SIZE_OPTIONS)
-@pytest.mark.parametrize('normalized', NORMALIZED_OPTIONS)
-@pytest.mark.parametrize('weight', [None])
-@pytest.mark.parametrize('endpoints', ENDPOINTS_OPTIONS)
-@pytest.mark.parametrize('subset_seed', SUBSET_SEED_OPTIONS)
-@pytest.mark.parametrize('result_dtype', RESULT_DTYPE_OPTIONS)
-@pytest.mark.parametrize('mg_device_count', MG_DEVICE_COUNT_OPTIONS)
-def test_mg_betweenness_centrality(graph_file,
-                                   directed,
-                                   subset_size,
-                                   normalized,
-                                   weight,
-                                   endpoints,
-                                   subset_seed,
-                                   result_dtype,
-                                   mg_device_count):
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
+@pytest.mark.parametrize("graph_file", DATASETS)
+@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
+@pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS)
+@pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS)
+@pytest.mark.parametrize("weight", [None])
+@pytest.mark.parametrize("endpoints", ENDPOINTS_OPTIONS)
+@pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS)
+@pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS)
+@pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
+def test_mg_betweenness_centrality(
+    graph_file,
+    directed,
+    subset_size,
+    normalized,
+    weight,
+    endpoints,
+    subset_seed,
+    result_dtype,
+    mg_device_count,
+):
     prepare_test()
     skip_if_not_enough_devices(mg_device_count)
     with MGContext(mg_device_count):
-        sorted_df = calc_betweenness_centrality(graph_file,
-                                                directed=directed,
-                                                normalized=normalized,
-                                                k=subset_size,
-                                                weight=weight,
-                                                endpoints=endpoints,
-                                                seed=subset_seed,
-                                                result_dtype=result_dtype,
-                                                multi_gpu_batch=True)
-    compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc",
-                   epsilon=DEFAULT_EPSILON)
+        sorted_df = calc_betweenness_centrality(
+            graph_file,
+            directed=directed,
+            normalized=normalized,
+            k=subset_size,
+            weight=weight,
+            endpoints=endpoints,
+            seed=subset_seed,
+            result_dtype=result_dtype,
+            multi_gpu_batch=True,
+        )
+    compare_scores(
+        sorted_df,
+        first_key="cu_bc",
+        second_key="ref_bc",
+        epsilon=DEFAULT_EPSILON,
+    )
diff --git a/python/cugraph/tests/dask/test_mg_batch_edge_betweenness_centrality.py b/python/cugraph/tests/dask/test_mg_batch_edge_betweenness_centrality.py
index 01023839d06..7778f7bf421 100644
--- a/python/cugraph/tests/dask/test_mg_batch_edge_betweenness_centrality.py
+++ b/python/cugraph/tests/dask/test_mg_batch_edge_betweenness_centrality.py
@@ -13,10 +13,10 @@
 
 import pytest
 import numpy as np
+from cugraph.dask.common.mg_utils import is_single_gpu
 
 
-from cugraph.tests.dask.mg_context import (MGContext,
-                                           skip_if_not_enough_devices)
+from cugraph.tests.dask.mg_context import MGContext, skip_if_not_enough_devices
 
 # Get parameters from standard betwenness_centrality_test
 from cugraph.tests.test_edge_betweenness_centrality import (
@@ -30,43 +30,54 @@
 from cugraph.tests.test_edge_betweenness_centrality import (
     prepare_test,
     calc_edge_betweenness_centrality,
-    compare_scores
+    compare_scores,
 )
 
 # =============================================================================
 # Parameters
 # =============================================================================
-DATASETS = ['../datasets/karate.csv']
+DATASETS = ["../datasets/karate.csv"]
 MG_DEVICE_COUNT_OPTIONS = [1, 2, 4]
 RESULT_DTYPE_OPTIONS = [np.float64]
 
 
-@pytest.mark.parametrize('graph_file', DATASETS)
-@pytest.mark.parametrize('directed', DIRECTED_GRAPH_OPTIONS)
-@pytest.mark.parametrize('subset_size', SUBSET_SIZE_OPTIONS)
-@pytest.mark.parametrize('normalized', NORMALIZED_OPTIONS)
-@pytest.mark.parametrize('weight', [None])
-@pytest.mark.parametrize('subset_seed', SUBSET_SEED_OPTIONS)
-@pytest.mark.parametrize('result_dtype', RESULT_DTYPE_OPTIONS)
-@pytest.mark.parametrize('mg_device_count', MG_DEVICE_COUNT_OPTIONS)
-def test_mg_edge_betweenness_centrality(graph_file,
-                                        directed,
-                                        subset_size,
-                                        normalized,
-                                        weight,
-                                        subset_seed,
-                                        result_dtype,
-                                        mg_device_count):
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
+@pytest.mark.parametrize("graph_file", DATASETS)
+@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
+@pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS)
+@pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS)
+@pytest.mark.parametrize("weight", [None])
+@pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS)
+@pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS)
+@pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
+def test_mg_edge_betweenness_centrality(
+    graph_file,
+    directed,
+    subset_size,
+    normalized,
+    weight,
+    subset_seed,
+    result_dtype,
+    mg_device_count,
+):
     prepare_test()
     skip_if_not_enough_devices(mg_device_count)
     with MGContext(mg_device_count):
-        sorted_df = calc_edge_betweenness_centrality(graph_file,
-                                                     directed=directed,
-                                                     normalized=normalized,
-                                                     k=subset_size,
-                                                     weight=weight,
-                                                     seed=subset_seed,
-                                                     result_dtype=result_dtype,
-                                                     multi_gpu_batch=True)
-    compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc",
-                   epsilon=DEFAULT_EPSILON)
+        sorted_df = calc_edge_betweenness_centrality(
+            graph_file,
+            directed=directed,
+            normalized=normalized,
+            k=subset_size,
+            weight=weight,
+            seed=subset_seed,
+            result_dtype=result_dtype,
+            multi_gpu_batch=True,
+        )
+    compare_scores(
+        sorted_df,
+        first_key="cu_bc",
+        second_key="ref_bc",
+        epsilon=DEFAULT_EPSILON,
+    )
diff --git a/python/cugraph/tests/dask/test_mg_bfs.py b/python/cugraph/tests/dask/test_mg_bfs.py
index a22f280e9b2..94bed827fd0 100644
--- a/python/cugraph/tests/dask/test_mg_bfs.py
+++ b/python/cugraph/tests/dask/test_mg_bfs.py
@@ -20,6 +20,7 @@
 import dask_cudf
 import cudf
 from dask_cuda import LocalCUDACluster
+from cugraph.dask.common.mg_utils import is_single_gpu
 
 
 @pytest.fixture
@@ -35,39 +36,49 @@ def client_connection():
     cluster.close()
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 def test_dask_bfs(client_connection):
     gc.collect()
 
     input_data_path = r"../datasets/netscience.csv"
     chunksize = dcg.get_chunksize(input_data_path)
 
-    ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
-                             delimiter=' ',
-                             names=['src', 'dst', 'value'],
-                             dtype=['int32', 'int32', 'float32'])
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        chunksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
 
-    df = cudf.read_csv(input_data_path,
-                       delimiter=' ',
-                       names=['src', 'dst', 'value'],
-                       dtype=['int32', 'int32', 'float32'])
+    df = cudf.read_csv(
+        input_data_path,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
 
     g = cugraph.DiGraph()
-    g.from_cudf_edgelist(df, 'src', 'dst', renumber=True)
+    g.from_cudf_edgelist(df, "src", "dst", renumber=True)
 
     dg = cugraph.DiGraph()
-    dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
+    dg.from_dask_cudf_edgelist(ddf, "src", "dst")
 
     expected_dist = cugraph.bfs(g, 0)
     result_dist = dcg.bfs(dg, 0, True)
 
     compare_dist = expected_dist.merge(
-        result_dist, on="vertex", suffixes=['_local', '_dask']
+        result_dist, on="vertex", suffixes=["_local", "_dask"]
     )
 
     err = 0
 
     for i in range(len(compare_dist)):
-        if (compare_dist['distance_local'].iloc[i] !=
-                compare_dist['distance_dask'].iloc[i]):
+        if (
+            compare_dist["distance_local"].iloc[i]
+            != compare_dist["distance_dask"].iloc[i]
+        ):
             err = err + 1
     assert err == 0
diff --git a/python/cugraph/tests/dask/test_mg_comms.py b/python/cugraph/tests/dask/test_mg_comms.py
index 214dc76b9be..cd94f945f93 100644
--- a/python/cugraph/tests/dask/test_mg_comms.py
+++ b/python/cugraph/tests/dask/test_mg_comms.py
@@ -20,6 +20,7 @@
 import dask_cudf
 import cudf
 from dask_cuda import LocalCUDACluster
+from cugraph.dask.common.mg_utils import is_single_gpu
 
 
 @pytest.fixture
@@ -35,6 +36,9 @@ def client_connection():
     cluster.close()
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 def test_dask_pagerank(client_connection):
     gc.collect()
 
@@ -47,43 +51,53 @@ def test_dask_pagerank(client_connection):
     input_data_path2 = r"../datasets/dolphins.csv"
     chunksize2 = dcg.get_chunksize(input_data_path2)
 
-    ddf1 = dask_cudf.read_csv(input_data_path1, chunksize=chunksize1,
-                              delimiter=' ',
-                              names=['src', 'dst', 'value'],
-                              dtype=['int32', 'int32', 'float32'])
+    ddf1 = dask_cudf.read_csv(
+        input_data_path1,
+        chunksize=chunksize1,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
 
     dg1 = cugraph.DiGraph()
-    dg1.from_dask_cudf_edgelist(ddf1, 'src', 'dst')
+    dg1.from_dask_cudf_edgelist(ddf1, "src", "dst")
 
     result_pr1 = dcg.pagerank(dg1)
 
-    ddf2 = dask_cudf.read_csv(input_data_path2, chunksize=chunksize2,
-                              delimiter=' ',
-                              names=['src', 'dst', 'value'],
-                              dtype=['int32', 'int32', 'float32'])
+    ddf2 = dask_cudf.read_csv(
+        input_data_path2,
+        chunksize=chunksize2,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
 
     dg2 = cugraph.DiGraph()
-    dg2.from_dask_cudf_edgelist(ddf2, 'src', 'dst')
+    dg2.from_dask_cudf_edgelist(ddf2, "src", "dst")
 
     result_pr2 = dcg.pagerank(dg2)
 
     # Calculate single GPU pagerank for verification of results
-    df1 = cudf.read_csv(input_data_path1,
-                        delimiter=' ',
-                        names=['src', 'dst', 'value'],
-                        dtype=['int32', 'int32', 'float32'])
+    df1 = cudf.read_csv(
+        input_data_path1,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
 
     g1 = cugraph.DiGraph()
-    g1.from_cudf_edgelist(df1, 'src', 'dst')
+    g1.from_cudf_edgelist(df1, "src", "dst")
     expected_pr1 = cugraph.pagerank(g1)
 
-    df2 = cudf.read_csv(input_data_path2,
-                        delimiter=' ',
-                        names=['src', 'dst', 'value'],
-                        dtype=['int32', 'int32', 'float32'])
+    df2 = cudf.read_csv(
+        input_data_path2,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
 
     g2 = cugraph.DiGraph()
-    g2.from_cudf_edgelist(df2, 'src', 'dst')
+    g2.from_cudf_edgelist(df2, "src", "dst")
     expected_pr2 = cugraph.pagerank(g2)
 
     # Compare and verify pagerank results
@@ -93,14 +107,16 @@ def test_dask_pagerank(client_connection):
     tol = 1.0e-05
 
     compare_pr1 = expected_pr1.merge(
-        result_pr1, on="vertex", suffixes=['_local', '_dask']
+        result_pr1, on="vertex", suffixes=["_local", "_dask"]
     )
 
     assert len(expected_pr1) == len(result_pr1)
 
     for i in range(len(compare_pr1)):
-        diff = abs(compare_pr1['pagerank_local'].iloc[i] -
-                   compare_pr1['pagerank_dask'].iloc[i])
+        diff = abs(
+            compare_pr1["pagerank_local"].iloc[i]
+            - compare_pr1["pagerank_dask"].iloc[i]
+        )
         if diff > tol * 1.1:
             err1 = err1 + 1
     print("Mismatches in ", input_data_path1, ": ", err1)
@@ -108,12 +124,14 @@ def test_dask_pagerank(client_connection):
     assert len(expected_pr2) == len(result_pr2)
 
     compare_pr2 = expected_pr2.merge(
-        result_pr2, on="vertex", suffixes=['_local', '_dask']
+        result_pr2, on="vertex", suffixes=["_local", "_dask"]
     )
 
     for i in range(len(compare_pr2)):
-        diff = abs(compare_pr2['pagerank_local'].iloc[i] -
-                   compare_pr2['pagerank_dask'].iloc[i])
+        diff = abs(
+            compare_pr2["pagerank_local"].iloc[i]
+            - compare_pr2["pagerank_dask"].iloc[i]
+        )
         if diff > tol * 1.1:
             err2 = err2 + 1
     print("Mismatches in ", input_data_path2, ": ", err2)
diff --git a/python/cugraph/tests/dask/test_mg_degree.py b/python/cugraph/tests/dask/test_mg_degree.py
index f7e206b8e75..a903f69d05a 100644
--- a/python/cugraph/tests/dask/test_mg_degree.py
+++ b/python/cugraph/tests/dask/test_mg_degree.py
@@ -18,6 +18,7 @@
 import cugraph.comms as Comms
 import cugraph
 import dask_cudf
+from cugraph.dask.common.mg_utils import is_single_gpu
 
 # Move to conftest
 from dask_cuda import LocalCUDACluster
@@ -36,6 +37,9 @@ def client_connection():
     cluster.close()
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 def test_dask_mg_degree(client_connection):
     gc.collect()
 
@@ -43,23 +47,31 @@ def test_dask_mg_degree(client_connection):
 
     chunksize = cugraph.dask.get_chunksize(input_data_path)
 
-    ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
-                             delimiter=' ',
-                             names=['src', 'dst', 'value'],
-                             dtype=['int32', 'int32', 'float32'])
-
-    df = cudf.read_csv(input_data_path,
-                       delimiter=' ',
-                       names=['src', 'dst', 'value'],
-                       dtype=['int32', 'int32', 'float32'])
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        chunksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    df = cudf.read_csv(
+        input_data_path,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
 
     dg = cugraph.DiGraph()
-    dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
+    dg.from_dask_cudf_edgelist(ddf, "src", "dst")
 
     g = cugraph.DiGraph()
-    g.from_cudf_edgelist(df, 'src', 'dst')
+    g.from_cudf_edgelist(df, "src", "dst")
 
-    merge_df = dg.in_degree().merge(
-        g.in_degree(), on="vertex", suffixes=['_dg', '_g']).compute()
+    merge_df = (
+        dg.in_degree()
+        .merge(g.in_degree(), on="vertex", suffixes=["_dg", "_g"])
+        .compute()
+    )
 
-    assert merge_df['degree_dg'].equals(merge_df['degree_g'])
+    assert merge_df["degree_dg"].equals(merge_df["degree_g"])
diff --git a/python/cugraph/tests/dask/test_mg_louvain.py b/python/cugraph/tests/dask/test_mg_louvain.py
new file mode 100644
index 00000000000..56401e338a4
--- /dev/null
+++ b/python/cugraph/tests/dask/test_mg_louvain.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import cugraph.dask as dcg
+import cugraph.comms as Comms
+from dask.distributed import Client
+import cugraph
+import dask_cudf
+from dask_cuda import LocalCUDACluster
+from cugraph.tests import utils
+from cugraph.dask.common.mg_utils import is_single_gpu
+
+try:
+    from rapids_pytest_benchmark import setFixtureParamNames
+except ImportError:
+    print(
+        "\n\nWARNING: rapids_pytest_benchmark is not installed, "
+        "falling back to pytest_benchmark fixtures.\n"
+    )
+
+    # if rapids_pytest_benchmark is not available, just perfrom time-only
+    # benchmarking and replace the util functions with nops
+    import pytest_benchmark
+
+    gpubenchmark = pytest_benchmark.plugin.benchmark
+
+    def setFixtureParamNames(*args, **kwargs):
+        pass
+
+
+###############################################################################
+# Fixtures
+@pytest.fixture(scope="module")
+def client_connection():
+    # setup
+    cluster = LocalCUDACluster()
+    client = Client(cluster)
+    Comms.initialize(p2p=True)
+
+    yield client
+
+    # teardown
+    Comms.destroy()
+    client.close()
+    cluster.close()
+
+
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
+@pytest.fixture(scope="module", params=utils.DATASETS_UNDIRECTED)
+def daskGraphFromDataset(request, client_connection):
+    """
+    Returns a new dask dataframe created from the dataset file param.
+    """
+    # Since parameterized fixtures do not assign param names to param values,
+    # manually call the helper to do so.
+    setFixtureParamNames(request, ["dataset"])
+    dataset = request.param
+
+    chunksize = dcg.get_chunksize(dataset)
+    ddf = dask_cudf.read_csv(
+        dataset,
+        chunksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
+
+    dg = cugraph.DiGraph()
+    dg.from_dask_cudf_edgelist(ddf, "src", "dst")
+    return dg
+
+
+###############################################################################
+# Tests
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
+def test_mg_louvain_with_edgevals(daskGraphFromDataset):
+    # FIXME: daskGraphFromDataset returns a DiGraph, which Louvain is currently
+    # accepting. In the future, an MNMG symmeterize will need to be called to
+    # create a Graph for Louvain.
+    parts, mod = dcg.louvain(daskGraphFromDataset)
+
+    # FIXME: either call Nx with the same dataset and compare results, or
+    # hadcode golden results to compare to.
+    print()
+    print(parts.compute())
+    print(mod)
+    print()
diff --git a/python/cugraph/tests/dask/test_mg_pagerank.py b/python/cugraph/tests/dask/test_mg_pagerank.py
index aad164a45c5..a2340e139d1 100644
--- a/python/cugraph/tests/dask/test_mg_pagerank.py
+++ b/python/cugraph/tests/dask/test_mg_pagerank.py
@@ -20,6 +20,7 @@
 import dask_cudf
 import cudf
 from dask_cuda import LocalCUDACluster
+from cugraph.dask.common.mg_utils import is_single_gpu
 
 # The function selects personalization_perc% of accessible vertices in graph M
 # and randomly assigns them personalization values
@@ -30,19 +31,20 @@ def personalize(v, personalization_perc):
     if personalization_perc != 0:
         personalization = {}
         nnz_vtx = np.arange(0, v)
-        personalization_count = int((nnz_vtx.size *
-                                     personalization_perc)/100.0)
-        nnz_vtx = np.random.choice(nnz_vtx,
-                                   min(nnz_vtx.size, personalization_count),
-                                   replace=False)
+        personalization_count = int(
+            (nnz_vtx.size * personalization_perc) / 100.0
+        )
+        nnz_vtx = np.random.choice(
+            nnz_vtx, min(nnz_vtx.size, personalization_count), replace=False
+        )
         nnz_val = np.random.random(nnz_vtx.size)
-        nnz_val = nnz_val/sum(nnz_val)
+        nnz_val = nnz_val / sum(nnz_val)
         for vtx, val in zip(nnz_vtx, nnz_val):
             personalization[vtx] = val
 
-        k = np.fromiter(personalization.keys(), dtype='int32')
-        v = np.fromiter(personalization.values(), dtype='float32')
-        cu_personalization = cudf.DataFrame({'vertex': k, 'values': v})
+        k = np.fromiter(personalization.keys(), dtype="int32")
+        v = np.fromiter(personalization.values(), dtype="float32")
+        cu_personalization = cudf.DataFrame({"vertex": k, "values": v})
 
     return cu_personalization
 
@@ -63,39 +65,48 @@ def client_connection():
     cluster.close()
 
 
-@pytest.mark.parametrize('personalization_perc', PERSONALIZATION_PERC)
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
+@pytest.mark.parametrize("personalization_perc", PERSONALIZATION_PERC)
 def test_dask_pagerank(client_connection, personalization_perc):
     gc.collect()
 
     input_data_path = r"../datasets/karate.csv"
     chunksize = dcg.get_chunksize(input_data_path)
 
-    ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
-                             delimiter=' ',
-                             names=['src', 'dst', 'value'],
-                             dtype=['int32', 'int32', 'float32'])
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        chunksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
 
-    df = cudf.read_csv(input_data_path,
-                       delimiter=' ',
-                       names=['src', 'dst', 'value'],
-                       dtype=['int32', 'int32', 'float32'])
+    df = cudf.read_csv(
+        input_data_path,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
 
     g = cugraph.DiGraph()
-    g.from_cudf_edgelist(df, 'src', 'dst')
+    g.from_cudf_edgelist(df, "src", "dst")
 
     dg = cugraph.DiGraph()
-    dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
+    dg.from_dask_cudf_edgelist(ddf, "src", "dst")
 
     # Pre compute local data and personalize
     personalization = None
     if personalization_perc != 0:
-        dg.compute_local_data(by='dst')
-        personalization = personalize(dg.number_of_vertices(),
-                                      personalization_perc)
+        dg.compute_local_data(by="dst")
+        personalization = personalize(
+            dg.number_of_vertices(), personalization_perc
+        )
 
-    expected_pr = cugraph.pagerank(g,
-                                   personalization=personalization,
-                                   tol=1e-6)
+    expected_pr = cugraph.pagerank(
+        g, personalization=personalization, tol=1e-6
+    )
     result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6)
 
     err = 0
@@ -104,12 +115,14 @@ def test_dask_pagerank(client_connection, personalization_perc):
     assert len(expected_pr) == len(result_pr)
 
     compare_pr = expected_pr.merge(
-        result_pr, on="vertex", suffixes=['_local', '_dask']
+        result_pr, on="vertex", suffixes=["_local", "_dask"]
     )
 
     for i in range(len(compare_pr)):
-        diff = abs(compare_pr['pagerank_local'].iloc[i] -
-                   compare_pr['pagerank_dask'].iloc[i])
+        diff = abs(
+            compare_pr["pagerank_local"].iloc[i]
+            - compare_pr["pagerank_dask"].iloc[i]
+        )
         if diff > tol * 1.1:
             err = err + 1
     assert err == 0
diff --git a/python/cugraph/tests/dask/test_mg_renumber.py b/python/cugraph/tests/dask/test_mg_renumber.py
index ceeeeb77a5a..b981a49a0de 100644
--- a/python/cugraph/tests/dask/test_mg_renumber.py
+++ b/python/cugraph/tests/dask/test_mg_renumber.py
@@ -29,6 +29,7 @@
 from dask_cuda import LocalCUDACluster
 from cugraph.tests import utils
 from cugraph.structure.number_map import NumberMap
+from cugraph.dask.common.mg_utils import is_single_gpu
 
 
 @pytest.fixture
@@ -45,6 +46,9 @@ def client_connection():
 
 
 # Test all combinations of default/managed and pooled/non-pooled allocation
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED)
 def test_mg_renumber(graph_file, client_connection):
     gc.collect()
@@ -85,6 +89,9 @@ def test_mg_renumber(graph_file, client_connection):
 
 
 # Test all combinations of default/managed and pooled/non-pooled allocation
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED)
 def test_mg_renumber2(graph_file, client_connection):
     gc.collect()
@@ -109,9 +116,9 @@ def test_mg_renumber2(graph_file, client_connection):
     )
 
     check_src = num2.from_internal_vertex_id(ren2, "src").compute()
-    check_src = check_src.sort_values('weight').reset_index(drop=True)
+    check_src = check_src.sort_values("weight").reset_index(drop=True)
     check_dst = num2.from_internal_vertex_id(ren2, "dst").compute()
-    check_dst = check_dst.sort_values('weight').reset_index(drop=True)
+    check_dst = check_dst.sort_values("weight").reset_index(drop=True)
 
     assert check_src["0"].to_pandas().equals(gdf["src"].to_pandas())
     assert check_src["1"].to_pandas().equals(gdf["src_old"].to_pandas())
@@ -120,6 +127,9 @@ def test_mg_renumber2(graph_file, client_connection):
 
 
 # Test all combinations of default/managed and pooled/non-pooled allocation
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED)
 def test_mg_renumber3(graph_file, client_connection):
     gc.collect()
@@ -143,39 +153,47 @@ def test_mg_renumber3(graph_file, client_connection):
         ddf, ["src", "src_old"], ["dst", "dst_old"]
     )
 
-    test_df = gdf[['src', 'src_old']].head()
+    test_df = gdf[["src", "src_old"]].head()
 
     #
     #  This call raises an exception in branch-0.15
     #  prior to this PR
     #
-    test_df = num2.add_internal_vertex_id(test_df, 'src', ['src', 'src_old'])
-    assert(True)
+    test_df = num2.add_internal_vertex_id(test_df, "src", ["src", "src_old"])
+    assert True
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 def test_dask_pagerank(client_connection):
     gc.collect()
 
-    pandas.set_option('display.max_rows', 10000)
+    pandas.set_option("display.max_rows", 10000)
 
     input_data_path = r"../datasets/karate.csv"
     chunksize = dcg.get_chunksize(input_data_path)
 
-    ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
-                             delimiter=' ',
-                             names=['src', 'dst', 'value'],
-                             dtype=['int32', 'int32', 'float32'])
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        chunksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
 
-    df = cudf.read_csv(input_data_path,
-                       delimiter=' ',
-                       names=['src', 'dst', 'value'],
-                       dtype=['int32', 'int32', 'float32'])
+    df = cudf.read_csv(
+        input_data_path,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
 
     g = cugraph.DiGraph()
-    g.from_cudf_edgelist(df, 'src', 'dst')
+    g.from_cudf_edgelist(df, "src", "dst")
 
     dg = cugraph.DiGraph()
-    dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
+    dg.from_dask_cudf_edgelist(ddf, "src", "dst")
 
     # Pre compute local data
     # dg.compute_local_data(by='dst')
@@ -189,12 +207,14 @@ def test_dask_pagerank(client_connection):
     assert len(expected_pr) == len(result_pr)
 
     compare_pr = expected_pr.merge(
-        result_pr, on="vertex", suffixes=['_local', '_dask']
+        result_pr, on="vertex", suffixes=["_local", "_dask"]
     )
 
     for i in range(len(compare_pr)):
-        diff = abs(compare_pr['pagerank_local'].iloc[i] -
-                   compare_pr['pagerank_dask'].iloc[i])
+        diff = abs(
+            compare_pr["pagerank_local"].iloc[i]
+            - compare_pr["pagerank_dask"].iloc[i]
+        )
         if diff > tol * 1.1:
             err = err + 1
     print("Mismatches:", err)
diff --git a/python/cugraph/tests/dask/test_mg_replication.py b/python/cugraph/tests/dask/test_mg_replication.py
index 061bcf83f20..d8a2676b32b 100644
--- a/python/cugraph/tests/dask/test_mg_replication.py
+++ b/python/cugraph/tests/dask/test_mg_replication.py
@@ -12,79 +12,100 @@
 # limitations under the License.
 
 import cugraph
-from cugraph.tests.dask.mg_context import (MGContext,
-                                           skip_if_not_enough_devices)
+from cugraph.tests.dask.mg_context import MGContext, skip_if_not_enough_devices
 import cudf
 import cugraph.dask.structure.replication as replication
+from cugraph.dask.common.mg_utils import is_single_gpu
 import cugraph.tests.utils as utils
 import pytest
 import gc
 
 DATASETS_OPTIONS = utils.DATASETS_SMALL
 DIRECTED_GRAPH_OPTIONS = [False, True]
-MG_DEVICE_COUNT_OPTIONS = [1, 2, 3, 4]
+# MG_DEVICE_COUNT_OPTIONS = [1, 2, 3, 4]
+MG_DEVICE_COUNT_OPTIONS = [1]
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
-def test_replicate_cudf_dataframe_with_weights(input_data_path,
-                                               mg_device_count):
+def test_replicate_cudf_dataframe_with_weights(
+    input_data_path, mg_device_count
+):
     gc.collect()
     skip_if_not_enough_devices(mg_device_count)
-    df = cudf.read_csv(input_data_path,
-                       delimiter=' ',
-                       names=['src', 'dst', 'value'],
-                       dtype=['int32', 'int32', 'float32'])
+    df = cudf.read_csv(
+        input_data_path,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
     with MGContext(mg_device_count):
         worker_to_futures = replication.replicate_cudf_dataframe(df)
         for worker in worker_to_futures:
             replicated_df = worker_to_futures[worker].result()
-            assert df.equals(replicated_df), "There is a mismatch in one " \
-                "of the replications"
+            assert df.equals(replicated_df), (
+                "There is a mismatch in one " "of the replications"
+            )
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
-def test_replicate_cudf_dataframe_no_weights(input_data_path,
-                                             mg_device_count):
+def test_replicate_cudf_dataframe_no_weights(input_data_path, mg_device_count):
     gc.collect()
     skip_if_not_enough_devices(mg_device_count)
-    df = cudf.read_csv(input_data_path,
-                       delimiter=' ',
-                       names=['src', 'dst'],
-                       dtype=['int32', 'int32'])
+    df = cudf.read_csv(
+        input_data_path,
+        delimiter=" ",
+        names=["src", "dst"],
+        dtype=["int32", "int32"],
+    )
     with MGContext(mg_device_count):
         worker_to_futures = replication.replicate_cudf_dataframe(df)
         for worker in worker_to_futures:
             replicated_df = worker_to_futures[worker].result()
-            assert df.equals(replicated_df), "There is a mismatch in one " \
-                "of the replications"
+            assert df.equals(replicated_df), (
+                "There is a mismatch in one " "of the replications"
+            )
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
-def test_replicate_cudf_series(input_data_path,
-                               mg_device_count):
+def test_replicate_cudf_series(input_data_path, mg_device_count):
     gc.collect()
     skip_if_not_enough_devices(mg_device_count)
-    df = cudf.read_csv(input_data_path,
-                       delimiter=' ',
-                       names=['src', 'dst', 'value'],
-                       dtype=['int32', 'int32', 'float32'])
+    df = cudf.read_csv(
+        input_data_path,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
     with MGContext(mg_device_count):
         for column in df.columns.values:
             series = df[column]
             worker_to_futures = replication.replicate_cudf_series(series)
             for worker in worker_to_futures:
                 replicated_series = worker_to_futures[worker].result()
-                assert series.equals(replicated_series), "There is a " \
-                    "mismatch in one of the replications"
+                assert series.equals(replicated_series), (
+                    "There is a " "mismatch in one of the replications"
+                )
             # FIXME: If we do not clear this dictionary, when comparing
             # results for the 2nd column, one of the workers still
             # has a value from the 1st column
             worker_to_futures = {}
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
@@ -97,11 +118,15 @@ def test_enable_batch_no_context(graph_file, directed, mg_device_count):
         G.enable_batch()
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
-def test_enable_batch_no_context_view_adj(graph_file, directed,
-                                          mg_device_count):
+def test_enable_batch_no_context_view_adj(
+    graph_file, directed, mg_device_count
+):
     gc.collect()
     skip_if_not_enough_devices(mg_device_count)
     G = utils.generate_cugraph_graph_from_file(graph_file, directed)
@@ -109,11 +134,15 @@ def test_enable_batch_no_context_view_adj(graph_file, directed,
     G.view_adj_list()
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
-def test_enable_batch_context_then_views(graph_file, directed,
-                                         mg_device_count):
+def test_enable_batch_context_then_views(
+    graph_file, directed, mg_device_count
+):
     gc.collect()
     skip_if_not_enough_devices(mg_device_count)
     G = utils.generate_cugraph_graph_from_file(graph_file, directed)
@@ -121,9 +150,9 @@ def test_enable_batch_context_then_views(graph_file, directed,
         assert G.batch_enabled is False, "Internal property should be False"
         G.enable_batch()
         assert G.batch_enabled is True, "Internal property should be True"
-        assert G.batch_edgelists is not None, "The graph should have " \
-                                              "been created with an "  \
-                                              "edgelist"
+        assert G.batch_edgelists is not None, (
+            "The graph should have " "been created with an " "edgelist"
+        )
         assert G.batch_adjlists is None
         G.view_adj_list()
         assert G.batch_adjlists is not None
@@ -133,11 +162,13 @@ def test_enable_batch_context_then_views(graph_file, directed,
         assert G.batch_transposed_adjlists is not None
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
-def test_enable_batch_view_then_context(graph_file, directed,
-                                        mg_device_count):
+def test_enable_batch_view_then_context(graph_file, directed, mg_device_count):
     gc.collect()
     skip_if_not_enough_devices(mg_device_count)
     G = utils.generate_cugraph_graph_from_file(graph_file, directed)
@@ -154,18 +185,22 @@ def test_enable_batch_view_then_context(graph_file, directed,
         assert G.batch_enabled is False, "Internal property should be False"
         G.enable_batch()
         assert G.batch_enabled is True, "Internal property should be True"
-        assert G.batch_edgelists is not None, "The graph should have " \
-                                              "been created with an "  \
-                                              "edgelist"
+        assert G.batch_edgelists is not None, (
+            "The graph should have " "been created with an " "edgelist"
+        )
         assert G.batch_adjlists is not None
         assert G.batch_transposed_adjlists is not None
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
-def test_enable_batch_context_no_context_views(graph_file, directed,
-                                               mg_device_count):
+def test_enable_batch_context_no_context_views(
+    graph_file, directed, mg_device_count
+):
     gc.collect()
     skip_if_not_enough_devices(mg_device_count)
     G = utils.generate_cugraph_graph_from_file(graph_file, directed)
@@ -173,19 +208,23 @@ def test_enable_batch_context_no_context_views(graph_file, directed,
         assert G.batch_enabled is False, "Internal property should be False"
         G.enable_batch()
         assert G.batch_enabled is True, "Internal property should be True"
-        assert G.batch_edgelists is not None, "The graph should have " \
-                                              "been created with an "  \
-                                              "edgelist"
+        assert G.batch_edgelists is not None, (
+            "The graph should have " "been created with an " "edgelist"
+        )
     G.view_edge_list()
     G.view_adj_list()
     G.view_transposed_adj_list()
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
-def test_enable_batch_edgelist_replication(graph_file, directed,
-                                           mg_device_count):
+def test_enable_batch_edgelist_replication(
+    graph_file, directed, mg_device_count
+):
     gc.collect()
     skip_if_not_enough_devices(mg_device_count)
     G = utils.generate_cugraph_graph_from_file(graph_file, directed)
@@ -197,20 +236,27 @@ def test_enable_batch_edgelist_replication(graph_file, directed,
             assert df.equals(replicated_df), "Replication of edgelist failed"
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
-def test_enable_batch_adjlist_replication_weights(graph_file, directed,
-                                                  mg_device_count):
+def test_enable_batch_adjlist_replication_weights(
+    graph_file, directed, mg_device_count
+):
     gc.collect()
     skip_if_not_enough_devices(mg_device_count)
-    df = cudf.read_csv(graph_file,
-                       delimiter=' ',
-                       names=['src', 'dst', 'value'],
-                       dtype=['int32', 'int32', 'float32'])
+    df = cudf.read_csv(
+        graph_file,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
     G = cugraph.DiGraph() if directed else cugraph.Graph()
-    G.from_cudf_edgelist(df, source='src', destination='dst',
-                         edge_attr='value')
+    G.from_cudf_edgelist(
+        df, source="src", destination="dst", edge_attr="value"
+    )
     with MGContext(mg_device_count):
         G.enable_batch()
         G.view_adj_list()
@@ -219,30 +265,37 @@ def test_enable_batch_adjlist_replication_weights(graph_file, directed,
         indices = adjlist.indices
         weights = adjlist.weights
         for worker in G.batch_adjlists:
-            (rep_offsets,
-             rep_indices,
-             rep_weights) = G.batch_adjlists[worker]
-            assert offsets.equals(rep_offsets.result()), "Replication of " \
-                "adjlist offsets failed"
-            assert indices.equals(rep_indices.result()), "Replication of " \
-                "adjlist indices failed"
-            assert weights.equals(rep_weights.result()), "Replication of " \
-                "adjlist weights failed"
+            (rep_offsets, rep_indices, rep_weights) = G.batch_adjlists[worker]
+            assert offsets.equals(rep_offsets.result()), (
+                "Replication of " "adjlist offsets failed"
+            )
+            assert indices.equals(rep_indices.result()), (
+                "Replication of " "adjlist indices failed"
+            )
+            assert weights.equals(rep_weights.result()), (
+                "Replication of " "adjlist weights failed"
+            )
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 @pytest.mark.parametrize("graph_file", DATASETS_OPTIONS)
 @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS)
-def test_enable_batch_adjlist_replication_no_weights(graph_file, directed,
-                                                     mg_device_count):
+def test_enable_batch_adjlist_replication_no_weights(
+    graph_file, directed, mg_device_count
+):
     gc.collect()
     skip_if_not_enough_devices(mg_device_count)
-    df = cudf.read_csv(graph_file,
-                       delimiter=' ',
-                       names=['src', 'dst'],
-                       dtype=['int32', 'int32'])
+    df = cudf.read_csv(
+        graph_file,
+        delimiter=" ",
+        names=["src", "dst"],
+        dtype=["int32", "int32"],
+    )
     G = cugraph.DiGraph() if directed else cugraph.Graph()
-    G.from_cudf_edgelist(df, source='src', destination='dst')
+    G.from_cudf_edgelist(df, source="src", destination="dst")
     with MGContext(mg_device_count):
         G.enable_batch()
         G.view_adj_list()
@@ -251,11 +304,11 @@ def test_enable_batch_adjlist_replication_no_weights(graph_file, directed,
         indices = adjlist.indices
         weights = adjlist.weights
         for worker in G.batch_adjlists:
-            (rep_offsets,
-             rep_indices,
-             rep_weights) = G.batch_adjlists[worker]
-            assert offsets.equals(rep_offsets.result()), "Replication of " \
-                "adjlist offsets failed"
-            assert indices.equals(rep_indices.result()), "Replication of " \
-                "adjlist indices failed"
+            (rep_offsets, rep_indices, rep_weights) = G.batch_adjlists[worker]
+            assert offsets.equals(rep_offsets.result()), (
+                "Replication of " "adjlist offsets failed"
+            )
+            assert indices.equals(rep_indices.result()), (
+                "Replication of " "adjlist indices failed"
+            )
             assert weights is None and rep_weights is None
diff --git a/python/cugraph/tests/dask/test_mg_utility.py b/python/cugraph/tests/dask/test_mg_utility.py
index 704b1db849c..f1becb051ad 100644
--- a/python/cugraph/tests/dask/test_mg_utility.py
+++ b/python/cugraph/tests/dask/test_mg_utility.py
@@ -12,13 +12,20 @@
 # limitations under the License.
 
 import cugraph.dask as dcg
-from dask.distributed import Client
+from dask.distributed import Client, default_client, futures_of, wait
 import gc
 import cugraph
 import dask_cudf
 import cugraph.comms as Comms
 from dask_cuda import LocalCUDACluster
 import pytest
+from cugraph.dask.common.part_utils import concat_within_workers
+from cugraph.dask.common.read_utils import get_n_workers
+from cugraph.dask.common.mg_utils import is_single_gpu
+import os
+import time
+import numpy as np
+from cugraph.tests import utils
 
 
 @pytest.fixture
@@ -34,30 +41,96 @@ def client_connection():
     cluster.close()
 
 
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
 def test_compute_local_data(client_connection):
 
     gc.collect()
 
     input_data_path = r"../datasets/karate.csv"
     chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
-                             delimiter=' ',
-                             names=['src', 'dst', 'value'],
-                             dtype=['int32', 'int32', 'float32'])
+    ddf = dask_cudf.read_csv(
+        input_data_path,
+        chunksize=chunksize,
+        delimiter=" ",
+        names=["src", "dst", "value"],
+        dtype=["int32", "int32", "float32"],
+    )
 
     dg = cugraph.DiGraph()
-    dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
-                               edge_attr='value')
+    dg.from_dask_cudf_edgelist(
+        ddf, source="src", destination="dst", edge_attr="value"
+    )
 
     # Compute_local_data
-    dg.compute_local_data(by='dst')
-    data = dg.local_data['data']
-    by = dg.local_data['by']
+    dg.compute_local_data(by="dst")
+    data = dg.local_data["data"]
+    by = dg.local_data["by"]
 
-    assert by == 'dst'
+    assert by == "dst"
     assert Comms.is_initialized()
 
-    global_num_edges = data.local_data['edges'].sum()
+    global_num_edges = data.local_data["edges"].sum()
     assert global_num_edges == dg.number_of_edges()
-    global_num_verts = data.local_data['verts'].sum()
+    global_num_verts = data.local_data["verts"].sum()
     assert global_num_verts == dg.number_of_nodes()
+
+
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
+@pytest.mark.skip(reason="MG not supported on CI")
+def test_parquet_concat_within_workers(client_connection):
+    if not os.path.exists("test_files_parquet"):
+        print("Generate data... ")
+        os.mkdir("test_files_parquet")
+    for x in range(10):
+        if not os.path.exists("test_files_parquet/df" + str(x)):
+            df = utils.random_edgelist(
+                e=100, ef=16, dtypes={"src": np.int32, "dst": np.int32}, seed=x
+            )
+            df.to_parquet("test_files_parquet/df" + str(x), index=False)
+
+    n_gpu = get_n_workers()
+
+    print("Read_parquet... ")
+    t1 = time.time()
+    ddf = dask_cudf.read_parquet(
+        "test_files_parquet/*", dtype=["int32", "int32"]
+    )
+    ddf = ddf.persist()
+    futures_of(ddf)
+    wait(ddf)
+    t1 = time.time() - t1
+    print("*** Read Time: ", t1, "s")
+    print(ddf)
+
+    assert ddf.npartitions > n_gpu
+
+    print("Drop_duplicates... ")
+    t2 = time.time()
+    ddf.drop_duplicates(inplace=True)
+    ddf = ddf.persist()
+    futures_of(ddf)
+    wait(ddf)
+    t2 = time.time() - t2
+    print("*** Drop duplicate time: ", t2, "s")
+    assert t2 < t1
+
+    print("Repartition... ")
+    t3 = time.time()
+    # Notice that ideally we would use :
+    # ddf = ddf.repartition(npartitions=n_gpu)
+    # However this is slower than reading and requires more memory
+    # Using custom concat instead
+    client = default_client()
+    ddf = concat_within_workers(client, ddf)
+    ddf = ddf.persist()
+    futures_of(ddf)
+    wait(ddf)
+    t3 = time.time() - t3
+    print("*** repartition Time: ", t3, "s")
+    print(ddf)
+
+    assert t3 < t1
diff --git a/python/cugraph/tests/test_balanced_cut.py b/python/cugraph/tests/test_balanced_cut.py
index 9bee231d99e..f0fc7152e56 100644
--- a/python/cugraph/tests/test_balanced_cut.py
+++ b/python/cugraph/tests/test_balanced_cut.py
@@ -15,7 +15,8 @@
 import random
 
 import pytest
-
+import networkx as nx
+import pandas as pd
 import cudf
 import cugraph
 from cugraph.tests import utils
@@ -120,3 +121,40 @@ def test_digraph_rejected():
 
     with pytest.raises(Exception):
         cugraph_call(G, 2)
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("partitions", PARTITIONS)
+def test_edge_cut_clustering_with_edgevals_nx(graph_file, partitions):
+    gc.collect()
+
+    # Read in the graph and create a NetworkX Graph
+    # FIXME: replace with utils.generate_nx_graph_from_file()
+    NM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    G = nx.from_pandas_edgelist(
+                NM, create_using=nx.Graph(), source="0", target="1",
+                edge_attr="weight"
+    )
+
+    # Get the edge_cut score for partitioning versus random assignment
+    df = cugraph.spectralBalancedCutClustering(
+        G, partitions, num_eigen_vects=partitions
+    )
+
+    pdf = pd.DataFrame.from_dict(df, orient='index').reset_index()
+    pdf.columns = ["vertex", "cluster"]
+    gdf = cudf.from_pandas(pdf)
+
+    cu_score = cugraph.analyzeClustering_edge_cut(
+        G, partitions, gdf, 'vertex', 'cluster'
+    )
+
+    df = set(gdf["vertex"].to_array())
+
+    Gcu = cugraph.utilities.convert_from_nx(G)
+    rand_vid, rand_score = random_call(Gcu, partitions)
+
+    # Assert that the partitioning has better edge_cut than the random
+    # assignment
+    print(cu_score, rand_score)
+    assert cu_score < rand_score
diff --git a/python/cugraph/tests/test_betweenness_centrality.py b/python/cugraph/tests/test_betweenness_centrality.py
index 1ef1601edd5..73a706f877d 100644
--- a/python/cugraph/tests/test_betweenness_centrality.py
+++ b/python/cugraph/tests/test_betweenness_centrality.py
@@ -64,6 +64,7 @@ def calc_betweenness_centrality(
     result_dtype=np.float64,
     use_k_full=False,
     multi_gpu_batch=False,
+
 ):
     """ Generate both cugraph and networkx betweenness centrality
 
@@ -474,3 +475,28 @@ def test_betweenness_invalid_dtype(
             result_dtype=result_dtype,
         )
         compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc")
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
+def test_betweenness_centrality_nx(graph_file):
+    prepare_test()
+
+    Gnx = utils.generate_nx_graph_from_file(graph_file)
+
+    nx_bc = nx.betweenness_centrality(Gnx)
+    cu_bc = cugraph.betweenness_centrality(Gnx)
+
+    # Calculating mismatch
+    networkx_bc = sorted(nx_bc.items(), key=lambda x: x[0])
+    cugraph_bc = sorted(cu_bc.items(), key=lambda x: x[0])
+    err = 0
+    assert len(cugraph_bc) == len(networkx_bc)
+    for i in range(len(cugraph_bc)):
+        if (
+            abs(cugraph_bc[i][1] - networkx_bc[i][1]) > 0.01
+            and cugraph_bc[i][0] == networkx_bc[i][0]
+        ):
+            err = err + 1
+            print(f"{cugraph_bc[i][1]} and {cugraph_bc[i][1]}")
+    print("Mismatches:", err)
+    assert err < (0.01 * len(cugraph_bc))
diff --git a/python/cugraph/tests/test_bfs.py b/python/cugraph/tests/test_bfs.py
index 8eb175ad66d..5b5f7cf3737 100644
--- a/python/cugraph/tests/test_bfs.py
+++ b/python/cugraph/tests/test_bfs.py
@@ -12,9 +12,10 @@
 # limitations under the License.
 
 import gc
-
+import pandas
 import cupy
 import numpy as np
+import cudf
 import pytest
 import cugraph
 from cugraph.tests import utils
@@ -110,7 +111,10 @@ def compare_bfs(graph_file, directed=True, return_sp_counter=False, seed=42):
 
 
 def _compare_bfs(G, Gnx, source):
-    df = cugraph.bfs(G, source, return_sp_counter=False)
+    df = cugraph.bfs_edges(G, source, return_sp_counter=False)
+    if isinstance(df, pandas.DataFrame):
+        df = cudf.from_pandas(df)
+
     # This call should only contain 3 columns:
     # 'vertex', 'distance', 'predecessor'
     # It also confirms wether or not 'sp_counter' has been created by the call
@@ -265,3 +269,23 @@ def test_bfs_spc_full(graph_file, directed):
     compare_bfs(
         graph_file, directed=directed, return_sp_counter=True, seed=None
     )
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
+@pytest.mark.parametrize("seed", SUBSET_SEED_OPTIONS)
+def test_bfs_nx(graph_file, directed, seed):
+    """Test BFS traversal on random source with distance and predecessors"""
+    prepare_test()
+
+    M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=False)
+    G = nx.from_pandas_edgelist(
+        M, source="0", target="1",
+        create_using=nx.Graph()
+    )
+
+    if isinstance(seed, int):
+        random.seed(seed)
+        start_vertex = random.sample(G.nodes(), 1)[0]
+
+    _compare_bfs(G, G, start_vertex)
diff --git a/python/cugraph/tests/test_connectivity.py b/python/cugraph/tests/test_connectivity.py
index 508be9bb58d..fdc1ca6d8fd 100644
--- a/python/cugraph/tests/test_connectivity.py
+++ b/python/cugraph/tests/test_connectivity.py
@@ -15,6 +15,7 @@
 import time
 from collections import defaultdict
 import pytest
+import pandas as pd
 
 import cugraph
 from cugraph.tests import utils
@@ -35,22 +36,12 @@
 
 
 def networkx_weak_call(M):
-    """M = M.tocsr()
-    if M is None:
-        raise TypeError('Could not read the input graph')
-    if M.shape[0] != M.shape[1]:
-        raise TypeError('Shape is not square')
-
-    Gnx = nx.DiGraph(M)"""
     Gnx = nx.from_pandas_edgelist(
         M, source="0", target="1", create_using=nx.DiGraph()
     )
 
     # Weakly Connected components call:
-    print("Solving... ")
     t1 = time.time()
-
-    # same parameters as in NVGRAPH
     result = nx.weakly_connected_components(Gnx)
     t2 = time.time() - t1
     print("Time : " + str(t2))
@@ -60,7 +51,6 @@ def networkx_weak_call(M):
 
 
 def cugraph_weak_call(cu_M):
-    # cugraph Pagerank Call
     G = cugraph.DiGraph()
     G.from_cudf_edgelist(cu_M, source="0", destination="1")
     t1 = time.time()
@@ -79,14 +69,9 @@ def networkx_strong_call(M):
         M, source="0", target="1", create_using=nx.DiGraph()
     )
 
-    # Weakly Connected components call:
-    print("Solving... ")
     t1 = time.time()
-
-    # same parameters as in NVGRAPH
     result = nx.strongly_connected_components(Gnx)
     t2 = time.time() - t1
-
     print("Time : " + str(t2))
 
     labels = sorted(result)
@@ -205,3 +190,44 @@ def test_strong_cc(graph_file):
 
     cg_vertices = sorted(lst_cg_components[idx])
     assert nx_vertices == cg_vertices
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_weak_cc_nx(graph_file):
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+    Gnx = nx.from_pandas_edgelist(
+        M, source="0", target="1", create_using=nx.DiGraph()
+    )
+
+    nx_wcc = nx.weakly_connected_components(Gnx)
+    nx_result = sorted(nx_wcc)
+
+    cu_wcc = cugraph.weakly_connected_components(Gnx)
+    pdf = pd.DataFrame.from_dict(cu_wcc, orient='index').reset_index()
+    pdf.columns = ["vertex", "labels"]
+    cu_result = pdf["labels"].nunique()
+
+    assert len(nx_result) == cu_result
+
+
+@pytest.mark.parametrize("graph_file", utils.STRONGDATASETS)
+def test_strong_cc_nx(graph_file):
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+    Gnx = nx.from_pandas_edgelist(
+        M, source="0", target="1", create_using=nx.DiGraph()
+    )
+
+    nx_scc = nx.strongly_connected_components(Gnx)
+    nx_result = sorted(nx_scc)
+
+    cu_scc = cugraph.strongly_connected_components(Gnx)
+
+    pdf = pd.DataFrame.from_dict(cu_scc, orient='index').reset_index()
+    pdf.columns = ["vertex", "labels"]
+    cu_result = pdf["labels"].nunique()
+
+    assert len(nx_result) == cu_result
diff --git a/python/cugraph/tests/test_core_number.py b/python/cugraph/tests/test_core_number.py
index c1b8702836f..edbc7b0597b 100644
--- a/python/cugraph/tests/test_core_number.py
+++ b/python/cugraph/tests/test_core_number.py
@@ -15,6 +15,7 @@
 import pytest
 import cugraph
 from cugraph.tests import utils
+from cugraph.utilities import df_score_to_dictionary
 
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
@@ -31,19 +32,38 @@
 print("Networkx version : {} ".format(nx.__version__))
 
 
-def calc_core_number(graph_file):
+def calc_nx_core_number(graph_file):
+    NM = utils.read_csv_for_nx(graph_file)
+    Gnx = nx.from_pandas_edgelist(
+        NM, source="0", target="1", create_using=nx.Graph()
+    )
+    nc = nx.core_number(Gnx)
+    return nc
+
+
+def calc_cg_core_number(graph_file):
     M = utils.read_csv_file(graph_file)
-    G = cugraph.DiGraph()
+    G = cugraph.Graph()
     G.from_cudf_edgelist(M, source="0", destination="1")
 
     cn = cugraph.core_number(G)
-    cn = cn.sort_values("vertex").reset_index(drop=True)
+    return cn
 
+
+def calc_core_number(graph_file):
     NM = utils.read_csv_for_nx(graph_file)
     Gnx = nx.from_pandas_edgelist(
         NM, source="0", target="1", create_using=nx.Graph()
     )
     nc = nx.core_number(Gnx)
+
+    M = utils.read_csv_file(graph_file)
+    G = cugraph.Graph()
+    G.from_cudf_edgelist(M, source="0", destination="1")
+
+    cn = cugraph.core_number(G)
+    cn = cn.sort_values("vertex").reset_index(drop=True)
+
     pdf = [nc[k] for k in sorted(nc.keys())]
     cn["nx_core_number"] = pdf
     cn = cn.rename(columns={"core_number": "cu_core_number"}, copy=False)
@@ -62,6 +82,24 @@ def calc_core_number(graph_file):
 def test_core_number(graph_file):
     gc.collect()
 
-    cn = calc_core_number(graph_file)
+    nx_num = calc_nx_core_number(graph_file)
+    cg_num = calc_cg_core_number(graph_file)
+
+    # convert cugraph dataframe to a dictionary
+    cg_num_dic = df_score_to_dictionary(cg_num, k="core_number")
+
+    assert cg_num_dic == nx_num
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_core_number_nx(graph_file):
+    gc.collect()
+
+    NM = utils.read_csv_for_nx(graph_file)
+    Gnx = nx.from_pandas_edgelist(
+        NM, source="0", target="1", create_using=nx.Graph()
+    )
+    nc = nx.core_number(Gnx)
+    cc = cugraph.core_number(Gnx)
 
-    assert cn["cu_core_number"].equals(cn["nx_core_number"])
+    assert nc == cc
diff --git a/python/cugraph/tests/test_ecg.py b/python/cugraph/tests/test_ecg.py
index b5c590a689a..4dc01c389cc 100644
--- a/python/cugraph/tests/test_ecg.py
+++ b/python/cugraph/tests/test_ecg.py
@@ -15,6 +15,7 @@
 
 import pytest
 
+import networkx as nx
 import cugraph
 from cugraph.tests import utils
 
@@ -66,3 +67,20 @@ def test_ecg_clustering(graph_file, min_weight, ensemble_size):
     # Assert that the partitioning has better modularity than the random
     # assignment
     assert cu_score > (0.95 * golden_score)
+
+
+@pytest.mark.parametrize("graph_file", DATASETS)
+@pytest.mark.parametrize("min_weight", MIN_WEIGHTS)
+@pytest.mark.parametrize("ensemble_size", ENSEMBLE_SIZES)
+def test_ecg_clustering_nx(graph_file, min_weight, ensemble_size):
+    gc.collect()
+
+    # Read in the graph and get a NetworkX graph
+    M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    G = nx.from_pandas_edgelist(
+        M, source="0", target="1", edge_attr="weight",
+        create_using=nx.Graph()
+    )
+
+    # Get the modularity score for partitioning versus random assignment
+    _ = cugraph.ecg(G, min_weight, ensemble_size, "weight")
diff --git a/python/cugraph/tests/test_edge_betweenness_centrality.py b/python/cugraph/tests/test_edge_betweenness_centrality.py
index e23fdc210ff..6165705a9b0 100644
--- a/python/cugraph/tests/test_edge_betweenness_centrality.py
+++ b/python/cugraph/tests/test_edge_betweenness_centrality.py
@@ -458,3 +458,28 @@ def test_edge_betweenness_invalid_dtype(
             result_dtype=result_dtype,
         )
         compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc")
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
+def test_edge_betweenness_centrality_nx(graph_file):
+    prepare_test()
+
+    Gnx = utils.generate_nx_graph_from_file(graph_file)
+
+    nx_bc = nx.edge_betweenness_centrality(Gnx)
+    cu_bc = cugraph.edge_betweenness_centrality(Gnx)
+
+    # Calculating mismatch
+    networkx_bc = sorted(nx_bc.items(), key=lambda x: x[0])
+    cugraph_bc = sorted(cu_bc.items(), key=lambda x: x[0])
+    err = 0
+    assert len(cugraph_bc) == len(networkx_bc)
+    for i in range(len(cugraph_bc)):
+        if (
+            abs(cugraph_bc[i][1] - networkx_bc[i][1]) > 0.01
+            and cugraph_bc[i][0] == networkx_bc[i][0]
+        ):
+            err = err + 1
+            print(f"{cugraph_bc[i][1]} and {cugraph_bc[i][1]}")
+    print("Mismatches:", err)
+    assert err < (0.01 * len(cugraph_bc))
diff --git a/python/cugraph/tests/test_graph.py b/python/cugraph/tests/test_graph.py
index 44c856cf3dc..59d0d5c4e09 100644
--- a/python/cugraph/tests/test_graph.py
+++ b/python/cugraph/tests/test_graph.py
@@ -98,12 +98,13 @@ def compare_graphs(nx_graph, cu_graph):
 
     if len(edgelist_df.columns) > 2:
         df0 = cudf.from_pandas(nx.to_pandas_edgelist(nx_graph))
-        df0 = df0.sort_values(by=["source", "target"]).reset_index(drop=True)
-        df1 = df.sort_values(by=["source", "target"]).reset_index(drop=True)
-        if not df0["weight"].equals(df1["weight"]):
+        merge = df.merge(df0, on=["source", "target"],
+                         suffixes=("_cugraph", "_nx"))
+        print("merge = \n", merge)
+        print(merge[merge.weight_cugraph != merge.weight_nx])
+        if not merge["weight_cugraph"].equals(merge["weight_nx"]):
             print('weights different')
-            print('df0 = \n', df0)
-            print('df1 = \n', df1)
+            print(merge[merge.weight_cugraph != merge.weight_nx])
             return False
 
     return True
@@ -383,65 +384,6 @@ def test_view_edge_list_for_Graph(graph_file):
     ).all()
 
 
-# Test
-@pytest.mark.parametrize("graph_file", utils.DATASETS)
-def test_networkx_compatibility(graph_file):
-    gc.collect()
-
-    # test from_cudf_edgelist()
-
-    M = utils.read_csv_for_nx(graph_file)
-
-    df = pd.DataFrame()
-    df["source"] = pd.Series(M["0"])
-    df["target"] = pd.Series(M["1"])
-    df["weight"] = pd.Series(M.weight)
-    gdf = cudf.from_pandas(df)
-
-    Gnx = nx.from_pandas_edgelist(
-        df,
-        source="source",
-        target="target",
-        edge_attr="weight",
-        create_using=nx.DiGraph,
-    )
-    G = cugraph.from_cudf_edgelist(
-        gdf,
-        source="source",
-        destination="target",
-        edge_attr="weight",
-        create_using=cugraph.DiGraph,
-    )
-
-    print('g from gdf = \n', gdf)
-    print('nx from df = \n', df)
-
-    t1 = time.time()
-    assert compare_graphs(Gnx, G)
-    t2 = time.time() - t1
-    print('compare_graphs time: ', t2)
-
-    Gnx.clear()
-    G.clear()
-    Gnx = nx.from_pandas_edgelist(
-        df, source="source", target="target", create_using=nx.DiGraph
-    )
-    G = cugraph.from_cudf_edgelist(
-        gdf,
-        source="source",
-        destination="target",
-        create_using=cugraph.DiGraph,
-    )
-
-    t1 = time.time()
-    assert compare_graphs(Gnx, G)
-    t2 = time.time() - t1
-    print('compare_graphs time: ', t2)
-
-    Gnx.clear()
-    G.clear()
-
-
 # Test
 @pytest.mark.parametrize('graph_file', utils.DATASETS)
 def test_consolidation(graph_file):
diff --git a/python/cugraph/tests/test_hits.py b/python/cugraph/tests/test_hits.py
index c8a9274e078..30b6f20f478 100644
--- a/python/cugraph/tests/test_hits.py
+++ b/python/cugraph/tests/test_hits.py
@@ -137,3 +137,20 @@ def test_hits(graph_file, max_iter, tol):
 
     assert cugraph_hits["authorities"].is_monotonic_decreasing
     assert cugraph_hits["nx_authorities"].is_monotonic_decreasing
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+@pytest.mark.parametrize("max_iter", MAX_ITERATIONS)
+@pytest.mark.parametrize("tol", TOLERANCE)
+def test_hits_nx(graph_file, max_iter, tol):
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+    Gnx = nx.from_pandas_edgelist(
+        M, source="0", target="1", create_using=nx.DiGraph()
+    )
+    nx_hubs, nx_authorities = nx.hits(Gnx, max_iter, tol, normalized=True)
+    cg_hubs, cg_authorities = cugraph.hits(Gnx, max_iter, tol, normalized=True)
+
+    # assert nx_hubs == cg_hubs
+    # assert nx_authorities == cg_authorities
diff --git a/python/cugraph/tests/test_jaccard.py b/python/cugraph/tests/test_jaccard.py
index 7cb7b274434..d5de073189d 100644
--- a/python/cugraph/tests/test_jaccard.py
+++ b/python/cugraph/tests/test_jaccard.py
@@ -197,3 +197,24 @@ def test_jaccard_two_hop_edge_vals(graph_file):
     for i in range(len(df)):
         diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i])
         assert diff < 1.0e-6
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_jaccard_nx(graph_file):
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+    Gnx = nx.from_pandas_edgelist(
+        M, source="0", target="1", create_using=nx.Graph()
+    )
+
+    nx_j = nx.jaccard_coefficient(Gnx)
+    nv_js = sorted(nx_j, key=len, reverse=True)
+
+    cg_j = cugraph.jaccard_coefficient(Gnx)
+
+    assert len(nv_js) > len(cg_j)
+
+    # FIXME:  Nx does a full all-pair Jaccard.
+    # cuGraph does a limited 1-hop Jaccard
+    # assert nx_j == cg_j
diff --git a/python/cugraph/tests/test_k_core.py b/python/cugraph/tests/test_k_core.py
index 59f0b3fb301..c05cb1dd86e 100644
--- a/python/cugraph/tests/test_k_core.py
+++ b/python/cugraph/tests/test_k_core.py
@@ -89,3 +89,17 @@ def test_core_number_Graph(graph_file):
     cu_kcore, nx_kcore = calc_k_cores(graph_file, False)
 
     assert compare_edges(cu_kcore, nx_kcore)
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_core_number_Graph_nx(graph_file):
+    gc.collect()
+
+    NM = utils.read_csv_for_nx(graph_file)
+    Gnx = nx.from_pandas_edgelist(
+        NM, source="0", target="1", create_using=nx.Graph()
+    )
+    nc = nx.k_core(Gnx)
+    cc = cugraph.k_core(Gnx)
+
+    assert nx.is_isomorphic(nc, cc)
diff --git a/python/cugraph/tests/test_k_truss_subgraph.py b/python/cugraph/tests/test_k_truss_subgraph.py
index 314a4f62618..e9ccac81cf6 100644
--- a/python/cugraph/tests/test_k_truss_subgraph.py
+++ b/python/cugraph/tests/test_k_truss_subgraph.py
@@ -46,16 +46,7 @@ def ktruss_ground_truth(graph_file):
     return df
 
 
-def cugraph_k_truss_subgraph(graph_file, k):
-    cu_M = utils.read_csv_file(graph_file)
-    G = cugraph.Graph()
-    G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
-    k_subgraph = cugraph.ktruss_subgraph(G, k)
-    return k_subgraph
-
-
-def compare_k_truss(graph_file, k, ground_truth_file):
-    k_truss_cugraph = cugraph_k_truss_subgraph(graph_file, k)
+def compare_k_truss(k_truss_cugraph, k, ground_truth_file):
     k_truss_nx = ktruss_ground_truth(ground_truth_file)
 
     edgelist_df = k_truss_cugraph.view_edge_list()
@@ -82,4 +73,29 @@ def compare_k_truss(graph_file, k, ground_truth_file):
 def test_ktruss_subgraph_Graph(graph_file, nx_ground_truth):
     gc.collect()
 
-    compare_k_truss(graph_file, 5, nx_ground_truth)
+    k = 5
+    cu_M = utils.read_csv_file(graph_file)
+    G = cugraph.Graph()
+    G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
+    k_subgraph = cugraph.ktruss_subgraph(G, k)
+
+    compare_k_truss(k_subgraph, k, nx_ground_truth)
+
+
+@pytest.mark.parametrize("graph_file, nx_ground_truth", utils.DATASETS_KTRUSS)
+def test_ktruss_subgraph_Graph_nx(graph_file, nx_ground_truth):
+    gc.collect()
+
+    k = 5
+    M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    G = nx.from_pandas_edgelist(
+        M, source="0", target="1", edge_attr="weight",
+        create_using=nx.Graph()
+    )
+    k_subgraph = cugraph.k_truss(G, k)
+    df = nx.to_pandas_edgelist(k_subgraph)
+
+    k_truss_nx = nx.k_truss(G, k)
+    nx_df = nx.to_pandas_edgelist(k_truss_nx)
+
+    assert len(df) == len(nx_df)
diff --git a/python/cugraph/tests/test_katz_centrality.py b/python/cugraph/tests/test_katz_centrality.py
index 62f30e22a57..a2a03c1518b 100644
--- a/python/cugraph/tests/test_katz_centrality.py
+++ b/python/cugraph/tests/test_katz_centrality.py
@@ -80,3 +80,35 @@ def test_katz_centrality(graph_file):
     topKCU = topKVertices(katz_scores, "cu_katz", 10)
 
     assert topKNX.equals(topKCU)
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_katz_centrality_nx(graph_file):
+    gc.collect()
+
+    NM = utils.read_csv_for_nx(graph_file)
+    Gnx = nx.from_pandas_edgelist(
+        NM, create_using=nx.DiGraph(), source="0", target="1"
+    )
+
+    G = cugraph.utilities.convert_from_nx(Gnx)
+    largest_out_degree = G.degrees().nlargest(n=1, columns="out_degree")
+    largest_out_degree = largest_out_degree["out_degree"].iloc[0]
+    katz_alpha = 1 / (largest_out_degree + 1)
+
+    nk = nx.katz_centrality(Gnx, alpha=katz_alpha)
+    ck = cugraph.katz_centrality(Gnx, alpha=None, max_iter=1000)
+
+    # Calculating mismatch
+    nk = sorted(nk.items(), key=lambda x: x[0])
+    ck = sorted(ck.items(), key=lambda x: x[0])
+    err = 0
+    assert len(ck) == len(nk)
+    for i in range(len(ck)):
+        if (
+            abs(ck[i][1] - nk[i][1]) > 0.1
+            and ck[i][0] == nk[i][0]
+        ):
+            err = err + 1
+    print("Mismatches:", err)
+    assert err < (0.1 * len(ck))
diff --git a/python/cugraph/tests/test_leiden.py b/python/cugraph/tests/test_leiden.py
index 7f7b4b577fe..d6a7f86b5c5 100644
--- a/python/cugraph/tests/test_leiden.py
+++ b/python/cugraph/tests/test_leiden.py
@@ -16,6 +16,7 @@
 
 import pytest
 
+import networkx as nx
 import cugraph
 from cugraph.tests import utils
 
@@ -30,34 +31,24 @@
     warnings.filterwarnings("ignore", category=DeprecationWarning)
 
 
-def cugraph_leiden(cu_M, edgevals=False):
+def cugraph_leiden(G, edgevals=False):
 
-    G = cugraph.Graph()
-    if edgevals:
-        G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
-    else:
-        G.from_cudf_edgelist(cu_M, source="0", destination="1")
     # cugraph Louvain Call
     t1 = time.time()
     parts, mod = cugraph.leiden(G)
     t2 = time.time() - t1
-    print("Cugraph Time : " + str(t2))
+    print("Cugraph Leiden Time : " + str(t2))
 
     return parts, mod
 
 
-def cugraph_louvain(cu_M, edgevals=False):
+def cugraph_louvain(G, edgevals=False):
 
-    G = cugraph.Graph()
-    if edgevals:
-        G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
-    else:
-        G.from_cudf_edgelist(cu_M, source="0", destination="1")
     # cugraph Louvain Call
     t1 = time.time()
     parts, mod = cugraph.louvain(G)
     t2 = time.time() - t1
-    print("Cugraph Time : " + str(t2))
+    print("Cugraph Louvain Time : " + str(t2))
 
     return parts, mod
 
@@ -65,10 +56,46 @@ def cugraph_louvain(cu_M, edgevals=False):
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 def test_leiden(graph_file):
     gc.collect()
+    edgevals = True
 
     cu_M = utils.read_csv_file(graph_file)
-    leiden_parts, leiden_mod = cugraph_leiden(cu_M, edgevals=True)
-    louvain_parts, louvain_mod = cugraph_louvain(cu_M, edgevals=True)
+
+    G = cugraph.Graph()
+    if edgevals:
+        G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
+    else:
+        G.from_cudf_edgelist(cu_M, source="0", destination="1")
+
+    leiden_parts, leiden_mod = cugraph_leiden(G, edgevals=True)
+    louvain_parts, louvain_mod = cugraph_louvain(G, edgevals=True)
+
+    # Calculating modularity scores for comparison
+    assert leiden_mod >= (0.99 * louvain_mod)
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_leiden_nx(graph_file):
+    gc.collect()
+    edgevals = True
+
+    NM = utils.read_csv_for_nx(graph_file)
+
+    if edgevals:
+        G = nx.from_pandas_edgelist(NM,
+                                    create_using=nx.Graph(),
+                                    source="0",
+                                    target="1"
+                                    )
+    else:
+        G = nx.from_pandas_edgelist(NM,
+                                    create_using=nx.Graph(),
+                                    source="0",
+                                    target="1",
+                                    edge_attr="2"
+                                    )
+
+    leiden_parts, leiden_mod = cugraph_leiden(G, edgevals=True)
+    louvain_parts, louvain_mod = cugraph_louvain(G, edgevals=True)
 
     # Calculating modularity scores for comparison
     assert leiden_mod >= (0.99 * louvain_mod)
diff --git a/python/cugraph/tests/test_louvain.py b/python/cugraph/tests/test_louvain.py
index b4b0b515899..d6b0030eb73 100644
--- a/python/cugraph/tests/test_louvain.py
+++ b/python/cugraph/tests/test_louvain.py
@@ -74,17 +74,21 @@ def test_louvain_with_edgevals(graph_file):
     M = utils.read_csv_for_nx(graph_file)
     cu_M = utils.read_csv_file(graph_file)
     cu_parts, cu_mod = cugraph_call(cu_M, edgevals=True)
+
     nx_parts = networkx_call(M)
     # Calculating modularity scores for comparison
     Gnx = nx.from_pandas_edgelist(
         M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
     )
-    cu_map = {0: 0}
-    for i in range(len(cu_parts)):
-        cu_map[cu_parts["vertex"][i]] = cu_parts["partition"][i]
+
+    cu_parts = cu_parts.to_pandas()
+    cu_map = dict(zip(cu_parts['vertex'], cu_parts['partition']))
+
     assert set(nx_parts.keys()) == set(cu_map.keys())
+
     cu_mod_nx = community.modularity(cu_map, Gnx)
     nx_mod = community.modularity(nx_parts, Gnx)
+
     assert len(cu_parts) == len(nx_parts)
     assert cu_mod > (0.82 * nx_mod)
     assert abs(cu_mod - cu_mod_nx) < 0.0001
@@ -103,9 +107,10 @@ def test_louvain(graph_file):
     Gnx = nx.from_pandas_edgelist(
         M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
     )
-    cu_map = {0: 0}
-    for i in range(len(cu_parts)):
-        cu_map[cu_parts["vertex"][i]] = cu_parts["partition"][i]
+
+    cu_parts = cu_parts.to_pandas()
+    cu_map = dict(zip(cu_parts['vertex'], cu_parts['partition']))
+
     assert set(nx_parts.keys()) == set(cu_map.keys())
 
     cu_mod_nx = community.modularity(cu_map, Gnx)
diff --git a/python/cugraph/tests/test_nx_convert.py b/python/cugraph/tests/test_nx_convert.py
new file mode 100644
index 00000000000..08a96a801e2
--- /dev/null
+++ b/python/cugraph/tests/test_nx_convert.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import pytest
+import cudf
+import cugraph
+from cugraph.tests import utils
+
+# Temporarily suppress warnings till networkX fixes deprecation warnings
+# (Using or importing the ABCs from 'collections' instead of from
+# 'collections.abc' is deprecated, and in 3.8 it will stop working) for
+# python 3.7.  Also, this import networkx needs to be relocated in the
+# third-party group once this gets fixed.
+import warnings
+
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    import networkx as nx
+
+
+def _compare_graphs(nxG, cuG, has_wt=True):
+    assert nxG.number_of_nodes() == cuG.number_of_nodes()
+    assert nxG.number_of_edges() == cuG.number_of_edges()
+
+    cu_df = cuG.view_edge_list().to_pandas()
+    if has_wt is True:
+        cu_df = cu_df.drop(columns=["weights"])
+    cu_df = cu_df.sort_values(by=["src", "dst"]).reset_index(drop=True)
+
+    nx_df = nx.to_pandas_edgelist(nxG)
+    if has_wt is True:
+        nx_df = nx_df.drop(columns=["weight"])
+    nx_df = nx_df.rename(columns={"source": "src", "target": "dst"})
+    nx_df = nx_df.astype('int32')
+    nx_df = nx_df.sort_values(by=["src", "dst"]).reset_index(drop=True)
+
+    assert cu_df.to_dict() == nx_df.to_dict()
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_networkx_compatibility(graph_file):
+    # test to make sure cuGraph and Nx build similar Graphs
+
+    gc.collect()
+
+    # Read in the graph
+    M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+
+    # create a NetworkX DiGraph
+    nxG = nx.from_pandas_edgelist(
+        M, source="0", target="1", edge_attr="weight",
+        create_using=nx.DiGraph()
+    )
+
+    # create a cuGraph DiGraph
+    gdf = cudf.from_pandas(M)
+    gdf = gdf.rename(columns={"weight": "weights"})
+    cuG = cugraph.from_cudf_edgelist(
+        gdf,
+        source="0",
+        destination="1",
+        edge_attr="weights",
+        create_using=cugraph.DiGraph,
+    )
+
+    _compare_graphs(nxG, cuG)
+
+
+# Test
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_nx_convert(graph_file):
+    gc.collect()
+
+    # read data and create a Nx Graph
+    nx_df = utils.read_csv_for_nx(graph_file)
+    nxG = nx.from_pandas_edgelist(nx_df, "0", "1", create_using=nx.DiGraph)
+
+    cuG = cugraph.utilities.convert_from_nx(nxG)
+
+    _compare_graphs(nxG, cuG, has_wt=False)
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_nx_convert_multicol(graph_file):
+    gc.collect()
+
+    # read data and create a Nx Graph
+    nx_df = utils.read_csv_for_nx(graph_file)
+
+    G = nx.DiGraph()
+
+    for row in nx_df.iterrows():
+        G.add_edge(
+            row[1]["0"], row[1]["1"], count=[row[1]["0"], row[1]["1"]]
+        )
+
+    nxG = nx.from_pandas_edgelist(nx_df, "0", "1")
+
+    cuG = cugraph.utilities.convert_from_nx(nxG)
+
+    assert nxG.number_of_nodes() == cuG.number_of_nodes()
+    assert nxG.number_of_edges() == cuG.number_of_edges()
diff --git a/python/cugraph/tests/test_pagerank.py b/python/cugraph/tests/test_pagerank.py
index b58ec2d9bc9..5a5de379f9d 100644
--- a/python/cugraph/tests/test_pagerank.py
+++ b/python/cugraph/tests/test_pagerank.py
@@ -46,10 +46,8 @@ def cudify(d):
     return cuD
 
 
-def cugraph_call(cu_M, max_iter, tol, alpha, personalization, nstart):
+def cugraph_call(G, max_iter, tol, alpha, personalization, nstart):
     # cugraph Pagerank Call
-    G = cugraph.DiGraph()
-    G.from_cudf_edgelist(cu_M, source="0", destination="1")
     t1 = time.time()
     df = cugraph.pagerank(
         G,
@@ -74,26 +72,32 @@ def cugraph_call(cu_M, max_iter, tol, alpha, personalization, nstart):
     return sorted_pr
 
 
+# need a different function since the Nx version returns a dictionary
+def cugraph_nx_call(G, max_iter, tol, alpha, personalization, nstart):
+    # cugraph Pagerank Call
+    t1 = time.time()
+    pr = cugraph.pagerank(
+        G,
+        alpha=alpha,
+        max_iter=max_iter,
+        tol=tol,
+        personalization=personalization,
+        nstart=nstart,
+    )
+    t2 = time.time() - t1
+    print("Cugraph Time : " + str(t2))
+
+    return pr
+
+
 # The function selects personalization_perc% of accessible vertices in graph M
 # and randomly assigns them personalization values
-def networkx_call(M, max_iter, tol, alpha, personalization_perc):
-    """nnz_per_row = {r: 0 for r in range(M.get_shape()[0])}
-    for nnz in range(M.getnnz()):
-        nnz_per_row[M.row[nnz]] = 1 + nnz_per_row[M.row[nnz]]
-    for nnz in range(M.getnnz()):
-        M.data[nnz] = 1.0/float(nnz_per_row[M.row[nnz]])
-
-    M = M.tocsr()
-    if M is None:
-        raise TypeError('Could not read the input graph')
-    if M.shape[0] != M.shape[1]:
-        raise TypeError('Shape is not square')
-    """
+def networkx_call(Gnx, max_iter, tol, alpha, personalization_perc, nnz_vtx):
+
     personalization = None
     if personalization_perc != 0:
         personalization = {}
-        nnz_vtx = np.unique(M)
-        print(nnz_vtx)
+        # print(nnz_vtx)
         personalization_count = int(
             (nnz_vtx.size * personalization_perc) / 100.0
         )
@@ -101,34 +105,18 @@ def networkx_call(M, max_iter, tol, alpha, personalization_perc):
         nnz_vtx = np.random.choice(
             nnz_vtx, min(nnz_vtx.size, personalization_count), replace=False
         )
-        print(nnz_vtx)
+        # print(nnz_vtx)
         nnz_val = np.random.random(nnz_vtx.size)
         nnz_val = nnz_val / sum(nnz_val)
-        print(nnz_val)
+        # print(nnz_val)
         for vtx, val in zip(nnz_vtx, nnz_val):
             personalization[vtx] = val
 
-    # should be autosorted, but check just to make sure
-    """if not M.has_sorted_indices:
-        print('sort_indices ... ')
-        M.sort_indices()
-    """
-    # in NVGRAPH tests we read as CSR and feed as CSC,
-    # so here we do this explicitly
-    print("Format conversion ... ")
-
-    # Directed NetworkX graph
-    Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", create_using=nx.DiGraph()
-    )
-
     z = {k: 1.0 / Gnx.number_of_nodes() for k in range(Gnx.number_of_nodes())}
 
     # Networkx Pagerank Call
-    print("Solving... ")
     t1 = time.time()
 
-    # same parameters as in NVGRAPH
     pr = nx.pagerank(
         Gnx,
         alpha=alpha,
@@ -170,9 +158,15 @@ def test_pagerank(
 ):
     gc.collect()
 
+    # NetworkX PageRank
     M = utils.read_csv_for_nx(graph_file)
+    nnz_vtx = np.unique(M)
+    Gnx = nx.from_pandas_edgelist(
+        M, source="0", target="1", create_using=nx.DiGraph()
+    )
+
     networkx_pr, networkx_prsn = networkx_call(
-        M, max_iter, tol, alpha, personalization_perc
+        Gnx, max_iter, tol, alpha, personalization_perc, nnz_vtx
     )
 
     cu_nstart = None
@@ -180,12 +174,62 @@ def test_pagerank(
         cu_nstart = cudify(networkx_pr)
         max_iter = 5
     cu_prsn = cudify(networkx_prsn)
+
+    # cuGraph PageRank
     cu_M = utils.read_csv_file(graph_file)
-    cugraph_pr = cugraph_call(cu_M, max_iter, tol, alpha, cu_prsn, cu_nstart)
+    G = cugraph.DiGraph()
+    G.from_cudf_edgelist(cu_M, source="0", destination="1")
+
+    cugraph_pr = cugraph_call(G, max_iter, tol, alpha, cu_prsn, cu_nstart)
 
     # Calculating mismatch
+    networkx_pr = sorted(networkx_pr.items(), key=lambda x: x[0])
+    err = 0
+    assert len(cugraph_pr) == len(networkx_pr)
+    for i in range(len(cugraph_pr)):
+        if (
+            abs(cugraph_pr[i][1] - networkx_pr[i][1]) > tol * 1.1
+            and cugraph_pr[i][0] == networkx_pr[i][0]
+        ):
+            err = err + 1
+    print("Mismatches:", err)
+    assert err < (0.01 * len(cugraph_pr))
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+@pytest.mark.parametrize("max_iter", MAX_ITERATIONS)
+@pytest.mark.parametrize("tol", TOLERANCE)
+@pytest.mark.parametrize("alpha", ALPHA)
+@pytest.mark.parametrize("personalization_perc", PERSONALIZATION_PERC)
+@pytest.mark.parametrize("has_guess", HAS_GUESS)
+def test_pagerank_nx(
+    graph_file, max_iter, tol, alpha, personalization_perc, has_guess
+):
+    gc.collect()
+
+    # NetworkX PageRank
+    M = utils.read_csv_for_nx(graph_file)
+    nnz_vtx = np.unique(M)
+    Gnx = nx.from_pandas_edgelist(
+        M, source="0", target="1", create_using=nx.DiGraph()
+    )
+
+    networkx_pr, networkx_prsn = networkx_call(
+        Gnx, max_iter, tol, alpha, personalization_perc, nnz_vtx
+    )
 
+    cu_nstart = None
+    if has_guess == 1:
+        cu_nstart = cudify(networkx_pr)
+        max_iter = 5
+    cu_prsn = cudify(networkx_prsn)
+
+    # cuGraph PageRank with Nx Graph
+    cugraph_pr = cugraph_nx_call(Gnx, max_iter, tol, alpha, cu_prsn, cu_nstart)
+
+    # Calculating mismatch
     networkx_pr = sorted(networkx_pr.items(), key=lambda x: x[0])
+    cugraph_pr = sorted(cugraph_pr.items(), key=lambda x: x[0])
     err = 0
     assert len(cugraph_pr) == len(networkx_pr)
     for i in range(len(cugraph_pr)):
@@ -194,5 +238,6 @@ def test_pagerank(
             and cugraph_pr[i][0] == networkx_pr[i][0]
         ):
             err = err + 1
+            print(f"{cugraph_pr[i][1]} and {cugraph_pr[i][1]}")
     print("Mismatches:", err)
     assert err < (0.01 * len(cugraph_pr))
diff --git a/python/cugraph/tests/test_sssp.py b/python/cugraph/tests/test_sssp.py
index 3c3b575fdb5..8dfcc60da3c 100644
--- a/python/cugraph/tests/test_sssp.py
+++ b/python/cugraph/tests/test_sssp.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import pytest
-
+import cudf
 import cugraph
 from cugraph.tests import utils
 
@@ -213,3 +213,46 @@ def test_sssp_data_type_conversion(graph_file, source):
                 err = err + 1
 
     assert err == 0
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("source", SOURCES)
+def test_sssp_nx(graph_file, source):
+    print("DOING test_sssp : " + graph_file + "\n\n\n")
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+    nx_paths, Gnx = networkx_call(M, source)
+
+    df = cugraph.shortest_path(Gnx, source)
+    df = cudf.from_pandas(df)
+
+    if np.issubdtype(df["distance"].dtype, np.integer):
+        max_val = np.iinfo(df["distance"].dtype).max
+    else:
+        max_val = np.finfo(df["distance"].dtype).max
+
+    verts_np = df["vertex"].to_array()
+    dist_np = df["distance"].to_array()
+    pred_np = df["predecessor"].to_array()
+    cu_paths = dict(zip(verts_np, zip(dist_np, pred_np)))
+
+    # Calculating mismatch
+    err = 0
+    for vid in cu_paths:
+        # Validate vertices that are reachable
+        # NOTE : If distance type is float64 then cu_paths[vid][0]
+        # should be compared against np.finfo(np.float64).max)
+        if cu_paths[vid][0] != max_val:
+            if cu_paths[vid][0] != nx_paths[vid]:
+                err = err + 1
+            # check pred dist + 1 = current dist (since unweighted)
+            pred = cu_paths[vid][1]
+            if vid != source and cu_paths[pred][0] + 1 != cu_paths[vid][0]:
+                err = err + 1
+        else:
+            if vid in nx_paths.keys():
+                err = err + 1
+
+    assert err == 0
+    print("DONE test_sssp : " + graph_file + "\n\n\n")
diff --git a/python/cugraph/tests/test_subgraph_extraction.py b/python/cugraph/tests/test_subgraph_extraction.py
index 9192495c6b2..a4f36af994a 100644
--- a/python/cugraph/tests/test_subgraph_extraction.py
+++ b/python/cugraph/tests/test_subgraph_extraction.py
@@ -100,3 +100,33 @@ def test_subgraph_extraction_Graph(graph_file):
     cu_sg = cugraph_call(M, verts, False)
     nx_sg = nx_call(M, verts, False)
     assert compare_edges(cu_sg, nx_sg)
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_subgraph_extraction_Graph_nx(graph_file):
+    gc.collect()
+    directed = False
+    verts = np.zeros(3, dtype=np.int32)
+    verts[0] = 0
+    verts[1] = 1
+    verts[2] = 17
+
+    M = utils.read_csv_for_nx(graph_file)
+
+    if directed:
+        G = nx.from_pandas_edgelist(
+            M, source="0", target="1", create_using=nx.DiGraph()
+        )
+    else:
+        G = nx.from_pandas_edgelist(
+            M, source="0", target="1", create_using=nx.Graph()
+        )
+
+    nx_sub = nx.subgraph(G, verts)
+    nx_df = nx.to_pandas_edgelist(nx_sub).to_dict()
+
+    cu_verts = cudf.Series(verts)
+    cu_sub = cugraph.subgraph(G, cu_verts)
+    cu_df = nx.to_pandas_edgelist(cu_sub).to_dict()
+
+    assert nx_df == cu_df
diff --git a/python/cugraph/tests/test_symmetrize.py b/python/cugraph/tests/test_symmetrize.py
index 4a49eddb70b..4a71dca5e96 100644
--- a/python/cugraph/tests/test_symmetrize.py
+++ b/python/cugraph/tests/test_symmetrize.py
@@ -19,6 +19,10 @@
 import cudf
 import cugraph
 from cugraph.tests import utils
+import cugraph.comms as Comms
+from dask.distributed import Client
+from dask_cuda import LocalCUDACluster
+from cugraph.dask.common.mg_utils import is_single_gpu
 
 
 def test_version():
@@ -198,13 +202,68 @@ def test_symmetrize_weighted(graph_file):
     compare(cu_M["0"], cu_M["1"], cu_M["2"], sym_src, sym_dst, sym_w)
 
 
+@pytest.fixture
+def client_connection():
+    cluster = LocalCUDACluster()
+    client = Client(cluster)
+    Comms.initialize()
+
+    yield client
+
+    Comms.destroy()
+    client.close()
+    cluster.close()
+
+
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_mg_symmetrize(graph_file, client_connection):
+    gc.collect()
+
+    ddf = utils.read_dask_cudf_csv_file(graph_file)
+    sym_src, sym_dst = cugraph.symmetrize(ddf["src"], ddf["dst"])
+
+    # convert to regular cudf to facilitate comparison
+    df = ddf.compute()
+
+    compare(
+        df["src"], df["dst"], None, sym_src.compute(), sym_dst.compute(), None
+    )
+
+
+@pytest.mark.skipif(
+    is_single_gpu(), reason="skipping MG testing on Single GPU system"
+)
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_mg_symmetrize_df(graph_file, client_connection):
+    gc.collect()
+
+    ddf = utils.read_dask_cudf_csv_file(graph_file)
+    sym_ddf = cugraph.symmetrize_ddf(ddf, "src", "dst", "weight")
+
+    # convert to regular cudf to facilitate comparison
+    df = ddf.compute()
+    sym_df = sym_ddf.compute()
+
+    compare(
+        df["src"],
+        df["dst"],
+        df["weight"],
+        sym_df["src"],
+        sym_df["dst"],
+        sym_df["weight"],
+    )
+
+
 # Test
 # NOTE: see https://github.com/rapidsai/cudf/issues/2636
 #       drop_duplicates doesn't work well with the pool allocator
 #                        list(product([False, True], [False, True])))
 
 
-@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
 def test_symmetrize_df(graph_file):
     gc.collect()
 
diff --git a/python/cugraph/tests/test_triangle_count.py b/python/cugraph/tests/test_triangle_count.py
index 975ddd82470..ff28f55838d 100644
--- a/python/cugraph/tests/test_triangle_count.py
+++ b/python/cugraph/tests/test_triangle_count.py
@@ -84,3 +84,21 @@ def test_triangles_edge_vals(graph_file):
     cu_count = cugraph_call(M, edgevals=True)
     nx_count = networkx_call(M)
     assert cu_count == nx_count
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
+def test_triangles_nx(graph_file):
+    gc.collect()
+
+    M = utils.read_csv_for_nx(graph_file)
+    G = nx.from_pandas_edgelist(
+        M, source="0", target="1", create_using=nx.Graph()
+    )
+
+    cu_count = cugraph.triangles(G)
+    dic = nx.triangles(G)
+    nx_count = 0
+    for i in dic.keys():
+        nx_count += dic[i]
+
+    assert cu_count == nx_count
diff --git a/python/cugraph/tests/utils.py b/python/cugraph/tests/utils.py
index e68f934c619..88f79f65b4d 100644
--- a/python/cugraph/tests/utils.py
+++ b/python/cugraph/tests/utils.py
@@ -15,6 +15,7 @@
 import cugraph
 import pandas as pd
 import networkx as nx
+import numpy as np
 import dask_cudf
 import os
 from cugraph.dask.common.mg_utils import (get_client)
@@ -139,3 +140,63 @@ def build_mg_batch_cu_and_nx_graphs(graph_file, directed=True):
                                                   directed=directed)
     Gnx = generate_nx_graph_from_file(graph_file, directed=directed)
     return G, Gnx
+
+
+def random_edgelist(e=1024, ef=16,
+                    dtypes={"src": np.int32, "dst": np.int32, "val": float},
+                    drop_duplicates=True, seed=None):
+    """ Create a random edge list
+
+    Parameters
+    ----------
+    e : int
+        Number of edges
+    ef : int
+        Edge factor (average number of edges per vertex)
+    dtypes : dict
+        Mapping of column names to types.
+        Supported type is {"src": int, "dst": int, "val": float}
+    drop_duplicates
+        Drop duplicates
+    seed : int (optional)
+        Randomstate seed
+
+    Examples
+    --------
+    >>> from cugraph.tests import utils
+    >>> # genrates 20 df with 100M edges each and write to disk
+    >>> for x in range(20):
+    >>>    df = utils.random_edgelist(e=100000000, ef=64,
+    >>>                               dtypes={'src':np.int32, 'dst':np.int32},
+    >>>                               seed=x)
+    >>>    df.to_csv('df'+str(x), header=False, index=False)
+    >>>    #df.to_parquet('files_parquet/df'+str(x), index=False)
+    """
+    state = np.random.RandomState(seed)
+    columns = dict((k, make[dt](e // ef, e, state))
+                   for k, dt in dtypes.items())
+
+    df = pd.DataFrame(columns)
+    if drop_duplicates:
+        df = df.drop_duplicates()
+        print("Generated "+str(df.shape[0])+" edges")
+    return cudf.from_pandas(df)
+
+
+def make_int32(v, e, rstate):
+    return rstate.randint(low=0, high=v, size=e, dtype=np.int32)
+
+
+def make_int64(v, e, rstate):
+    return rstate.randint(low=0, high=v, size=e, dtype=np.int64)
+
+
+def make_float(v, e, rstate):
+    return rstate.rand(e) * 2 - 1
+
+
+make = {
+    float: make_float,
+    np.int32: make_int32,
+    np.int64: make_int64
+}
diff --git a/python/cugraph/traversal/__init__.py b/python/cugraph/traversal/__init__.py
index 288c4edd2e3..52a1b9e2cfb 100644
--- a/python/cugraph/traversal/__init__.py
+++ b/python/cugraph/traversal/__init__.py
@@ -12,4 +12,7 @@
 # limitations under the License.
 
 from cugraph.traversal.bfs import bfs
-from cugraph.traversal.sssp import sssp, filter_unreachable
+from cugraph.traversal.bfs import bfs_edges
+from cugraph.traversal.sssp import sssp
+from cugraph.traversal.sssp import shortest_path
+from cugraph.traversal.sssp import filter_unreachable
diff --git a/python/cugraph/traversal/bfs.pxd b/python/cugraph/traversal/bfs.pxd
index ea9f3e4a0e4..0502754c161 100644
--- a/python/cugraph/traversal/bfs.pxd
+++ b/python/cugraph/traversal/bfs.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 from libcpp cimport bool
 
 
diff --git a/python/cugraph/traversal/bfs.py b/python/cugraph/traversal/bfs.py
index 3a977a06baf..7e03d8ab016 100644
--- a/python/cugraph/traversal/bfs.py
+++ b/python/cugraph/traversal/bfs.py
@@ -15,6 +15,7 @@
 
 from cugraph.traversal import bfs_wrapper
 from cugraph.structure.graph import Graph
+from cugraph.utilities import check_nx_graph
 
 
 def bfs(G, start, return_sp_counter=False):
@@ -72,3 +73,71 @@ def bfs(G, start, return_sp_counter=False):
         df["predecessor"].fillna(-1, inplace=True)
 
     return df
+
+
+def bfs_edges(G, source, reverse=False, depth_limit=None, sort_neighbors=None,
+              return_sp_counter=False):
+    """
+    Find the distances and predecessors for a breadth first traversal of a
+    graph.
+
+    Parameters
+    ----------
+    G : cugraph.graph or NetworkX.Graph
+        graph descriptor that contains connectivity information
+    source : Integer
+        The starting vertex index
+    reverse : boolean
+        If a directed graph, then process edges in a reverse direction
+        Currently not implemented
+    depth_limit : Int or None
+        Limit the depth of the search
+        Currently not implemented
+    sort_neighbors : None or Function
+        Currently not implemented
+    return_sp_counter : bool, optional, default=False
+        Indicates if shortest path counters should be returned
+
+    Returns
+    -------
+    df : cudf.DataFrame or Pandas.DataFrame
+        df['vertex'][i] gives the vertex id of the i'th vertex
+
+        df['distance'][i] gives the path distance for the i'th vertex from the
+        starting vertex
+
+        df['predecessor'][i] gives for the i'th vertex the vertex it was
+        reached from in the traversal
+
+        df['sp_counter'][i] gives for the i'th vertex the number of shortest
+        path leading to it during traversal (Only if retrun_sp_counter is True)
+
+    Examples
+    --------
+    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+    >>> G = cugraph.Graph()
+    >>> G.from_cudf_edgelist(M, source='0', destination='1')
+    >>> df = cugraph.bfs_edges(G, 0)
+    """
+
+    if reverse is True:
+        raise NotImplementedError(
+            "reverse processing of graph is "
+            "currently not supported"
+        )
+
+    if depth_limit is not None:
+        raise NotImplementedError(
+            "depth limit implementation of BFS "
+            "is not currently supported"
+        )
+
+    G, isNx = check_nx_graph(G)
+
+    df = bfs(G, source, return_sp_counter)
+
+    if isNx is True:
+        df = df.to_pandas()
+
+    return df
diff --git a/python/cugraph/traversal/bfs_wrapper.pyx b/python/cugraph/traversal/bfs_wrapper.pyx
index dbbda90b17e..c13e1eb58ee 100644
--- a/python/cugraph/traversal/bfs_wrapper.pyx
+++ b/python/cugraph/traversal/bfs_wrapper.pyx
@@ -17,8 +17,8 @@
 # cython: language_level = 3
 
 cimport cugraph.traversal.bfs as c_bfs
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
 from libc.float cimport FLT_MAX_EXP
@@ -56,7 +56,7 @@ def bfs(input_graph, start, directed=True,
     # Step 3: Extract CSR offsets, indices, weights are not expected
     #         - offsets: int (signed, 32-bit)
     #         - indices: int (signed, 32-bit)
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
     c_offsets_ptr = offsets.__cuda_array_interface__['data'][0]
     c_indices_ptr = indices.__cuda_array_interface__['data'][0]
 
diff --git a/python/cugraph/traversal/sssp.pxd b/python/cugraph/traversal/sssp.pxd
index 7067a5e983f..8f36ff12ae8 100644
--- a/python/cugraph/traversal/sssp.pxd
+++ b/python/cugraph/traversal/sssp.pxd
@@ -16,7 +16,7 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-from cugraph.structure.graph_new cimport *
+from cugraph.structure.graph_primtypes cimport *
 
 cdef extern from "algorithms.hpp" namespace "cugraph":
 
diff --git a/python/cugraph/traversal/sssp.py b/python/cugraph/traversal/sssp.py
index 546407af2b6..a40755c6602 100644
--- a/python/cugraph/traversal/sssp.py
+++ b/python/cugraph/traversal/sssp.py
@@ -14,6 +14,7 @@
 from cugraph.traversal import sssp_wrapper
 import numpy as np
 import cudf
+from cugraph.utilities import check_nx_graph
 
 
 def sssp(G, source):
@@ -38,11 +39,14 @@ def sssp(G, source):
     Returns
     -------
     df : cudf.DataFrame
-        df['vertex'][i] gives the vertex id of the i'th vertex.
-        df['distance'][i] gives the path distance for the i'th vertex from the
-        starting vertex.
-        df['predecessor'][i] gives the vertex id of the vertex that was reached
-        before the i'th vertex in the traversal.
+        df['vertex']
+            vertex id
+
+        df['distance']
+            gives the path distance from the starting vertex
+
+        df['predecessor']
+            the vertex it was reached from
 
     Examples
     --------
@@ -94,3 +98,52 @@ def filter_unreachable(df):
         return df[df.distance != max_val]
     else:
         raise TypeError("distance type unsupported")
+
+
+def shortest_path(G, source):
+    """
+    Compute the distance and predecessors for shortest paths from the specified
+    source to all the vertices in the graph. The distances column will store
+    the distance from the source to each vertex. The predecessors column will
+    store each vertex's predecessor in the shortest path. Vertices that are
+    unreachable will have a distance of infinity denoted by the maximum value
+    of the data type and the predecessor set as -1. The source vertex's
+    predecessor is also set to -1. Graphs with negative weight cycles are not
+    supported.
+
+    Parameters
+    ----------
+    graph : cuGraph.Graph or NetworkX.Graph
+        cuGraph graph descriptor with connectivity information. Edge weights,
+        if present, should be single or double precision floating point values.
+    source : int
+        Index of the source vertex.
+
+    Returns
+    -------
+    df : cudf.DataFrame or pandas.DataFrame
+        df['vertex']
+            vertex id
+
+        df['distance']
+            gives the path distance from the starting vertex
+
+        df['predecessor']
+            the vertex it was reached from
+
+    Examples
+    --------
+    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
+    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
+    >>> G = cugraph.Graph()
+    >>> G.from_cudf_edgelist(M, source='0', destination='1')
+    >>> distances = cugraph.shortest_path(G, 0)
+    """
+    G, isNx = check_nx_graph(G)
+
+    df = sssp(G, source)
+
+    if isNx is True:
+        df = df.to_pandas()
+
+    return df
diff --git a/python/cugraph/traversal/sssp_wrapper.pyx b/python/cugraph/traversal/sssp_wrapper.pyx
index ab844819291..1504eee53e1 100644
--- a/python/cugraph/traversal/sssp_wrapper.pyx
+++ b/python/cugraph/traversal/sssp_wrapper.pyx
@@ -18,8 +18,8 @@
 
 cimport cugraph.traversal.sssp as c_sssp
 cimport cugraph.traversal.bfs as c_bfs
-from cugraph.structure.graph_new cimport *
-from cugraph.structure import graph_new_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from cugraph.structure import graph_primtypes_wrapper
 
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t
@@ -60,8 +60,8 @@ def sssp(input_graph, source):
     #         - indices: int (signed, 32-bit)
     #         - weights: float / double
     #         Extract data_type from weights (not None: float / double, None: signed int 32-bit)
-    [offsets, indices] = graph_new_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
-    [weights] = graph_new_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
+    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
+    [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
     c_offsets_ptr = offsets.__cuda_array_interface__['data'][0]
     c_indices_ptr = indices.__cuda_array_interface__['data'][0]
 
diff --git a/python/cugraph/utilities/__init__.py b/python/cugraph/utilities/__init__.py
index 19b7c347420..c42e28dd2cd 100644
--- a/python/cugraph/utilities/__init__.py
+++ b/python/cugraph/utilities/__init__.py
@@ -13,3 +13,9 @@
 
 # from cugraph.utilities.grmat import grmat_gen
 # from cugraph.utilities.pointer_utils import device_of_gpu_pointer
+from cugraph.utilities.nx_factory import convert_from_nx
+from cugraph.utilities.nx_factory import check_nx_graph
+from cugraph.utilities.nx_factory import df_score_to_dictionary
+from cugraph.utilities.nx_factory import df_edge_score_to_dictionary
+from cugraph.utilities.nx_factory import cugraph_to_nx
+from cugraph.utilities.nx_factory import is_networkx_graph
diff --git a/python/cugraph/utilities/nx_factory.py b/python/cugraph/utilities/nx_factory.py
new file mode 100644
index 00000000000..e880df5f32e
--- /dev/null
+++ b/python/cugraph/utilities/nx_factory.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import cugraph
+from cudf import from_pandas
+
+
+def convert_from_nx(nxG, weight=None):
+    if type(nxG) == nx.classes.graph.Graph:
+        G = cugraph.Graph()
+    elif type(nxG) == nx.classes.digraph.DiGraph:
+        G = cugraph.DiGraph()
+    else:
+        raise ValueError("nxG does not appear to be a NetworkX graph type")
+
+    pdf = nx.to_pandas_edgelist(nxG)
+    num_col = len(pdf.columns)
+
+    if num_col < 2:
+        raise ValueError("NetworkX graph did not contain edges")
+
+    if weight is None:
+        num_col == 2
+        pdf = pdf[["source", "target"]]
+
+    if num_col >= 3 and weight is not None:
+        pdf = pdf[["source", "target", weight]]
+        num_col = 3
+
+    gdf = from_pandas(pdf)
+
+    if num_col == 2:
+        G.from_cudf_edgelist(gdf, "source", "target")
+    else:
+        G.from_cudf_edgelist(gdf, "source", "target", weight)
+
+    del gdf
+    del pdf
+
+    return G
+
+
+def is_networkx_graph(G):
+    return isinstance(G, nx.classes.graph.Graph)
+
+
+def check_nx_graph(G, weight=None):
+    """
+    This is a convenience function that will ensure the proper graph type
+
+    Parameters
+    ----------
+    G : cudf.Graph or networkx.Graph
+    weight : str or None
+        which column to use for weight. Default is None
+
+    Returns
+    -------
+    G : cudf.Graph
+        returns a cugraph.Graph that is either the orginal input or
+        a conversion from NetworkX
+
+    is_nx : Boolean
+        indicates rather or not the Graph was converted
+    """
+
+    if isinstance(G, nx.classes.graph.Graph):
+        return convert_from_nx(G, weight), True
+    else:
+        return G, False
+
+
+def df_score_to_dictionary(df, k, v="vertex"):
+    """
+    Convert a dataframe to a dictionary
+
+    Parameters
+    ----------
+     df : cudf.DataFrame
+        GPU data frame containing two cudf.Series of size V: the vertex
+        identifiers and the corresponding score values.
+        Please note that the resulting the 'vertex' column might not be
+        in ascending order.
+
+        df['vertex'] : cudf.Series
+            Contains the vertex identifiers
+        df[..] : cudf.Series
+            Contains the scores of the vertices
+
+    k : str
+        score column name
+    v : str
+        the vertex column name. Default is "vertex"
+
+
+    Returns
+    -------
+    dict : Dictionary of vertices and score
+
+    """
+    df = df.sort_values(by=v)
+    return df.to_pandas().set_index(v).to_dict()[k]
+
+
+def df_edge_score_to_dictionary(df, k, src="src", dst="dst"):
+    """
+    Convert a dataframe to a dictionary
+
+    Parameters
+    ----------
+     df : cudf.DataFrame
+        GPU data frame containing two cudf.Series of size V: the vertex
+        identifiers and the corresponding score values.
+        Please note that the resulting the 'vertex' column might not be
+        in ascending order.
+
+        df['vertex'] : cudf.Series
+            Contains the vertex identifiers
+        df[X] : cudf.Series
+            Contains the scores of the vertices
+
+    k : str
+        score column name
+
+    src : str
+        source column name
+    dst : str
+        destination column name
+
+
+    Returns
+    -------
+    dict : Dictionary of vertices and score
+
+    """
+    pdf = df.sort_values(by=[src, dst]).to_pandas()
+    d = {}
+    for i in range(len(pdf)):
+        d[(pdf[src][i], pdf[dst][i])] = pdf[k][i]
+
+    return d
+
+
+def cugraph_to_nx(G):
+    pdf = G.view_edge_list().to_pandas()
+    num_col = len(pdf.columns)
+
+    if num_col == 2:
+        Gnx = nx.from_pandas_edgelist(pdf, source="src", target="dst")
+    else:
+        Gnx = nx.from_pandas_edgelist(pdf, source="src", target="dst",
+                                      edge_attr="weights")
+
+    return Gnx