diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d35c5c02218..ebffd18ca5a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,3 +1,17 @@
+# https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners
+# Order matters - match of highest importance goes last (last match wins)
+
+#doc code owners
+datasets/          @rapidsai/cugraph-doc-codeowners
+notebooks/         @rapidsai/cugraph-doc-codeowners
+docs/              @rapidsai/cugraph-doc-codeowners
+**/*.txt           @rapidsai/cugraph-doc-codeowners
+**/*.md            @rapidsai/cugraph-doc-codeowners
+**/*.rst           @rapidsai/cugraph-doc-codeowners
+**/*.ipynb         @rapidsai/cugraph-doc-codeowners
+**/*.pdf           @rapidsai/cugraph-doc-codeowners
+**/*.png           @rapidsai/cugraph-doc-codeowners
+
 #cpp code owners
 cpp/               @rapidsai/cugraph-cpp-codeowners
 
@@ -9,7 +23,7 @@ python/            @rapidsai/cugraph-python-codeowners
 **/cmake/          @rapidsai/cugraph-cmake-codeowners
 
 #build/ops code owners
-.github/           @rapidsai/ops-codeowners 
+.github/           @rapidsai/ops-codeowners
 ci/                @rapidsai/ops-codeowners
 conda/             @rapidsai/ops-codeowners
 **/Dockerfile      @rapidsai/ops-codeowners
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 621d0fde833..9c3af6de64b 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -5,26 +5,33 @@
 python:
   - 'python/**'
   - 'notebooks/**'
+
+benchmarks:
   - 'benchmarks/**'
   
 doc:
   - 'docs/**'
   - '**/*.md'
+  - 'datasets/**'
+  - 'notebooks/**'
+  - '**/*.txt'
+  - '**/*.rst'
+  - '**/*.ipynb'
+  - '**/*.pdf'
+  - '**/*.png'
   
 datasets:
   - 'datasets/**'
   
 cuGraph:
   - 'cpp/**'
-
+  
 CMake:
   - '**/CMakeLists.txt'
   - '**/cmake/**'
-  
-Ops:
-  - '.github/**'
-  - 'ci/**'
+
+gpuCI:
+   - 'ci/**'
+
+conda:
   - 'conda/**'
-  - '**/Dockerfile'
-  - '**/.dockerignore'
-  - 'docker/**'
diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
new file mode 100644
index 00000000000..8b65da69aa2
--- /dev/null
+++ b/.github/workflows/stale.yaml
@@ -0,0 +1,57 @@
+name: Mark inactive issues and pull requests
+
+on:
+  schedule:
+    - cron: "0 * * * *"
+
+jobs:
+  mark-inactive-30d:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Mark 30 day inactive issues and pull requests
+        uses: actions/stale@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          stale-issue-message: >
+            This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days.
+            Please close this issue if no further response or action is needed.
+            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
+            This issue will be labeled `inactive-90d` if there is no activity in the next 60 days.
+          stale-issue-label: "inactive-30d"
+          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
+          days-before-issue-stale: 30
+          days-before-issue-close: -1
+          stale-pr-message: >
+            This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days.
+            Please close this PR if it is no longer required.
+            Otherwise, please respond with a comment indicating any updates.
+            This PR will be labeled `inactive-90d` if there is no activity in the next 60 days.
+          stale-pr-label: "inactive-30d"
+          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
+          days-before-pr-stale: 30
+          days-before-pr-close: -1
+          operations-per-run: 50
+  mark-inactive-90d:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Mark 90 day inactive issues and pull requests
+        uses: actions/stale@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          stale-issue-message: >
+            This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days.
+            Please close this issue if no further response or action is needed.
+            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
+          stale-issue-label: "inactive-90d"
+          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
+          days-before-issue-stale: 90
+          days-before-issue-close: -1
+          stale-pr-message: >
+            This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days.
+            Please close this PR if it is no longer required.
+            Otherwise, please respond with a comment indicating any updates.
+          stale-pr-label: "inactive-90d"
+          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
+          days-before-pr-stale: 90
+          days-before-pr-close: -1
+          operations-per-run: 50
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b22d92c902f..2b8247f02d3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,12 +1,14 @@
-
-# cuGraph 0.18.0 (Date TBD)
+# cuGraph 0.19.0 (Date TBD)
 
 ## New Features
 
 ## Improvements
 
 ## Bug Fixes
-- PR #1321 Fix benchmark script trap setup to come after the PATH variable update
+
+# 0.18.0
+
+Please see https://github.com/rapidsai/cugraph/releases/tag/branch-0.18-latest for the latest changes to this development branch.
 
 # cuGraph 0.17.0 (10 Dec 2020)
 ## New Features
@@ -19,6 +21,7 @@
 - PR #1279 Add self loop check variable in graph
 - PR #1277 SciPy sparse matrix input support for WCC, SCC, SSSP, and BFS
 - PR #1278 Add support for shortest_path_length and fix graph vertex checks
+- PR #1280 Add Multi(Di)Graph support
 
 ## Improvements
 - PR #1227 Pin cmake policies to cmake 3.17 version
diff --git a/README.md b/README.md
index 8fee5451ac3..62059e9c7b6 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ The [RAPIDS](https://rapids.ai) cuGraph library is a collection of GPU accelerat
 
 **NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cugraph/blob/main/README.md) ensure you are on the latest branch.
 
-
+As an example, the following Python snippet loads graph data and computes PageRank:
 
 ```python
 import cugraph
@@ -30,6 +30,17 @@ for i in range(len(df_page)):
 		" PageRank is " + str(df_page['pagerank'].iloc[i]))
 ```
 
+## Getting cuGraph
+There are 3 ways to get cuGraph :
+1. [Quick start with Docker Repo](#quick)
+2. [Conda Installation](#conda)
+3. [Build from Source](#source)
+<br/><br/>
+
+---
+# Currently Supported Features
+As of Release 0.18 - including 0.18 nightly
+
 
 ## Supported Algorithms
 
@@ -40,6 +51,7 @@ for i in range(len(df_page)):
 |              | Betweenness Centrality                 | Single-GPU   |                     |
 |              | Edge Betweenness Centrality            | Single-GPU   |                     |
 | Community    |                                        |              |                     |
+|              | EgoNet                                 | Single-GPU   |                     |
 |              | Leiden                                 | Single-GPU   |                     |
 |              | Louvain                                | Multi-GPU    |                     |
 |              | Ensemble Clustering for Graphs         | Single-GPU   |                     |
@@ -56,17 +68,20 @@ for i in range(len(df_page)):
 |              | Core Number                            | Single-GPU   |                     |
 | Layout       |                                        |              |                     |
 |              | Force Atlas 2                          | Single-GPU   |                     |
+| Linear Assignment|                                    |              |                     |
+|              | Hungarian                              | Single-GPU   | [README](cpp/src/linear_assignment/README-hungarian.md) |
 | Link Analysis|                                        |              |                     |
 |              | Pagerank                               | Multi-GPU    |                     |
 |              | Personal Pagerank                      | Multi-GPU    |                     |
-|              | HITS                      				| Single-GPU   | leverages Gunrock   |
+|              | HITS                                   | Single-GPU   | leverages Gunrock   |
 | Link Prediction |                                     |              |                     |
 |              | Jaccard Similarity                     | Single-GPU   |                     |
 |              | Weighted Jaccard Similarity            | Single-GPU   |                     |
 |              | Overlap Similarity                     | Single-GPU   |                     |
 | Traversal    |                                        |              |                     |
-|              | Breadth First Search (BFS)             | Multi-GPU    |                     |
+|              | Breadth First Search (BFS)             | Multi-GPU    | with cutoff support |
 |              | Single Source Shortest Path (SSSP)     | Multi-GPU    |                     |
+|              | Traveling Salesperson Problem (TSP)    | Single-GPU   |                     |
 | Structure    |                                        |              |                     |
 |              | Renumbering                            | Single-GPU   | multiple columns, any data type  |
 |              | Symmetrize                             | Multi-GPU    |                     |
@@ -74,7 +89,6 @@ for i in range(len(df_page)):
 |              | Hungarian Algorithm                    | Single-GPU   |                     |
 |              | Minimum Spanning Tree                  | Single-GPU   |                     |
 |              | Maximum Spanning Tree                  | Single-GPU   |                     |
-
 |  |  |
 
 </br></br>
@@ -83,13 +97,13 @@ for i in range(len(df_page)):
 | --------------- | --------------------------------------------------- |
 | Graph           | An undirected Graph                                 |
 | DiGraph         | A Directed Graph                                    |
-| _Multigraph_      | _coming in 0.18_                                      |
-| _MultiDigraph_    | _coming in 0.18_                                      |
+| Multigraph      | A Graph with multiple edges between a vertex pair   |
+| MultiDigraph    | A Directed Graph with multiple edges between a vertex pair   |
 |  |  |
 
 </br></br>
 ## Supported Data Types
-cuGraph supports the creation of a graph several data types:
+cuGraph supports graph creation with Source and Destination being expressed as:
 * cuDF DataFrame
 * Pandas DataFrame
 
@@ -123,22 +137,14 @@ The amount of memory required is dependent on the graph structure and the analyt
 
 The use of managed memory for oversubscription can also be used to exceed the above memory limitations.  See the recent blog on _Tackling Large Graphs with RAPIDS cuGraph and CUDA Unified Memory on GPUs_:  https://medium.com/rapids-ai/tackling-large-graphs-with-rapids-cugraph-and-unified-virtual-memory-b5b69a065d4
 
+</br></br>
 
-## Getting cuGraph
-### Intro
-There are 3 ways to get cuGraph :
-1. [Quick start with Docker Demo Repo](#quick)
-2. [Conda Installation](#conda)
-3. [Build from Source](#source)
-
-
-
-
+---
 ## Quick Start <a name="quick"></a>
-Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you’re running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize all of the RAPIDS libraries: cuDF, cuML, and cuGraph.
+Please see the [Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you’re running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize all of the RAPIDS libraries: cuDF, cuML, and cuGraph.
 
 
-### Conda <a name="conda"></a>
+## Conda <a name="conda"></a>
 It is easy to install cuGraph using conda. You can get a minimal conda installation with [Miniconda](https://conda.io/miniconda.html) or get the full installation with [Anaconda](https://www.anaconda.com/download).
 
 Install and update cuGraph using the conda command:
@@ -158,7 +164,7 @@ conda install -c nvidia -c rapidsai -c numba -c conda-forge -c defaults cugraph
 Note: This conda installation only applies to Linux and Python versions 3.7/3.8.
 
 
-### Build from Source and Contributing <a name="source"></a>
+## Build from Source and Contributing <a name="source"></a>
 
 Please see our [guide for building cuGraph from source](SOURCEBUILD.md)</pr>
 
diff --git a/SOURCEBUILD.md b/SOURCEBUILD.md
index 8acd90c4f7f..0cbf6ccdaa3 100644
--- a/SOURCEBUILD.md
+++ b/SOURCEBUILD.md
@@ -1,6 +1,6 @@
 # Building from Source
 
-The following instructions are for users wishing to build cuGraph from source code.  These instructions are tested on supported distributions of Linux, CUDA, and Python - See [RAPIDS Getting Started](https://rapids.ai/start.html) for list of supported environments.  Other operating systems _might be_ compatible, but are not currently tested. 
+The following instructions are for users wishing to build cuGraph from source code.  These instructions are tested on supported distributions of Linux, CUDA, and Python - See [RAPIDS Getting Started](https://rapids.ai/start.html) for list of supported environments.  Other operating systems _might be_ compatible, but are not currently tested.
 
 The cuGraph package include both a C/C++ CUDA portion and a python portion.  Both libraries need to be installed in order for cuGraph to operate correctly.
 
@@ -9,7 +9,7 @@ The cuGraph package include both a C/C++ CUDA portion and a python portion.  Bot
 __Compiler__:
 * `gcc`         version 5.4+
 * `nvcc`        version 10.0+
-* `cmake`       version 3.12+
+* `cmake`       version 3.18+
 
 __CUDA:__
 * CUDA 10.1+
@@ -97,17 +97,21 @@ There are several other options available on the build script for advanced users
 `build.sh` options:
 ```bash
 build.sh [<target> ...] [<flag> ...]
-   clean            - remove all existing build artifacts and configuration (start over)
-   libcugraph       - build the cugraph C++ code
-   cugraph          - build the cugraph Python package
-
+ where <target> is:
+    clean            - remove all existing build artifacts and configuration (start over)
+    libcugraph       - build the cugraph C++ code
+    cugraph          - build the cugraph Python package
+    docs             - build the docs
  and <flag> is:
    -v               - verbose build mode
    -g               - build for debug
    -n               - no install step
+   --allgpuarch     - build for all supported GPU architectures
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
 
+ default action (no args) is to build and install 'libcugraph' then 'cugraph' then 'docs' targets
+
 examples:
 $ ./build.sh clean                        # remove prior build artifacts (start over)
 $ ./build.sh libcugraph -v                # compile and install libcugraph with verbose output
@@ -189,7 +193,7 @@ Run either the C++ or the Python tests with datasets
 
    ```bash
    cd $CUGRAPH_HOME/datasets
-   source get_test_data.sh #This takes about 10 minutes and download 1GB data (>5 GB uncompressed)
+   source get_test_data.sh #This takes about 10 minutes and downloads 1GB data (>5 GB uncompressed)
    ```
 
    Run the C++ tests on large input:
diff --git a/benchmarks/bench_algos.py b/benchmarks/bench_algos.py
index 9be636ca480..f9f8bf9cf53 100644
--- a/benchmarks/bench_algos.py
+++ b/benchmarks/bench_algos.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -32,6 +32,7 @@ def setFixtureParamNames(*args, **kwargs):
 import cugraph
 from cugraph.structure.number_map import NumberMap
 from cugraph.tests import utils
+from cugraph.utilities.utils import is_device_version_less_than
 import rmm
 
 from .params import FIXTURE_PARAMS
@@ -212,6 +213,8 @@ def bench_jaccard(gpubenchmark, graphWithAdjListComputed):
     gpubenchmark(cugraph.jaccard, graphWithAdjListComputed)
 
 
+@pytest.mark.skipif(
+    is_device_version_less_than((7, 0)), reason="Not supported on Pascal")
 def bench_louvain(gpubenchmark, graphWithAdjListComputed):
     gpubenchmark(cugraph.louvain, graphWithAdjListComputed)
 
diff --git a/build.sh b/build.sh
index b3d3463ed4e..54634e2ca6e 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 
 # cugraph build script
 
@@ -19,18 +19,20 @@ ARGS=$*
 REPODIR=$(cd $(dirname $0); pwd)
 LIBCUGRAPH_BUILD_DIR=${LIBCUGRAPH_BUILD_DIR:=${REPODIR}/cpp/build}
 
-VALIDARGS="clean libcugraph cugraph docs -v -g -n --allgpuarch --show_depr_warn -h --help"
+VALIDARGS="clean libcugraph cugraph docs -v -g -n --allgpuarch --buildfaiss --show_depr_warn -h --help"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
    libcugraph       - build the cugraph C++ code
    cugraph          - build the cugraph Python package
+   cpp-mgtests      - build libcugraph mnmg tests. Builds MPI communicator, adding MPI as a dependency.
    docs             - build the docs
  and <flag> is:
    -v               - verbose build mode
    -g               - build for debug
    -n               - no install step
    --allgpuarch     - build for all supported GPU architectures
+   --buildfaiss     - build faiss statically into cugraph
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
 
@@ -44,10 +46,12 @@ CUGRAPH_BUILD_DIR=${REPODIR}/python/build
 BUILD_DIRS="${LIBCUGRAPH_BUILD_DIR} ${CUGRAPH_BUILD_DIR}"
 
 # Set defaults for vars modified by flags to this script
-VERBOSE=""
+VERBOSE_FLAG=""
 BUILD_TYPE=Release
 INSTALL_TARGET=install
 BUILD_DISABLE_DEPRECATION_WARNING=ON
+BUILD_CPP_MG_TESTS=OFF
+BUILD_STATIC_FAISS=OFF
 GPU_ARCH=""
 
 # Set defaults for vars that may not have been defined externally
@@ -82,7 +86,7 @@ fi
 
 # Process flags
 if hasArg -v; then
-    VERBOSE=1
+    VERBOSE_FLAG="-v"
 fi
 if hasArg -g; then
     BUILD_TYPE=Debug
@@ -93,9 +97,15 @@ fi
 if hasArg --allgpuarch; then
     GPU_ARCH="-DGPU_ARCHS=ALL"
 fi
+if hasArg --buildfaiss; then
+        BUILD_STATIC_FAISS=ON
+fi
 if hasArg --show_depr_warn; then
     BUILD_DISABLE_DEPRECATION_WARNING=OFF
 fi
+if hasArg cpp-mgtests; then
+    BUILD_CPP_MG_TESTS=ON
+fi
 
 # If clean given, run it prior to any other steps
 if hasArg clean; then
@@ -127,10 +137,13 @@ if buildAll || hasArg libcugraph; then
     mkdir -p ${LIBCUGRAPH_BUILD_DIR}
     cd ${LIBCUGRAPH_BUILD_DIR}
     cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
-        ${GPU_ARCH} \
-        -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ${REPODIR}/cpp
-    make -j${PARALLEL_LEVEL} VERBOSE=${VERBOSE} ${INSTALL_TARGET}
+          ${GPU_ARCH} \
+          -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
+          -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+          -DBUILD_STATIC_FAISS=${BUILD_STATIC_FAISS} \
+          -DBUILD_CUGRAPH_MG_TESTS=${BUILD_CPP_MG_TESTS} \
+          ${REPODIR}/cpp
+    cmake --build "${LIBCUGRAPH_BUILD_DIR}" -j${PARALLEL_LEVEL} --target ${INSTALL_TARGET} ${VERBOSE_FLAG}
 fi
 
 # Build and install the cugraph Python package
@@ -152,10 +165,11 @@ if buildAll || hasArg docs; then
         cd ${LIBCUGRAPH_BUILD_DIR}
         cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
             -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
-            -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ${REPODIR}/cpp
+            -DCMAKE_BUILD_TYPE=${BUILD_TYPE} ${REPODIR}/cpp \
+            -DBUILD_STATIC_FAISS=${BUILD_STATIC_FAISS}
     fi
     cd ${LIBCUGRAPH_BUILD_DIR}
-    make -j${PARALLEL_LEVEL} VERBOSE=${VERBOSE} docs_cugraph
+    cmake --build "${LIBCUGRAPH_BUILD_DIR}" -j${PARALLEL_LEVEL} --target docs_cugraph ${VERBOSE_FLAG}
     cd ${REPODIR}/docs
     make html
 fi
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 2c6dc899be2..d69448cda4e 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #########################################
 # cuGraph CPU conda build script for CI #
 #########################################
@@ -24,6 +24,9 @@ fi
 export GPUCI_CONDA_RETRY_MAX=1
 export GPUCI_CONDA_RETRY_SLEEP=30
 
+# Use Ninja to build
+export CMAKE_GENERATOR="Ninja"
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
diff --git a/ci/getGTestTimes.sh b/ci/getGTestTimes.sh
deleted file mode 100755
index 8a3752d76e2..00000000000
--- a/ci/getGTestTimes.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script will print the gtest results sorted by runtime. This will print
-# the results two ways: first by printing all tests sorted by runtime, then by
-# printing all tests grouped by test binary with tests sorted by runtime within
-# the group.
-#
-# To use this script, capture the test run output to a file then run this script
-# with the file as the first arg, or just redirect test output to this script.
-
-awk '/^Running GoogleTest .+$/ {
-       testbinary = $3
-     }
-     /^\[       OK \].+$/ {
-        testtime = substr($(NF-1),2)
-        newtestdata = testbinary ":" substr($0,14)
-        alltestdata = alltestdata newtestdata "\n"
-        testdata[testbinary] = testdata[testbinary] newtestdata "\n"
-        totaltime = totaltime + testtime
-     }
-     END {
-        # Print all tests sorted by time
-        system("echo \"" alltestdata "\" | sort -r -t\\( -nk2")
-        print "\n================================================================================"
-        # Print test binaries with tests sorted by time
-        print "Tests grouped by test binary:"
-        for (testbinary in testdata) {
-           print testbinary
-           system("echo \"" testdata[testbinary] "\" | sort -r -t\\( -nk2")
-        }
-        print "\n================================================================================"
-        print totaltime " milliseconds = " totaltime/60000 " minutes"
-     }
-' $1
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 019d03e21da..0fef7b62f8d 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -1,10 +1,10 @@
 #!/usr/bin/env bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 ##########################################
 # cuGraph GPU build & testscript for CI  #
 ##########################################
-set -e
-set -o pipefail
+set -e           # abort the script on error, this will change for running tests (see below)
+set -o pipefail  # piped commands propagate their error
 NUMARGS=$#
 ARGS=$*
 
@@ -98,10 +98,15 @@ fi
 # TEST - Run GoogleTest and py.tests for libcugraph and cuGraph
 ################################################################################
 
-set +e -Eo pipefail
-EXITCODE=0
+# Switch to +e to allow failing commands to continue the script, which is needed
+# so all testing commands run regardless of pass/fail. This requires the desired
+# exit code to be managed using the ERR trap.
+set +e           # allow script to continue on error
+set -E           # ERR traps are inherited by subcommands
 trap "EXITCODE=1" ERR
 
+EXITCODE=0
+
 if hasArg --skip-tests; then
     gpuci_logger "Skipping Tests"
 else
@@ -117,18 +122,19 @@ else
         TEST_MODE_FLAG=""
     fi
 
+    gpuci_logger "Running cuGraph test.sh..."
     ${WORKSPACE}/ci/test.sh ${TEST_MODE_FLAG} | tee testoutput.txt
+    gpuci_logger "Ran cuGraph test.sh : return code was: $?, gpu/build.sh exit code is now: $EXITCODE"
 
-    echo -e "\nTOP 20 SLOWEST TESTS:\n"
-    # Wrap in echo to prevent non-zero exit since this command is non-essential
-    echo "$(${WORKSPACE}/ci/getGTestTimes.sh testoutput.txt | head -20)"
-
+    gpuci_logger "Running cuGraph notebook test script..."
     ${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log
+    gpuci_logger "Ran cuGraph notebook test script : return code was: $?, gpu/build.sh exit code is now: $EXITCODE"
     python ${WORKSPACE}/ci/utils/nbtestlog2junitxml.py nbtest.log
 fi
 
-if [ -n "\${CODECOV_TOKEN}" ]; then
-    codecov -t \$CODECOV_TOKEN
+if [ -n "${CODECOV_TOKEN}" ]; then
+    codecov -t $CODECOV_TOKEN
 fi
 
+gpuci_logger "gpu/build.sh returning value: $EXITCODE"
 return ${EXITCODE}
diff --git a/ci/gpu/notebook_list.py b/ci/gpu/notebook_list.py
new file mode 100644
index 00000000000..bb54913ac8d
--- /dev/null
+++ b/ci/gpu/notebook_list.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import sys
+import glob
+
+from numba import cuda
+
+#
+# Not strictly true... however what we mean is
+# Pascal or earlier
+#
+pascal = False
+
+device = cuda.get_current_device()
+cc = getattr(device, 'COMPUTE_CAPABILITY')
+if (cc[0] < 7):
+    pascal = True
+
+for filename in glob.iglob('**/*.ipynb', recursive=True):
+    skip = False
+    for line in open(filename, 'r'):
+        if re.search('# Skip notebook test', line):
+            skip = True
+            print(f'SKIPPING {filename} (marked as skip)', file=sys.stderr)
+            break;
+        elif re.search('dask', line):
+            print(f'SKIPPING {filename} (suspected Dask usage, not currently automatable)', file=sys.stderr)
+            skip = True
+            break;
+        elif pascal and re.search('# Does not run on Pascal', line):
+            print(f'SKIPPING {filename} (does not run on Pascal)', file=sys.stderr)
+            skip = True
+            break;
+
+    if not skip:
+        print(filename)
diff --git a/ci/gpu/test-notebooks.sh b/ci/gpu/test-notebooks.sh
index 389d3be0bfd..650132f116d 100755
--- a/ci/gpu/test-notebooks.sh
+++ b/ci/gpu/test-notebooks.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,23 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#RAPIDS_DIR=/rapids
+# Any failing command will set EXITCODE to non-zero
+set -e           # abort the script on error, this will change for running tests (see below)
+set -o pipefail  # piped commands propagate their error
+set -E           # ERR traps are inherited by subcommands
+trap "EXITCODE=1" ERR
+
 NOTEBOOKS_DIR=${WORKSPACE}/notebooks
 NBTEST=${WORKSPACE}/ci/utils/nbtest.sh
 LIBCUDF_KERNEL_CACHE_PATH=${WORKSPACE}/.jitcache
+EXITCODE=0
 
 cd ${NOTEBOOKS_DIR}
 TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u)
 
-# Add notebooks that should be skipped here
-# (space-separated list of filenames without paths)
-
-SKIPNBS="uvm.ipynb bfs_benchmark.ipynb louvain_benchmark.ipynb pagerank_benchmark.ipynb sssp_benchmark.ipynb release.ipynb nx_cugraph_bc_benchmarking.ipynb"
-
 ## Check env
 env
 
-EXITCODE=0
+# Do not abort the script on error. This allows all tests to run regardless of
+# pass/fail but relies on the ERR trap above to manage the EXITCODE for the
+# script.
+set +e
 
 # Always run nbtest in all TOPLEVEL_NB_FOLDERS, set EXITCODE to failure
 # if any run fails
@@ -37,29 +41,20 @@ for folder in ${TOPLEVEL_NB_FOLDERS}; do
     echo "FOLDER: ${folder}"
     echo "========================================"
     cd ${NOTEBOOKS_DIR}/${folder}
-    for nb in $(find . -name "*.ipynb"); do
+    NBLIST=$(python ${WORKSPACE}/ci/gpu/notebook_list.py)
+    for nb in ${NBLIST}; do
         nbBasename=$(basename ${nb})
-        # Skip all NBs that use dask (in the code or even in their name)
-        if ((echo ${nb}|grep -qi dask) || \
-            (grep -q dask ${nb})); then
-            echo "--------------------------------------------------------------------------------"
-            echo "SKIPPING: ${nb} (suspected Dask usage, not currently automatable)"
-            echo "--------------------------------------------------------------------------------"
-        elif (echo " ${SKIPNBS} " | grep -q " ${nbBasename} "); then
-            echo "--------------------------------------------------------------------------------"
-            echo "SKIPPING: ${nb} (listed in skip list)"
-            echo "--------------------------------------------------------------------------------"
-        else
-            cd $(dirname ${nb})
-            nvidia-smi
-            ${NBTEST} ${nbBasename}
-            EXITCODE=$((EXITCODE | $?))
-            rm -rf ${LIBCUDF_KERNEL_CACHE_PATH}/*
-            cd ${NOTEBOOKS_DIR}/${folder}
-        fi
+        cd $(dirname ${nb})
+        nvidia-smi
+        ${NBTEST} ${nbBasename}
+        echo "Ran nbtest for $nb : return code was: $?, test script exit code is now: $EXITCODE"
+        echo
+        rm -rf ${LIBCUDF_KERNEL_CACHE_PATH}/*
+        cd ${NOTEBOOKS_DIR}/${folder}
     done
 done
 
 nvidia-smi
 
+echo "Notebook test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/test.sh b/ci/test.sh
index db9390461c0..b0134e97246 100755
--- a/ci/test.sh
+++ b/ci/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# note: do not use set -e in order to allow all gtest invocations to take place,
-# and instead keep track of exit status and exit with an overall exit status
-set -o pipefail
+# Any failing command will set EXITCODE to non-zero
+set -e           # abort the script on error, this will change for running tests (see below)
+set -o pipefail  # piped commands propagate their error
+set -E           # ERR traps are inherited by subcommands
+trap "EXITCODE=1" ERR
 
 NUMARGS=$#
 ARGS=$*
@@ -22,7 +24,7 @@ THISDIR=$(cd $(dirname $0);pwd)
 CUGRAPH_ROOT=$(cd ${THISDIR}/..;pwd)
 GTEST_ARGS="--gtest_output=xml:${CUGRAPH_ROOT}/test-results/"
 DOWNLOAD_MODE=""
-ERRORCODE=0
+EXITCODE=0
 
 export RAPIDS_DATASET_ROOT_DIR=${CUGRAPH_ROOT}/datasets
 
@@ -50,47 +52,61 @@ else
     echo "Download datasets..."
     cd ${RAPIDS_DATASET_ROOT_DIR}
     bash ./get_test_data.sh ${DOWNLOAD_MODE}
-    ERRORCODE=$((ERRORCODE | $?))
-    # no need to run tests if dataset download fails
-    if (( ${ERRORCODE} != 0 )); then
-        exit ${ERRORCODE}
-    fi
 fi
 
 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     cd ${CUGRAPH_ROOT}/cpp/build
 else
-    export LD_LIBRARY_PATH="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build:$LD_LIBRARY_PATH"
+    export LD_LIBRARY_PATH="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build:$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
     cd $WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build
 fi
 
-for gt in gtests/*; do
-    test_name=$(basename $gt)
-    echo "Running GoogleTest $test_name"
-    ${gt} ${GTEST_FILTER} ${GTEST_ARGS}
-    ERRORCODE=$((ERRORCODE | $?))
-done
-
+# FIXME: if possible, any install and build steps should be moved outside this
+# script since a failing install/build step is treated as a failing test command
+# and will not stop the script. This script is also only expected to run tests
+# in a preconfigured environment, and install/build steps are unexpected side
+# effects.
 if [[ "$PROJECT_FLASH" == "1" ]]; then
+    export LIBCUGRAPH_BUILD_DIR="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build"
+
+    # Faiss patch
+    echo "Update libcugraph.so"
+    cd $LIBCUGRAPH_BUILD_DIR
+    chrpath -d libcugraph.so
+    patchelf --replace-needed `patchelf --print-needed libcugraph.so | grep faiss` libfaiss.so libcugraph.so
+
     CONDA_FILE=`find $WORKSPACE/ci/artifacts/cugraph/cpu/conda-bld/ -name "libcugraph*.tar.bz2"`
     CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension
     CONDA_FILE=${CONDA_FILE//-/=} #convert to conda install
     echo "Installing $CONDA_FILE"
     conda install -c $WORKSPACE/ci/artifacts/cugraph/cpu/conda-bld/ "$CONDA_FILE"
 
-    export LIBCUGRAPH_BUILD_DIR="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build"
     echo "Build cugraph..."
     $WORKSPACE/build.sh cugraph
 fi
 
+# Do not abort the script on error from this point on. This allows all tests to
+# run regardless of pass/fail, but relies on the ERR trap above to manage the
+# EXITCODE for the script.
+set +e
+
+echo "C++ gtests for cuGraph..."
+for gt in tests/*_TEST; do
+    test_name=$(basename $gt)
+    echo "Running gtest $test_name"
+    ${gt} ${GTEST_FILTER} ${GTEST_ARGS}
+    echo "Ran gtest $test_name : return code was: $?, test script exit code is now: $EXITCODE"
+done
+
 echo "Python pytest for cuGraph..."
 cd ${CUGRAPH_ROOT}/python
 pytest --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph.xml -v --cov-config=.coveragerc --cov=cugraph --cov-report=xml:${WORKSPACE}/python/cugraph/cugraph-coverage.xml --cov-report term --ignore=cugraph/raft --benchmark-disable
-ERRORCODE=$((ERRORCODE | $?))
+echo "Ran Python pytest for cugraph : return code was: $?, test script exit code is now: $EXITCODE"
 
 echo "Python benchmarks for cuGraph (running as tests)..."
 cd ${CUGRAPH_ROOT}/benchmarks
 pytest -v -m "managedmem_on and poolallocator_on and tiny" --benchmark-disable
-ERRORCODE=$((ERRORCODE | $?))
+echo "Ran Python benchmarks for cuGraph (running as tests) : return code was: $?, test script exit code is now: $EXITCODE"
 
-exit ${ERRORCODE}
+echo "Test script exiting with value: $EXITCODE"
+exit ${EXITCODE}
diff --git a/ci/utils/nbtest.sh b/ci/utils/nbtest.sh
index 8c86baeaa09..ae8b52df106 100755
--- a/ci/utils/nbtest.sh
+++ b/ci/utils/nbtest.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,6 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Any failing command will set EXITCODE to non-zero
+set +e           # do not abort the script on error
+set -o pipefail  # piped commands propagate their error
+set -E           # ERR traps are inherited by subcommands
+trap "EXITCODE=1" ERR
+
+# Prepend the following code to all scripts generated from nbconvert.  This
+# allows all cell and line magic code to run and update the namespace as if
+# running in jupyter, but will also tolerate failures due to running in a
+# non-jupyter env.
+# Note: depending on the assumptions of the notebook script, ignoring failures
+# may not be acceptable (meaning the converted notebook simply cannot run
+# outside of jupyter as-is), hence the warning.
 MAGIC_OVERRIDE_CODE="
 def my_run_line_magic(*args, **kwargs):
     g=globals()
@@ -58,7 +71,6 @@ for nb in $*; do
     NBEXITCODE=$?
     echo EXIT CODE: ${NBEXITCODE}
     echo
-    EXITCODE=$((EXITCODE | ${NBEXITCODE}))
 done
 
 exit ${EXITCODE}
diff --git a/conda/environments/builddocs.yml b/conda/environments/builddocs.yml
deleted file mode 100644
index 89bd44a5542..00000000000
--- a/conda/environments/builddocs.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-name: builddocs
-channels:
-- rapidsai
-- pytorch
-- conda-forge
-- numba
-- defaults
-dependencies:
-- python=3.6*
-- cugraph=0.8*
-- cudatoolkit=9.2
-- cudf=0.8*
-- pyarrow=0.12.1.*
-- cython=0.29*
-- pip:
-  - numpydoc
-  - sphinx
-  - sphinx-rtd-theme
-  - sphinxcontrib-websupport
diff --git a/conda/environments/cugraph_dev_cuda10.1.yml b/conda/environments/cugraph_dev_cuda10.1.yml
index 067fd0bc4ba..255366b0a82 100644
--- a/conda/environments/cugraph_dev_cuda10.1.yml
+++ b/conda/environments/cugraph_dev_cuda10.1.yml
@@ -5,17 +5,17 @@ channels:
 - rapidsai-nightly
 - conda-forge
 dependencies:
-- cudf=0.18.*
-- libcudf=0.18.*
-- rmm=0.18.*
-- cuxfilter=0.18.*
-- librmm=0.18.*
+- cudf=0.19.*
+- libcudf=0.19.*
+- rmm=0.19.*
+- cuxfilter=0.19.*
+- librmm=0.19.*
 - dask>=2.12.0
 - distributed>=2.12.0
-- dask-cuda=0.18*
-- dask-cudf=0.18*
+- dask-cuda=0.19*
+- dask-cudf=0.19*
 - nccl>=2.7
-- ucx-py=0.18*
+- ucx-py=0.19*
 - ucx-proc=*=gpu
 - scipy
 - networkx
@@ -23,12 +23,14 @@ dependencies:
 - cudatoolkit=10.1
 - clang=8.0.1
 - clang-tools=8.0.1
-- cmake>=3.12
+- cmake>=3.18
 - python>=3.6,<3.9
 - notebook>=0.5.0
 - boost
 - cython>=0.29,<0.30
 - pytest
+- libfaiss=1.6.3
+- faiss-proc=*=cuda
 - scikit-learn>=0.23.1
 - colorcet
 - holoviews
@@ -44,3 +46,6 @@ dependencies:
 - libcypher-parser
 - rapids-pytest-benchmark
 - doxygen
+- pytest-cov
+- gtest
+- gmock
diff --git a/conda/environments/cugraph_dev_cuda10.2.yml b/conda/environments/cugraph_dev_cuda10.2.yml
index 3371340d8bd..e64d7c77b7d 100644
--- a/conda/environments/cugraph_dev_cuda10.2.yml
+++ b/conda/environments/cugraph_dev_cuda10.2.yml
@@ -5,17 +5,17 @@ channels:
 - rapidsai-nightly
 - conda-forge
 dependencies:
-- cudf=0.18.*
-- libcudf=0.18.*
-- rmm=0.18.*
-- cuxfilter=0.18.*
-- librmm=0.18.*
+- cudf=0.19.*
+- libcudf=0.19.*
+- rmm=0.19.*
+- cuxfilter=0.19.*
+- librmm=0.19.*
 - dask>=2.12.0
 - distributed>=2.12.0
-- dask-cuda=0.18*
-- dask-cudf=0.18*
+- dask-cuda=0.19*
+- dask-cudf=0.19*
 - nccl>=2.7
-- ucx-py=0.18*
+- ucx-py=0.19*
 - ucx-proc=*=gpu
 - scipy
 - networkx
@@ -23,12 +23,14 @@ dependencies:
 - cudatoolkit=10.2
 - clang=8.0.1
 - clang-tools=8.0.1
-- cmake>=3.12
+- cmake>=3.18
 - python>=3.6,<3.9
 - notebook>=0.5.0
 - boost
 - cython>=0.29,<0.30
 - pytest
+- libfaiss=1.6.3
+- faiss-proc=*=cuda
 - scikit-learn>=0.23.1
 - colorcet
 - holoviews
@@ -44,3 +46,6 @@ dependencies:
 - libcypher-parser
 - rapids-pytest-benchmark
 - doxygen
+- pytest-cov
+- gtest
+- gmock
diff --git a/conda/environments/cugraph_dev_cuda11.0.yml b/conda/environments/cugraph_dev_cuda11.0.yml
index ee3b57632a1..1f05e4762ef 100644
--- a/conda/environments/cugraph_dev_cuda11.0.yml
+++ b/conda/environments/cugraph_dev_cuda11.0.yml
@@ -5,17 +5,17 @@ channels:
 - rapidsai-nightly
 - conda-forge
 dependencies:
-- cudf=0.18.*
-- libcudf=0.18.*
-- rmm=0.18.*
-- cuxfilter=0.18.*
-- librmm=0.18.*
+- cudf=0.19.*
+- libcudf=0.19.*
+- rmm=0.19.*
+- cuxfilter=0.19.*
+- librmm=0.19.*
 - dask>=2.12.0
 - distributed>=2.12.0
-- dask-cuda=0.18*
-- dask-cudf=0.18*
+- dask-cuda=0.19*
+- dask-cudf=0.19*
 - nccl>=2.7
-- ucx-py=0.18*
+- ucx-py=0.19*
 - ucx-proc=*=gpu
 - scipy
 - networkx
@@ -23,12 +23,14 @@ dependencies:
 - cudatoolkit=11.0
 - clang=8.0.1
 - clang-tools=8.0.1
-- cmake>=3.12
+- cmake>=3.18
 - python>=3.6,<3.9
 - notebook>=0.5.0
 - boost
 - cython>=0.29,<0.30
 - pytest
+- libfaiss=1.6.3
+- faiss-proc=*=cuda
 - scikit-learn>=0.23.1
 - colorcet
 - holoviews
@@ -44,3 +46,6 @@ dependencies:
 - libcypher-parser
 - rapids-pytest-benchmark
 - doxygen
+- pytest-cov
+- gtest
+- gmock
diff --git a/conda/recipes/libcugraph/meta.yaml b/conda/recipes/libcugraph/meta.yaml
index 211ec920d27..8f7495eab3c 100644
--- a/conda/recipes/libcugraph/meta.yaml
+++ b/conda/recipes/libcugraph/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 # Usage:
 #   conda build -c nvidia -c rapidsai -c conda-forge -c defaults .
@@ -21,6 +21,13 @@ build:
     - CUDAHOSTCXX
     - PARALLEL_LEVEL
     - VERSION_SUFFIX
+    - CCACHE_DIR
+    - CCACHE_NOHASHDIR
+    - CCACHE_COMPILERCHECK
+    - CMAKE_GENERATOR
+    - CMAKE_C_COMPILER_LAUNCHER
+    - CMAKE_CXX_COMPILER_LAUNCHER
+    - CMAKE_CUDA_COMPILER_LAUNCHER
 
 requirements:
   build:
@@ -32,12 +39,18 @@ requirements:
     - nccl>=2.7
     - ucx-py {{ minor_version }}
     - ucx-proc=*=gpu
+    - gtest
+    - faiss-proc=*=cuda
+    - libfaiss=1.6.3
+    - gmock
   run:
     - libcudf={{ minor_version }}
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
     - nccl>=2.7
     - ucx-py {{ minor_version }}
     - ucx-proc=*=gpu
+    - faiss-proc=*=cuda
+    - libfaiss=1.6.3
 
 #test:
 #  commands:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index bd122fc1fb2..b2d537edaa2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@
 # limitations under the License.
 #=============================================================================
 
-cmake_minimum_required(VERSION 3.12..3.17 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.18...3.18 FATAL_ERROR)
 
-project(CUGRAPH VERSION 0.18.0 LANGUAGES C CXX CUDA)
+project(CUGRAPH VERSION 0.19.0 LANGUAGES C CXX CUDA)
 
 ###################################################################################################
 # - build type ------------------------------------------------------------------------------------
@@ -33,6 +33,18 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
+##############################################################################
+# - User Options  ------------------------------------------------------------
+
+option(BUILD_CUGRAPH_MG_TESTS "Build cuGraph multigpu algorithm tests" OFF)
+
+###################################################################################################
+# - user options ------------------------------------------------------------------------------
+
+set(BLAS_LIBRARIES "" CACHE STRING
+    "Location of BLAS library for FAISS build.")
+option(BUILD_STATIC_FAISS "Build the FAISS library for nearest neighbors search on GPU" OFF)
+
 ###################################################################################################
 # - compiler options ------------------------------------------------------------------------------
 
@@ -90,10 +102,12 @@ message("-- Building for GPU_ARCHS = ${GPU_ARCHS}")
 foreach(arch ${GPU_ARCHS})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${arch},code=sm_${arch}")
   set(GUNROCK_GENCODE_SM${arch} "ON")
+  set(FAISS_GPU_ARCHS "${FAISS_GPU_ARCHS} -gencode arch=compute_${arch},code=sm_${arch}")
 endforeach()
 
 list(GET GPU_ARCHS -1 ptx)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${ptx},code=compute_${ptx}")
+set(FAISS_GPU_ARCHS "${FAISS_GPU_ARCHS} -gencode arch=compute_${ptx},code=compute_${ptx}")
 
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xptxas --disable-warnings")
@@ -152,23 +166,24 @@ if(OpenMP_FOUND)
 endif(OpenMP_FOUND)
 
 
+###################################################################################################
+# - find blas -------------------------------------------------------------------------------------
+
+if(NOT DEFINED BLAS_LIBRARIES)
+  find_package( BLAS REQUIRED )
+else()
+  message(STATUS "Manually setting BLAS to ${BLAS_LIBRARIES}")
+endif()
+
 ###################################################################################################
 # - find gtest ------------------------------------------------------------------------------------
 
 if(BUILD_TESTS)
-    include(ConfigureGoogleTest)
-
-    if(GTEST_FOUND)
-        message(STATUS
-            "Google C++ Testing Framework (Google Test) found in ${GTEST_ROOT}")
-    else()
-        message(AUTHOR_WARNING
-            "Google C++ Testing Framework (Google Test) not found: automated tests are disabled.")
-    endif(GTEST_FOUND)
+    find_package(GTest REQUIRED)
 endif(BUILD_TESTS)
 
 ###################################################################################################
-# - RMM -------------------------------------------------------------------------------------------
+# - find RMM --------------------------------------------------------------------------------------
 
 find_path(RMM_INCLUDE "rmm"
     HINTS
@@ -178,6 +193,24 @@ find_path(RMM_INCLUDE "rmm"
 
 message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}")
 
+###################################################################################################
+# - find NCCL -------------------------------------------------------------------------------------
+
+if(NOT NCCL_PATH)
+    find_package(NCCL REQUIRED)
+else()
+    message("-- Manually set NCCL PATH to ${NCCL_PATH}")
+    set(NCCL_INCLUDE_DIRS ${NCCL_PATH}/include)
+    set(NCCL_LIBRARIES ${NCCL_PATH}/lib/libnccl.so)
+endif(NOT NCCL_PATH)
+
+###################################################################################################
+# - find MPI - only enabled if MG tests are to be built
+
+if(BUILD_CUGRAPH_MG_TESTS)
+  find_package(MPI REQUIRED)
+endif(BUILD_CUGRAPH_MG_TESTS)
+
 ###################################################################################################
 # - Fetch Content ---------------------------------------------------------------------------------
 include(FetchContent)
@@ -205,7 +238,7 @@ message("Fetching cuco")
 FetchContent_Declare(
     cuco
     GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-    GIT_TAG        d965ed8dea8f56da8e260a6130dddf3ca351c45f
+    GIT_TAG        2196040f0562a0280292eebef5295d914f615e63
 )
 
 FetchContent_GetProperties(cuco)
@@ -235,26 +268,60 @@ endif()
 set(LIBCUDACXX_INCLUDE_DIR "${libcudacxx_SOURCE_DIR}/include")
 message("set LIBCUDACXX_INCLUDE_DIR to: ${LIBCUDACXX_INCLUDE_DIR}")
 
+# - CUHORNET
+FetchContent_Declare(
+    cuhornet
+    GIT_REPOSITORY    https://github.com/rapidsai/cuhornet.git
+    GIT_TAG           9cb8e8803852bd895a9c95c0fe778ad6eeefa7ad
+    GIT_SHALLOW       true
+    SOURCE_SUBDIR     hornet
+)
+
+FetchContent_GetProperties(cuhornet)
+if(NOT cuhornet_POPULATED)
+  message("populating cuhornet")
+  FetchContent_Populate(cuhornet)
+  # We are not using the cuhornet CMake targets, so no need to call `add_subdirectory()`.
+endif()
+set(CUHORNET_INCLUDE_DIR ${cuhornet_SOURCE_DIR} CACHE STRING "Path to cuhornet includes")
+
+# - raft - (header only)
+# Only cloned if RAFT_PATH env variable is not defined
+if(DEFINED ENV{RAFT_PATH})
+  message(STATUS "RAFT_PATH environment variable detected.")
+  message(STATUS "RAFT_DIR set to $ENV{RAFT_PATH}")
+  set(RAFT_DIR "$ENV{RAFT_PATH}")
+
+else(DEFINED ENV{RAFT_PATH})
+  message(STATUS "RAFT_PATH environment variable NOT detected, cloning RAFT")
+
+  FetchContent_Declare(
+    raft
+    GIT_REPOSITORY    https://github.com/rapidsai/raft.git
+    GIT_TAG           4a79adcb0c0e87964dcdc9b9122f242b5235b702
+    SOURCE_SUBDIR     raft
+  )
+
+  FetchContent_GetProperties(raft)
+  if(NOT raft_POPULATED)
+     message("populating raft")
+     FetchContent_Populate(raft)
+     # We are not using any raft CMake targets, so no need to call `add_subdirectory()`.
+  endif()
+
+  set(RAFT_DIR "${raft_SOURCE_DIR}")
+endif(DEFINED ENV{RAFT_PATH})
 
 ###################################################################################################
 # - External Projects -----------------------------------------------------------------------------
 
 # https://cmake.org/cmake/help/v3.0/module/ExternalProject.html
-include(ExternalProject)
-
-# - CUHORNET
-set(CUHORNET_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuhornet CACHE STRING "Path to cuhornet repo")
-set(CUHORNET_INCLUDE_DIR ${CUHORNET_DIR}/src/cuhornet CACHE STRING "Path to cuhornet includes")
 
+# FIXME: gunrock is the only external package still using ExternalProject
+# instead of FetchContent. Consider migrating to FetchContent soon (this may
+# require updates to the gunrock cmake files to support this).
 
-ExternalProject_Add(cuhornet
-  GIT_REPOSITORY    https://github.com/rapidsai/cuhornet.git
-  GIT_TAG           9cb8e8803852bd895a9c95c0fe778ad6eeefa7ad
-  PREFIX            ${CUHORNET_DIR}
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-)
+include(ExternalProject)
 
 # - GUNROCK
 set(GUNROCK_DIR ${CMAKE_CURRENT_BINARY_DIR}/gunrock CACHE STRING "Path to gunrock repo")
@@ -262,7 +329,7 @@ set(GUNROCK_INCLUDE_DIR ${GUNROCK_DIR}/src/gunrock_ext CACHE STRING "Path to gun
 
 ExternalProject_Add(gunrock_ext
   GIT_REPOSITORY    https://github.com/gunrock/gunrock.git
-  GIT_TAG           dev
+  GIT_TAG           v1.2
   PREFIX            ${GUNROCK_DIR}
   CMAKE_ARGS        -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
                     -DGUNROCK_BUILD_SHARED_LIBS=OFF
@@ -280,74 +347,61 @@ ExternalProject_Add(gunrock_ext
 )
 
 add_library(gunrock STATIC IMPORTED)
-
 add_dependencies(gunrock gunrock_ext)
-
 set_property(TARGET gunrock PROPERTY IMPORTED_LOCATION ${GUNROCK_DIR}/src/gunrock_ext-build/lib/libgunrock.a)
 
-# - NCCL
-if(NOT NCCL_PATH)
-    find_package(NCCL REQUIRED)
+# - FAISS
+# FIXME: The commit currently being fetched from faiss is using autotools which
+# is more convenient to build with ExternalProjectAdd.
+# Consider migrating to FetchContent once the tagged commit is changed.
+
+if(BUILD_STATIC_FAISS)
+  set(FAISS_DIR ${CMAKE_CURRENT_BINARY_DIR}/faiss CACHE STRING
+    "Path to FAISS source directory")
+  ExternalProject_Add(faiss
+    GIT_REPOSITORY    https://github.com/facebookresearch/faiss.git
+    GIT_TAG           a5b850dec6f1cd6c88ab467bfd5e87b0cac2e41d
+    CONFIGURE_COMMAND LIBS=-pthread
+                      CPPFLAGS=-w
+                      LDFLAGS=-L${CMAKE_INSTALL_PREFIX}/lib
+                              ${CMAKE_CURRENT_BINARY_DIR}/faiss/src/faiss/configure
+	                      --prefix=${CMAKE_CURRENT_BINARY_DIR}/faiss
+	                      --with-blas=${BLAS_LIBRARIES}
+	                      --with-cuda=${CUDA_TOOLKIT_ROOT_DIR}
+	                      --with-cuda-arch=${FAISS_GPU_ARCHS}
+	                      -v
+    PREFIX            ${FAISS_DIR}
+    BUILD_COMMAND     make -j${PARALLEL_LEVEL} VERBOSE=1
+    BUILD_BYPRODUCTS  ${FAISS_DIR}/lib/libfaiss.a
+    BUILD_ALWAYS      1
+    INSTALL_COMMAND   make -s install > /dev/null
+    UPDATE_COMMAND    ""
+    BUILD_IN_SOURCE   1
+    PATCH_COMMAND     patch -p1 -N < ${CMAKE_CURRENT_SOURCE_DIR}/cmake/faiss_cuda11.patch || true)
+
+  ExternalProject_Get_Property(faiss install_dir)
+  add_library(FAISS::FAISS STATIC IMPORTED)
+  add_dependencies(FAISS::FAISS faiss)
+  set_property(TARGET FAISS::FAISS PROPERTY
+    IMPORTED_LOCATION ${FAISS_DIR}/lib/libfaiss.a)
+  set(FAISS_INCLUDE_DIRS "${FAISS_DIR}/src")
 else()
-    message("-- Manually set NCCL PATH to ${NCCL_PATH}")
-    set(NCCL_INCLUDE_DIRS ${NCCL_PATH}/include)
-    set(NCCL_LIBRARIES ${NCCL_PATH}/lib/libnccl.so)
-endif(NOT NCCL_PATH)
-
-# - raft - (header only) -----------------------------------------------------
-
-# Only cloned if RAFT_PATH env variable is not defined
-
-if(DEFINED ENV{RAFT_PATH})
-  message(STATUS "RAFT_PATH environment variable detected.")
-  message(STATUS "RAFT_DIR set to $ENV{RAFT_PATH}")
-  set(RAFT_DIR "$ENV{RAFT_PATH}")
-
-  ExternalProject_Add(raft
-    DOWNLOAD_COMMAND  ""
-    SOURCE_DIR        ${RAFT_DIR}
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   "")
-
-else(DEFINED ENV{RAFT_PATH})
-  message(STATUS "RAFT_PATH environment variable NOT detected, cloning RAFT")
-  set(RAFT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo")
-
-  ExternalProject_Add(raft
-    GIT_REPOSITORY    https://github.com/rapidsai/raft.git
-    GIT_TAG           f75d7b437bf1da3df749108161b8a0505fb6b7b3
-    PREFIX            ${RAFT_DIR}
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   "")
-
-  # Redefining RAFT_DIR so it coincides with the one inferred by env variable.
-  set(RAFT_DIR "${RAFT_DIR}/src/raft/")
-endif(DEFINED ENV{RAFT_PATH})
-
+  set(FAISS_INSTALL_DIR ENV{FAISS_ROOT})
+  find_package(FAISS REQUIRED)
+endif(BUILD_STATIC_FAISS)
 
 ###################################################################################################
 # - library targets -------------------------------------------------------------------------------
 
-# target_link_directories is added in cmake 3.13, and cmake advises to use this instead of
-# link_directoires (we should switch to target_link_directories once 3.13 becomes the minimum
-# required version).
-link_directories(
-     # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the
-     # link directories for nvcc.
-    "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
-
 add_library(cugraph SHARED
     src/utilities/spmv_1D.cu
     src/utilities/cython.cu
     src/structure/graph.cu
     src/linear_assignment/hungarian.cu
-    src/link_analysis/pagerank.cu
-    src/link_analysis/pagerank_1D.cu
     src/link_analysis/gunrock_hits.cpp
     src/traversal/bfs.cu
     src/traversal/sssp.cu
+    src/traversal/tsp.cu
     src/link_prediction/jaccard.cu
     src/link_prediction/overlap.cu
     src/layout/force_atlas2.cu
@@ -357,9 +411,10 @@ add_library(cugraph SHARED
     src/community/louvain.cu
     src/community/leiden.cu
     src/community/ktruss.cu
-    src/community/ECG.cu
+    src/community/ecg.cu
     src/community/triangles_counting.cu
     src/community/extract_subgraph_by_vertex.cu
+    src/community/egonet.cu
     src/cores/core_number.cu
     src/traversal/two_hop_neighbors.cu
     src/components/connectivity.cu
@@ -367,6 +422,10 @@ add_library(cugraph SHARED
     src/centrality/betweenness_centrality.cu
     src/experimental/graph.cu
     src/experimental/graph_view.cu
+    src/experimental/coarsen_graph.cu
+    src/experimental/renumber_edgelist.cu
+    src/experimental/relabel.cu
+    src/experimental/induced_subgraph.cu
     src/experimental/bfs.cu
     src/experimental/sssp.cu
     src/experimental/pagerank.cu
@@ -374,12 +433,17 @@ add_library(cugraph SHARED
     src/tree/mst.cu
 )
 
+target_link_directories(cugraph
+    PRIVATE
+    # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the
+    # link directories for nvcc.
+    "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
+
 #
 # NOTE:  This dependency will force the building of cugraph to
 #        wait until after cugunrock is constructed.
 #
 add_dependencies(cugraph gunrock_ext)
-add_dependencies(cugraph raft)
 
 ###################################################################################################
 # - include paths ---------------------------------------------------------------------------------
@@ -408,7 +472,7 @@ target_include_directories(cugraph
 # - link libraries --------------------------------------------------------------------------------
 
 target_link_libraries(cugraph PRIVATE
-    gunrock cublas cusparse curand cusolver cudart cuda ${NCCL_LIBRARIES})
+  gunrock cublas cusparse curand cusolver cudart cuda FAISS::FAISS ${NCCL_LIBRARIES})
 
 if(OpenMP_CXX_FOUND)
 target_link_libraries(cugraph PRIVATE
@@ -461,16 +525,23 @@ target_link_libraries(cugraph PRIVATE
     ${OpenMP_CXX_LIB_NAMES})
 endif(OpenMP_CXX_FOUND)
 
+# CUDA_ARCHITECTURES=OFF implies cmake will not pass arch flags to the
+# compiler. CUDA_ARCHITECTURES must be set to a non-empty value to prevent cmake
+# warnings about policy CMP0104. With this setting, arch flags must be manually
+# set! ("evaluate_gpu_archs(GPU_ARCHS)" is the current mechanism used in
+# cpp/CMakeLists.txt for setting arch options).  Run "cmake --help-policy
+# CMP0104" for policy details.
+# NOTE: the CUDA_ARCHITECTURES=OFF setting may be removed after migrating to the
+# findcudatoolkit features in cmake 3.17+
+set_target_properties(cugraph PROPERTIES
+    CUDA_ARCHITECTURES OFF)
+
 ###################################################################################################
 # - generate tests --------------------------------------------------------------------------------
 
 if(BUILD_TESTS)
     if(GTEST_FOUND)
-        # target_link_directories is added in cmake 3.13, and cmake advises to use this instead of
-        # link_directoires (we should switch to target_link_directories once 3.13 becomes the
-        # minimum required version).
-        link_directories(${GTEST_LIBRARY_DIR})
-        add_subdirectory(${CMAKE_SOURCE_DIR}/tests)
+        add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tests)
     endif(GTEST_FOUND)
 endif(BUILD_TESTS)
 
diff --git a/cpp/cmake/Modules/ConfigureArrow.cmake b/cpp/cmake/Modules/ConfigureArrow.cmake
deleted file mode 100644
index b27e53dd415..00000000000
--- a/cpp/cmake/Modules/ConfigureArrow.cmake
+++ /dev/null
@@ -1,98 +0,0 @@
-set(ARROW_ROOT ${CMAKE_BINARY_DIR}/arrow)
-
-set(ARROW_CMAKE_ARGS " -DARROW_WITH_LZ4=OFF"
-                     " -DARROW_WITH_ZSTD=OFF"
-                     " -DARROW_WITH_BROTLI=OFF"
-                     " -DARROW_WITH_SNAPPY=OFF"
-                     " -DARROW_WITH_ZLIB=OFF"
-                     " -DARROW_BUILD_STATIC=ON"
-                     " -DARROW_BUILD_SHARED=OFF"
-                     " -DARROW_BOOST_USE_SHARED=ON"
-                     " -DARROW_BUILD_TESTS=OFF"
-                     " -DARROW_TEST_LINKAGE=OFF"
-                     " -DARROW_TEST_MEMCHECK=OFF"
-                     " -DARROW_BUILD_BENCHMARKS=OFF"
-                     " -DARROW_IPC=ON"
-                     " -DARROW_COMPUTE=OFF"
-                     " -DARROW_CUDA=OFF"
-                     " -DARROW_JEMALLOC=OFF"
-                     " -DARROW_BOOST_VENDORED=OFF"
-                     " -DARROW_PYTHON=OFF"
-                     " -DARROW_USE_GLOG=OFF"
-                     " -DCMAKE_VERBOSE_MAKEFILE=ON")
-
-configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/Arrow.CMakeLists.txt.cmake"
-               "${ARROW_ROOT}/CMakeLists.txt")
-
-file(MAKE_DIRECTORY "${ARROW_ROOT}/build")
-file(MAKE_DIRECTORY "${ARROW_ROOT}/install")
-
-execute_process(
-    COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
-    RESULT_VARIABLE ARROW_CONFIG
-    WORKING_DIRECTORY ${ARROW_ROOT})
-
-if(ARROW_CONFIG)
-    message(FATAL_ERROR "Configuring Arrow failed: " ${ARROW_CONFIG})
-endif(ARROW_CONFIG)
-
-set(PARALLEL_BUILD -j)
-if($ENV{PARALLEL_LEVEL})
-    set(NUM_JOBS $ENV{PARALLEL_LEVEL})
-    set(PARALLEL_BUILD "${PARALLEL_BUILD}${NUM_JOBS}")
-endif($ENV{PARALLEL_LEVEL})
-
-if(${NUM_JOBS})
-    if(${NUM_JOBS} EQUAL 1)
-        message(STATUS "ARROW BUILD: Enabling Sequential CMake build")
-    elseif(${NUM_JOBS} GREATER 1)
-        message(STATUS "ARROW BUILD: Enabling Parallel CMake build with ${NUM_JOBS} jobs")
-    endif(${NUM_JOBS} EQUAL 1)
-else()
-    message(STATUS "ARROW BUILD: Enabling Parallel CMake build with all threads")
-endif(${NUM_JOBS})
-
-execute_process(
-    COMMAND ${CMAKE_COMMAND} --build .. -- ${PARALLEL_BUILD}
-    RESULT_VARIABLE ARROW_BUILD
-    WORKING_DIRECTORY ${ARROW_ROOT}/build)
-
-if(ARROW_BUILD)
-    message(FATAL_ERROR "Building Arrow failed: " ${ARROW_BUILD})
-endif(ARROW_BUILD)
-
-set(ARROW_GENERATED_IPC_DIR 
-    "${ARROW_ROOT}/build/src/arrow/ipc")
-
-configure_file(${ARROW_GENERATED_IPC_DIR}/File_generated.h ${CMAKE_SOURCE_DIR}/include/cudf/ipc_generated/File_generated.h COPYONLY)
-configure_file(${ARROW_GENERATED_IPC_DIR}/Message_generated.h ${CMAKE_SOURCE_DIR}/include/cudf/ipc_generated/Message_generated.h COPYONLY)
-configure_file(${ARROW_GENERATED_IPC_DIR}/Schema_generated.h ${CMAKE_SOURCE_DIR}/include/cudf/ipc_generated/Schema_generated.h COPYONLY)
-configure_file(${ARROW_GENERATED_IPC_DIR}/Tensor_generated.h ${CMAKE_SOURCE_DIR}/include/cudf/ipc_generated/Tensor_generated.h COPYONLY)
-
-message(STATUS "Arrow installed here: " ${ARROW_ROOT}/install)
-set(ARROW_LIBRARY_DIR "${ARROW_ROOT}/install/lib")
-set(ARROW_INCLUDE_DIR "${ARROW_ROOT}/install/include")
-
-find_library(ARROW_LIB arrow
-             NO_DEFAULT_PATH
-             HINTS "${ARROW_LIBRARY_DIR}")
-
-if(ARROW_LIB)
-    message(STATUS "Arrow library: " ${ARROW_LIB})
-    set(ARROW_FOUND TRUE)
-endif(ARROW_LIB)
-
-set(FLATBUFFERS_ROOT "${ARROW_ROOT}/build/flatbuffers_ep-prefix/src/flatbuffers_ep-install")
-
-message(STATUS "FlatBuffers installed here: " ${FLATBUFFERS_ROOT})
-set(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_ROOT}/include")
-set(FLATBUFFERS_LIBRARY_DIR "${FLATBUFFERS_ROOT}/lib")
-
-add_definitions(-DARROW_METADATA_V4)
-add_definitions(-DARROW_VERSION=1210)
-
-
-
-
-
-
diff --git a/cpp/cmake/Modules/ConfigureGoogleTest.cmake b/cpp/cmake/Modules/ConfigureGoogleTest.cmake
deleted file mode 100644
index 9fac40f4649..00000000000
--- a/cpp/cmake/Modules/ConfigureGoogleTest.cmake
+++ /dev/null
@@ -1,49 +0,0 @@
-set(GTEST_ROOT "${CMAKE_BINARY_DIR}/googletest")
-
-set(GTEST_CMAKE_ARGS "")
-                     #" -Dgtest_build_samples=ON" 
-                     #" -DCMAKE_VERBOSE_MAKEFILE=ON")
-
-configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/GoogleTest.CMakeLists.txt.cmake"
-               "${GTEST_ROOT}/CMakeLists.txt")
-
-file(MAKE_DIRECTORY "${GTEST_ROOT}/build")
-file(MAKE_DIRECTORY "${GTEST_ROOT}/install")
-
-execute_process(COMMAND ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} .
-                RESULT_VARIABLE GTEST_CONFIG
-                WORKING_DIRECTORY ${GTEST_ROOT})
-
-if(GTEST_CONFIG)
-    message(FATAL_ERROR "Configuring GoogleTest failed: " ${GTEST_CONFIG})
-endif(GTEST_CONFIG)
-
-set(PARALLEL_BUILD -j)
-if($ENV{PARALLEL_LEVEL})
-    set(NUM_JOBS $ENV{PARALLEL_LEVEL})
-    set(PARALLEL_BUILD "${PARALLEL_BUILD}${NUM_JOBS}")
-endif($ENV{PARALLEL_LEVEL})
-
-if(${NUM_JOBS})
-    if(${NUM_JOBS} EQUAL 1)
-        message(STATUS "GTEST BUILD: Enabling Sequential CMake build")
-    elseif(${NUM_JOBS} GREATER 1)
-        message(STATUS "GTEST BUILD: Enabling Parallel CMake build with ${NUM_JOBS} jobs")
-    endif(${NUM_JOBS} EQUAL 1)
-else()
-    message(STATUS "GTEST BUILD: Enabling Parallel CMake build with all threads")
-endif(${NUM_JOBS})
-
-execute_process(COMMAND ${CMAKE_COMMAND} --build .. -- ${PARALLEL_BUILD}
-                RESULT_VARIABLE GTEST_BUILD
-                WORKING_DIRECTORY ${GTEST_ROOT}/build)
-
-if(GTEST_BUILD)
-    message(FATAL_ERROR "Building GoogleTest failed: " ${GTEST_BUILD})
-endif(GTEST_BUILD)
-
-message(STATUS "GoogleTest installed here: " ${GTEST_ROOT}/install)
-set(GTEST_INCLUDE_DIR "${GTEST_ROOT}/install/include")
-set(GTEST_LIBRARY_DIR "${GTEST_ROOT}/install/lib")
-set(GTEST_FOUND TRUE)
-
diff --git a/cpp/cmake/Modules/FindFAISS.cmake b/cpp/cmake/Modules/FindFAISS.cmake
new file mode 100644
index 00000000000..7c456edfeef
--- /dev/null
+++ b/cpp/cmake/Modules/FindFAISS.cmake
@@ -0,0 +1,98 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Based on FindPNG.cmake from cmake 3.14.3
+
+#[=======================================================================[.rst:
+FindFAISS
+--------
+
+Template to generate FindPKG_NAME.cmake CMake modules
+
+Find FAISS
+
+Imported targets
+^^^^^^^^^^^^^^^^
+
+This module defines the following :prop_tgt:`IMPORTED` target:
+
+``FAISS::FAISS``
+  The libFAISS library, if found.
+
+Result variables
+^^^^^^^^^^^^^^^^
+
+This module will set the following variables in your project:
+
+``FAISS_INCLUDE_DIRS``
+  where to find FAISS.hpp , etc.
+``FAISS_LIBRARIES``
+  the libraries to link against to use libFAISS.
+``FAISS_FOUND``
+  If false, do not try to use FAISS.
+``FAISS_VERSION_STRING``
+  the version of the FAISS library found
+
+#]=======================================================================]
+
+find_path(FAISS_LOCATION faiss/IndexFlat.h
+          HINTS ${FAISS_INSTALL_DIR}
+          PATH_SUFFIXES include include/)
+
+list(APPEND FAISS_NAMES faiss libfaiss)
+set(_FAISS_VERSION_SUFFIXES )
+
+foreach(v IN LISTS _FAISS_VERSION_SUFFIXES)
+  list(APPEND FAISS_NAMES faiss${v} libfaiss${v})
+  list(APPEND FAISS_NAMES faiss.${v} libfaiss.${v})
+endforeach()
+unset(_FAISS_VERSION_SUFFIXES)
+
+find_library(FAISS_LIBRARY_RELEASE NAMES ${FAISS_NAMES}
+             HINTS ${FAISS_INSTALL_DIR}
+             PATH_SUFFIXES lib)
+
+include(${CMAKE_ROOT}/Modules/SelectLibraryConfigurations.cmake)
+select_library_configurations(FAISS)
+mark_as_advanced(FAISS_LIBRARY_RELEASE)
+unset(FAISS_NAMES)
+
+# Set by select_library_configurations(), but we want the one from
+# find_package_handle_standard_args() below.
+unset(FAISS_FOUND)
+
+if (FAISS_LIBRARY AND FAISS_LOCATION)
+  set(FAISS_INCLUDE_DIRS ${FAISS_LOCATION} )
+  set(FAISS_LIBRARY ${FAISS_LIBRARY})
+
+  if(NOT TARGET FAISS::FAISS)
+    add_library(FAISS::FAISS UNKNOWN IMPORTED)
+    set_target_properties(FAISS::FAISS PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${FAISS_INCLUDE_DIRS}")
+    if(EXISTS "${FAISS_LIBRARY}")
+      set_target_properties(FAISS::FAISS PROPERTIES
+        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+        IMPORTED_LOCATION "${FAISS_LIBRARY}")
+    endif()
+  endif()
+endif ()
+
+
+include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+find_package_handle_standard_args(FAISS
+                                  REQUIRED_VARS FAISS_LIBRARY FAISS_LOCATION
+                                  VERSION_VAR FAISS_VERSION_STRING)
+
+mark_as_advanced(FAISS_LOCATION FAISS_LIBRARY)
diff --git a/cpp/cmake/Templates/Arrow.CMakeLists.txt.cmake b/cpp/cmake/Templates/Arrow.CMakeLists.txt.cmake
deleted file mode 100644
index b1eaf3f0efa..00000000000
--- a/cpp/cmake/Templates/Arrow.CMakeLists.txt.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.12)
-
-include(ExternalProject)
-
-ExternalProject_Add(Arrow
-                    GIT_REPOSITORY    https://github.com/apache/arrow.git
-                    GIT_TAG           apache-arrow-0.12.1
-                    SOURCE_DIR        "${ARROW_ROOT}/arrow"
-                    SOURCE_SUBDIR     "cpp"
-                    BINARY_DIR        "${ARROW_ROOT}/build"
-                    INSTALL_DIR       "${ARROW_ROOT}/install"
-                    CMAKE_ARGS        ${ARROW_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX=${ARROW_ROOT}/install)
-
-
-
-
-
-
-
diff --git a/cpp/cmake/Templates/GoogleTest.CMakeLists.txt.cmake b/cpp/cmake/Templates/GoogleTest.CMakeLists.txt.cmake
deleted file mode 100644
index 66e1dc85a50..00000000000
--- a/cpp/cmake/Templates/GoogleTest.CMakeLists.txt.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.12)
-
-include(ExternalProject)
-
-ExternalProject_Add(GoogleTest
-                    GIT_REPOSITORY    https://github.com/google/googletest.git
-                    GIT_TAG           release-1.8.0
-                    SOURCE_DIR        "${GTEST_ROOT}/googletest"
-                    BINARY_DIR        "${GTEST_ROOT}/build"
-                    INSTALL_DIR		  "${GTEST_ROOT}/install"
-                    CMAKE_ARGS        ${GTEST_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX=${GTEST_ROOT}/install)
-
-
-
-
-
-
-
-
diff --git a/cpp/cmake/faiss_cuda11.patch b/cpp/cmake/faiss_cuda11.patch
new file mode 100644
index 00000000000..496ca0e7b23
--- /dev/null
+++ b/cpp/cmake/faiss_cuda11.patch
@@ -0,0 +1,40 @@
+diff --git a/configure b/configure
+index ed40dae..f88ed0a 100755
+--- a/configure
++++ b/configure
+@@ -2970,7 +2970,7 @@ ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ex
+ ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+
+-  ax_cxx_compile_alternatives="11 0x"    ax_cxx_compile_cxx11_required=true
++  ax_cxx_compile_alternatives="14 11 0x"    ax_cxx_compile_cxx11_required=true
+   ac_ext=cpp
+ ac_cpp='$CXXCPP $CPPFLAGS'
+ ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+diff --git a/gpu/utils/DeviceDefs.cuh b/gpu/utils/DeviceDefs.cuh
+index 89d3dda..bc0f9b5 100644
+--- a/gpu/utils/DeviceDefs.cuh
++++ b/gpu/utils/DeviceDefs.cuh
+@@ -13,7 +13,7 @@
+ namespace faiss { namespace gpu {
+
+ #ifdef __CUDA_ARCH__
+-#if __CUDA_ARCH__ <= 750
++#if __CUDA_ARCH__ <= 800
+ constexpr int kWarpSize = 32;
+ #else
+ #error Unknown __CUDA_ARCH__; please define parameters for compute capability
+diff --git a/gpu/utils/MatrixMult-inl.cuh b/gpu/utils/MatrixMult-inl.cuh
+index ede225e..4f7eb44 100644
+--- a/gpu/utils/MatrixMult-inl.cuh
++++ b/gpu/utils/MatrixMult-inl.cuh
+@@ -51,6 +51,9 @@ rawGemm(cublasHandle_t handle,
+   auto cBT = GetCudaType<BT>::Type;
+
+   // Always accumulate in f32
++# if __CUDACC_VER_MAJOR__ >= 11
++  cublasSetMathMode(handle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
++# endif
+   return cublasSgemmEx(handle, transa, transb, m, n, k,
+                        &fAlpha, A, cAT, lda,
+                        B, cBT, ldb,
diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp
index a57e550521e..c666bce23ad 100644
--- a/cpp/include/algorithms.hpp
+++ b/cpp/include/algorithms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #pragma once
-
+#include <experimental/graph.hpp>
 #include <experimental/graph_view.hpp>
 #include <graph.hpp>
 #include <internals.hpp>
@@ -22,78 +22,6 @@
 
 namespace cugraph {
 
-/**
- * @brief     Find the PageRank vertex values for a graph.
- *
- * cuGraph computes an approximation of the Pagerank eigenvector using the power method.
- * The number of iterations depends on the properties of the network itself; it increases
- * when the tolerance descreases and/or alpha increases toward the limiting value of 1.
- * The user is free to use default values or to provide inputs for the initial guess,
- * tolerance and maximum number of iterations.
-
- *
- * @throws                           cugraph::logic_error with a custom message when an error
- occurs.
- *
- * @tparam VT                        Type of vertex identifiers. Supported value : int (signed,
- 32-bit)
- * @tparam ET                        Type of edge identifiers. Supported value : int (signed,
- 32-bit)
- * @tparam WT                        Type of edge weights. Supported value : float or double.
- *
- * @param[in] handle                 Library handle (RAFT). If a communicator is set in the handle,
- the multi GPU version will be selected.
- * @param[in] graph                  cuGraph graph descriptor, should contain the connectivity
- information as a transposed adjacency list (CSC). Edge weights are not used for this algorithm.
- * @param[in] alpha                  The damping factor alpha represents the probability to follow
- an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a
- random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0.
- *                                   The initial guess must not be the vector of 0s. Any value other
- than 1 or 0 is treated as an invalid value.
- * @param[in] pagerank               Array of size V. Should contain the initial guess if
- has_guess=true. In this case the initial guess cannot be the vector of 0s. Memory is provided and
- owned by the caller.
- * @param[in] personalization_subset_size (optional) Supported on single-GPU, on the roadmap for
- Multi-GPU. The number of vertices for to personalize. Initialized to 0 by default.
- * @param[in] personalization_subset (optional) Supported on single-GPU, on the roadmap for
- Multi-GPU..= Array of size personalization_subset_size containing vertices for running personalized
- pagerank. Initialized to nullptr by default. Memory is provided and owned by the caller.
- * @param[in] personalization_values (optional) Supported on single-GPU, on the roadmap for
- Multi-GPU. Array of size personalization_subset_size containing values associated with
- personalization_subset vertices. Initialized to nullptr by default. Memory is provided and owned by
- the caller.
- * @param[in] tolerance              Supported on single-GPU. Set the tolerance the approximation,
- this parameter should be a small magnitude value.
- *                                   The lower the tolerance the better the approximation. If this
- value is 0.0f, cuGraph will use the default value which is 1.0E-5.
- *                                   Setting too small a tolerance can lead to non-convergence due
- to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable.
- * @param[in] max_iter               (optional) The maximum number of iterations before an answer is
- returned. This can be used to limit the execution time and do an early exit before the solver
- reaches the convergence tolerance.
- *                                   If this value is lower or equal to 0 cuGraph will use the
- default value, which is 500.
- * @param[in] has_guess              (optional) Supported on single-GPU. This parameter is used to
- notify cuGraph if it should use a user-provided initial guess. False means the user does not have a
- guess, in this case cuGraph will use a uniform vector set to 1/V.
- *                                   If the value is True, cuGraph will read the pagerank parameter
- and use this as an initial guess.
- * @param[out] *pagerank             The PageRank : pagerank[i] is the PageRank of vertex i. Memory
- remains provided and owned by the caller.
- *
- */
-template <typename VT, typename ET, typename WT>
-void pagerank(raft::handle_t const &handle,
-              GraphCSCView<VT, ET, WT> const &graph,
-              WT *pagerank,
-              VT personalization_subset_size = 0,
-              VT *personalization_subset     = nullptr,
-              WT *personalization_values     = nullptr,
-              double alpha                   = 0.85,
-              double tolerance               = 1e-5,
-              int64_t max_iter               = 500,
-              bool has_guess                 = false);
-
 /**
  * @brief     Compute jaccard similarity coefficient for all vertices
  *
@@ -264,6 +192,44 @@ void force_atlas2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
                   bool verbose                                  = false,
                   internals::GraphBasedDimRedCallback *callback = nullptr);
 
+/**
+ * @brief Finds an approximate solution to the traveling salesperson problem (TSP).
+ *        cuGraph computes an approximation of the TSP problem using hill climbing
+ *        optimization.
+ *
+ *        The current implementation does not support a weighted graph.
+ *
+ * @throws                                    cugraph::logic_error when an error occurs.
+ * @param[in] handle                          Library handle (RAFT). If a communicator is set in the
+ * handle, the multi GPU version will be selected.
+ * @param[in] vtx_ptr                         Device array containing the vertex identifiers used
+ * to initialize the route.
+ * @param[in] x_pos                           Device array containing starting x-axis positions.
+ * @param[in] y_pos                           Device array containing starting y-axis positions.
+ * @param[in] nodes                           Number of cities.
+ * @param[in] restarts                        Number of starts to try. The more restarts,
+ * the better the solution will be approximated. The number of restarts depends on the problem
+ * size and should be kept low for instances above 2k cities.
+ * @param[in] beam_search                     Specify if the initial solution should use KNN
+ * for an approximation solution.
+ * @param[in] k                               Beam width to use in the search.
+ * @param[in] nstart                          Start from a specific position.
+ * @param[in] verbose                         Logs configuration and iterative improvement.
+ * @param[out] route                          Device array containing the returned route.
+ *
+ */
+float traveling_salesperson(raft::handle_t &handle,
+                            int const *vtx_ptr,
+                            float const *x_pos,
+                            float const *y_pos,
+                            int nodes,
+                            int restarts,
+                            bool beam_search,
+                            int k,
+                            int nstart,
+                            bool verbose,
+                            int *route);
+
 /**
  * @brief     Compute betweenness centrality for a graph
  *
@@ -815,6 +781,7 @@ template <typename VT, typename ET, typename WT>
 std::unique_ptr<GraphCOO<VT, ET, WT>> extract_subgraph_vertex(GraphCOOView<VT, ET, WT> const &graph,
                                                               VT const *vertices,
                                                               VT num_vertices);
+}  // namespace subgraph
 
 /**
  * @brief     Wrapper function for Nvgraph balanced cut clustering
@@ -837,7 +804,6 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> extract_subgraph_vertex(GraphCOOView<VT, E
  * @param[out] clustering            Pointer to device memory where the resulting clustering will
  * be stored
  */
-}  // namespace subgraph
 
 namespace ext_raft {
 template <typename VT, typename ET, typename WT>
@@ -1191,6 +1157,35 @@ void katz_centrality(raft::handle_t const &handle,
                      bool has_initial_guess  = false,
                      bool normalize          = false,
                      bool do_expensive_check = false);
-
+/**
+ * @brief returns induced EgoNet subgraph(s) of neighbors centered at nodes in source_vertex within
+ * a given radius.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object of, we extract induced egonet subgraphs from @p graph_view.
+ * @param source_vertex Pointer to egonet center vertices (size == @p n_subgraphs).
+ * @param n_subgraphs Number of induced EgoNet subgraphs to extract (ie. number of elements in @p
+ * source_vertex).
+ * @param radius  Include all neighbors of distance <= radius from @p source_vertex.
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>,
+ * rmm::device_uvector<weight_t>, rmm::device_uvector<size_t>> Quadraplet of edge source vertices,
+ * edge destination vertices, edge weights, and edge offsets for each induced EgoNet subgraph.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<weight_t>,
+           rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const &handle,
+            graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
+            vertex_t *source_vertex,
+            vertex_t n_subgraphs,
+            vertex_t radius);
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/include/experimental/detail/graph_utils.cuh b/cpp/include/experimental/detail/graph_utils.cuh
index bf56b2e6f80..3ac2e2163c6 100644
--- a/cpp/include/experimental/detail/graph_utils.cuh
+++ b/cpp/include/experimental/detail/graph_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <experimental/graph_view.hpp>
 #include <partition_manager.hpp>
+#include <utilities/dataframe_buffer.cuh>
+#include <utilities/device_comm.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/handle.hpp>
@@ -24,8 +26,10 @@
 
 #include <thrust/sort.h>
 #include <thrust/transform.h>
+#include <cuco/detail/hash_functions.cuh>
 
 #include <algorithm>
+#include <numeric>
 #include <vector>
 
 namespace cugraph {
@@ -137,6 +141,38 @@ struct degree_from_offsets_t {
   __device__ edge_t operator()(vertex_t v) { return offsets[v + 1] - offsets[v]; }
 };
 
+template <typename vertex_t>
+struct compute_gpu_id_from_vertex_t {
+  int comm_size{0};
+
+  __device__ int operator()(vertex_t v) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    return hash_func(v) % comm_size;
+  }
+};
+
+template <typename vertex_t>
+struct compute_gpu_id_from_edge_t {
+  bool hypergraph_partitioned{false};
+  int comm_size{0};
+  int row_comm_size{0};
+  int col_comm_size{0};
+
+  __device__ int operator()(vertex_t major, vertex_t minor) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    auto major_comm_rank = static_cast<int>(hash_func(major) % comm_size);
+    auto minor_comm_rank = static_cast<int>(hash_func(minor) % comm_size);
+    if (hypergraph_partitioned) {
+      return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
+    } else {
+      return (major_comm_rank - (major_comm_rank % row_comm_size)) +
+             (minor_comm_rank / col_comm_size);
+    }
+  }
+};
+
 }  // namespace detail
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/include/experimental/graph.hpp b/cpp/include/experimental/graph.hpp
index 592294c8967..cc21f7c5013 100644
--- a/cpp/include/experimental/graph.hpp
+++ b/cpp/include/experimental/graph.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,7 +62,7 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
   static constexpr bool is_multi_gpu             = multi_gpu;
 
   graph_t(raft::handle_t const &handle,
-          std::vector<edgelist_t<vertex_t, edge_t, weight_t>> const &edge_lists,
+          std::vector<edgelist_t<vertex_t, edge_t, weight_t>> const &edgelists,
           partition_t<vertex_t> const &partition,
           vertex_t number_of_vertices,
           edge_t number_of_edges,
@@ -70,7 +70,7 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
           bool sorted_by_global_degree_within_vertex_partition,
           bool do_expensive_check = false);
 
-  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> view()
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> view() const
   {
     std::vector<edge_t const *> offsets(adj_matrix_partition_offsets_.size(), nullptr);
     std::vector<vertex_t const *> indices(adj_matrix_partition_indices_.size(), nullptr);
@@ -124,7 +124,7 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
   static constexpr bool is_multi_gpu             = multi_gpu;
 
   graph_t(raft::handle_t const &handle,
-          edgelist_t<vertex_t, edge_t, weight_t> const &edge_list,
+          edgelist_t<vertex_t, edge_t, weight_t> const &edgelist,
           vertex_t number_of_vertices,
           graph_properties_t properties,
           bool sorted_by_degree,
@@ -132,7 +132,7 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
 
   vertex_t get_number_of_local_vertices() const { return this->get_number_of_vertices(); }
 
-  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> view()
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> view() const
   {
     return graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       *(this->get_handle_ptr()),
diff --git a/cpp/include/experimental/graph_functions.hpp b/cpp/include/experimental/graph_functions.hpp
new file mode 100644
index 00000000000..7b4bb466b97
--- /dev/null
+++ b/cpp/include/experimental/graph_functions.hpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph.hpp>
+#include <experimental/graph_view.hpp>
+#include <utilities/error.hpp>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <memory>
+#include <tuple>
+
+namespace cugraph {
+namespace experimental {
+
+/**
+ * @brief renumber edgelist (multi-GPU)
+ *
+ * This function assumes that edges are pre-shuffled to their target processes using the
+ * compute_gpu_id_from_edge_t functor.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as
+ * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex
+ * IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t functor to
+ * every (major, minor) pair should return the local GPU ID for this function to work (edges should
+ * be pre-shuffled).
+ * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is
+ * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored).
+ * Vertex IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t
+ * functor to every (major, minor) pair should return the local GPU ID for this function to work
+ * (edges should be pre-shuffled).
+ * @param num_edgelist_edges Number of edges in the edgelist.
+ * @param is_hypergraph_partitioned Flag indicating whether we are assuming hypergraph partitioning
+ * (this flag will be removed in the future).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t>
+ * Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to
+ * this process in multi-GPU), partition_t object storing graph partitioning information, total
+ * number of vertices, and total number of edges.
+ */
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<multi_gpu,
+                 std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t>>
+renumber_edgelist(raft::handle_t const& handle,
+                  vertex_t* edgelist_major_vertices /* [INOUT] */,
+                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+                  edge_t num_edgelist_edges,
+                  bool is_hypergraph_partitioned,
+                  bool do_expensive_check = false);
+
+/**
+ * @brief renumber edgelist (single-GPU)
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as
+ * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex
+ * IDs are updated in-place ([INOUT] parameter).
+ * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is
+ * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored).
+ * Vertex IDs are updated in-place ([INOUT] parameter).
+ * @param num_edgelist_edges Number of edges in the edgelist.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return rmm::device_uvector<vertex_t> Labels (vertex IDs before renumbering) for the entire set
+ * of vertices.
+ */
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
+  raft::handle_t const& handle,
+  vertex_t* edgelist_major_vertices /* [INOUT] */,
+  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+  edge_t num_edgelist_edges,
+  bool do_expensive_check = false);
+
+/**
+ * @brief renumber edgelist (multi-GPU)
+ *
+ * This version takes the vertex set in addition; this allows renumbering to include isolated
+ * vertices. This function assumes that vertices and edges are pre-shuffled to their target
+ * processes using the compute_gpu_id_from_vertex_t & compute_gpu_id_from_edge_t functors,
+ * respectively.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param local_vertices Part of the entire set of vertices in the graph to be renumbered. Applying
+ * the compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this function
+ * to work (vertices should be pre-shuffled).
+ * @param num_local_vertices Number of local vertices.
+ * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as
+ * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex
+ * IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t functor to
+ * every (major, minor) pair should return the local GPU ID for this function to work (edges should
+ * be pre-shuffled).
+ * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is
+ * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored).
+ * Vertex IDs are updated in-place ([INOUT] parameter). Applying the compute_gpu_id_from_edge_t
+ * functor to every (major, minor) pair should return the local GPU ID for this function to work
+ * (edges should be pre-shuffled).
+ * @param num_edgelist_edges Number of edges in the edgelist.
+ * @param is_hypergraph_partitioned Flag indicating whether we are assuming hypergraph partitioning
+ * (this flag will be removed in the future).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t>
+ * Quadruplet of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to
+ * this process in multi-GPU), partition_t object storing graph partitioning information, total
+ * number of vertices, and total number of edges.
+ */
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<multi_gpu,
+                 std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t>>
+renumber_edgelist(raft::handle_t const& handle,
+                  vertex_t const* local_vertices,
+                  vertex_t num_local_vertices,
+                  vertex_t* edgelist_major_vertices /* [INOUT] */,
+                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+                  edge_t num_edgelist_edges,
+                  bool is_hypergraph_partitioned,
+                  bool do_expensive_check = false);
+
+/**
+ * @brief renumber edgelist (single-GPU)
+ *
+ * This version takes the vertex set in addition; this allows renumbering to include isolated
+ * vertices.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices The entire set of vertices in the graph to be renumbered.
+ * @param num_vertices Number of vertices.
+ * @param edgelist_major_vertices Edge source vertex IDs (if the graph adjacency matrix is stored as
+ * is) or edge destination vertex IDs (if the transposed graph adjacency matrix is stored). Vertex
+ * IDs are updated in-place ([INOUT] parameter).
+ * @param edgelist_minor_vertices Edge destination vertex IDs (if the graph adjacency matrix is
+ * stored as is) or edge source vertex IDs (if the transposed graph adjacency matrix is stored).
+ * Vertex IDs are updated in-place ([INOUT] parameter).
+ * @param num_edgelist_edges Number of edges in the edgelist.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return rmm::device_uvector<vertex_t> Labels (vertex IDs before renumbering) for the entire set
+ * of vertices.
+ */
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
+  raft::handle_t const& handle,
+  vertex_t const* vertices,
+  vertex_t num_vertices,
+  vertex_t* edgelist_major_vertices /* [INOUT] */,
+  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+  edge_t num_edgelist_edges,
+  bool do_expensive_check = false);
+
+/**
+ * @brief Compute the coarsened graph.
+ *
+ * Aggregates the vertices with the same label to a new vertex in the output coarsened graph.
+ * Multi-edges in the coarsened graph are collapsed to a single edge with its weight equal to the
+ * sum of multi-edge weights.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam store_transposed Flag indicating whether to store the graph adjacency matrix as is or as
+ * transposed.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object of the input graph to be coarsened.
+ * @param labels Vertex labels (assigned to this process in multi-GPU) to be used in coarsening.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed,
+ * multi_gpu>>, rmm::device_uvector<vertex_t>> Tuple of the coarsened graph and labels mapped to the
+ * vertices (assigned to this process in multi-GPU) in the coarsened graph.
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>,
+           rmm::device_uvector<vertex_t>>
+coarsen_graph(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view,
+  vertex_t const* labels,
+  bool do_expensive_check = false);
+
+/**
+ * @brief Relabel old labels to new labels.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param old_new_label_pairs Pairs of an old label and the corresponding new label (each process
+ * holds only part of the entire old labels and the corresponding new labels; partitioning can be
+ * arbitrary).
+ * @param num_label_pairs Number of (old, new) label pairs.
+ * @param labels Labels to be relabeled. This initially holds old labels. Old labels are updated to
+ * new labels in-place ([INOUT] parameter).
+ * @param num_labels Number of labels to be relabeled.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return rmm::device_uvector<vertex_t> New labels corresponding to the @p old_labels.
+ */
+template <typename vertex_t, bool multi_gpu>
+void relabel(raft::handle_t const& handle,
+             std::tuple<vertex_t const*, vertex_t const*> old_new_label_pairs,
+             vertex_t num_label_pairs,
+             vertex_t* labels /* [INOUT] */,
+             vertex_t num_labels,
+             bool do_expensive_check = false);
+
+/**
+ * @brief extract induced subgraph(s).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights.
+ * @tparam store_transposed Flag indicating whether to store the graph adjacency matrix as is or as
+ * transposed.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Graph view object, we extract induced subgraphs from @p graph_view.
+ * @param subgraph_offsets Pointer to subgraph vertex offsets (size == @p num_subgraphs + 1).
+ * @param subgraph_vertices Pointer to subgraph vertices (size == @p subgraph_offsets[@p
+ * num_subgraphs]). The elements of @p subgraph_vertices for each subgraph should be sorted in
+ * ascending order and unique.
+ * @param num_subgraphs Number of induced subgraphs to extract.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>,
+ * rmm::device_uvector<weight_t>, rmm::device_uvector<size_t>> Quadraplet of edge major (destination
+ * if @p store_transposed is true, source otherwise) vertices, edge minor (source if @p
+ * store_transposed  is true, destination otherwise) vertices, edge weights, and edge offsets for
+ * each induced subgraphs (size == num_subgraphs + 1). The sizes of the edge major & minor vertices
+ * are edge_offsets[num_subgraphs]. The size of the edge weights is either
+ * edge_offsets[num_subgraphs] (if @p graph_view is weighted) or 0 (if @p graph_view is unweighted).
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<weight_t>,
+           rmm::device_uvector<size_t>>
+extract_induced_subgraphs(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const& graph_view,
+  size_t const* subgraph_offsets /* size == num_subgraphs + 1 */,
+  vertex_t const* subgraph_vertices /* size == subgraph_offsets[num_subgraphs] */,
+  size_t num_subgraphs,
+  bool do_expensive_check = false);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/experimental/graph_view.hpp b/cpp/include/experimental/graph_view.hpp
index ba327047b1d..d2ae1150970 100644
--- a/cpp/include/experimental/graph_view.hpp
+++ b/cpp/include/experimental/graph_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -332,6 +332,7 @@ class graph_view_t<vertex_t,
 
   bool is_weighted() const { return adj_matrix_partition_weights_.size() > 0; }
 
+  // FIXME: this should be removed once MNMG Louvain is updated to use graph primitives
   partition_t<vertex_t> get_partition() const { return partition_; }
 
   vertex_t get_number_of_local_vertices() const
@@ -399,6 +400,11 @@ class graph_view_t<vertex_t,
     }
   }
 
+  edge_t get_number_of_local_adj_matrix_partition_edges(size_t adj_matrix_partition_idx) const
+  {
+    return adj_matrix_partition_number_of_edges_[adj_matrix_partition_idx];
+  }
+
   vertex_t get_local_adj_matrix_partition_row_first(size_t adj_matrix_partition_idx) const
   {
     return store_transposed ? partition_.get_matrix_partition_minor_first()
@@ -441,25 +447,25 @@ class graph_view_t<vertex_t,
 
   bool is_hypergraph_partitioned() const { return partition_.is_hypergraph_partitioned(); }
 
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
+  // FIXME: this function is not part of the public stable API. This function is mainly for pattern
   // accelerator implementation. This function is currently public to support the legacy
   // implementations directly accessing CSR/CSC data, but this function will eventually become
   // private or even disappear if we switch to CSR + DCSR (or CSC + DCSC).
   edge_t const* offsets() const { return offsets(0); }
 
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
+  // FIXME: this function is not part of the public stable API. This function is mainly for pattern
   // accelerator implementation. This function is currently public to support the legacy
   // implementations directly accessing CSR/CSC data, but this function will eventually become
   // private or even disappear if we switch to CSR + DCSR (or CSC + DCSC).
   vertex_t const* indices() const { return indices(0); }
 
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
+  // FIXME: this function is not part of the public stable API. This function is mainly for pattern
   // accelerator implementation. This function is currently public to support the legacy
   // implementations directly accessing CSR/CSC data, but this function will eventually become
   // private or even disappear if we switch to CSR + DCSR (or CSC + DCSC).
   weight_t const* weights() const { return weights(0); }
 
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
+  // FIXME: this function is not part of the public stable API. This function is mainly for pattern
   // accelerator implementation. This function is currently public to support the legacy
   // implementations directly accessing CSR/CSC data, but this function will eventually become
   // private or even disappear if we switch to CSR + DCSR (or CSC + DCSC).
@@ -468,7 +474,7 @@ class graph_view_t<vertex_t,
     return adj_matrix_partition_offsets_[adj_matrix_partition_idx];
   }
 
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
+  // FIXME: this function is not part of the public stable API. This function is mainly for pattern
   // accelerator implementation. This function is currently public to support the legacy
   // implementations directly accessing CSR/CSC data, but this function will eventually become
   // private or even disappear if we switch to CSR + DCSR (or CSC + DCSC).
@@ -477,7 +483,7 @@ class graph_view_t<vertex_t,
     return adj_matrix_partition_indices_[adj_matrix_partition_idx];
   }
 
-  // FIXME: this function is not part of the public stable API.This function is mainly for pattern
+  // FIXME: this function is not part of the public stable API. This function is mainly for pattern
   // accelerator implementation. This function is currently public to support the legacy
   // implementations directly accessing CSR/CSC data, but this function will eventually become
   // private or even disappear if we switch to CSR + DCSR (or CSC + DCSC).
@@ -492,6 +498,7 @@ class graph_view_t<vertex_t,
   std::vector<edge_t const*> adj_matrix_partition_offsets_{};
   std::vector<vertex_t const*> adj_matrix_partition_indices_{};
   std::vector<weight_t const*> adj_matrix_partition_weights_{};
+  std::vector<edge_t> adj_matrix_partition_number_of_edges_{};
 
   partition_t<vertex_t> partition_{};
 
@@ -567,6 +574,12 @@ class graph_view_t<vertex_t,
     return this->get_number_of_vertices();
   }
 
+  edge_t get_number_of_local_adj_matrix_partition_edges(size_t adj_matrix_partition_idx) const
+  {
+    assert(adj_matrix_partition_idx == 0);
+    return this->get_number_of_edges();
+  }
+
   vertex_t get_local_adj_matrix_partition_row_first(size_t adj_matrix_partition_idx) const
   {
     assert(adj_matrix_partition_idx == 0);
@@ -629,6 +642,7 @@ class graph_view_t<vertex_t,
   edge_t const* offsets_{nullptr};
   vertex_t const* indices_{nullptr};
   weight_t const* weights_{nullptr};
+
   std::vector<vertex_t> segment_offsets_{};  // segment offsets based on vertex degree, relevant
                                              // only if sorted_by_global_degree is true
 };
diff --git a/cpp/include/matrix_partition_device.cuh b/cpp/include/matrix_partition_device.cuh
index 53796530f60..b41119e7be6 100644
--- a/cpp/include/matrix_partition_device.cuh
+++ b/cpp/include/matrix_partition_device.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,11 +30,14 @@ class matrix_partition_device_base_t {
  public:
   matrix_partition_device_base_t(edge_t const* offsets,
                                  vertex_t const* indices,
-                                 weight_t const* weights)
-    : offsets_(offsets), indices_(indices), weights_(weights)
+                                 weight_t const* weights,
+                                 edge_t number_of_edges)
+    : offsets_(offsets), indices_(indices), weights_(weights), number_of_edges_(number_of_edges)
   {
   }
 
+  __host__ __device__ edge_t get_number_of_edges() const { return number_of_edges_; }
+
   __device__ thrust::tuple<vertex_t const*, weight_t const*, edge_t> get_local_edges(
     vertex_t major_offset) const noexcept
   {
@@ -50,11 +53,17 @@ class matrix_partition_device_base_t {
     return *(offsets_ + (major_offset + 1)) - *(offsets_ + major_offset);
   }
 
+  __device__ edge_t get_local_offset(vertex_t major_offset) const noexcept
+  {
+    return *(offsets_ + major_offset);
+  }
+
  private:
   // should be trivially copyable to device
   edge_t const* offsets_{nullptr};
   vertex_t const* indices_{nullptr};
   weight_t const* weights_{nullptr};
+  edge_t number_of_edges_{0};
 };
 
 template <typename GraphViewType, typename Enable = void>
@@ -73,7 +82,8 @@ class matrix_partition_device_t<GraphViewType, std::enable_if_t<GraphViewType::i
                                      typename GraphViewType::weight_type>(
         graph_view.offsets(partition_idx),
         graph_view.indices(partition_idx),
-        graph_view.weights(partition_idx)),
+        graph_view.weights(partition_idx),
+        graph_view.get_number_of_local_adj_matrix_partition_edges(partition_idx)),
       major_first_(GraphViewType::is_adj_matrix_transposed
                      ? graph_view.get_local_adj_matrix_partition_col_first(partition_idx)
                      : graph_view.get_local_adj_matrix_partition_row_first(partition_idx)),
@@ -93,7 +103,7 @@ class matrix_partition_device_t<GraphViewType, std::enable_if_t<GraphViewType::i
   {
   }
 
-  typename GraphViewType::vertex_type get_major_value_start_offset() const
+  __host__ __device__ typename GraphViewType::vertex_type get_major_value_start_offset() const
   {
     return major_value_start_offset_;
   }
@@ -173,7 +183,10 @@ class matrix_partition_device_t<GraphViewType, std::enable_if_t<!GraphViewType::
     : matrix_partition_device_base_t<typename GraphViewType::vertex_type,
                                      typename GraphViewType::edge_type,
                                      typename GraphViewType::weight_type>(
-        graph_view.offsets(), graph_view.indices(), graph_view.weights()),
+        graph_view.offsets(),
+        graph_view.indices(),
+        graph_view.weights(),
+        graph_view.get_number_of_edges()),
       number_of_vertices_(graph_view.get_number_of_vertices())
   {
     assert(partition_idx == 0);
diff --git a/cpp/include/patterns/any_of_adj_matrix_row.cuh b/cpp/include/patterns/any_of_adj_matrix_row.cuh
index 199e7c230ef..a367ec2a50c 100644
--- a/cpp/include/patterns/any_of_adj_matrix_row.cuh
+++ b/cpp/include/patterns/any_of_adj_matrix_row.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 #pragma once
 
 #include <experimental/graph_view.hpp>
-#include <utilities/comm_utils.cuh>
 #include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/handle.hpp>
diff --git a/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh b/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh
index 760775c03d4..d4559de06af 100644
--- a/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh
+++ b/cpp/include/patterns/copy_to_adj_matrix_row_col.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,10 @@
 #include <experimental/graph_view.hpp>
 #include <matrix_partition_device.cuh>
 #include <partition_manager.hpp>
-#include <utilities/comm_utils.cuh>
+#include <utilities/dataframe_buffer.cuh>
+#include <utilities/device_comm.cuh>
 #include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
 #include <utilities/thrust_tuple_utils.cuh>
 #include <vertex_partition_device.cuh>
 
@@ -120,10 +122,10 @@ void copy_to_matrix_major(raft::handle_t const& handle,
       for (int i = 0; i < row_comm_size; ++i) {
         rmm::device_uvector<vertex_t> rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i],
                                                   handle.get_stream());
-        auto rx_tmp_buffer =
-          allocate_comm_buffer<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
-            rx_counts[i], handle.get_stream());
-        auto rx_value_first = get_comm_buffer_begin<
+        auto rx_tmp_buffer = allocate_dataframe_buffer<
+          typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_counts[i],
+                                                                               handle.get_stream());
+        auto rx_value_first = get_dataframe_buffer_begin<
           typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
 
         if (row_comm_rank == i) {
@@ -173,12 +175,6 @@ void copy_to_matrix_major(raft::handle_t const& handle,
                           map_first,
                           matrix_major_value_output_first);
         }
-
-        CUDA_TRY(cudaStreamSynchronize(
-          handle.get_stream()));  // this is as necessary rx_tmp_buffer will become out-of-scope
-                                  // once control flow exits this block (FIXME: we can reduce stream
-                                  // synchronization if we compute the maximum rx_counts and
-                                  // allocate rx_tmp_buffer outside the loop)
       }
     }
   } else {
@@ -219,7 +215,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
       // partitioning
       auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank;
       auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
-      // FIXME: this branch may no longer necessary with NCCL backend
+      // FIXME: this branch may be no longer necessary with NCCL backend
       if (comm_src_rank == comm_rank) {
         assert(comm_dst_rank == comm_rank);
         thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
@@ -314,12 +310,11 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
 
       vertex_partition_device_t<GraphViewType> vertex_partition(graph_view);
       rmm::device_uvector<vertex_t> dst_vertices(rx_count, handle.get_stream());
-      auto dst_tmp_buffer =
-        allocate_comm_buffer<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
-          rx_count, handle.get_stream());
-      auto dst_value_first =
-        get_comm_buffer_begin<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
-          dst_tmp_buffer);
+      auto dst_tmp_buffer = allocate_dataframe_buffer<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_count,
+                                                                             handle.get_stream());
+      auto dst_value_first = get_dataframe_buffer_begin<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(dst_tmp_buffer);
       if (comm_src_rank == comm_rank) {
         thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                      vertex_first,
@@ -335,10 +330,10 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                        vertex_value_input_first,
                        dst_value_first);
       } else {
-        auto src_tmp_buffer =
-          allocate_comm_buffer<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
-            tx_count, handle.get_stream());
-        auto src_value_first = get_comm_buffer_begin<
+        auto src_tmp_buffer = allocate_dataframe_buffer<
+          typename std::iterator_traits<VertexValueInputIterator>::value_type>(tx_count,
+                                                                               handle.get_stream());
+        auto src_value_first = get_dataframe_buffer_begin<
           typename std::iterator_traits<VertexValueInputIterator>::value_type>(src_tmp_buffer);
 
         auto map_first =
@@ -369,10 +364,6 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                                                                               rx_count,
                                                                               comm_src_rank,
                                                                               handle.get_stream());
-
-        CUDA_TRY(cudaStreamSynchronize(
-          handle.get_stream()));  // this is as necessary src_tmp_buffer will become out-of-scope
-                                  // once control flow exits this block
       }
 
       // FIXME: now we can clear tx_tmp_buffer
@@ -383,10 +374,10 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
       for (int i = 0; i < col_comm_size; ++i) {
         rmm::device_uvector<vertex_t> rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i],
                                                   handle.get_stream());
-        auto rx_tmp_buffer =
-          allocate_comm_buffer<typename std::iterator_traits<VertexValueInputIterator>::value_type>(
-            rx_counts[i], handle.get_stream());
-        auto rx_value_first = get_comm_buffer_begin<
+        auto rx_tmp_buffer = allocate_dataframe_buffer<
+          typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_counts[i],
+                                                                               handle.get_stream());
+        auto rx_value_first = get_dataframe_buffer_begin<
           typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
 
         // FIXME: these broadcast operations can be placed between ncclGroupStart() and
@@ -423,17 +414,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                           map_first,
                           matrix_minor_value_output_first);
         }
-
-        CUDA_TRY(cudaStreamSynchronize(
-          handle.get_stream()));  // this is as necessary rx_tmp_buffer will become out-of-scope
-                                  // once control flow exits this block (FIXME: we can reduce stream
-                                  // synchronization if we compute the maximum rx_counts and
-                                  // allocate rx_tmp_buffer outside the loop)
       }
-
-      CUDA_TRY(cudaStreamSynchronize(
-        handle.get_stream()));  // this is as necessary dst_tmp_buffer will become out-of-scope once
-                                // control flow exits this block
     }
   } else {
     assert(graph_view.get_number_of_local_vertices() ==
diff --git a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
index f3c36897dd6..3059cf95852 100644
--- a/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/patterns/copy_v_transform_reduce_in_out_nbr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,8 @@
 #include <matrix_partition_device.cuh>
 #include <patterns/edge_op_utils.cuh>
 #include <patterns/reduce_op.cuh>
-#include <utilities/comm_utils.cuh>
+#include <utilities/dataframe_buffer.cuh>
+#include <utilities/device_comm.cuh>
 #include <utilities/error.hpp>
 
 #include <raft/cudart_utils.h>
@@ -377,8 +378,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
           ? graph_view.get_number_of_local_adj_matrix_partition_rows()
           : graph_view.get_number_of_local_adj_matrix_partition_cols()
       : vertex_t{0};
-  auto minor_tmp_buffer   = allocate_comm_buffer<T>(minor_tmp_buffer_size, handle.get_stream());
-  auto minor_buffer_first = get_comm_buffer_begin<T>(minor_tmp_buffer);
+  auto minor_tmp_buffer = allocate_dataframe_buffer<T>(minor_tmp_buffer_size, handle.get_stream());
+  auto minor_buffer_first = get_dataframe_buffer_begin<T>(minor_tmp_buffer);
 
   if (in != GraphViewType::is_adj_matrix_transposed) {
     auto minor_init = init;
@@ -424,8 +425,9 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
               : graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i)
           : vertex_t{0};
     }
-    auto major_tmp_buffer   = allocate_comm_buffer<T>(major_tmp_buffer_size, handle.get_stream());
-    auto major_buffer_first = get_comm_buffer_begin<T>(major_tmp_buffer);
+    auto major_tmp_buffer =
+      allocate_dataframe_buffer<T>(major_tmp_buffer_size, handle.get_stream());
+    auto major_buffer_first = get_dataframe_buffer_begin<T>(major_tmp_buffer);
 
     auto major_init = T{};
     if (in == GraphViewType::is_adj_matrix_transposed) {
@@ -523,12 +525,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                       handle.get_stream());
       }
     }
-
-    CUDA_TRY(cudaStreamSynchronize(
-      handle.get_stream()));  // this is as necessary major_tmp_buffer will become out-of-scope once
-                              // control flow exits this block (FIXME: we can reduce stream
-                              // synchronization if we compute the maximum major_tmp_buffer_size and
-                              // allocate major_tmp_buffer outside the loop)
   }
 
   if (GraphViewType::is_multi_gpu && (in != GraphViewType::is_adj_matrix_transposed)) {
@@ -590,10 +586,6 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
     }
   }
-
-  CUDA_TRY(cudaStreamSynchronize(
-    handle.get_stream()));  // this is as necessary minor_tmp_buffer will become out-of-scope once
-                            // control flow exits this block
 }
 
 }  // namespace detail
@@ -627,7 +619,7 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
  * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
  * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
  * get_number_of_local_adj_matrix_partition_cols())) and returns a value to be reduced.
- * @param init Initial value to be added to the reduced @e_op return values for each vertex.
+ * @param init Initial value to be added to the reduced @p e_op return values for each vertex.
  * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first
  * (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
  * (exclusive) is deduced as @p vertex_value_output_first + @p
@@ -689,7 +681,7 @@ void copy_v_transform_reduce_in_nbr(raft::handle_t const& handle,
  * adj_matrix_col_value_input_first + j) (where i is in [0,
  * graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
  * get_number_of_local_adj_matrix_partition_cols())) and returns a value to be reduced.
- * @param init Initial value to be added to the reduced @e_op return values for each vertex.
+ * @param init Initial value to be added to the reduced @p e_op return values for each vertex.
  * @param vertex_value_output_first Iterator pointing to the vertex property variables for the
  * first (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
  * (exclusive) is deduced as @p vertex_value_output_first + @p
diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
new file mode 100644
index 00000000000..785f8197aff
--- /dev/null
+++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -0,0 +1,522 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/detail/graph_utils.cuh>
+#include <experimental/graph.hpp>
+#include <experimental/graph_view.hpp>
+#include <utilities/dataframe_buffer.cuh>
+#include <utilities/error.hpp>
+#include <utilities/shuffle_comm.cuh>
+#include <vertex_partition_device.cuh>
+
+#include <raft/handle.hpp>
+
+#include <cuco/static_map.cuh>
+
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+// FIXME: block size requires tuning
+int32_t constexpr copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size = 128;
+
+template <typename GraphViewType, typename VertexIterator>
+__global__ void for_all_major_for_all_nbr_low_degree(
+  matrix_partition_device_t<GraphViewType> matrix_partition,
+  typename GraphViewType::vertex_type major_first,
+  typename GraphViewType::vertex_type major_last,
+  VertexIterator adj_matrix_minor_key_first,
+  typename GraphViewType::vertex_type* major_vertices,
+  typename GraphViewType::vertex_type* minor_keys,
+  typename GraphViewType::weight_type* key_aggregated_edge_weights,
+  typename GraphViewType::vertex_type invalid_vertex)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  auto const tid          = threadIdx.x + blockIdx.x * blockDim.x;
+  auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
+  auto idx                = static_cast<size_t>(tid);
+
+  while (idx < static_cast<size_t>(major_last - major_first)) {
+    vertex_t const* indices{nullptr};
+    weight_t const* weights{nullptr};
+    edge_t local_degree{};
+    auto major_offset = major_start_offset + idx;
+    thrust::tie(indices, weights, local_degree) =
+      matrix_partition.get_local_edges(static_cast<vertex_t>(major_offset));
+    if (local_degree > 0) {
+      auto local_offset    = matrix_partition.get_local_offset(major_offset);
+      auto minor_key_first = thrust::make_transform_iterator(
+        indices, [matrix_partition, adj_matrix_minor_key_first] __device__(auto minor) {
+          return *(adj_matrix_minor_key_first +
+                   matrix_partition.get_minor_offset_from_minor_nocheck(minor));
+        });
+      thrust::copy(
+        thrust::seq, minor_key_first, minor_key_first + local_degree, minor_keys + local_offset);
+      if (weights == nullptr) {
+        thrust::sort(
+          thrust::seq, minor_keys + local_offset, minor_keys + local_offset + local_degree);
+      } else {
+        thrust::copy(
+          thrust::seq, weights, weights + local_degree, key_aggregated_edge_weights + local_offset);
+        thrust::sort_by_key(thrust::seq,
+                            minor_keys + local_offset,
+                            minor_keys + local_offset + local_degree,
+                            key_aggregated_edge_weights + local_offset);
+      }
+      // in-place reduce_by_key
+      vertex_t key_idx{0};
+      key_aggregated_edge_weights[local_offset + key_idx] =
+        weights != nullptr ? weights[0] : weight_t{1.0};
+      for (edge_t i = 1; i < local_degree; ++i) {
+        if (minor_keys[local_offset + i] == minor_keys[local_offset + key_idx]) {
+          key_aggregated_edge_weights[local_offset + key_idx] +=
+            weights != nullptr ? weights[i] : weight_t{1.0};
+        } else {
+          ++key_idx;
+          minor_keys[local_offset + key_idx] = minor_keys[local_offset + i];
+          key_aggregated_edge_weights[local_offset + key_idx] =
+            weights != nullptr ? weights[i] : weight_t{1.0};
+        }
+      }
+      thrust::fill(thrust::seq,
+                   major_vertices + local_offset,
+                   major_vertices + local_offset + key_idx,
+                   matrix_partition.get_major_from_major_offset_nocheck(major_offset));
+      thrust::fill(thrust::seq,
+                   major_vertices + local_offset + key_idx,
+                   major_vertices + local_offset + local_degree,
+                   invalid_vertex);
+    }
+
+    idx += gridDim.x * blockDim.x;
+  }
+}
+
+}  // namespace detail
+
+/**
+ * @brief Iterate over every vertex's key-aggregated outgoing edges to update vertex properties.
+ *
+ * This function is inspired by thrust::transfrom_reduce() (iteration over the outgoing edges
+ * part) and thrust::copy() (update vertex properties part, take transform_reduce output as copy
+ * input).
+ * Unlike copy_v_transform_reduce_out_nbr, this function first aggregates outgoing edges by key to
+ * support two level reduction for every vertex.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam VertexIterator Type of the iterator for graph adjacency matrix column key values for
+ * aggregation (key type should coincide with vertex type).
+ * @tparam ValueIterator Type of the iterator for values in (key, value) pairs.
+ * @tparam KeyAggregatedEdgeOp Type of the quinary key-aggregated edge operator.
+ * @tparam ReduceOp Type of the binary reduction operator.
+ * @tparam T Type of the initial value for reduction over the key-aggregated outgoing edges.
+ * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first
+ * + @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_col_key_first Iterator pointing to the adjacency matrix column key (for
+ * aggregation) for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_key_last` (exclusive) is deduced as @p adj_matrix_col_key_first + @p
+ * graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param map_key_first Iterator pointing to the first (inclusive) key in (key, value) pairs
+ * (assigned to this process in multi-GPU,
+ * `cugraph::experimental::detail::compute_gpu_id_from_vertex_t` is used to map keys to processes).
+ * (Key, value) pairs may be provided by transform_reduce_by_adj_matrix_row_key_e() or
+ * transform_reduce_by_adj_matrix_col_key_e().
+ * @param map_key_last Iterator pointing to the last (exclusive) key in (key, value) pairs (assigned
+ * to this process in multi-GPU).
+ * @param map_value_first Iterator pointing to the first (inclusive) value in (key, value) pairs
+ * (assigned to this process in multi-GPU). `map_value_last` (exclusive) is deduced as @p
+ * map_value_first + thrust::distance(@p map_key_first, @p map_key_last).
+ * @param key_aggregated_e_op Quinary operator takes edge source, key, aggregated edge weight, *(@p
+ * adj_matrix_row_value_input_first + i), and value for the key stored in the input (key, value)
+ * pairs provided by @p map_key_first, @p map_key_last, and @p map_value_first (aggregated over the
+ * entire set of processes in multi-GPU).
+ * @param reduce_op Binary operator takes two input arguments and reduce the two variables to one.
+ * @param init Initial value to be added to the reduced @p key_aggregated_e_op return values for
+ * each vertex.
+ * @param vertex_value_output_first Iterator pointing to the vertex property variables for the
+ * first (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
+ * (exclusive) is deduced as @p vertex_value_output_first + @p
+ * graph_view.get_number_of_local_vertices().
+ */
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename VertexIterator,
+          typename ValueIterator,
+          typename KeyAggregatedEdgeOp,
+          typename ReduceOp,
+          typename T,
+          typename VertexValueOutputIterator>
+void copy_v_transform_reduce_key_aggregated_out_nbr(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  VertexIterator adj_matrix_col_key_first,
+  VertexIterator map_key_first,
+  VertexIterator map_key_last,
+  ValueIterator map_value_first,
+  KeyAggregatedEdgeOp key_aggregated_e_op,
+  ReduceOp reduce_op,
+  T init,
+  VertexValueOutputIterator vertex_value_output_first)
+{
+  static_assert(!GraphViewType::is_adj_matrix_transposed,
+                "GraphViewType should support the push model.");
+  static_assert(std::is_same<typename std::iterator_traits<VertexIterator>::value_type,
+                             typename GraphViewType::vertex_type>::value);
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+  using value_t  = typename std::iterator_traits<ValueIterator>::value_type;
+
+  double constexpr load_factor = 0.7;
+
+  // 1. build a cuco::static_map object for the k, v pairs.
+
+  auto kv_map_ptr = std::make_unique<cuco::static_map<vertex_t, value_t>>(
+    static_cast<size_t>(static_cast<double>(thrust::distance(map_key_first, map_key_last)) /
+                        load_factor),
+    invalid_vertex_id<vertex_t>::value,
+    invalid_vertex_id<vertex_t>::value);
+  auto pair_first = thrust::make_transform_iterator(
+    thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first)),
+    [] __device__(auto val) {
+      return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+    });
+  kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last));
+
+  if (GraphViewType::is_multi_gpu) {
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+
+    rmm::device_uvector<vertex_t> unique_keys(
+      graph_view.get_number_of_local_adj_matrix_partition_cols(), handle.get_stream());
+    thrust::copy(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      adj_matrix_col_key_first,
+      adj_matrix_col_key_first + graph_view.get_number_of_local_adj_matrix_partition_cols(),
+      unique_keys.begin());
+    thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 unique_keys.begin(),
+                 unique_keys.end());
+    auto last = thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                               unique_keys.begin(),
+                               unique_keys.end());
+    unique_keys.resize(thrust::distance(unique_keys.begin(), last), handle.get_stream());
+
+    rmm::device_uvector<vertex_t> rx_unique_keys(0, handle.get_stream());
+    std::vector<size_t> rx_value_counts{};
+    std::tie(rx_unique_keys, rx_value_counts) = groupby_gpuid_and_shuffle_values(
+      comm,
+      unique_keys.begin(),
+      unique_keys.end(),
+      [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(auto val) {
+        return key_func(val);
+      },
+      handle.get_stream());
+
+    rmm::device_uvector<value_t> values_for_unique_keys(rx_unique_keys.size(), handle.get_stream());
+
+    CUDA_TRY(cudaStreamSynchronize(
+      handle.get_stream()));  // cuco::static_map currently does not take stream
+
+    kv_map_ptr->find(rx_unique_keys.begin(), rx_unique_keys.end(), values_for_unique_keys.begin());
+
+    rmm::device_uvector<value_t> rx_values_for_unique_keys(0, handle.get_stream());
+
+    std::tie(rx_values_for_unique_keys, std::ignore) =
+      shuffle_values(comm, values_for_unique_keys.begin(), rx_value_counts, handle.get_stream());
+
+    CUDA_TRY(cudaStreamSynchronize(
+      handle.get_stream()));  // cuco::static_map currently does not take stream
+
+    kv_map_ptr.reset();
+
+    kv_map_ptr = std::make_unique<cuco::static_map<vertex_t, value_t>>(
+      static_cast<size_t>(static_cast<double>(unique_keys.size()) / load_factor),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value);
+
+    auto pair_first = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(
+        thrust::make_tuple(unique_keys.begin(), rx_values_for_unique_keys.begin())),
+      [] __device__(auto val) {
+        return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+      });
+
+    kv_map_ptr->insert(pair_first, pair_first + unique_keys.size());
+  }
+
+  // 2. aggregate each vertex out-going edges based on keys and transform-reduce.
+
+  auto loop_count = size_t{1};
+  if (GraphViewType::is_multi_gpu) {
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_size = row_comm.get_size();
+    loop_count               = graph_view.is_hypergraph_partitioned()
+                   ? graph_view.get_number_of_local_adj_matrix_partitions()
+                   : static_cast<size_t>(row_comm_size);
+  }
+
+  rmm::device_uvector<vertex_t> major_vertices(0, handle.get_stream());
+  auto e_op_result_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
+  for (size_t i = 0; i < loop_count; ++i) {
+    matrix_partition_device_t<GraphViewType> matrix_partition(
+      graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i);
+
+    int comm_root_rank = 0;
+    if (GraphViewType::is_multi_gpu) {
+      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_rank = row_comm.get_rank();
+      auto const row_comm_size = row_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank
+                                                              : col_comm_rank * row_comm_size + i;
+    }
+
+    auto num_edges = thrust::transform_reduce(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      thrust::make_counting_iterator(graph_view.get_vertex_partition_first(comm_root_rank)),
+      thrust::make_counting_iterator(graph_view.get_vertex_partition_last(comm_root_rank)),
+      [matrix_partition] __device__(auto row) {
+        auto row_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
+        return matrix_partition.get_local_degree(row_offset);
+      },
+      edge_t{0},
+      thrust::plus<edge_t>());
+
+    rmm::device_uvector<vertex_t> tmp_major_vertices(num_edges, handle.get_stream());
+    rmm::device_uvector<vertex_t> tmp_minor_keys(tmp_major_vertices.size(), handle.get_stream());
+    rmm::device_uvector<weight_t> tmp_key_aggregated_edge_weights(tmp_major_vertices.size(),
+                                                                  handle.get_stream());
+
+    if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) {
+      raft::grid_1d_thread_t update_grid(
+        graph_view.get_vertex_partition_size(comm_root_rank),
+        detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size,
+        handle.get_device_properties().maxGridSize[0]);
+
+      auto constexpr invalid_vertex = invalid_vertex_id<vertex_t>::value;
+
+      // FIXME: This is highly inefficient for graphs with high-degree vertices. If we renumber
+      // vertices to insure that rows within a partition are sorted by their out-degree in
+      // decreasing order, we will apply this kernel only to low out-degree vertices.
+      detail::for_all_major_for_all_nbr_low_degree<<<update_grid.num_blocks,
+                                                     update_grid.block_size,
+                                                     0,
+                                                     handle.get_stream()>>>(
+        matrix_partition,
+        graph_view.get_vertex_partition_first(comm_root_rank),
+        graph_view.get_vertex_partition_last(comm_root_rank),
+        adj_matrix_col_key_first,
+        tmp_major_vertices.data(),
+        tmp_minor_keys.data(),
+        tmp_key_aggregated_edge_weights.data(),
+        invalid_vertex);
+    }
+
+    auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
+      tmp_major_vertices.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_weights.begin()));
+    auto last =
+      thrust::remove_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        triplet_first,
+                        triplet_first + tmp_major_vertices.size(),
+                        [] __device__(auto val) {
+                          return thrust::get<0>(val) == invalid_vertex_id<vertex_t>::value;
+                        });
+    tmp_major_vertices.resize(thrust::distance(triplet_first, last), handle.get_stream());
+    tmp_minor_keys.resize(tmp_major_vertices.size(), handle.get_stream());
+    tmp_key_aggregated_edge_weights.resize(tmp_major_vertices.size(), handle.get_stream());
+
+    if (GraphViewType::is_multi_gpu) {
+      auto& sub_comm           = handle.get_subcomm(graph_view.is_hypergraph_partitioned()
+                                            ? cugraph::partition_2d::key_naming_t().col_name()
+                                            : cugraph::partition_2d::key_naming_t().row_name());
+      auto const sub_comm_size = sub_comm.get_size();
+
+      triplet_first =
+        thrust::make_zip_iterator(thrust::make_tuple(tmp_major_vertices.begin(),
+                                                     tmp_minor_keys.begin(),
+                                                     tmp_key_aggregated_edge_weights.begin()));
+      rmm::device_uvector<vertex_t> rx_major_vertices(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> rx_minor_keys(0, handle.get_stream());
+      rmm::device_uvector<weight_t> rx_key_aggregated_edge_weights(0, handle.get_stream());
+      std::forward_as_tuple(
+        std::tie(rx_major_vertices, rx_minor_keys, rx_key_aggregated_edge_weights), std::ignore) =
+        groupby_gpuid_and_shuffle_values(
+          sub_comm,
+          triplet_first,
+          triplet_first + tmp_major_vertices.size(),
+          [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{sub_comm_size}] __device__(
+            auto val) { return key_func(thrust::get<1>(val)); },
+          handle.get_stream());
+
+      tmp_major_vertices              = std::move(rx_major_vertices);
+      tmp_minor_keys                  = std::move(rx_minor_keys);
+      tmp_key_aggregated_edge_weights = std::move(rx_key_aggregated_edge_weights);
+    }
+
+    auto tmp_e_op_result_buffer =
+      allocate_dataframe_buffer<T>(tmp_major_vertices.size(), handle.get_stream());
+    auto tmp_e_op_result_buffer_first = get_dataframe_buffer_begin<T>(tmp_e_op_result_buffer);
+
+    triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
+      tmp_major_vertices.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_weights.begin()));
+    thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      triplet_first,
+                      triplet_first + major_vertices.size(),
+                      tmp_e_op_result_buffer_first,
+                      [adj_matrix_row_value_input_first,
+                       key_aggregated_e_op,
+                       matrix_partition,
+                       kv_map = kv_map_ptr->get_device_view()] __device__(auto val) {
+                        auto major = thrust::get<0>(val);
+                        auto key   = thrust::get<1>(val);
+                        auto w     = thrust::get<2>(val);
+                        return key_aggregated_e_op(
+                          major,
+                          key,
+                          w,
+                          *(adj_matrix_row_value_input_first +
+                            matrix_partition.get_major_offset_from_major_nocheck(major)),
+                          kv_map.find(key)->second);
+                      });
+    tmp_minor_keys.resize(0, handle.get_stream());
+    tmp_key_aggregated_edge_weights.resize(0, handle.get_stream());
+    tmp_minor_keys.shrink_to_fit(handle.get_stream());
+    tmp_key_aggregated_edge_weights.shrink_to_fit(handle.get_stream());
+
+    if (GraphViewType::is_multi_gpu) {
+      auto& sub_comm           = handle.get_subcomm(graph_view.is_hypergraph_partitioned()
+                                            ? cugraph::partition_2d::key_naming_t().col_name()
+                                            : cugraph::partition_2d::key_naming_t().row_name());
+      auto const sub_comm_rank = sub_comm.get_rank();
+      auto const sub_comm_size = sub_comm.get_size();
+
+      // FIXME: additional optimization is possible if reduce_op is a pure function (and reduce_op
+      // can be mapped to ncclRedOp_t).
+
+      auto rx_sizes =
+        host_scalar_gather(sub_comm, tmp_major_vertices.size(), i, handle.get_stream());
+      std::vector<size_t> rx_displs(
+        static_cast<size_t>(sub_comm_rank) == i ? sub_comm_size : int{0}, size_t{0});
+      if (static_cast<size_t>(sub_comm_rank) == i) {
+        std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1);
+      }
+      rmm::device_uvector<vertex_t> rx_major_vertices(
+        static_cast<size_t>(sub_comm_rank) == i
+          ? std::accumulate(rx_sizes.begin(), rx_sizes.end(), size_t{0})
+          : size_t{0},
+        handle.get_stream());
+      auto rx_tmp_e_op_result_buffer =
+        allocate_dataframe_buffer<T>(rx_major_vertices.size(), handle.get_stream());
+
+      device_gatherv(sub_comm,
+                     tmp_major_vertices.data(),
+                     rx_major_vertices.data(),
+                     tmp_major_vertices.size(),
+                     rx_sizes,
+                     rx_displs,
+                     i,
+                     handle.get_stream());
+      device_gatherv(sub_comm,
+                     tmp_e_op_result_buffer_first,
+                     get_dataframe_buffer_begin<T>(rx_tmp_e_op_result_buffer),
+                     tmp_major_vertices.size(),
+                     rx_sizes,
+                     rx_displs,
+                     i,
+                     handle.get_stream());
+
+      if (static_cast<size_t>(sub_comm_rank) == i) {
+        major_vertices     = std::move(rx_major_vertices);
+        e_op_result_buffer = std::move(rx_tmp_e_op_result_buffer);
+      }
+    } else {
+      major_vertices     = std::move(tmp_major_vertices);
+      e_op_result_buffer = std::move(tmp_e_op_result_buffer);
+    }
+  }
+
+  thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               vertex_value_output_first,
+               vertex_value_output_first + graph_view.get_number_of_local_vertices(),
+               T{});
+  thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      major_vertices.begin(),
+                      major_vertices.end(),
+                      get_dataframe_buffer_begin<T>(e_op_result_buffer));
+
+  auto num_uniques = thrust::count_if(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    thrust::make_counting_iterator(size_t{0}),
+    thrust::make_counting_iterator(major_vertices.size()),
+    [major_vertices = major_vertices.data()] __device__(auto i) {
+      return ((i == 0) || (major_vertices[i] != major_vertices[i - 1])) ? true : false;
+    });
+  rmm::device_uvector<vertex_t> unique_major_vertices(num_uniques, handle.get_stream());
+
+  auto major_vertex_first = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(size_t{0}),
+    [major_vertices = major_vertices.data()] __device__(auto i) {
+      return ((i == 0) || (major_vertices[i] == major_vertices[i - 1]))
+               ? major_vertices[i]
+               : invalid_vertex_id<vertex_t>::value;
+    });
+  thrust::copy_if(
+    major_vertex_first,
+    major_vertex_first + major_vertices.size(),
+    unique_major_vertices.begin(),
+    [] __device__(auto major) { return major != invalid_vertex_id<vertex_t>::value; });
+  thrust::reduce_by_key(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    major_vertices.begin(),
+    major_vertices.end(),
+    get_dataframe_buffer_begin<T>(e_op_result_buffer),
+    thrust::make_discard_iterator(),
+    thrust::make_permutation_iterator(
+      vertex_value_output_first,
+      thrust::make_transform_iterator(
+        major_vertices.begin(),
+        [vertex_partition = vertex_partition_device_t<GraphViewType>(graph_view)] __device__(
+          auto v) { return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v); })),
+    reduce_op);
+
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    vertex_value_output_first,
+                    vertex_value_output_first + graph_view.get_number_of_local_vertices(),
+                    vertex_value_output_first,
+                    [reduce_op, init] __device__(auto val) { return reduce_op(val, init); });
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/count_if_e.cuh b/cpp/include/patterns/count_if_e.cuh
index 4f0f0a7a43e..63b31f9c44e 100644
--- a/cpp/include/patterns/count_if_e.cuh
+++ b/cpp/include/patterns/count_if_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include <experimental/graph_view.hpp>
 #include <matrix_partition_device.cuh>
 #include <patterns/edge_op_utils.cuh>
-#include <utilities/comm_utils.cuh>
 #include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
 
 #include <raft/cudart_utils.h>
 #include <rmm/thrust_rmm_allocator.h>
diff --git a/cpp/include/patterns/count_if_v.cuh b/cpp/include/patterns/count_if_v.cuh
index c90b259cdde..6b28cd7ae12 100644
--- a/cpp/include/patterns/count_if_v.cuh
+++ b/cpp/include/patterns/count_if_v.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 #pragma once
 
 #include <experimental/graph_view.hpp>
-#include <utilities/comm_utils.cuh>
 #include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/handle.hpp>
diff --git a/cpp/include/patterns/reduce_op.cuh b/cpp/include/patterns/reduce_op.cuh
index e9011914292..d92d3352d08 100644
--- a/cpp/include/patterns/reduce_op.cuh
+++ b/cpp/include/patterns/reduce_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@ struct any {
   __host__ __device__ T operator()(T const& lhs, T const& rhs) const { return lhs; }
 };
 
+// FIXME: thrust::minimum can replace this.
 // reducing N elements (operator < should be defined between any two elements), the minimum element
 // should be selected.
 template <typename T>
diff --git a/cpp/include/patterns/reduce_v.cuh b/cpp/include/patterns/reduce_v.cuh
index 12224dc55f4..b232d37b78d 100644
--- a/cpp/include/patterns/reduce_v.cuh
+++ b/cpp/include/patterns/reduce_v.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 #pragma once
 
 #include <experimental/graph_view.hpp>
-#include <utilities/comm_utils.cuh>
 #include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
 
 #include <raft/handle.hpp>
 
diff --git a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh
new file mode 100644
index 00000000000..70b6dc92752
--- /dev/null
+++ b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/detail/graph_utils.cuh>
+#include <experimental/graph_view.hpp>
+#include <utilities/dataframe_buffer.cuh>
+#include <utilities/error.hpp>
+#include <utilities/shuffle_comm.cuh>
+
+#include <raft/handle.hpp>
+
+#include <cuco/static_map.cuh>
+
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+// FIXME: block size requires tuning
+int32_t constexpr transform_reduce_by_key_e_for_all_block_size = 128;
+
+template <bool adj_matrix_row_key,
+          typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename VertexIterator,
+          typename EdgeOp,
+          typename T>
+__global__ void for_all_major_for_all_nbr_low_degree(
+  matrix_partition_device_t<GraphViewType> matrix_partition,
+  typename GraphViewType::vertex_type major_first,
+  typename GraphViewType::vertex_type major_last,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  VertexIterator adj_matrix_row_col_key_first,
+  EdgeOp e_op,
+  typename GraphViewType::vertex_type* keys,
+  T* values)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  auto const tid          = threadIdx.x + blockIdx.x * blockDim.x;
+  auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
+  auto idx                = static_cast<size_t>(tid);
+
+  while (idx < static_cast<size_t>(major_last - major_first)) {
+    vertex_t const* indices{nullptr};
+    weight_t const* weights{nullptr};
+    edge_t local_degree{};
+    auto major_offset = major_start_offset + idx;
+    thrust::tie(indices, weights, local_degree) =
+      matrix_partition.get_local_edges(static_cast<vertex_t>(major_offset));
+    if (local_degree > 0) {
+      auto transform_op = [&matrix_partition,
+                           &adj_matrix_row_value_input_first,
+                           &adj_matrix_col_value_input_first,
+                           &adj_matrix_row_col_key_first,
+                           &e_op,
+                           major_offset,
+                           indices,
+                           weights] __device__(auto i) {
+        auto minor        = indices[i];
+        auto weight       = weights != nullptr ? weights[i] : weight_t{1.0};
+        auto minor_offset = matrix_partition.get_minor_offset_from_minor_nocheck(minor);
+        auto row          = GraphViewType::is_adj_matrix_transposed
+                     ? minor
+                     : matrix_partition.get_major_from_major_offset_nocheck(major_offset);
+        auto col = GraphViewType::is_adj_matrix_transposed
+                     ? matrix_partition.get_major_from_major_offset_nocheck(major_offset)
+                     : minor;
+        auto row_offset = GraphViewType::is_adj_matrix_transposed
+                            ? minor_offset
+                            : static_cast<vertex_t>(major_offset);
+        auto col_offset = GraphViewType::is_adj_matrix_transposed
+                            ? static_cast<vertex_t>(major_offset)
+                            : minor_offset;
+
+        auto key =
+          *(adj_matrix_row_col_key_first +
+            ((GraphViewType::is_adj_matrix_transposed != adj_matrix_row_key) ? major_offset
+                                                                             : minor_offset));
+        auto e_op_result = evaluate_edge_op<GraphViewType,
+                                            AdjMatrixRowValueInputIterator,
+                                            AdjMatrixColValueInputIterator,
+                                            EdgeOp>()
+                             .compute(row,
+                                      col,
+                                      weight,
+                                      *(adj_matrix_row_value_input_first + row_offset),
+                                      *(adj_matrix_col_value_input_first + col_offset),
+                                      e_op);
+
+        return thrust::make_tuple(key, e_op_result);
+      };
+
+      auto local_offset = matrix_partition.get_local_offset(major_offset);
+      thrust::transform(
+        thrust::seq,
+        thrust::make_counting_iterator(edge_t{0}),
+        thrust::make_counting_iterator(local_degree),
+        thrust::make_zip_iterator(thrust::make_tuple(keys + local_offset, values + local_offset)),
+        transform_op);
+    }
+
+    idx += gridDim.x * blockDim.x;
+  }
+}
+
+template <bool adj_matrix_row_key,
+          typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename VertexIterator,
+          typename EdgeOp,
+          typename T>
+std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
+           decltype(allocate_dataframe_buffer<T>(0, cudaStream_t{nullptr}))>
+transform_reduce_by_adj_matrix_row_col_key_e(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  VertexIterator adj_matrix_row_col_key_first,
+  EdgeOp e_op,
+  T init)
+{
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+  static_assert(std::is_same<typename std::iterator_traits<VertexIterator>::value_type,
+                             typename GraphViewType::vertex_type>::value);
+
+  using vertex_t = typename GraphViewType::vertex_type;
+  using edge_t   = typename GraphViewType::edge_type;
+  using weight_t = typename GraphViewType::weight_type;
+
+  auto loop_count = size_t{1};
+  if (GraphViewType::is_multi_gpu) {
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_size = row_comm.get_size();
+    loop_count               = graph_view.is_hypergraph_partitioned()
+                   ? graph_view.get_number_of_local_adj_matrix_partitions()
+                   : static_cast<size_t>(row_comm_size);
+  }
+
+  rmm::device_uvector<vertex_t> keys(0, handle.get_stream());
+  auto value_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
+  for (size_t i = 0; i < loop_count; ++i) {
+    matrix_partition_device_t<GraphViewType> matrix_partition(
+      graph_view, (GraphViewType::is_multi_gpu && !graph_view.is_hypergraph_partitioned()) ? 0 : i);
+
+    int comm_root_rank = 0;
+    if (GraphViewType::is_multi_gpu) {
+      auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+      auto const row_comm_rank = row_comm.get_rank();
+      auto const row_comm_size = row_comm.get_size();
+      auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+      auto const col_comm_rank = col_comm.get_rank();
+      comm_root_rank = graph_view.is_hypergraph_partitioned() ? i * row_comm_size + row_comm_rank
+                                                              : col_comm_rank * row_comm_size + i;
+    }
+
+    auto num_edges = thrust::transform_reduce(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      thrust::make_counting_iterator(graph_view.get_vertex_partition_first(comm_root_rank)),
+      thrust::make_counting_iterator(graph_view.get_vertex_partition_last(comm_root_rank)),
+      [matrix_partition] __device__(auto row) {
+        auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(row);
+        return matrix_partition.get_local_degree(major_offset);
+      },
+      edge_t{0},
+      thrust::plus<edge_t>());
+
+    rmm::device_uvector<vertex_t> tmp_keys(num_edges, handle.get_stream());
+    auto tmp_value_buffer = allocate_dataframe_buffer<T>(tmp_keys.size(), handle.get_stream());
+
+    if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) {
+      raft::grid_1d_thread_t update_grid(graph_view.get_vertex_partition_size(comm_root_rank),
+                                         detail::transform_reduce_by_key_e_for_all_block_size,
+                                         handle.get_device_properties().maxGridSize[0]);
+
+      // FIXME: This is highly inefficient for graphs with high-degree vertices. If we renumber
+      // vertices to insure that rows within a partition are sorted by their out-degree in
+      // decreasing order, we will apply this kernel only to low out-degree vertices.
+      detail::for_all_major_for_all_nbr_low_degree<adj_matrix_row_key>
+        <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+          matrix_partition,
+          graph_view.get_vertex_partition_first(comm_root_rank),
+          graph_view.get_vertex_partition_last(comm_root_rank),
+          adj_matrix_row_value_input_first,
+          adj_matrix_col_value_input_first,
+          adj_matrix_row_col_key_first,
+          e_op,
+          tmp_keys.data(),
+          get_dataframe_buffer_begin<T>(tmp_value_buffer));
+    }
+
+    if (GraphViewType::is_multi_gpu) {
+      auto& comm           = handle.get_comms();
+      auto const comm_size = comm.get_size();
+
+      thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          tmp_keys.begin(),
+                          tmp_keys.end(),
+                          get_dataframe_buffer_begin<T>(tmp_value_buffer));
+
+      auto num_uniques =
+        thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                         thrust::make_counting_iterator(size_t{0}),
+                         thrust::make_counting_iterator(tmp_keys.size()),
+                         [tmp_keys = tmp_keys.data()] __device__(auto i) {
+                           return ((i == 0) || (tmp_keys[i] != tmp_keys[i - 1])) ? true : false;
+                         });
+      rmm::device_uvector<vertex_t> unique_keys(num_uniques, handle.get_stream());
+      auto value_for_unique_key_buffer =
+        allocate_dataframe_buffer<T>(unique_keys.size(), handle.get_stream());
+
+      thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                            tmp_keys.begin(),
+                            tmp_keys.end(),
+                            get_dataframe_buffer_begin<T>(tmp_value_buffer),
+                            unique_keys.begin(),
+                            get_dataframe_buffer_begin<T>(value_for_unique_key_buffer));
+
+      rmm::device_uvector<vertex_t> rx_unique_keys(0, handle.get_stream());
+      auto rx_value_for_unique_key_buffer = allocate_dataframe_buffer<T>(0, handle.get_stream());
+      std::tie(rx_unique_keys, rx_value_for_unique_key_buffer, std::ignore) =
+        groupby_gpuid_and_shuffle_kv_pairs(
+          comm,
+          unique_keys.begin(),
+          unique_keys.end(),
+          get_dataframe_buffer_begin<T>(value_for_unique_key_buffer),
+          [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(
+            auto val) { return key_func(val); },
+          handle.get_stream());
+
+      // FIXME: we can reduce after shuffle
+
+      tmp_keys         = std::move(rx_unique_keys);
+      tmp_value_buffer = std::move(rx_value_for_unique_key_buffer);
+    }
+
+    auto cur_size = keys.size();
+    // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we can
+    // reserve address space to avoid expensive reallocation.
+    // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management
+    keys.resize(cur_size + tmp_keys.size(), handle.get_stream());
+    resize_dataframe_buffer<T>(value_buffer, keys.size(), handle.get_stream());
+
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 tmp_keys.begin(),
+                 tmp_keys.end(),
+                 keys.begin() + cur_size);
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 get_dataframe_buffer_begin<T>(tmp_value_buffer),
+                 get_dataframe_buffer_begin<T>(tmp_value_buffer) + tmp_keys.size(),
+                 get_dataframe_buffer_begin<T>(value_buffer) + cur_size);
+  }
+
+  if (GraphViewType::is_multi_gpu) {
+    thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        keys.begin(),
+                        keys.end(),
+                        get_dataframe_buffer_begin<T>(value_buffer));
+
+    auto num_uniques =
+      thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       thrust::make_counting_iterator(size_t{0}),
+                       thrust::make_counting_iterator(keys.size()),
+                       [keys = keys.data()] __device__(auto i) {
+                         return ((i == 0) || (keys[i] != keys[i - 1])) ? true : false;
+                       });
+    rmm::device_uvector<vertex_t> unique_keys(num_uniques, handle.get_stream());
+    auto value_for_unique_key_buffer =
+      allocate_dataframe_buffer<T>(unique_keys.size(), handle.get_stream());
+
+    thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          keys.begin(),
+                          keys.end(),
+                          get_dataframe_buffer_begin<T>(value_buffer),
+                          unique_keys.begin(),
+                          get_dataframe_buffer_begin<T>(value_for_unique_key_buffer));
+
+    keys         = std::move(unique_keys);
+    value_buffer = std::move(value_for_unique_key_buffer);
+  }
+
+  // FIXME: add init
+
+  return std::make_tuple(std::move(keys), std::move(value_buffer));
+}
+
+}  // namespace detail
+
+// FIXME: EdgeOp & VertexOp in update_frontier_v_push_if_out_nbr concatenates push inidicator or
+// bucket idx with the value while EdgeOp here does not. This is inconsistent. Better be fixed.
+/**
+ * @brief Iterate over the entire set of edges and reduce @p edge_op outputs to (key, value) pairs.
+ *
+ * This function is inspired by thrust::transform_reduce() and thrust::reduce_by_key(). Keys for
+ * edges are determined by the graph adjacency matrix rows.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam VertexIterator Type of the iterator for keys in (key, value) pairs (key type should
+ * coincide with vertex type).
+ * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam T Type of the values in (key, value) pairs.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
+ * properties for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param adj_matrix_row_key_first Iterator pointing to the adjacency matrix row key for the first
+ * (inclusive) column (assigned to this process in multi-GPU). `adj_matrix_row_key_last` (exclusive)
+ * is deduced as @p adj_matrix_row_key_first + @p graph_view.get_number_of_local_adj_matrix_rows().
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
+ * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
+ * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
+ * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced.
+ * @param init Initial value to be added to the value in each transform-reduced (key, value) pair.
+ * @return std::tuple Tuple of rmm::device_uvector<typename GraphView::vertex_type> and
+ * rmm::device_uvector<T> (if T is arithmetic scalar) or a tuple of rmm::device_uvector objects (if
+ * T is a thrust::tuple type of arithmetic scalar types, one rmm::device_uvector object per scalar
+ * type).
+ */
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename VertexIterator,
+          typename EdgeOp,
+          typename T>
+auto transform_reduce_by_adj_matrix_row_key_e(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  VertexIterator adj_matrix_row_key_first,
+  EdgeOp e_op,
+  T init)
+{
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+  static_assert(std::is_same<typename std::iterator_traits<VertexIterator>::value_type,
+                             typename GraphViewType::vertex_type>::value);
+
+  return detail::transform_reduce_by_adj_matrix_row_col_key_e<true>(
+    handle,
+    graph_view,
+    adj_matrix_row_value_input_first,
+    adj_matrix_col_value_input_first,
+    adj_matrix_row_key_first,
+    e_op,
+    init);
+}
+
+// FIXME: EdgeOp & VertexOp in update_frontier_v_push_if_out_nbr concatenates push inidicator or
+// bucket idx with the value while EdgeOp here does not. This is inconsistent. Better be fixed.
+/**
+ * @brief Iterate over the entire set of edges and reduce @p edge_op outputs to (key, value) pairs.
+ *
+ * This function is inspired by thrust::transform_reduce() and thrust::reduce_by_key(). Keys for
+ * edges are determined by the graph adjacency matrix columns.
+ *
+ * @tparam GraphViewType Type of the passed non-owning graph object.
+ * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
+ * input properties.
+ * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
+ * input properties.
+ * @tparam VertexIterator Type of the iterator for keys in (key, value) pairs (key type should
+ * coincide with vertex type).
+ * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
+ * @tparam T Type of the values in (key, value) pairs.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph_view Non-owning graph object.
+ * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
+ * properties for the first (inclusive) row (assigned to this process in multi-GPU).
+ * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
+ * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
+ * properties for the first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
+ * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param adj_matrix_col_key_first Iterator pointing to the adjacency matrix column key for the
+ * first (inclusive) column (assigned to this process in multi-GPU).
+ * `adj_matrix_col_key_last` (exclusive) is deduced as @p adj_matrix_col_key_first + @p
+ * graph_view.get_number_of_local_adj_matrix_cols().
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
+ * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
+ * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
+ * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced.
+ * @param init Initial value to be added to the value in each transform-reduced (key, value) pair.
+ * @return std::tuple Tuple of rmm::device_uvector<typename GraphView::vertex_type> and
+ * rmm::device_uvector<T> (if T is arithmetic scalar) or a tuple of rmm::device_uvector objects (if
+ * T is a thrust::tuple type of arithmetic scalar types, one rmm::device_uvector object per scalar
+ * type).
+ */
+template <typename GraphViewType,
+          typename AdjMatrixRowValueInputIterator,
+          typename AdjMatrixColValueInputIterator,
+          typename VertexIterator,
+          typename EdgeOp,
+          typename T>
+auto transform_reduce_by_adj_matrix_col_key_e(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
+  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  VertexIterator adj_matrix_col_key_first,
+  EdgeOp e_op,
+  T init)
+{
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+  static_assert(std::is_same<typename std::iterator_traits<VertexIterator>::value_type,
+                             typename GraphViewType::vertex_type>::value);
+
+  return detail::transform_reduce_by_adj_matrix_row_col_key_e<false>(
+    handle,
+    graph_view,
+    adj_matrix_row_value_input_first,
+    adj_matrix_col_value_input_first,
+    adj_matrix_col_key_first,
+    e_op,
+    init);
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/patterns/transform_reduce_e.cuh b/cpp/include/patterns/transform_reduce_e.cuh
index 797facd4657..946c15a16a0 100644
--- a/cpp/include/patterns/transform_reduce_e.cuh
+++ b/cpp/include/patterns/transform_reduce_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <matrix_partition_device.cuh>
 #include <patterns/edge_op_utils.cuh>
 #include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
 
 #include <raft/cudart_utils.h>
 #include <rmm/thrust_rmm_allocator.h>
diff --git a/cpp/include/patterns/transform_reduce_v.cuh b/cpp/include/patterns/transform_reduce_v.cuh
index 02538c36f47..17ffb89206a 100644
--- a/cpp/include/patterns/transform_reduce_v.cuh
+++ b/cpp/include/patterns/transform_reduce_v.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 #pragma once
 
 #include <experimental/graph_view.hpp>
-#include <utilities/comm_utils.cuh>
 #include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
 
 #include <raft/handle.hpp>
 
diff --git a/cpp/include/patterns/transform_reduce_v_with_adj_matrix_row.cuh b/cpp/include/patterns/transform_reduce_v_with_adj_matrix_row.cuh
index f5af03d647c..39aca7cacae 100644
--- a/cpp/include/patterns/transform_reduce_v_with_adj_matrix_row.cuh
+++ b/cpp/include/patterns/transform_reduce_v_with_adj_matrix_row.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 #pragma once
 
 #include <experimental/graph_view.hpp>
-#include <utilities/comm_utils.cuh>
 #include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
 
 #include <raft/handle.hpp>
 
diff --git a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh
index a2250482c68..4c76322fa79 100644
--- a/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh
+++ b/cpp/include/patterns/update_frontier_v_push_if_out_nbr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,9 @@
 #include <partition_manager.hpp>
 #include <patterns/edge_op_utils.cuh>
 #include <patterns/reduce_op.cuh>
-#include <utilities/comm_utils.cuh>
+#include <utilities/device_comm.cuh>
 #include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
 #include <utilities/thrust_tuple_utils.cuh>
 #include <vertex_partition_device.cuh>
 
@@ -155,7 +156,7 @@ size_t reduce_buffer_elements(raft::handle_t const& handle,
     // FIXME: actually, we can find how many unique keys are here by now.
     // FIXME: if GraphViewType::is_multi_gpu is true, this should be executed on the GPU holding the
     // vertex unless reduce_op is a pure function.
-    rmm::device_vector<key_t> keys(num_buffer_elements);
+    rmm::device_uvector<key_t> keys(num_buffer_elements, handle.get_stream());
     rmm::device_vector<payload_t> values(num_buffer_elements);
     auto it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                                     buffer_key_output_first,
@@ -175,9 +176,10 @@ size_t reduce_buffer_elements(raft::handle_t const& handle,
                  values.begin(),
                  values.begin() + num_reduced_buffer_elements,
                  buffer_payload_output_first);
-    CUDA_TRY(cudaStreamSynchronize(
-      handle.get_stream()));  // this is necessary as kyes & values will become out-of-scope once
-                              // this function returns
+    // FIXME: this is unecessary if we use a tuple of rmm::device_uvector objects for values
+    CUDA_TRY(
+      cudaStreamSynchronize(handle.get_stream()));  // this is necessary as values will become
+                                                    // out-of-scope once this function returns
     return num_reduced_buffer_elements;
   }
 }
@@ -400,7 +402,7 @@ void update_frontier_v_push_if_out_nbr(
       frontier_size = thrust::distance(vertex_first, vertex_last);
     }
 
-    edge_t max_pushes =
+    auto max_pushes =
       frontier_size > 0
         ? frontier_rows.size() > 0
             ? thrust::transform_reduce(
diff --git a/cpp/include/patterns/vertex_frontier.cuh b/cpp/include/patterns/vertex_frontier.cuh
index ccb9e1a5a0d..2126a27ee5a 100644
--- a/cpp/include/patterns/vertex_frontier.cuh
+++ b/cpp/include/patterns/vertex_frontier.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include <utilities/comm_utils.cuh>
 #include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
 #include <utilities/thrust_tuple_utils.cuh>
 
 #include <raft/cudart_utils.h>
diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp
index cd621a516ea..e94190897b8 100644
--- a/cpp/include/utilities/cython.hpp
+++ b/cpp/include/utilities/cython.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <experimental/graph.hpp>
 #include <graph.hpp>
 #include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace cugraph {
 namespace cython {
@@ -109,6 +110,197 @@ struct graph_container_t {
   experimental::graph_properties_t graph_props;
 };
 
+/**
+ * @brief     Owning struct. Allows returning multiple edge lists and edge offsets.
+ *            cython only
+ *
+ * @param  number_of_vertices    The total number of vertices
+ * @param  number_of_edges       The total number of edges (number of elements in src_indices,
+ dst_indices and edge_data)
+ * @param  number_of_subgraph    The number of subgraphs, number of elements in subgraph_offsets - 1
+ * @param  source_indices        This array of size E (number of edges) contains
+ * the index of the
+ * source for each edge. Indices must be in the range [0, V-1].
+ * @param  destination_indices   This array of size E (number of edges) contains
+ * the index of the
+ * destination for each edge. Indices must be in the range [0, V-1].
+ * @param  edge_data             This array size E (number of edges) contains
+ * the weight for each
+ * edge.  This array can be null in which case the graph is considered
+ * unweighted.
+ * @param  subgraph_offsets            This array size number_of_subgraph + 1 contains edge offsets
+ for each subgraph
+
+
+ */
+struct cy_multi_edgelists_t {
+  size_t number_of_vertices;
+  size_t number_of_edges;
+  size_t number_of_subgraph;
+  std::unique_ptr<rmm::device_buffer> src_indices;
+  std::unique_ptr<rmm::device_buffer> dst_indices;
+  std::unique_ptr<rmm::device_buffer> edge_data;
+  std::unique_ptr<rmm::device_buffer> subgraph_offsets;
+};
+
+// replacement for std::tuple<,,>, since std::tuple is not
+// supported in cython
+//
+template <typename vertex_t, typename weight_t>
+struct major_minor_weights_t {
+  explicit major_minor_weights_t(raft::handle_t const& handle)
+    : shuffled_major_vertices_(0, handle.get_stream()),
+      shuffled_minor_vertices_(0, handle.get_stream()),
+      shuffled_weights_(0, handle.get_stream())
+  {
+  }
+  rmm::device_uvector<vertex_t>& get_major(void) { return shuffled_major_vertices_; }
+
+  rmm::device_uvector<vertex_t>& get_minor(void) { return shuffled_minor_vertices_; }
+
+  rmm::device_uvector<weight_t>& get_weights(void) { return shuffled_weights_; }
+
+  std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_major_wrap(
+    void)  // const: triggers errors in Cython autogen-ed C++
+  {
+    return std::make_pair(std::make_unique<rmm::device_buffer>(shuffled_major_vertices_.release()),
+                          sizeof(vertex_t));
+  }
+
+  std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_minor_wrap(void)  // const
+  {
+    return std::make_pair(std::make_unique<rmm::device_buffer>(shuffled_minor_vertices_.release()),
+                          sizeof(vertex_t));
+  }
+
+  std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_weights_wrap(void)  // const
+  {
+    return std::make_pair(std::make_unique<rmm::device_buffer>(shuffled_weights_.release()),
+                          sizeof(weight_t));
+  }
+
+ private:
+  rmm::device_uvector<vertex_t> shuffled_major_vertices_;
+  rmm::device_uvector<vertex_t> shuffled_minor_vertices_;
+  rmm::device_uvector<weight_t> shuffled_weights_;
+};
+
+// wrapper for renumber_edgelist() return
+// (unrenumbering maps, etc.)
+//
+template <typename vertex_t, typename edge_t>
+struct renum_quad_t {
+  explicit renum_quad_t(raft::handle_t const& handle)
+    : dv_(0, handle.get_stream()), part_(std::vector<vertex_t>(), false, 0, 0, 0, 0)
+  {
+  }
+
+  rmm::device_uvector<vertex_t>& get_dv(void) { return dv_; }
+
+  std::pair<std::unique_ptr<rmm::device_buffer>, size_t> get_dv_wrap(
+    void)  // const: see above explanation
+  {
+    return std::make_pair(std::make_unique<rmm::device_buffer>(dv_.release()), sizeof(vertex_t));
+  }
+
+  cugraph::experimental::partition_t<vertex_t>& get_partition(void) { return part_; }
+  vertex_t& get_num_vertices(void) { return nv_; }
+  edge_t& get_num_edges(void) { return ne_; }
+
+  // `partition_t` pass-through getters
+  //
+  int get_part_row_size() const { return part_.get_row_size(); }
+
+  int get_part_col_size() const { return part_.get_col_size(); }
+
+  int get_part_comm_rank() const { return part_.get_comm_rank(); }
+
+  // FIXME: part_.get_vertex_partition_offsets() returns a std::vector
+  //
+  std::unique_ptr<std::vector<vertex_t>> get_partition_offsets(void)  // const
+  {
+    return std::make_unique<std::vector<vertex_t>>(part_.get_vertex_partition_offsets());
+  }
+
+  std::pair<vertex_t, vertex_t> get_part_local_vertex_range() const
+  {
+    auto tpl_v = part_.get_local_vertex_range();
+    return std::make_pair(std::get<0>(tpl_v), std::get<1>(tpl_v));
+  }
+
+  vertex_t get_part_local_vertex_first() const { return part_.get_local_vertex_first(); }
+
+  vertex_t get_part_local_vertex_last() const { return part_.get_local_vertex_last(); }
+
+  std::pair<vertex_t, vertex_t> get_part_vertex_partition_range(size_t vertex_partition_idx) const
+  {
+    auto tpl_v = part_.get_vertex_partition_range(vertex_partition_idx);
+    return std::make_pair(std::get<0>(tpl_v), std::get<1>(tpl_v));
+  }
+
+  vertex_t get_part_vertex_partition_first(size_t vertex_partition_idx) const
+  {
+    return part_.get_vertex_partition_first(vertex_partition_idx);
+  }
+
+  vertex_t get_part_vertex_partition_last(size_t vertex_partition_idx) const
+  {
+    return part_.get_vertex_partition_last(vertex_partition_idx);
+  }
+
+  vertex_t get_part_vertex_partition_size(size_t vertex_partition_idx) const
+  {
+    return part_.get_vertex_partition_size(vertex_partition_idx);
+  }
+
+  size_t get_part_number_of_matrix_partitions() const
+  {
+    return part_.get_number_of_matrix_partitions();
+  }
+
+  std::pair<vertex_t, vertex_t> get_part_matrix_partition_major_range(size_t partition_idx) const
+  {
+    auto tpl_v = part_.get_matrix_partition_major_range(partition_idx);
+    return std::make_pair(std::get<0>(tpl_v), std::get<1>(tpl_v));
+  }
+
+  vertex_t get_part_matrix_partition_major_first(size_t partition_idx) const
+  {
+    return part_.get_matrix_partition_major_first(partition_idx);
+  }
+
+  vertex_t get_part_matrix_partition_major_last(size_t partition_idx) const
+  {
+    return part_.get_matrix_partition_major_last(partition_idx);
+  }
+
+  vertex_t get_part_matrix_partition_major_value_start_offset(size_t partition_idx) const
+  {
+    return part_.get_part_matrix_partition_major_value_start_offset(partition_idx);
+  }
+
+  std::pair<vertex_t, vertex_t> get_part_matrix_partition_minor_range() const
+  {
+    auto tpl_v = part_.get_matrix_partition_minor_range();
+    return std::make_pair(std::get<0>(tpl_v), std::get<1>(tpl_v));
+  }
+
+  vertex_t get_part_matrix_partition_minor_first() const
+  {
+    return part_.get_matrix_partition_minor_first();
+  }
+
+  vertex_t get_part_matrix_partition_minor_last() const
+  {
+    return part_.get_matrix_partition_minor_last();
+  }
+
+ private:
+  rmm::device_uvector<vertex_t> dv_;
+  cugraph::experimental::partition_t<vertex_t> part_;
+  vertex_t nv_;
+  edge_t ne_;
+};
 // FIXME: finish description for vertex_partition_offsets
 //
 // Factory function for populating an empty graph container with a new graph
@@ -246,6 +438,38 @@ void call_sssp(raft::handle_t const& handle,
                vertex_t* predecessors,
                const vertex_t source_vertex);
 
+// Wrapper for calling egonet through a graph container
+template <typename vertex_t, typename weight_t>
+std::unique_ptr<cy_multi_edgelists_t> call_egonet(raft::handle_t const& handle,
+                                                  graph_container_t const& graph_container,
+                                                  vertex_t* source_vertex,
+                                                  vertex_t n_subgraphs,
+                                                  vertex_t radius);
+
+// wrapper for shuffling:
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::unique_ptr<major_minor_weights_t<vertex_t, weight_t>> call_shuffle(
+  raft::handle_t const& handle,
+  vertex_t*
+    edgelist_major_vertices,  // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place
+  vertex_t* edgelist_minor_vertices,  // [IN / OUT]
+  weight_t* edgelist_weights,         // [IN / OUT]
+  edge_t num_edgelist_edges,
+  bool is_hypergraph_partitioned);  // = false
+
+// Wrapper for calling renumber_edeglist() inplace:
+//
+template <typename vertex_t, typename edge_t>
+std::unique_ptr<renum_quad_t<vertex_t, edge_t>> call_renumber(
+  raft::handle_t const& handle,
+  vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */,
+  vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
+  edge_t num_edgelist_edges,
+  bool is_hypergraph_partitioned,
+  bool do_expensive_check,
+  bool multi_gpu);
+
 // Helper for setting up subcommunicators, typically called as part of the
 // user-initiated comms initialization in Python.
 //
diff --git a/cpp/include/utilities/dataframe_buffer.cuh b/cpp/include/utilities/dataframe_buffer.cuh
new file mode 100644
index 00000000000..06352b8e217
--- /dev/null
+++ b/cpp/include/utilities/dataframe_buffer.cuh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <utilities/thrust_tuple_utils.cuh>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename TupleType, size_t I>
+auto allocate_dataframe_buffer_tuple_element_impl(size_t buffer_size, cudaStream_t stream)
+{
+  using element_t = typename thrust::tuple_element<I, TupleType>::type;
+  return rmm::device_uvector<element_t>(buffer_size, stream);
+}
+
+template <typename TupleType, size_t... Is>
+auto allocate_dataframe_buffer_tuple_impl(std::index_sequence<Is...>,
+                                          size_t buffer_size,
+                                          cudaStream_t stream)
+{
+  return std::make_tuple(
+    allocate_dataframe_buffer_tuple_element_impl<TupleType, Is>(buffer_size, stream)...);
+}
+
+template <typename TupleType, typename BufferType, size_t I, size_t N>
+void resize_dataframe_buffer_tuple_element_impl(BufferType& buffer,
+                                                size_t new_buffer_size,
+                                                cudaStream_t stream)
+{
+  std::get<I>(buffer).resize(new_buffer_size, stream);
+  resize_dataframe_buffer_tuple_element_impl<TupleType, BufferType, I + 1, N>(
+    buffer, new_buffer_size, stream);
+}
+
+template <typename TupleType, typename BufferType, size_t I>
+void resize_dataframe_buffer_tuple_impl(BufferType& buffer,
+                                        size_t new_buffer_size,
+                                        cudaStream_t stream)
+{
+}
+
+template <typename TupleType, size_t I, typename BufferType>
+auto get_dataframe_buffer_begin_tuple_element_impl(BufferType& buffer)
+{
+  using element_t = typename thrust::tuple_element<I, TupleType>::type;
+  return std::get<I>(buffer).begin();
+}
+
+template <typename TupleType, size_t... Is, typename BufferType>
+auto get_dataframe_buffer_begin_tuple_impl(std::index_sequence<Is...>, BufferType& buffer)
+{
+  // thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator.
+  return thrust::make_tuple(
+    get_dataframe_buffer_begin_tuple_element_impl<TupleType, Is>(buffer)...);
+}
+
+}  // namespace detail
+
+template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+auto allocate_dataframe_buffer(size_t buffer_size, cudaStream_t stream)
+{
+  return rmm::device_uvector<T>(buffer_size, stream);
+}
+
+template <typename T, typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+auto allocate_dataframe_buffer(size_t buffer_size, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  return detail::allocate_dataframe_buffer_tuple_impl<T>(
+    std::make_index_sequence<tuple_size>(), buffer_size, stream);
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+void resize_dataframe_buffer(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream)
+{
+  buffer.resize(new_buffer_size, stream);
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+void resize_dataframe_buffer(BufferType& buffer, size_t new_buffer_size, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  detail::resize_dataframe_buffer_tuple_impl<T, BufferType, size_t{0}, tuple_size>(
+    buffer, new_buffer_size, stream);
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+auto get_dataframe_buffer_begin(BufferType& buffer)
+{
+  return buffer.begin();
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+auto get_dataframe_buffer_begin(BufferType& buffer)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  return thrust::make_zip_iterator(detail::get_dataframe_buffer_begin_tuple_impl<T>(
+    std::make_index_sequence<tuple_size>(), buffer));
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/utilities/comm_utils.cuh b/cpp/include/utilities/device_comm.cuh
similarity index 75%
rename from cpp/include/utilities/comm_utils.cuh
rename to cpp/include/utilities/device_comm.cuh
index fb69fff49c9..8c3b0f86a47 100644
--- a/cpp/include/utilities/comm_utils.cuh
+++ b/cpp/include/utilities/device_comm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,6 @@
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/detail/normal_iterator.h>
 
-#include <numeric>
 #include <type_traits>
 
 namespace cugraph {
@@ -32,66 +31,6 @@ namespace experimental {
 
 namespace detail {
 
-template <typename TupleType, size_t I, size_t N>
-struct update_vector_of_tuple_scalar_elements_from_tuple_impl {
-  void update(std::vector<int64_t>& tuple_scalar_elements, TupleType const& tuple) const
-  {
-    using element_t = typename thrust::tuple_element<I, TupleType>::type;
-    static_assert(sizeof(element_t) <= sizeof(int64_t));
-    auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
-    *ptr     = thrust::get<I>(tuple);
-    update_vector_of_tuple_scalar_elements_from_tuple_impl<TupleType, I + 1, N>().update(
-      tuple_scalar_elements, tuple);
-  }
-};
-
-template <typename TupleType, size_t I>
-struct update_vector_of_tuple_scalar_elements_from_tuple_impl<TupleType, I, I> {
-  void update(std::vector<int64_t>& tuple_scalar_elements, TupleType const& tuple) const { return; }
-};
-
-template <typename TupleType, size_t I, size_t N>
-struct update_tuple_from_vector_of_tuple_scalar_elements_impl {
-  void update(TupleType& tuple, std::vector<int64_t> const& tuple_scalar_elements) const
-  {
-    using element_t = typename thrust::tuple_element<I, TupleType>::type;
-    static_assert(sizeof(element_t) <= sizeof(int64_t));
-    auto ptr              = reinterpret_cast<element_t const*>(tuple_scalar_elements.data() + I);
-    thrust::get<I>(tuple) = *ptr;
-    update_tuple_from_vector_of_tuple_scalar_elements_impl<TupleType, I + 1, N>().update(
-      tuple, tuple_scalar_elements);
-  }
-};
-
-template <typename TupleType, size_t I>
-struct update_tuple_from_vector_of_tuple_scalar_elements_impl<TupleType, I, I> {
-  void update(TupleType& tuple, std::vector<int64_t> const& tuple_scalar_elements) const { return; }
-};
-
-template <typename TupleType, size_t I, size_t N>
-struct host_allreduce_tuple_scalar_element_impl {
-  void run(raft::comms::comms_t const& comm,
-           rmm::device_uvector<int64_t>& tuple_scalar_elements,
-           cudaStream_t stream) const
-  {
-    using element_t = typename thrust::tuple_element<I, TupleType>::type;
-    static_assert(sizeof(element_t) <= sizeof(int64_t));
-    auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
-    comm.allreduce(ptr, ptr, 1, raft::comms::op_t::SUM, stream);
-    host_allreduce_tuple_scalar_element_impl<TupleType, I + 1, N>().run(
-      comm, tuple_scalar_elements, stream);
-  }
-};
-
-template <typename TupleType, size_t I>
-struct host_allreduce_tuple_scalar_element_impl<TupleType, I, I> {
-  void run(raft::comms::comms_t const& comm,
-           rmm::device_uvector<int64_t>& tuple_scalar_elements,
-           cudaStream_t stream) const
-  {
-  }
-};
-
 template <typename T>
 T* iter_to_raw_ptr(T* ptr)
 {
@@ -621,183 +560,88 @@ struct device_allgatherv_tuple_iterator_element_impl<InputIterator, OutputIterat
   }
 };
 
-template <typename TupleType, size_t I>
-auto allocate_comm_buffer_tuple_element_impl(size_t buffer_size, cudaStream_t stream)
-{
-  using element_t = typename thrust::tuple_element<I, TupleType>::type;
-  return rmm::device_uvector<element_t>(buffer_size, stream);
-}
-
-template <typename TupleType, size_t... Is>
-auto allocate_comm_buffer_tuple_impl(std::index_sequence<Is...>,
-                                     size_t buffer_size,
-                                     cudaStream_t stream)
-{
-  return thrust::make_tuple(
-    allocate_comm_buffer_tuple_element_impl<TupleType, Is>(buffer_size, stream)...);
-}
-
-template <typename TupleType, size_t I, typename BufferType>
-auto get_comm_buffer_begin_tuple_element_impl(BufferType& buffer)
-{
-  using element_t = typename thrust::tuple_element<I, TupleType>::type;
-  return thrust::get<I>(buffer).begin();
-}
-
-template <typename TupleType, size_t... Is, typename BufferType>
-auto get_comm_buffer_begin_tuple_impl(std::index_sequence<Is...>, BufferType& buffer)
-{
-  return thrust::make_tuple(get_comm_buffer_begin_tuple_element_impl<TupleType, Is>(buffer)...);
-}
-
-}  // namespace detail
-
-template <typename T>
-std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_allreduce(
-  raft::comms::comms_t const& comm, T input, cudaStream_t stream)
-{
-  rmm::device_uvector<T> d_input(1, stream);
-  raft::update_device(d_input.data(), &input, 1, stream);
-  comm.allreduce(d_input.data(), d_input.data(), 1, raft::comms::op_t::SUM, stream);
-  T h_input{};
-  raft::update_host(&h_input, d_input.data(), 1, stream);
-  auto status = comm.sync_stream(stream);
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-  return h_input;
-}
-
-template <typename T>
-std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, T>
-host_scalar_allreduce(raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<thrust::detail::is_discard_iterator<OutputIterator>::value, void>
+device_gatherv_impl(raft::comms::comms_t const& comm,
+                    InputIterator input_first,
+                    OutputIterator output_first,
+                    size_t sendcount,
+                    std::vector<size_t> const& recvcounts,
+                    std::vector<size_t> const& displacements,
+                    int root,
+                    cudaStream_t stream)
 {
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
-  rmm::device_uvector<int64_t> d_tuple_scalar_elements(tuple_size, stream);
-  T ret{};
-
-  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
-    h_tuple_scalar_elements, input);
-  raft::update_device(
-    d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
-  detail::host_allreduce_tuple_scalar_element_impl<T, size_t{0}, tuple_size>().run(
-    comm, d_tuple_scalar_elements, stream);
-  raft::update_host(
-    h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
-  auto status = comm.sync_stream(stream);
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-  detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>().update(
-    ret, h_tuple_scalar_elements);
-
-  return ret;
+  // no-op
 }
 
-template <typename T>
-std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_bcast(
-  raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_gatherv_impl(raft::comms::comms_t const& comm,
+                    InputIterator input_first,
+                    OutputIterator output_first,
+                    size_t sendcount,
+                    std::vector<size_t> const& recvcounts,
+                    std::vector<size_t> const& displacements,
+                    int root,
+                    cudaStream_t stream)
 {
-  rmm::device_uvector<T> d_input(1, stream);
-  if (comm.get_rank() == root) { raft::update_device(d_input.data(), &input, 1, stream); }
-  comm.bcast(d_input.data(), 1, root, stream);
-  auto h_input = input;
-  if (comm.get_rank() != root) { raft::update_host(&h_input, d_input.data(), 1, stream); }
-  auto status = comm.sync_stream(stream);
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-  return h_input;
+  static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
+                             typename std::iterator_traits<OutputIterator>::value_type>::value);
+  // FIXME: should be enabled once the RAFT gather & gatherv PR is merged
+#if 1
+  CUGRAPH_FAIL("Unimplemented.");
+#else
+  comm.gatherv(iter_to_raw_ptr(input_first),
+               iter_to_raw_ptr(output_first),
+               sendcount,
+               recvcounts.data(),
+               displacements.data(),
+               root,
+               stream);
+#endif
 }
 
-template <typename T>
-std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, T>
-host_scalar_bcast(raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
-{
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
-  rmm::device_uvector<int64_t> d_tuple_scalar_elements(tuple_size, stream);
-  auto ret = input;
-
-  if (comm.get_rank() == root) {
-    detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>()
-      .update(h_tuple_scalar_elements, input);
-    raft::update_device(
-      d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
-  }
-  comm.bcast(d_tuple_scalar_elements.data(), d_tuple_scalar_elements.size(), root, stream);
-  if (comm.get_rank() != root) {
-    raft::update_host(
-      h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
-  }
-  auto status = comm.sync_stream(stream);
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-  if (comm.get_rank() != root) {
-    detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
-      .update(ret, h_tuple_scalar_elements);
+template <typename InputIterator, typename OutputIterator, size_t I, size_t N>
+struct device_gatherv_tuple_iterator_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t sendcount,
+           std::vector<size_t> const& recvcounts,
+           std::vector<size_t> const& displacements,
+           int root,
+           cudaStream_t stream) const
+  {
+    device_gatherv_impl(comm,
+                        thrust::get<I>(input_first.get_iterator_tuple()),
+                        thrust::get<I>(output_first.get_iterator_tuple()),
+                        sendcount,
+                        recvcounts,
+                        displacements,
+                        root,
+                        stream);
+    device_gatherv_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>().run(
+      comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream);
   }
+};
 
-  return ret;
-}
-
-template <typename T>
-std::enable_if_t<std::is_arithmetic<T>::value, std::vector<T>> host_scalar_allgather(
-  raft::comms::comms_t const& comm, T input, cudaStream_t stream)
-{
-  std::vector<size_t> rx_counts(comm.get_size(), size_t{1});
-  std::vector<size_t> displacements(rx_counts.size(), size_t{0});
-  std::iota(displacements.begin(), displacements.end(), size_t{0});
-  rmm::device_uvector<T> d_outputs(rx_counts.size(), stream);
-  raft::update_device(d_outputs.data() + comm.get_rank(), &input, 1, stream);
-  comm.allgatherv(d_outputs.data() + comm.get_rank(),
-                  d_outputs.data(),
-                  rx_counts.data(),
-                  displacements.data(),
-                  stream);
-  std::vector<T> h_outputs(rx_counts.size(), size_t{0});
-  raft::update_host(h_outputs.data(), d_outputs.data(), rx_counts.size(), stream);
-  auto status = comm.sync_stream(stream);
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-  return h_outputs;
-}
-
-template <typename T>
-std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, std::vector<T>>
-host_scalar_allgather(raft::comms::comms_t const& comm, T input, cudaStream_t stream)
-{
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  std::vector<size_t> rx_counts(comm.get_size(), tuple_size);
-  std::vector<size_t> displacements(rx_counts.size(), size_t{0});
-  for (size_t i = 0; i < displacements.size(); ++i) { displacements[i] = i * tuple_size; }
-  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
-  rmm::device_uvector<int64_t> d_allgathered_tuple_scalar_elements(comm.get_size() * tuple_size,
-                                                                   stream);
-
-  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
-    h_tuple_scalar_elements, input);
-  raft::update_device(d_allgathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size,
-                      h_tuple_scalar_elements.data(),
-                      tuple_size,
-                      stream);
-  comm.allgatherv(d_allgathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size,
-                  d_allgathered_tuple_scalar_elements.data(),
-                  rx_counts.data(),
-                  displacements.data(),
-                  stream);
-  std::vector<int64_t> h_allgathered_tuple_scalar_elements(comm.get_size() * tuple_size);
-  raft::update_host(h_allgathered_tuple_scalar_elements.data(),
-                    d_allgathered_tuple_scalar_elements.data(),
-                    comm.get_size() * tuple_size,
-                    stream);
-  auto status = comm.sync_stream(stream);
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-
-  std::vector<T> ret(comm.get_size());
-  for (size_t i = 0; i < ret.size(); ++i) {
-    std::vector<int64_t> h_tuple_scalar_elements(
-      h_allgathered_tuple_scalar_elements.data() + i * tuple_size,
-      h_allgathered_tuple_scalar_elements.data() + (i + 1) * tuple_size);
-    detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
-      .update(ret[i], h_tuple_scalar_elements);
+template <typename InputIterator, typename OutputIterator, size_t I>
+struct device_gatherv_tuple_iterator_element_impl<InputIterator, OutputIterator, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           InputIterator input_first,
+           OutputIterator output_first,
+           size_t sendcount,
+           std::vector<size_t> const& recvcounts,
+           std::vector<size_t> const& displacements,
+           int root,
+           cudaStream_t stream) const
+  {
   }
+};
 
-  return ret;
-}
+}  // namespace detail
 
 template <typename InputIterator, typename OutputIterator>
 std::enable_if_t<
@@ -1114,36 +958,49 @@ device_allgatherv(raft::comms::comms_t const& comm,
     .run(comm, input_first, output_first, recvcounts, displacements, stream);
 }
 
-template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-auto allocate_comm_buffer(size_t buffer_size, cudaStream_t stream)
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  std::is_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value,
+  void>
+device_gatherv(raft::comms::comms_t const& comm,
+               InputIterator input_first,
+               OutputIterator output_first,
+               size_t sendcount,
+               std::vector<size_t> const& recvcounts,
+               std::vector<size_t> const& displacements,
+               int root,
+               cudaStream_t stream)
 {
-  return rmm::device_uvector<T>(buffer_size, stream);
+  detail::device_gatherv_impl(
+    comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream);
 }
 
-template <typename T, typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
-auto allocate_comm_buffer(size_t buffer_size, cudaStream_t stream)
+template <typename InputIterator, typename OutputIterator>
+std::enable_if_t<
+  is_thrust_tuple_of_arithmetic<typename std::iterator_traits<InputIterator>::value_type>::value &&
+    is_thrust_tuple<typename std::iterator_traits<OutputIterator>::value_type>::value,
+  void>
+device_gatherv(raft::comms::comms_t const& comm,
+               InputIterator input_first,
+               OutputIterator output_first,
+               size_t sendcount,
+               std::vector<size_t> const& recvcounts,
+               std::vector<size_t> const& displacements,
+               int root,
+               cudaStream_t stream)
 {
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  return detail::allocate_comm_buffer_tuple_impl<T>(
-    std::make_index_sequence<tuple_size>(), buffer_size, stream);
-}
+  static_assert(
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value ==
+    thrust::tuple_size<typename thrust::iterator_traits<OutputIterator>::value_type>::value);
 
-template <typename T,
-          typename BufferType,
-          typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-auto get_comm_buffer_begin(BufferType& buffer)
-{
-  return buffer.begin();
-}
+  size_t constexpr tuple_size =
+    thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
 
-template <typename T,
-          typename BufferType,
-          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
-auto get_comm_buffer_begin(BufferType& buffer)
-{
-  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
-  return thrust::make_zip_iterator(
-    detail::get_comm_buffer_begin_tuple_impl<T>(std::make_index_sequence<tuple_size>(), buffer));
+  detail::device_allgatherv_tuple_iterator_element_impl<InputIterator,
+                                                        OutputIterator,
+                                                        size_t{0},
+                                                        tuple_size>()
+    .run(comm, input_first, output_first, sendcount, recvcounts, displacements, root, stream);
 }
 
 }  // namespace experimental
diff --git a/cpp/include/utilities/host_scalar_comm.cuh b/cpp/include/utilities/host_scalar_comm.cuh
new file mode 100644
index 00000000000..dda0ce1f091
--- /dev/null
+++ b/cpp/include/utilities/host_scalar_comm.cuh
@@ -0,0 +1,399 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <utilities/thrust_tuple_utils.cuh>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <numeric>
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename TupleType, size_t I, size_t N>
+struct update_vector_of_tuple_scalar_elements_from_tuple_impl {
+  void update(std::vector<int64_t>& tuple_scalar_elements, TupleType const& tuple) const
+  {
+    using element_t = typename thrust::tuple_element<I, TupleType>::type;
+    static_assert(sizeof(element_t) <= sizeof(int64_t));
+    auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
+    *ptr     = thrust::get<I>(tuple);
+    update_vector_of_tuple_scalar_elements_from_tuple_impl<TupleType, I + 1, N>().update(
+      tuple_scalar_elements, tuple);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct update_vector_of_tuple_scalar_elements_from_tuple_impl<TupleType, I, I> {
+  void update(std::vector<int64_t>& tuple_scalar_elements, TupleType const& tuple) const { return; }
+};
+
+template <typename TupleType, size_t I, size_t N>
+struct update_tuple_from_vector_of_tuple_scalar_elements_impl {
+  void update(TupleType& tuple, std::vector<int64_t> const& tuple_scalar_elements) const
+  {
+    using element_t = typename thrust::tuple_element<I, TupleType>::type;
+    static_assert(sizeof(element_t) <= sizeof(int64_t));
+    auto ptr              = reinterpret_cast<element_t const*>(tuple_scalar_elements.data() + I);
+    thrust::get<I>(tuple) = *ptr;
+    update_tuple_from_vector_of_tuple_scalar_elements_impl<TupleType, I + 1, N>().update(
+      tuple, tuple_scalar_elements);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct update_tuple_from_vector_of_tuple_scalar_elements_impl<TupleType, I, I> {
+  void update(TupleType& tuple, std::vector<int64_t> const& tuple_scalar_elements) const { return; }
+};
+
+template <typename TupleType, size_t I, size_t N>
+struct host_allreduce_tuple_scalar_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           cudaStream_t stream) const
+  {
+    using element_t = typename thrust::tuple_element<I, TupleType>::type;
+    static_assert(sizeof(element_t) <= sizeof(int64_t));
+    auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
+    comm.allreduce(ptr, ptr, 1, raft::comms::op_t::SUM, stream);
+    host_allreduce_tuple_scalar_element_impl<TupleType, I + 1, N>().run(
+      comm, tuple_scalar_elements, stream);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct host_allreduce_tuple_scalar_element_impl<TupleType, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           cudaStream_t stream) const
+  {
+  }
+};
+
+template <typename TupleType, size_t I, size_t N>
+struct host_reduce_tuple_scalar_element_impl {
+  void run(raft::comms::comms_t const& comm,
+           rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           int root,
+           cudaStream_t stream) const
+  {
+    using element_t = typename thrust::tuple_element<I, TupleType>::type;
+    static_assert(sizeof(element_t) <= sizeof(int64_t));
+    auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
+    comm.reduce(ptr, ptr, 1, raft::comms::op_t::SUM, root, stream);
+    host_reduce_tuple_scalar_element_impl<TupleType, I + 1, N>().run(
+      comm, tuple_scalar_elements, root, stream);
+  }
+};
+
+template <typename TupleType, size_t I>
+struct host_reduce_tuple_scalar_element_impl<TupleType, I, I> {
+  void run(raft::comms::comms_t const& comm,
+           rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           int root,
+           cudaStream_t stream) const
+  {
+  }
+};
+
+}  // namespace detail
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_allreduce(
+  raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+{
+  rmm::device_uvector<T> d_input(1, stream);
+  raft::update_device(d_input.data(), &input, 1, stream);
+  comm.allreduce(d_input.data(), d_input.data(), 1, raft::comms::op_t::SUM, stream);
+  T h_input{};
+  raft::update_host(&h_input, d_input.data(), 1, stream);
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  return h_input;
+}
+
+template <typename T>
+std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, T>
+host_scalar_allreduce(raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
+  rmm::device_uvector<int64_t> d_tuple_scalar_elements(tuple_size, stream);
+  T ret{};
+
+  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
+    h_tuple_scalar_elements, input);
+  raft::update_device(
+    d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
+  detail::host_allreduce_tuple_scalar_element_impl<T, size_t{0}, tuple_size>().run(
+    comm, d_tuple_scalar_elements, stream);
+  raft::update_host(
+    h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>().update(
+    ret, h_tuple_scalar_elements);
+
+  return ret;
+}
+
+// Return value is valid only in root (return value may better be std::optional in C++17 or later)
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_reduce(
+  raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+{
+  rmm::device_uvector<T> d_input(1, stream);
+  raft::update_device(d_input.data(), &input, 1, stream);
+  comm.reduce(d_input.data(), d_input.data(), 1, raft::comms::op_t::SUM, stream);
+  T h_input{};
+  if (comm.get_rank() == root) { raft::update_host(&h_input, d_input.data(), 1, stream); }
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  return h_input;
+}
+
+// Return value is valid only in root (return value may better be std::optional in C++17 or later)
+template <typename T>
+std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, T>
+host_scalar_reduce(raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
+  rmm::device_uvector<int64_t> d_tuple_scalar_elements(tuple_size, stream);
+  T ret{};
+
+  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
+    h_tuple_scalar_elements, input);
+  raft::update_device(
+    d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
+  detail::host_reduce_tuple_scalar_element_impl<T, size_t{0}, tuple_size>().run(
+    comm, d_tuple_scalar_elements, root, stream);
+  if (comm.get_rank() == root) {
+    raft::update_host(
+      h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
+  }
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  if (comm.get_rank() == root) {
+    detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
+      .update(ret, h_tuple_scalar_elements);
+  }
+
+  return ret;
+}
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_bcast(
+  raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+{
+  rmm::device_uvector<T> d_input(1, stream);
+  if (comm.get_rank() == root) { raft::update_device(d_input.data(), &input, 1, stream); }
+  comm.bcast(d_input.data(), 1, root, stream);
+  auto h_input = input;
+  if (comm.get_rank() != root) { raft::update_host(&h_input, d_input.data(), 1, stream); }
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  return h_input;
+}
+
+template <typename T>
+std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, T>
+host_scalar_bcast(raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
+  rmm::device_uvector<int64_t> d_tuple_scalar_elements(tuple_size, stream);
+  auto ret = input;
+
+  if (comm.get_rank() == root) {
+    detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>()
+      .update(h_tuple_scalar_elements, input);
+    raft::update_device(
+      d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
+  }
+  comm.bcast(d_tuple_scalar_elements.data(), d_tuple_scalar_elements.size(), root, stream);
+  if (comm.get_rank() != root) {
+    raft::update_host(
+      h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
+  }
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  if (comm.get_rank() != root) {
+    detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
+      .update(ret, h_tuple_scalar_elements);
+  }
+
+  return ret;
+}
+
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value, std::vector<T>> host_scalar_allgather(
+  raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+{
+  std::vector<size_t> rx_counts(comm.get_size(), size_t{1});
+  std::vector<size_t> displacements(rx_counts.size(), size_t{0});
+  std::iota(displacements.begin(), displacements.end(), size_t{0});
+  rmm::device_uvector<T> d_outputs(rx_counts.size(), stream);
+  raft::update_device(d_outputs.data() + comm.get_rank(), &input, 1, stream);
+  // FIXME: better use allgather
+  comm.allgatherv(d_outputs.data() + comm.get_rank(),
+                  d_outputs.data(),
+                  rx_counts.data(),
+                  displacements.data(),
+                  stream);
+  std::vector<T> h_outputs(rx_counts.size());
+  raft::update_host(h_outputs.data(), d_outputs.data(), rx_counts.size(), stream);
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  return h_outputs;
+}
+
+template <typename T>
+std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, std::vector<T>>
+host_scalar_allgather(raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  std::vector<size_t> rx_counts(comm.get_size(), tuple_size);
+  std::vector<size_t> displacements(rx_counts.size(), size_t{0});
+  for (size_t i = 0; i < displacements.size(); ++i) { displacements[i] = i * tuple_size; }
+  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
+  rmm::device_uvector<int64_t> d_allgathered_tuple_scalar_elements(comm.get_size() * tuple_size,
+                                                                   stream);
+
+  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
+    h_tuple_scalar_elements, input);
+  raft::update_device(d_allgathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size,
+                      h_tuple_scalar_elements.data(),
+                      tuple_size,
+                      stream);
+  // FIXME: better use allgather
+  comm.allgatherv(d_allgathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size,
+                  d_allgathered_tuple_scalar_elements.data(),
+                  rx_counts.data(),
+                  displacements.data(),
+                  stream);
+  std::vector<int64_t> h_allgathered_tuple_scalar_elements(comm.get_size() * tuple_size);
+  raft::update_host(h_allgathered_tuple_scalar_elements.data(),
+                    d_allgathered_tuple_scalar_elements.data(),
+                    comm.get_size() * tuple_size,
+                    stream);
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+
+  std::vector<T> ret(comm.get_size());
+  for (size_t i = 0; i < ret.size(); ++i) {
+    std::vector<int64_t> h_tuple_scalar_elements(
+      h_allgathered_tuple_scalar_elements.data() + i * tuple_size,
+      h_allgathered_tuple_scalar_elements.data() + (i + 1) * tuple_size);
+    detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
+      .update(ret[i], h_tuple_scalar_elements);
+  }
+
+  return ret;
+}
+
+// Return value is valid only in root (return value may better be std::optional in C++17 or later)
+template <typename T>
+std::enable_if_t<std::is_arithmetic<T>::value, std::vector<T>> host_scalar_gather(
+  raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+{
+  rmm::device_uvector<T> d_outputs(comm.get_rank() == root ? comm.get_size() : int{1}, stream);
+  raft::update_device(
+    comm.get_rank() == root ? d_outputs.data() + comm.get_rank() : d_outputs.data(),
+    &input,
+    1,
+    stream);
+  // FIXME: should be enabled once the RAFT gather & gatherv PR is merged
+#if 1
+  CUGRAPH_FAIL("Unimplemented.");
+#else
+  comm.gather(comm.get_rank() == root ? d_outputs.data() + comm.get_rank() : d_outputs.data(),
+              d_outputs.data(),
+              size_t{1},
+              root,
+              stream);
+#endif
+  std::vector<T> h_outputs(comm.get_rank() == root ? comm.get_size() : 0);
+  if (comm.get_rank() == root) {
+    raft::update_host(h_outputs.data(), d_outputs.data(), comm.get_size(), stream);
+  }
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+  return h_outputs;
+}
+
+// Return value is valid only in root (return value may better be std::optional in C++17 or later)
+template <typename T>
+std::enable_if_t<cugraph::experimental::is_thrust_tuple_of_arithmetic<T>::value, std::vector<T>>
+host_scalar_gather(raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
+  rmm::device_uvector<int64_t> d_gathered_tuple_scalar_elements(
+    comm.get_rank() == root ? comm.get_size() * tuple_size : tuple_size, stream);
+
+  detail::update_vector_of_tuple_scalar_elements_from_tuple_impl<T, size_t{0}, tuple_size>().update(
+    h_tuple_scalar_elements, input);
+  raft::update_device(comm.get_rank() == root
+                        ? d_gathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size
+                        : d_gathered_tuple_scalar_elements.data(),
+                      h_tuple_scalar_elements.data(),
+                      tuple_size,
+                      stream);
+  // FIXME: should be enabled once the RAFT gather & gatherv PR is merged
+#if 1
+  CUGRAPH_FAIL("Unimplemented.");
+#else
+  comm.gather(comm.get_rank() == root
+                ? d_gathered_tuple_scalar_elements.data() + comm.get_rank() * tuple_size
+                : d_gathered_tuple_scalar_elements.data(),
+              d_gathered_tuple_scalar_elements.data(),
+              tuple_size,
+              root,
+              stream);
+#endif
+  std::vector<int64_t> h_gathered_tuple_scalar_elements(
+    comm.get_rank() == root ? comm.get_size() * tuple_size : size_t{0});
+  if (comm.get_rank() == root) {
+    raft::update_host(h_gathered_tuple_scalar_elements.data(),
+                      d_gathered_tuple_scalar_elements.data(),
+                      comm.get_size() * tuple_size,
+                      stream);
+  }
+  auto status = comm.sync_stream(stream);
+  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
+
+  std::vector<T> ret(comm.get_size());
+  if (comm.get_rank() == root) {
+    for (size_t i = 0; i < ret.size(); ++i) {
+      std::vector<int64_t> h_tuple_scalar_elements(
+        h_gathered_tuple_scalar_elements.data() + i * tuple_size,
+        h_gathered_tuple_scalar_elements.data() + (i + 1) * tuple_size);
+      detail::update_tuple_from_vector_of_tuple_scalar_elements_impl<T, size_t{0}, tuple_size>()
+        .update(ret[i], h_tuple_scalar_elements);
+    }
+  }
+
+  return ret;
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/include/utilities/shuffle_comm.cuh b/cpp/include/utilities/shuffle_comm.cuh
new file mode 100644
index 00000000000..7e04c7e1972
--- /dev/null
+++ b/cpp/include/utilities/shuffle_comm.cuh
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <utilities/dataframe_buffer.cuh>
+#include <utilities/device_comm.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename ValueIterator, typename ValueToGPUIdOp>
+rmm::device_uvector<size_t> sort_and_count(raft::comms::comms_t const &comm,
+                                           ValueIterator tx_value_first /* [INOUT */,
+                                           ValueIterator tx_value_last /* [INOUT */,
+                                           ValueToGPUIdOp value_to_gpu_id_op,
+                                           cudaStream_t stream)
+{
+  auto const comm_size = comm.get_size();
+
+  thrust::sort(rmm::exec_policy(stream)->on(stream),
+               tx_value_first,
+               tx_value_last,
+               [value_to_gpu_id_op] __device__(auto lhs, auto rhs) {
+                 return value_to_gpu_id_op(lhs) < value_to_gpu_id_op(rhs);
+               });
+
+  auto gpu_id_first = thrust::make_transform_iterator(
+    tx_value_first,
+    [value_to_gpu_id_op] __device__(auto value) { return value_to_gpu_id_op(value); });
+  rmm::device_uvector<int> d_tx_dst_ranks(comm_size, stream);
+  rmm::device_uvector<size_t> d_tx_value_counts(comm_size, stream);
+  auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+                                    gpu_id_first,
+                                    gpu_id_first + thrust::distance(tx_value_first, tx_value_last),
+                                    thrust::make_constant_iterator(size_t{1}),
+                                    d_tx_dst_ranks.begin(),
+                                    d_tx_value_counts.begin());
+  if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) {
+    rmm::device_uvector<size_t> d_counts(comm_size, stream);
+    thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0});
+    thrust::scatter(rmm::exec_policy(stream)->on(stream),
+                    d_tx_value_counts.begin(),
+                    thrust::get<1>(last),
+                    d_tx_dst_ranks.begin(),
+                    d_counts.begin());
+    d_tx_value_counts = std::move(d_counts);
+  }
+
+  return std::move(d_tx_value_counts);
+}
+
+template <typename VertexIterator, typename ValueIterator, typename KeyToGPUIdOp>
+rmm::device_uvector<size_t> sort_and_count(raft::comms::comms_t const &comm,
+                                           VertexIterator tx_key_first /* [INOUT */,
+                                           VertexIterator tx_key_last /* [INOUT */,
+                                           ValueIterator tx_value_first /* [INOUT */,
+                                           KeyToGPUIdOp key_to_gpu_id_op,
+                                           cudaStream_t stream)
+{
+  auto const comm_size = comm.get_size();
+
+  thrust::sort_by_key(rmm::exec_policy(stream)->on(stream),
+                      tx_key_first,
+                      tx_key_last,
+                      tx_value_first,
+                      [key_to_gpu_id_op] __device__(auto lhs, auto rhs) {
+                        return key_to_gpu_id_op(lhs) < key_to_gpu_id_op(rhs);
+                      });
+
+  auto gpu_id_first = thrust::make_transform_iterator(
+    tx_key_first, [key_to_gpu_id_op] __device__(auto key) { return key_to_gpu_id_op(key); });
+  rmm::device_uvector<int> d_tx_dst_ranks(comm_size, stream);
+  rmm::device_uvector<size_t> d_tx_value_counts(comm_size, stream);
+  auto last = thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+                                    gpu_id_first,
+                                    gpu_id_first + thrust::distance(tx_key_first, tx_key_last),
+                                    thrust::make_constant_iterator(size_t{1}),
+                                    d_tx_dst_ranks.begin(),
+                                    d_tx_value_counts.begin());
+  if (thrust::distance(d_tx_value_counts.begin(), thrust::get<1>(last)) < comm_size) {
+    rmm::device_uvector<size_t> d_counts(comm_size, stream);
+    thrust::fill(rmm::exec_policy(stream)->on(stream), d_counts.begin(), d_counts.end(), size_t{0});
+    thrust::scatter(rmm::exec_policy(stream)->on(stream),
+                    d_tx_value_counts.begin(),
+                    thrust::get<1>(last),
+                    d_tx_dst_ranks.begin(),
+                    d_counts.begin());
+    d_tx_value_counts = std::move(d_counts);
+  }
+
+  return std::move(d_tx_value_counts);
+}
+
+// inline to suppress a complaint about ODR violation
+inline std::tuple<std::vector<size_t>,
+                  std::vector<size_t>,
+                  std::vector<int>,
+                  std::vector<size_t>,
+                  std::vector<size_t>,
+                  std::vector<int>>
+compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const &comm,
+                                   rmm::device_uvector<size_t> const &d_tx_value_counts,
+                                   cudaStream_t stream)
+{
+  auto const comm_size = comm.get_size();
+
+  rmm::device_uvector<size_t> d_rx_value_counts(comm_size, stream);
+
+  // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released.
+  std::vector<size_t> tx_counts(comm_size, size_t{1});
+  std::vector<size_t> tx_offsets(comm_size);
+  std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0});
+  std::vector<int> tx_dst_ranks(comm_size);
+  std::iota(tx_dst_ranks.begin(), tx_dst_ranks.end(), int{0});
+  std::vector<size_t> rx_counts(comm_size, size_t{1});
+  std::vector<size_t> rx_offsets(comm_size);
+  std::iota(rx_offsets.begin(), rx_offsets.end(), size_t{0});
+  std::vector<int> rx_src_ranks(comm_size);
+  std::iota(rx_src_ranks.begin(), rx_src_ranks.end(), int{0});
+  device_multicast_sendrecv(comm,
+                            d_tx_value_counts.data(),
+                            tx_counts,
+                            tx_offsets,
+                            tx_dst_ranks,
+                            d_rx_value_counts.data(),
+                            rx_counts,
+                            rx_offsets,
+                            rx_src_ranks,
+                            stream);
+
+  raft::update_host(tx_counts.data(), d_tx_value_counts.data(), comm_size, stream);
+  raft::update_host(rx_counts.data(), d_rx_value_counts.data(), comm_size, stream);
+
+  CUDA_TRY(cudaStreamSynchronize(stream));  // rx_counts should be up-to-date
+
+  std::partial_sum(tx_counts.begin(), tx_counts.end() - 1, tx_offsets.begin() + 1);
+  std::partial_sum(rx_counts.begin(), rx_counts.end() - 1, rx_offsets.begin() + 1);
+
+  int num_tx_dst_ranks{0};
+  int num_rx_src_ranks{0};
+  for (int i = 0; i < comm_size; ++i) {
+    if (tx_counts[i] != 0) {
+      tx_counts[num_tx_dst_ranks]    = tx_counts[i];
+      tx_offsets[num_tx_dst_ranks]   = tx_offsets[i];
+      tx_dst_ranks[num_tx_dst_ranks] = tx_dst_ranks[i];
+      ++num_tx_dst_ranks;
+    }
+    if (rx_counts[i] != 0) {
+      rx_counts[num_rx_src_ranks]    = rx_counts[i];
+      rx_offsets[num_rx_src_ranks]   = rx_offsets[i];
+      rx_src_ranks[num_rx_src_ranks] = rx_src_ranks[i];
+      ++num_rx_src_ranks;
+    }
+  }
+  tx_counts.resize(num_tx_dst_ranks);
+  tx_offsets.resize(num_tx_dst_ranks);
+  tx_dst_ranks.resize(num_tx_dst_ranks);
+  rx_counts.resize(num_rx_src_ranks);
+  rx_offsets.resize(num_rx_src_ranks);
+  rx_src_ranks.resize(num_rx_src_ranks);
+
+  return std::make_tuple(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks);
+}
+
+}  // namespace detail
+
+template <typename TxValueIterator>
+auto shuffle_values(raft::comms::comms_t const &comm,
+                    TxValueIterator tx_value_first,
+                    std::vector<size_t> const &tx_value_counts,
+                    cudaStream_t stream)
+{
+  auto const comm_size = comm.get_size();
+
+  rmm::device_uvector<size_t> d_tx_value_counts(comm_size, stream);
+  raft::update_device(d_tx_value_counts.data(), tx_value_counts.data(), comm_size, stream);
+
+  CUDA_TRY(cudaStreamSynchronize(stream));  // tx_value_counts should be up-to-date
+
+  std::vector<size_t> tx_counts{};
+  std::vector<size_t> tx_offsets{};
+  std::vector<int> tx_dst_ranks{};
+  std::vector<size_t> rx_counts{};
+  std::vector<size_t> rx_offsets{};
+  std::vector<int> rx_src_ranks{};
+  std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) =
+    detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream);
+
+  auto rx_value_buffer =
+    allocate_dataframe_buffer<typename std::iterator_traits<TxValueIterator>::value_type>(
+      rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream);
+
+  // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
+  // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
+  device_multicast_sendrecv(
+    comm,
+    tx_value_first,
+    tx_counts,
+    tx_offsets,
+    tx_dst_ranks,
+    get_dataframe_buffer_begin<typename std::iterator_traits<TxValueIterator>::value_type>(
+      rx_value_buffer),
+    rx_counts,
+    rx_offsets,
+    rx_src_ranks,
+    stream);
+
+  return std::make_tuple(std::move(rx_value_buffer), rx_counts);
+}
+
+template <typename ValueIterator, typename ValueToGPUIdOp>
+auto groupby_gpuid_and_shuffle_values(raft::comms::comms_t const &comm,
+                                      ValueIterator tx_value_first /* [INOUT */,
+                                      ValueIterator tx_value_last /* [INOUT */,
+                                      ValueToGPUIdOp value_to_gpu_id_op,
+                                      cudaStream_t stream)
+{
+  auto const comm_size = comm.get_size();
+
+  auto d_tx_value_counts =
+    detail::sort_and_count(comm, tx_value_first, tx_value_last, value_to_gpu_id_op, stream);
+
+  std::vector<size_t> tx_counts{};
+  std::vector<size_t> tx_offsets{};
+  std::vector<int> tx_dst_ranks{};
+  std::vector<size_t> rx_counts{};
+  std::vector<size_t> rx_offsets{};
+  std::vector<int> rx_src_ranks{};
+  std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) =
+    detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream);
+
+  auto rx_value_buffer =
+    allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
+      rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream);
+
+  // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
+  // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
+  device_multicast_sendrecv(
+    comm,
+    tx_value_first,
+    tx_counts,
+    tx_offsets,
+    tx_dst_ranks,
+    get_dataframe_buffer_begin<typename std::iterator_traits<ValueIterator>::value_type>(
+      rx_value_buffer),
+    rx_counts,
+    rx_offsets,
+    rx_src_ranks,
+    stream);
+
+  return std::make_tuple(std::move(rx_value_buffer), rx_counts);
+}
+
+template <typename VertexIterator, typename ValueIterator, typename KeyToGPUIdOp>
+auto groupby_gpuid_and_shuffle_kv_pairs(raft::comms::comms_t const &comm,
+                                        VertexIterator tx_key_first /* [INOUT */,
+                                        VertexIterator tx_key_last /* [INOUT */,
+                                        ValueIterator tx_value_first /* [INOUT */,
+                                        KeyToGPUIdOp key_to_gpu_id_op,
+                                        cudaStream_t stream)
+{
+  auto d_tx_value_counts = detail::sort_and_count(
+    comm, tx_key_first, tx_key_last, tx_value_first, key_to_gpu_id_op, stream);
+
+  std::vector<size_t> tx_counts{};
+  std::vector<size_t> tx_offsets{};
+  std::vector<int> tx_dst_ranks{};
+  std::vector<size_t> rx_counts{};
+  std::vector<size_t> rx_offsets{};
+  std::vector<int> rx_src_ranks{};
+  std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) =
+    detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream);
+
+  rmm::device_uvector<typename std::iterator_traits<VertexIterator>::value_type> rx_keys(
+    rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream);
+  auto rx_value_buffer =
+    allocate_dataframe_buffer<typename std::iterator_traits<ValueIterator>::value_type>(
+      rx_keys.size(), stream);
+
+  // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
+  // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
+  device_multicast_sendrecv(comm,
+                            tx_key_first,
+                            tx_counts,
+                            tx_offsets,
+                            tx_dst_ranks,
+                            rx_keys.begin(),
+                            rx_counts,
+                            rx_offsets,
+                            rx_src_ranks,
+                            stream);
+
+  // FIXME: this needs to be replaced with AlltoAll once NCCL 2.8 is released
+  // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size).
+  device_multicast_sendrecv(
+    comm,
+    tx_value_first,
+    tx_counts,
+    tx_offsets,
+    tx_dst_ranks,
+    get_dataframe_buffer_begin<typename std::iterator_traits<ValueIterator>::value_type>(
+      rx_value_buffer),
+    rx_counts,
+    rx_offsets,
+    rx_src_ranks,
+    stream);
+
+  return std::make_tuple(std::move(rx_keys), std::move(rx_value_buffer), rx_counts);
+}
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/centrality/betweenness_centrality.cu b/cpp/src/centrality/betweenness_centrality.cu
index 8ff62f7ddb6..c0a34de5f70 100644
--- a/cpp/src/centrality/betweenness_centrality.cu
+++ b/cpp/src/centrality/betweenness_centrality.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,7 +116,7 @@ void verify_betweenness_centrality_input(result_t *result,
   static_assert(std::is_same<result_t, float>::value || std::is_same<result_t, double>::value,
                 "result_t should be float or double");
 
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: betwenness pointer is NULL");
+  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: betwenness pointer is NULL");
   CUGRAPH_EXPECTS(number_of_sources >= 0, "Number of sources must be positive or equal to 0.");
   if (number_of_sources != 0) {
     CUGRAPH_EXPECTS(sources != nullptr,
diff --git a/cpp/src/community/dendrogram.cuh b/cpp/src/community/dendrogram.cuh
new file mode 100644
index 00000000000..414f5f3854d
--- /dev/null
+++ b/cpp/src/community/dendrogram.cuh
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/device_buffer.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace cugraph {
+
+template <typename vertex_t>
+class Dendrogram {
+ public:
+  void add_level(vertex_t num_verts,
+                 cudaStream_t stream                 = 0,
+                 rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+  {
+    level_ptr_.push_back(
+      std::make_unique<rmm::device_buffer>(num_verts * sizeof(vertex_t), stream, mr));
+    level_size_.push_back(num_verts);
+  }
+
+  size_t current_level() const { return level_size_.size() - 1; }
+
+  size_t num_levels() const { return level_size_.size(); }
+
+  vertex_t const *get_level_ptr_nocheck(size_t level) const
+  {
+    return static_cast<vertex_t const *>(level_ptr_[level]->data());
+  }
+
+  vertex_t *get_level_ptr_nocheck(size_t level)
+  {
+    return static_cast<vertex_t *>(level_ptr_[level]->data());
+  }
+
+  vertex_t get_level_size_nocheck(size_t level) const { return level_size_[level]; }
+
+  vertex_t const *current_level_begin() const { return get_level_ptr_nocheck(current_level()); }
+
+  vertex_t const *current_level_end() const { return current_level_begin() + current_level_size(); }
+
+  vertex_t *current_level_begin() { return get_level_ptr_nocheck(current_level()); }
+
+  vertex_t *current_level_end() { return current_level_begin() + current_level_size(); }
+
+  vertex_t current_level_size() const { return get_level_size_nocheck(current_level()); }
+
+ private:
+  std::vector<vertex_t> level_size_;
+  std::vector<std::unique_ptr<rmm::device_buffer>> level_ptr_;
+};
+
+}  // namespace cugraph
diff --git a/cpp/src/community/ECG.cu b/cpp/src/community/ecg.cu
similarity index 70%
rename from cpp/src/community/ECG.cu
rename to cpp/src/community/ecg.cu
index ce7e9dd1ad2..994204ecd32 100644
--- a/cpp/src/community/ECG.cu
+++ b/cpp/src/community/ecg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,15 @@
  */
 
 #include <algorithms.hpp>
+#include <community/louvain.cuh>
+#include <converters/permute_graph.cuh>
+#include <utilities/error.hpp>
+#include <utilities/graph_utils.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <thrust/random.h>
-#include <converters/permute_graph.cuh>
+
 #include <ctime>
-#include <utilities/error.hpp>
-#include "utilities/graph_utils.cuh"
 
 namespace {
 template <typename IndexType>
@@ -41,26 +43,23 @@ binsearch_maxle(const IndexType *vec, const IndexType val, IndexType low, IndexT
   }
 }
 
+// FIXME: This shouldn't need to be a custom kernel, this
+//        seems like it should just be a thrust::transform
 template <typename IdxT, typename ValT>
-__global__ void match_check_kernel(IdxT size,
-                                   IdxT num_verts,
-                                   IdxT *offsets,
-                                   IdxT *indices,
-                                   IdxT *permutation,
-                                   IdxT *parts,
-                                   ValT *weights)
+__global__ void match_check_kernel(
+  IdxT size, IdxT num_verts, IdxT *offsets, IdxT *indices, IdxT *parts, ValT *weights)
 {
   IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
   while (tid < size) {
     IdxT source = binsearch_maxle(offsets, tid, (IdxT)0, num_verts);
     IdxT dest   = indices[tid];
-    if (parts[permutation[source]] == parts[permutation[dest]]) weights[tid] += 1;
+    if (parts[source] == parts[dest]) weights[tid] += 1;
     tid += gridDim.x * blockDim.x;
   }
 }
 
 struct prg {
-  __host__ __device__ float operator()(int n)
+  __device__ float operator()(int n)
   {
     thrust::default_random_engine rng;
     thrust::uniform_real_distribution<float> dist(0.0, 1.0);
@@ -93,7 +92,7 @@ struct update_functor {
 template <typename T>
 void get_permutation_vector(T size, T seed, T *permutation, cudaStream_t stream)
 {
-  rmm::device_vector<float> randoms_v(size);
+  rmm::device_uvector<float> randoms_v(size, stream);
 
   thrust::counting_iterator<uint32_t> index(seed);
   thrust::transform(
@@ -103,6 +102,31 @@ void get_permutation_vector(T size, T seed, T *permutation, cudaStream_t stream)
     rmm::exec_policy(stream)->on(stream), randoms_v.begin(), randoms_v.end(), permutation);
 }
 
+template <typename graph_type>
+class EcgLouvain : public cugraph::Louvain<graph_type> {
+ public:
+  using graph_t  = graph_type;
+  using vertex_t = typename graph_type::vertex_type;
+  using edge_t   = typename graph_type::edge_type;
+  using weight_t = typename graph_type::weight_type;
+
+  EcgLouvain(raft::handle_t const &handle, graph_type const &graph, vertex_t seed)
+    : cugraph::Louvain<graph_type>(handle, graph), seed_(seed)
+  {
+  }
+
+  void initialize_dendrogram_level(vertex_t num_vertices) override
+  {
+    this->dendrogram_->add_level(num_vertices);
+
+    get_permutation_vector(
+      num_vertices, seed_, this->dendrogram_->current_level_begin(), this->stream_);
+  }
+
+ private:
+  vertex_t seed_;
+};
+
 }  // anonymous namespace
 
 namespace cugraph {
@@ -114,36 +138,34 @@ void ecg(raft::handle_t const &handle,
          vertex_t ensemble_size,
          vertex_t *clustering)
 {
-  CUGRAPH_EXPECTS(graph.edge_data != nullptr, "API error, louvain expects a weighted graph");
-  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is NULL");
+  using graph_type = GraphCSRView<vertex_t, edge_t, weight_t>;
+
+  CUGRAPH_EXPECTS(graph.edge_data != nullptr,
+                  "Invalid input argument: ecg expects a weighted graph");
+  CUGRAPH_EXPECTS(clustering != nullptr,
+                  "Invalid input argument: clustering is NULL, should be a device pointer to "
+                  "memory for storing the result");
 
   cudaStream_t stream{0};
 
-  rmm::device_vector<weight_t> ecg_weights_v(graph.edge_data,
-                                             graph.edge_data + graph.number_of_edges);
+  rmm::device_uvector<weight_t> ecg_weights_v(graph.number_of_edges, handle.get_stream());
+
+  thrust::copy(rmm::exec_policy(stream)->on(stream),
+               graph.edge_data,
+               graph.edge_data + graph.number_of_edges,
+               ecg_weights_v.data());
 
   vertex_t size{graph.number_of_vertices};
-  vertex_t seed{1};
 
-  auto permuted_graph = std::make_unique<GraphCSR<vertex_t, edge_t, weight_t>>(
-    size, graph.number_of_edges, graph.has_data());
+  // FIXME:  This seed should be a parameter
+  vertex_t seed{1};
 
   // Iterate over each member of the ensemble
   for (vertex_t i = 0; i < ensemble_size; i++) {
-    // Take random permutation of the graph
-    rmm::device_vector<vertex_t> permutation_v(size);
-    vertex_t *d_permutation = permutation_v.data().get();
-
-    get_permutation_vector(size, seed, d_permutation, stream);
+    EcgLouvain<graph_type> runner(handle, graph, seed);
     seed += size;
 
-    detail::permute_graph<vertex_t, edge_t, weight_t>(graph, d_permutation, permuted_graph->view());
-
-    // Run one level of Louvain clustering on the random permutation
-    rmm::device_vector<vertex_t> parts_v(size);
-    vertex_t *d_parts = parts_v.data().get();
-
-    cugraph::louvain(handle, permuted_graph->view(), d_parts, size_t{1});
+    weight_t wt = runner(size_t{1}, weight_t{1});
 
     // For each edge in the graph determine whether the endpoints are in the same partition
     // Keep a sum for each edge of the total number of times its endpoints are in the same partition
@@ -154,17 +176,16 @@ void ecg(raft::handle_t const &handle,
                                                    graph.number_of_vertices,
                                                    graph.offsets,
                                                    graph.indices,
-                                                   permutation_v.data().get(),
-                                                   d_parts,
-                                                   ecg_weights_v.data().get());
+                                                   runner.get_dendrogram().get_level_ptr_nocheck(0),
+                                                   ecg_weights_v.data());
   }
 
   // Set weights = min_weight + (1 - min-weight)*sum/ensemble_size
   update_functor<weight_t> uf(min_weight, ensemble_size);
   thrust::transform(rmm::exec_policy(stream)->on(stream),
-                    ecg_weights_v.data().get(),
-                    ecg_weights_v.data().get() + graph.number_of_edges,
-                    ecg_weights_v.data().get(),
+                    ecg_weights_v.begin(),
+                    ecg_weights_v.end(),
+                    ecg_weights_v.begin(),
                     uf);
 
   // Run Louvain on the original graph using the computed weights
@@ -172,7 +193,7 @@ void ecg(raft::handle_t const &handle,
   GraphCSRView<vertex_t, edge_t, weight_t> louvain_graph;
   louvain_graph.indices            = graph.indices;
   louvain_graph.offsets            = graph.offsets;
-  louvain_graph.edge_data          = ecg_weights_v.data().get();
+  louvain_graph.edge_data          = ecg_weights_v.data();
   louvain_graph.number_of_vertices = graph.number_of_vertices;
   louvain_graph.number_of_edges    = graph.number_of_edges;
 
diff --git a/cpp/src/community/egonet.cu b/cpp/src/community/egonet.cu
new file mode 100644
index 00000000000..fa788aa307b
--- /dev/null
+++ b/cpp/src/community/egonet.cu
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Alex Fender afender@nvida.com
+#include <algorithms.hpp>
+#include <cstddef>
+#include <memory>
+#include <tuple>
+#include <utility>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <thrust/transform.h>
+#include <ctime>
+
+#include <graph.hpp>
+
+#include <utilities/error.hpp>
+#include "experimental/graph.hpp"
+#include "utilities/graph_utils.cuh"
+
+#include <experimental/graph_functions.hpp>
+#include <experimental/graph_view.hpp>
+
+namespace {
+
+/*
+Description
+Let the egonet graph of a node x be the subgraph that includes node x, the neighborhood of x, and
+all edges between them. Naive algorithm
+- Add center node x to the graph.
+- Go through all the neighbors y of this center node x, add edge (x, y) to the graph.
+- For each neighbor y of center node x, go through all the neighbors z of center node x, if there is
+an edge between y and z in original graph, add edge (y, z) to our new graph.
+
+Rather than doing custom one/two hops features, we propose a generic k-hops solution leveraging BFS
+cutoff and subgraph extraction
+*/
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<weight_t>,
+           rmm::device_uvector<size_t>>
+extract(
+  raft::handle_t const &handle,
+  cugraph::experimental::graph_view_t<vertex_t, edge_t, weight_t, false, false> const &csr_view,
+  vertex_t *source_vertex,
+  vertex_t n_subgraphs,
+  vertex_t radius)
+{
+  auto v           = csr_view.get_number_of_vertices();
+  auto e           = csr_view.get_number_of_edges();
+  auto stream      = handle.get_stream();
+  float avg_degree = e / v;
+  rmm::device_vector<size_t> neighbors_offsets(n_subgraphs + 1);
+  rmm::device_vector<vertex_t> neighbors;
+
+  // It is the right thing to accept device memory for source_vertex
+  // FIXME consider adding a device API to BFS (ie. accept source on the device)
+  std::vector<vertex_t> h_source_vertex(n_subgraphs);
+  raft::update_host(&h_source_vertex[0], source_vertex, n_subgraphs, stream);
+
+  // reserve some reasonable memory, but could grow larger than that
+  neighbors.reserve(v + avg_degree * n_subgraphs * radius);
+  neighbors_offsets[0] = 0;
+  // each source should be done concurently in the future
+  for (vertex_t i = 0; i < n_subgraphs; i++) {
+    // BFS with cutoff
+    rmm::device_vector<vertex_t> reached(v);
+    rmm::device_vector<vertex_t> predecessors(v);  // not used
+    bool direction_optimizing = false;
+    cugraph::experimental::bfs<vertex_t, edge_t, weight_t, false>(handle,
+                                                                  csr_view,
+                                                                  reached.data().get(),
+                                                                  predecessors.data().get(),
+                                                                  h_source_vertex[i],
+                                                                  direction_optimizing,
+                                                                  radius);
+
+    // identify reached vertex ids from distance array
+    thrust::transform(rmm::exec_policy(stream)->on(stream),
+                      thrust::make_counting_iterator(vertex_t{0}),
+                      thrust::make_counting_iterator(v),
+                      reached.begin(),
+                      reached.begin(),
+                      [sentinel = std::numeric_limits<vertex_t>::max()] __device__(
+                        auto id, auto val) { return val < sentinel ? id : sentinel; });
+
+    // removes unreached data
+    auto reached_end = thrust::remove(rmm::exec_policy(stream)->on(stream),
+                                      reached.begin(),
+                                      reached.end(),
+                                      std::numeric_limits<vertex_t>::max());
+
+    // update extraction input
+    size_t n_reached         = thrust::distance(reached.begin(), reached_end);
+    neighbors_offsets[i + 1] = neighbors_offsets[i] + n_reached;
+    if (neighbors_offsets[i + 1] > neighbors.capacity())
+      neighbors.reserve(neighbors_offsets[i + 1] * 2);
+    neighbors.insert(neighbors.end(), reached.begin(), reached_end);
+  }
+
+  // extract
+  return cugraph::experimental::extract_induced_subgraphs(
+    handle, csr_view, neighbors_offsets.data().get(), neighbors.data().get(), n_subgraphs);
+}
+}  // namespace
+namespace cugraph {
+namespace experimental {
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<weight_t>,
+           rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const &handle,
+            graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
+            vertex_t *source_vertex,
+            vertex_t n_subgraphs,
+            vertex_t radius)
+{
+  if (multi_gpu) {
+    CUGRAPH_FAIL("Unimplemented.");
+    return std::make_tuple(rmm::device_uvector<vertex_t>(0, handle.get_stream()),
+                           rmm::device_uvector<vertex_t>(0, handle.get_stream()),
+                           rmm::device_uvector<weight_t>(0, handle.get_stream()),
+                           rmm::device_uvector<size_t>(0, handle.get_stream()));
+  }
+  CUGRAPH_EXPECTS(n_subgraphs > 0, "Need at least one source to extract the egonet from");
+  CUGRAPH_EXPECTS(n_subgraphs < graph_view.get_number_of_vertices(),
+                  "Can't have more sources to extract from than vertices in the graph");
+  CUGRAPH_EXPECTS(radius > 0, "Radius should be at least 1");
+  CUGRAPH_EXPECTS(radius < graph_view.get_number_of_vertices(), "radius is too large");
+  // source_vertex range is checked in bfs.
+
+  return extract<vertex_t, edge_t, weight_t>(
+    handle, graph_view, source_vertex, n_subgraphs, radius);
+}
+
+// SG FP32
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<float>,
+                    rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const &,
+            graph_view_t<int32_t, int32_t, float, false, false> const &,
+            int32_t *,
+            int32_t,
+            int32_t);
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<float>,
+                    rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const &,
+            graph_view_t<int32_t, int64_t, float, false, false> const &,
+            int32_t *,
+            int32_t,
+            int32_t);
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<float>,
+                    rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const &,
+            graph_view_t<int64_t, int64_t, float, false, false> const &,
+            int64_t *,
+            int64_t,
+            int64_t);
+
+// SG FP64
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<double>,
+                    rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const &,
+            graph_view_t<int32_t, int32_t, double, false, false> const &,
+            int32_t *,
+            int32_t,
+            int32_t);
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<double>,
+                    rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const &,
+            graph_view_t<int32_t, int64_t, double, false, false> const &,
+            int32_t *,
+            int32_t,
+            int32_t);
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<double>,
+                    rmm::device_uvector<size_t>>
+extract_ego(raft::handle_t const &,
+            graph_view_t<int64_t, int64_t, double, false, false> const &,
+            int64_t *,
+            int64_t,
+            int64_t);
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/community/extract_subgraph_by_vertex.cu b/cpp/src/community/extract_subgraph_by_vertex.cu
index c39b7f8ad0a..eb7b1d494a0 100644
--- a/cpp/src/community/extract_subgraph_by_vertex.cu
+++ b/cpp/src/community/extract_subgraph_by_vertex.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -119,7 +119,7 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> extract_subgraph_vertex(GraphCOOView<VT, E
                                                               VT const *vertices,
                                                               VT num_vertices)
 {
-  CUGRAPH_EXPECTS(vertices != nullptr, "API error, vertices must be non null");
+  CUGRAPH_EXPECTS(vertices != nullptr, "Invalid input argument: vertices must be non null");
 
   cudaStream_t stream{0};
 
diff --git a/cpp/src/community/flatten_dendrogram.cuh b/cpp/src/community/flatten_dendrogram.cuh
new file mode 100644
index 00000000000..892fe2d1c51
--- /dev/null
+++ b/cpp/src/community/flatten_dendrogram.cuh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <community/dendrogram.cuh>
+#include <experimental/graph_functions.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+
+namespace cugraph {
+
+template <typename vertex_t, bool multi_gpu>
+void partition_at_level(raft::handle_t const &handle,
+                        Dendrogram<vertex_t> const &dendrogram,
+                        vertex_t const *d_vertex_ids,
+                        vertex_t *d_partition,
+                        size_t level)
+{
+  vertex_t local_num_verts = dendrogram.get_level_size_nocheck(0);
+
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               d_vertex_ids,
+               d_vertex_ids + local_num_verts,
+               d_partition);
+
+  std::for_each(thrust::make_counting_iterator<size_t>(0),
+                thrust::make_counting_iterator<size_t>(level),
+                [&handle, &dendrogram, d_vertex_ids, &d_partition, local_num_verts](size_t l) {
+                  cugraph::experimental::relabel<vertex_t, multi_gpu>(
+                    handle,
+                    std::tuple<vertex_t const *, vertex_t const *>(
+                      d_vertex_ids, dendrogram.get_level_ptr_nocheck(l)),
+                    dendrogram.get_level_size_nocheck(l),
+                    d_partition,
+                    local_num_verts);
+                });
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/community/leiden.cu b/cpp/src/community/leiden.cu
index 9e5a847cdf0..427e62d3286 100644
--- a/cpp/src/community/leiden.cu
+++ b/cpp/src/community/leiden.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,11 @@
  * limitations under the License.
  */
 
+#include <community/flatten_dendrogram.cuh>
 #include <community/leiden.cuh>
 
+#include <rmm/device_uvector.hpp>
+
 namespace cugraph {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
@@ -27,11 +30,29 @@ std::pair<size_t, weight_t> leiden(raft::handle_t const &handle,
 {
   CUGRAPH_EXPECTS(graph.edge_data != nullptr,
                   "Invalid input argument: leiden expects a weighted graph");
-  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
+  CUGRAPH_EXPECTS(clustering != nullptr,
+                  "Invalid input argument: clustering is null, should be a device pointer to "
+                  "memory for storing the result");
 
   Leiden<GraphCSRView<vertex_t, edge_t, weight_t>> runner(handle, graph);
+  weight_t wt = runner(max_level, resolution);
+
+  rmm::device_uvector<vertex_t> vertex_ids_v(graph.number_of_vertices, handle.get_stream());
+
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               thrust::make_counting_iterator<vertex_t>(0),  // MNMG - base vertex id
+               thrust::make_counting_iterator<vertex_t>(
+                 graph.number_of_vertices),  // MNMG - base vertex id + number_of_vertices
+               vertex_ids_v.begin());
+
+  partition_at_level<vertex_t, false>(handle,
+                                      runner.get_dendrogram(),
+                                      vertex_ids_v.data(),
+                                      clustering,
+                                      runner.get_dendrogram().num_levels());
 
-  return runner(clustering, max_level, resolution);
+  // FIXME: Consider returning the Dendrogram at some point
+  return std::make_pair(runner.get_dendrogram().num_levels(), wt);
 }
 
 // Explicit template instantations
diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh
index f2f84433284..141f8beac40 100644
--- a/cpp/src/community/leiden.cuh
+++ b/cpp/src/community/leiden.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <community/louvain.cuh>
 
+#include <rmm/device_uvector.hpp>
+
 namespace cugraph {
 
 template <typename graph_type>
@@ -28,7 +30,8 @@ class Leiden : public Louvain<graph_type> {
   using weight_t = typename graph_type::weight_type;
 
   Leiden(raft::handle_t const &handle, graph_type const &graph)
-    : Louvain<graph_type>(handle, graph), constraint_v_(graph.number_of_vertices)
+    : Louvain<graph_type>(handle, graph),
+      constraint_v_(graph.number_of_vertices, handle.get_stream())
   {
   }
 
@@ -38,22 +41,28 @@ class Leiden : public Louvain<graph_type> {
   {
     this->timer_start("update_clustering_constrained");
 
-    rmm::device_vector<vertex_t> next_cluster_v(this->cluster_v_);
-    rmm::device_vector<weight_t> delta_Q_v(graph.number_of_edges);
-    rmm::device_vector<vertex_t> cluster_hash_v(graph.number_of_edges);
-    rmm::device_vector<weight_t> old_cluster_sum_v(graph.number_of_vertices);
+    rmm::device_uvector<vertex_t> next_cluster_v(this->dendrogram_->current_level_size(),
+                                                 this->stream_);
+    rmm::device_uvector<weight_t> delta_Q_v(graph.number_of_edges, this->stream_);
+    rmm::device_uvector<vertex_t> cluster_hash_v(graph.number_of_edges, this->stream_);
+    rmm::device_uvector<weight_t> old_cluster_sum_v(graph.number_of_vertices, this->stream_);
 
-    vertex_t const *d_src_indices    = this->src_indices_v_.data().get();
+    vertex_t const *d_src_indices    = this->src_indices_v_.data();
     vertex_t const *d_dst_indices    = graph.indices;
-    vertex_t *d_cluster_hash         = cluster_hash_v.data().get();
-    vertex_t *d_cluster              = this->cluster_v_.data().get();
-    weight_t const *d_vertex_weights = this->vertex_weights_v_.data().get();
-    weight_t *d_cluster_weights      = this->cluster_weights_v_.data().get();
-    weight_t *d_delta_Q              = delta_Q_v.data().get();
-    vertex_t *d_constraint           = constraint_v_.data().get();
+    vertex_t *d_cluster_hash         = cluster_hash_v.data();
+    vertex_t *d_cluster              = this->dendrogram_->current_level_begin();
+    weight_t const *d_vertex_weights = this->vertex_weights_v_.data();
+    weight_t *d_cluster_weights      = this->cluster_weights_v_.data();
+    weight_t *d_delta_Q              = delta_Q_v.data();
+    vertex_t *d_constraint           = constraint_v_.data();
+
+    thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
+                 this->dendrogram_->current_level_begin(),
+                 this->dendrogram_->current_level_end(),
+                 next_cluster_v.data());
 
-    weight_t new_Q =
-      this->modularity(total_edge_weight, resolution, graph, this->cluster_v_.data().get());
+    weight_t new_Q = this->modularity(
+      total_edge_weight, resolution, graph, this->dendrogram_->current_level_begin());
 
     weight_t cur_Q = new_Q - 1;
 
@@ -83,13 +92,13 @@ class Leiden : public Louvain<graph_type> {
 
       up_down = !up_down;
 
-      new_Q = this->modularity(total_edge_weight, resolution, graph, next_cluster_v.data().get());
+      new_Q = this->modularity(total_edge_weight, resolution, graph, next_cluster_v.data());
 
       if (new_Q > cur_Q) {
         thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
                      next_cluster_v.begin(),
                      next_cluster_v.end(),
-                     this->cluster_v_.begin());
+                     this->dendrogram_->current_level_begin());
       }
     }
 
@@ -97,9 +106,7 @@ class Leiden : public Louvain<graph_type> {
     return cur_Q;
   }
 
-  std::pair<size_t, weight_t> operator()(vertex_t *d_cluster_vec,
-                                         size_t max_level,
-                                         weight_t resolution)
+  weight_t operator()(size_t max_level, weight_t resolution) override
   {
     size_t num_level{0};
 
@@ -109,57 +116,50 @@ class Leiden : public Louvain<graph_type> {
 
     weight_t best_modularity = weight_t{-1};
 
-    //
-    //  Initialize every cluster to reference each vertex to itself
-    //
-    thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_),
-                     this->cluster_v_.begin(),
-                     this->cluster_v_.end());
-    thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
-                 this->cluster_v_.begin(),
-                 this->cluster_v_.end(),
-                 d_cluster_vec);
-
     //
     //  Our copy of the graph.  Each iteration of the outer loop will
     //  shrink this copy of the graph.
     //
-    GraphCSRView<vertex_t, edge_t, weight_t> current_graph(this->offsets_v_.data().get(),
-                                                           this->indices_v_.data().get(),
-                                                           this->weights_v_.data().get(),
+    GraphCSRView<vertex_t, edge_t, weight_t> current_graph(this->offsets_v_.data(),
+                                                           this->indices_v_.data(),
+                                                           this->weights_v_.data(),
                                                            this->number_of_vertices_,
                                                            this->number_of_edges_);
 
-    current_graph.get_source_indices(this->src_indices_v_.data().get());
+    current_graph.get_source_indices(this->src_indices_v_.data());
 
     while (num_level < max_level) {
+      //
+      //  Initialize every cluster to reference each vertex to itself
+      //
+      this->dendrogram_->add_level(current_graph.number_of_vertices);
+
+      thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_),
+                       this->dendrogram_->current_level_begin(),
+                       this->dendrogram_->current_level_end());
+
       this->compute_vertex_and_cluster_weights(current_graph);
 
       weight_t new_Q = this->update_clustering(total_edge_weight, resolution, current_graph);
 
-      thrust::copy(rmm::exec_policy(this->stream_)->on(this->stream_),
-                   this->cluster_v_.begin(),
-                   this->cluster_v_.end(),
-                   constraint_v_.begin());
-
       new_Q = update_clustering_constrained(total_edge_weight, resolution, current_graph);
 
       if (new_Q <= best_modularity) { break; }
 
       best_modularity = new_Q;
 
-      this->shrink_graph(current_graph, d_cluster_vec);
+      this->shrink_graph(current_graph);
 
       num_level++;
     }
 
     this->timer_display(std::cout);
 
-    return std::make_pair(num_level, best_modularity);
+    return best_modularity;
   }
 
  private:
-  rmm::device_vector<vertex_t> constraint_v_;
+  rmm::device_uvector<vertex_t> constraint_v_;
 };
 
 }  // namespace cugraph
diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu
index 81a68a31663..a851777ad93 100644
--- a/cpp/src/community/louvain.cu
+++ b/cpp/src/community/louvain.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,13 @@
  * limitations under the License.
  */
 
+#include <community/flatten_dendrogram.cuh>
 #include <community/louvain.cuh>
 #include <experimental/graph.hpp>
 #include <experimental/louvain.cuh>
 
+#include <rmm/device_uvector.hpp>
+
 namespace cugraph {
 
 namespace detail {
@@ -31,10 +34,28 @@ std::pair<size_t, weight_t> louvain(raft::handle_t const &handle,
 {
   CUGRAPH_EXPECTS(graph_view.edge_data != nullptr,
                   "Invalid input argument: louvain expects a weighted graph");
-  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
+  CUGRAPH_EXPECTS(clustering != nullptr,
+                  "Invalid input argument: clustering is null, should be a device pointer to "
+                  "memory for storing the result");
 
   Louvain<GraphCSRView<vertex_t, edge_t, weight_t>> runner(handle, graph_view);
-  return runner(clustering, max_level, resolution);
+  weight_t wt = runner(max_level, resolution);
+
+  rmm::device_uvector<vertex_t> vertex_ids_v(graph_view.number_of_vertices, handle.get_stream());
+
+  thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   vertex_ids_v.begin(),
+                   vertex_ids_v.end(),
+                   vertex_t{0});
+
+  partition_at_level<vertex_t, false>(handle,
+                                      runner.get_dendrogram(),
+                                      vertex_ids_v.data(),
+                                      clustering,
+                                      runner.get_dendrogram().num_levels());
+
+  // FIXME: Consider returning the Dendrogram at some point
+  return std::make_pair(runner.get_dendrogram().num_levels(), wt);
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
@@ -45,7 +66,9 @@ std::pair<size_t, weight_t> louvain(
   size_t max_level,
   weight_t resolution)
 {
-  CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
+  CUGRAPH_EXPECTS(clustering != nullptr,
+                  "Invalid input argument: clustering is null, should be a device pointer to "
+                  "memory for storing the result");
 
   // "FIXME": remove this check and the guards below
   //
@@ -61,7 +84,25 @@ std::pair<size_t, weight_t> louvain(
   } else {
     experimental::Louvain<experimental::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu>>
       runner(handle, graph_view);
-    return runner(clustering, max_level, resolution);
+
+    weight_t wt = runner(max_level, resolution);
+
+    rmm::device_uvector<vertex_t> vertex_ids_v(graph_view.get_number_of_vertices(),
+                                               handle.get_stream());
+
+    thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     vertex_ids_v.begin(),
+                     vertex_ids_v.end(),
+                     graph_view.get_local_vertex_first());
+
+    partition_at_level<vertex_t, multi_gpu>(handle,
+                                            runner.get_dendrogram(),
+                                            vertex_ids_v.data(),
+                                            clustering,
+                                            runner.get_dendrogram().num_levels());
+
+    // FIXME: Consider returning the Dendrogram at some point
+    return std::make_pair(runner.get_dendrogram().num_levels(), wt);
   }
 }
 
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index 7ca3638f42b..e28f0f1746d 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,13 @@
 
 #include <graph.hpp>
 
-#include <rmm/thrust_rmm_allocator.h>
-
 #include <converters/COOtoCSR.cuh>
 #include <utilities/graph_utils.cuh>
 
+#include <community/dendrogram.cuh>
+
+#include <rmm/device_uvector.hpp>
+
 //#define TIMING
 
 #ifdef TIMING
@@ -44,26 +46,42 @@ class Louvain {
       hr_timer_(),
 #endif
       handle_(handle),
+      dendrogram_(std::make_unique<Dendrogram<vertex_t>>()),
 
       // FIXME:  Don't really need to copy here but would need
       //         to change the logic to populate this properly
       //         in generate_superverticies_graph.
       //
-      offsets_v_(graph.offsets, graph.offsets + graph.number_of_vertices + 1),
-      indices_v_(graph.indices, graph.indices + graph.number_of_edges),
-      weights_v_(graph.edge_data, graph.edge_data + graph.number_of_edges),
-      src_indices_v_(graph.number_of_edges),
-      vertex_weights_v_(graph.number_of_vertices),
-      cluster_weights_v_(graph.number_of_vertices),
-      cluster_v_(graph.number_of_vertices),
-      tmp_arr_v_(graph.number_of_vertices),
-      cluster_inverse_v_(graph.number_of_vertices),
+      offsets_v_(graph.number_of_vertices + 1, handle.get_stream()),
+      indices_v_(graph.number_of_edges, handle.get_stream()),
+      weights_v_(graph.number_of_edges, handle.get_stream()),
+      src_indices_v_(graph.number_of_edges, handle.get_stream()),
+      vertex_weights_v_(graph.number_of_vertices, handle.get_stream()),
+      cluster_weights_v_(graph.number_of_vertices, handle.get_stream()),
+      tmp_arr_v_(graph.number_of_vertices, handle.get_stream()),
+      cluster_inverse_v_(graph.number_of_vertices, handle.get_stream()),
       number_of_vertices_(graph.number_of_vertices),
       number_of_edges_(graph.number_of_edges),
       stream_(handle.get_stream())
   {
+    thrust::copy(rmm::exec_policy(stream_)->on(stream_),
+                 graph.offsets,
+                 graph.offsets + graph.number_of_vertices + 1,
+                 offsets_v_.begin());
+
+    thrust::copy(rmm::exec_policy(stream_)->on(stream_),
+                 graph.indices,
+                 graph.indices + graph.number_of_edges,
+                 indices_v_.begin());
+
+    thrust::copy(rmm::exec_policy(stream_)->on(stream_),
+                 graph.edge_data,
+                 graph.edge_data + graph.number_of_edges,
+                 weights_v_.begin());
   }
 
+  virtual ~Louvain() {}
+
   weight_t modularity(weight_t total_edge_weight,
                       weight_t resolution,
                       graph_t const &graph,
@@ -71,43 +89,45 @@ class Louvain {
   {
     vertex_t n_verts = graph.number_of_vertices;
 
-    rmm::device_vector<weight_t> inc(n_verts, weight_t{0.0});
-    rmm::device_vector<weight_t> deg(n_verts, weight_t{0.0});
+    rmm::device_uvector<weight_t> inc(n_verts, stream_);
+    rmm::device_uvector<weight_t> deg(n_verts, stream_);
 
-    edge_t const *d_offsets   = graph.offsets;
-    vertex_t const *d_indices = graph.indices;
-    weight_t const *d_weights = graph.edge_data;
-    weight_t *d_inc           = inc.data().get();
-    weight_t *d_deg           = deg.data().get();
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_), inc.begin(), inc.end(), weight_t{0.0});
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_), deg.begin(), deg.end(), weight_t{0.0});
 
     // FIXME:  Already have weighted degree computed in main loop,
     //         could pass that in rather than computing d_deg... which
     //         would save an atomicAdd (synchronization)
     //
-    thrust::for_each(
-      rmm::exec_policy(stream_)->on(stream_),
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(graph.number_of_vertices),
-      [d_inc, d_deg, d_offsets, d_indices, d_weights, d_cluster] __device__(vertex_t v) {
-        vertex_t community = d_cluster[v];
-        weight_t increase{0.0};
-        weight_t degree{0.0};
-
-        for (edge_t loc = d_offsets[v]; loc < d_offsets[v + 1]; ++loc) {
-          vertex_t neighbor = d_indices[loc];
-          degree += d_weights[loc];
-          if (d_cluster[neighbor] == community) { increase += d_weights[loc]; }
-        }
+    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
+                     thrust::make_counting_iterator(0),
+                     thrust::make_counting_iterator(graph.number_of_vertices),
+                     [d_inc     = inc.data(),
+                      d_deg     = deg.data(),
+                      d_offsets = graph.offsets,
+                      d_indices = graph.indices,
+                      d_weights = graph.edge_data,
+                      d_cluster] __device__(vertex_t v) {
+                       vertex_t community = d_cluster[v];
+                       weight_t increase{0.0};
+                       weight_t degree{0.0};
+
+                       for (edge_t loc = d_offsets[v]; loc < d_offsets[v + 1]; ++loc) {
+                         vertex_t neighbor = d_indices[loc];
+                         degree += d_weights[loc];
+                         if (d_cluster[neighbor] == community) { increase += d_weights[loc]; }
+                       }
 
-        if (degree > weight_t{0.0}) atomicAdd(d_deg + community, degree);
-        if (increase > weight_t{0.0}) atomicAdd(d_inc + community, increase);
-      });
+                       if (degree > weight_t{0.0}) atomicAdd(d_deg + community, degree);
+                       if (increase > weight_t{0.0}) atomicAdd(d_inc + community, increase);
+                     });
 
     weight_t Q = thrust::transform_reduce(
       rmm::exec_policy(stream_)->on(stream_),
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(graph.number_of_vertices),
-      [d_deg, d_inc, total_edge_weight, resolution] __device__(vertex_t community) {
+      [d_deg = deg.data(), d_inc = inc.data(), total_edge_weight, resolution] __device__(
+        vertex_t community) {
         return ((d_inc[community] / total_edge_weight) - resolution *
                                                            (d_deg[community] * d_deg[community]) /
                                                            (total_edge_weight * total_edge_weight));
@@ -118,37 +138,35 @@ class Louvain {
     return Q;
   }
 
-  virtual std::pair<size_t, weight_t> operator()(vertex_t *d_cluster_vec,
-                                                 size_t max_level,
-                                                 weight_t resolution)
-  {
-    size_t num_level{0};
+  Dendrogram<vertex_t> &get_dendrogram() const { return *dendrogram_; }
 
+  std::unique_ptr<Dendrogram<vertex_t>> move_dendrogram() { return dendrogram_; }
+
+  virtual weight_t operator()(size_t max_level, weight_t resolution)
+  {
     weight_t total_edge_weight =
       thrust::reduce(rmm::exec_policy(stream_)->on(stream_), weights_v_.begin(), weights_v_.end());
 
     weight_t best_modularity = weight_t{-1};
 
-    //
-    //  Initialize every cluster to reference each vertex to itself
-    //
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end());
-    thrust::copy(
-      rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end(), d_cluster_vec);
-
     //
     //  Our copy of the graph.  Each iteration of the outer loop will
     //  shrink this copy of the graph.
     //
-    GraphCSRView<vertex_t, edge_t, weight_t> current_graph(offsets_v_.data().get(),
-                                                           indices_v_.data().get(),
-                                                           weights_v_.data().get(),
+    GraphCSRView<vertex_t, edge_t, weight_t> current_graph(offsets_v_.data(),
+                                                           indices_v_.data(),
+                                                           weights_v_.data(),
                                                            number_of_vertices_,
                                                            number_of_edges_);
 
-    current_graph.get_source_indices(src_indices_v_.data().get());
+    current_graph.get_source_indices(src_indices_v_.data());
+
+    while (dendrogram_->num_levels() < max_level) {
+      //
+      //  Initialize every cluster to reference each vertex to itself
+      //
+      initialize_dendrogram_level(current_graph.number_of_vertices);
 
-    while (num_level < max_level) {
       compute_vertex_and_cluster_weights(current_graph);
 
       weight_t new_Q = update_clustering(total_edge_weight, resolution, current_graph);
@@ -157,14 +175,12 @@ class Louvain {
 
       best_modularity = new_Q;
 
-      shrink_graph(current_graph, d_cluster_vec);
-
-      num_level++;
+      shrink_graph(current_graph);
     }
 
     timer_display(std::cout);
 
-    return std::make_pair(num_level, best_modularity);
+    return best_modularity;
   }
 
  protected:
@@ -190,6 +206,15 @@ class Louvain {
 #endif
   }
 
+  virtual void initialize_dendrogram_level(vertex_t num_vertices)
+  {
+    dendrogram_->add_level(num_vertices);
+
+    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
+                     dendrogram_->current_level_begin(),
+                     dendrogram_->current_level_end());
+  }
+
  public:
   void compute_vertex_and_cluster_weights(graph_type const &graph)
   {
@@ -198,8 +223,8 @@ class Louvain {
     edge_t const *d_offsets     = graph.offsets;
     vertex_t const *d_indices   = graph.indices;
     weight_t const *d_weights   = graph.edge_data;
-    weight_t *d_vertex_weights  = vertex_weights_v_.data().get();
-    weight_t *d_cluster_weights = cluster_weights_v_.data().get();
+    weight_t *d_vertex_weights  = vertex_weights_v_.data();
+    weight_t *d_cluster_weights = cluster_weights_v_.data();
 
     //
     // MNMG:  copy_v_transform_reduce_out_nbr, then copy
@@ -226,21 +251,23 @@ class Louvain {
   {
     timer_start("update_clustering");
 
-    //
-    // MNMG: This is the hard one, see writeup
-    //
-    rmm::device_vector<vertex_t> next_cluster_v(cluster_v_);
-    rmm::device_vector<weight_t> delta_Q_v(graph.number_of_edges);
-    rmm::device_vector<vertex_t> cluster_hash_v(graph.number_of_edges);
-    rmm::device_vector<weight_t> old_cluster_sum_v(graph.number_of_vertices);
+    rmm::device_uvector<vertex_t> next_cluster_v(dendrogram_->current_level_size(), stream_);
+    rmm::device_uvector<weight_t> delta_Q_v(graph.number_of_edges, stream_);
+    rmm::device_uvector<vertex_t> cluster_hash_v(graph.number_of_edges, stream_);
+    rmm::device_uvector<weight_t> old_cluster_sum_v(graph.number_of_vertices, stream_);
+
+    vertex_t *d_cluster              = dendrogram_->current_level_begin();
+    weight_t const *d_vertex_weights = vertex_weights_v_.data();
+    weight_t *d_cluster_weights      = cluster_weights_v_.data();
+    weight_t *d_delta_Q              = delta_Q_v.data();
 
-    vertex_t *d_cluster_hash         = cluster_hash_v.data().get();
-    vertex_t *d_cluster              = cluster_v_.data().get();
-    weight_t const *d_vertex_weights = vertex_weights_v_.data().get();
-    weight_t *d_cluster_weights      = cluster_weights_v_.data().get();
-    weight_t *d_delta_Q              = delta_Q_v.data().get();
+    thrust::copy(rmm::exec_policy(stream_)->on(stream_),
+                 dendrogram_->current_level_begin(),
+                 dendrogram_->current_level_end(),
+                 next_cluster_v.data());
 
-    weight_t new_Q = modularity(total_edge_weight, resolution, graph, cluster_v_.data().get());
+    weight_t new_Q =
+      modularity(total_edge_weight, resolution, graph, dendrogram_->current_level_begin());
 
     weight_t cur_Q = new_Q - 1;
 
@@ -259,13 +286,13 @@ class Louvain {
 
       up_down = !up_down;
 
-      new_Q = modularity(total_edge_weight, resolution, graph, next_cluster_v.data().get());
+      new_Q = modularity(total_edge_weight, resolution, graph, next_cluster_v.data());
 
       if (new_Q > cur_Q) {
         thrust::copy(rmm::exec_policy(stream_)->on(stream_),
                      next_cluster_v.begin(),
                      next_cluster_v.end(),
-                     cluster_v_.begin());
+                     dendrogram_->current_level_begin());
       }
     }
 
@@ -276,45 +303,37 @@ class Louvain {
   void compute_delta_modularity(weight_t total_edge_weight,
                                 weight_t resolution,
                                 graph_type const &graph,
-                                rmm::device_vector<vertex_t> &cluster_hash_v,
-                                rmm::device_vector<weight_t> &old_cluster_sum_v,
-                                rmm::device_vector<weight_t> &delta_Q_v)
+                                rmm::device_uvector<vertex_t> &cluster_hash_v,
+                                rmm::device_uvector<weight_t> &old_cluster_sum_v,
+                                rmm::device_uvector<weight_t> &delta_Q_v)
   {
-    vertex_t const *d_src_indices     = src_indices_v_.data().get();
-    vertex_t const *d_dst_indices     = graph.indices;
     edge_t const *d_offsets           = graph.offsets;
     weight_t const *d_weights         = graph.edge_data;
-    vertex_t const *d_cluster         = cluster_v_.data().get();
-    weight_t const *d_vertex_weights  = vertex_weights_v_.data().get();
-    weight_t const *d_cluster_weights = cluster_weights_v_.data().get();
+    vertex_t const *d_cluster         = dendrogram_->current_level_begin();
+    weight_t const *d_vertex_weights  = vertex_weights_v_.data();
+    weight_t const *d_cluster_weights = cluster_weights_v_.data();
 
-    vertex_t *d_cluster_hash    = cluster_hash_v.data().get();
-    weight_t *d_delta_Q         = delta_Q_v.data().get();
-    weight_t *d_old_cluster_sum = old_cluster_sum_v.data().get();
+    vertex_t *d_cluster_hash    = cluster_hash_v.data();
+    weight_t *d_delta_Q         = delta_Q_v.data();
+    weight_t *d_old_cluster_sum = old_cluster_sum_v.data();
     weight_t *d_new_cluster_sum = d_delta_Q;
 
-    thrust::fill(cluster_hash_v.begin(), cluster_hash_v.end(), vertex_t{-1});
-    thrust::fill(delta_Q_v.begin(), delta_Q_v.end(), weight_t{0.0});
-    thrust::fill(old_cluster_sum_v.begin(), old_cluster_sum_v.end(), weight_t{0.0});
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_),
+                 cluster_hash_v.begin(),
+                 cluster_hash_v.end(),
+                 vertex_t{-1});
+    thrust::fill(
+      rmm::exec_policy(stream_)->on(stream_), delta_Q_v.begin(), delta_Q_v.end(), weight_t{0.0});
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_),
+                 old_cluster_sum_v.begin(),
+                 old_cluster_sum_v.end(),
+                 weight_t{0.0});
 
-    // MNMG:  New technique using reduce_by_key.  Would require a segmented sort
-    //        or a pair of sorts on each node, so probably slower than what's here.
-    //        This might still be faster even in MNMG...
-    //
-    //
-    // FIXME:  Eventually this should use cuCollections concurrent map
-    //         implementation, but that won't be available for a while.
-    //
-    // For each source vertex, we're going to build a hash
-    // table to the destination cluster ids.  We can use
-    // the offsets ranges to define the bounds of the hash
-    // table.
-    //
     thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
                      thrust::make_counting_iterator<edge_t>(0),
                      thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
-                     [d_src_indices,
-                      d_dst_indices,
+                     [d_src_indices = src_indices_v_.data(),
+                      d_dst_indices = graph.indices,
                       d_cluster,
                       d_offsets,
                       d_cluster_hash,
@@ -355,7 +374,7 @@ class Louvain {
       [total_edge_weight,
        resolution,
        d_cluster_hash,
-       d_src_indices,
+       d_src_indices = src_indices_v_.data(),
        d_cluster,
        d_vertex_weights,
        d_delta_Q,
@@ -383,33 +402,37 @@ class Louvain {
   }
 
   void assign_nodes(graph_type const &graph,
-                    rmm::device_vector<vertex_t> &cluster_hash_v,
-                    rmm::device_vector<vertex_t> &next_cluster_v,
-                    rmm::device_vector<weight_t> &delta_Q_v,
+                    rmm::device_uvector<vertex_t> &cluster_hash_v,
+                    rmm::device_uvector<vertex_t> &next_cluster_v,
+                    rmm::device_uvector<weight_t> &delta_Q_v,
                     bool up_down)
   {
-    rmm::device_vector<vertex_t> temp_vertices_v(graph.number_of_vertices);
-    rmm::device_vector<vertex_t> temp_cluster_v(graph.number_of_vertices, vertex_t{-1});
-    rmm::device_vector<weight_t> temp_delta_Q_v(graph.number_of_vertices, weight_t{0.0});
+    rmm::device_uvector<vertex_t> temp_vertices_v(graph.number_of_vertices, stream_);
+    rmm::device_uvector<vertex_t> temp_cluster_v(graph.number_of_vertices, stream_);
+    rmm::device_uvector<weight_t> temp_delta_Q_v(graph.number_of_vertices, stream_);
+
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_),
+                 temp_cluster_v.begin(),
+                 temp_cluster_v.end(),
+                 vertex_t{-1});
 
-    weight_t *d_delta_Q              = delta_Q_v.data().get();
-    vertex_t *d_next_cluster         = next_cluster_v.data().get();
-    vertex_t *d_cluster_hash         = cluster_hash_v.data().get();
-    weight_t const *d_vertex_weights = vertex_weights_v_.data().get();
-    weight_t *d_cluster_weights      = cluster_weights_v_.data().get();
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_),
+                 temp_delta_Q_v.begin(),
+                 temp_delta_Q_v.end(),
+                 weight_t{0});
 
     auto cluster_reduce_iterator =
-      thrust::make_zip_iterator(thrust::make_tuple(d_cluster_hash, d_delta_Q));
+      thrust::make_zip_iterator(thrust::make_tuple(cluster_hash_v.begin(), delta_Q_v.begin()));
 
-    auto output_edge_iterator2 = thrust::make_zip_iterator(
-      thrust::make_tuple(temp_cluster_v.data().get(), temp_delta_Q_v.data().get()));
+    auto output_edge_iterator2 =
+      thrust::make_zip_iterator(thrust::make_tuple(temp_cluster_v.begin(), temp_delta_Q_v.begin()));
 
     auto cluster_reduce_end =
       thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_),
                             src_indices_v_.begin(),
                             src_indices_v_.end(),
                             cluster_reduce_iterator,
-                            temp_vertices_v.data().get(),
+                            temp_vertices_v.data(),
                             output_edge_iterator2,
                             thrust::equal_to<vertex_t>(),
                             [] __device__(auto pair1, auto pair2) {
@@ -422,22 +445,18 @@ class Louvain {
                                 return pair2;
                             });
 
-    vertex_t final_size = thrust::distance(temp_vertices_v.data().get(), cluster_reduce_end.first);
-
-    vertex_t *d_temp_vertices = temp_vertices_v.data().get();
-    vertex_t *d_temp_clusters = temp_cluster_v.data().get();
-    weight_t *d_temp_delta_Q  = temp_delta_Q_v.data().get();
+    vertex_t final_size = thrust::distance(temp_vertices_v.data(), cluster_reduce_end.first);
 
     thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
                      thrust::make_counting_iterator<vertex_t>(0),
                      thrust::make_counting_iterator<vertex_t>(final_size),
-                     [d_temp_delta_Q,
-                      up_down,
-                      d_next_cluster,
-                      d_temp_vertices,
-                      d_vertex_weights,
-                      d_temp_clusters,
-                      d_cluster_weights] __device__(vertex_t id) {
+                     [up_down,
+                      d_temp_delta_Q    = temp_delta_Q_v.data(),
+                      d_next_cluster    = next_cluster_v.data(),
+                      d_temp_vertices   = temp_vertices_v.data(),
+                      d_vertex_weights  = vertex_weights_v_.data(),
+                      d_temp_clusters   = temp_cluster_v.data(),
+                      d_cluster_weights = cluster_weights_v_.data()] __device__(vertex_t id) {
                        if ((d_temp_clusters[id] >= 0) && (d_temp_delta_Q[id] > weight_t{0.0})) {
                          vertex_t new_cluster = d_temp_clusters[id];
                          vertex_t old_cluster = d_next_cluster[d_temp_vertices[id]];
@@ -453,38 +472,38 @@ class Louvain {
                      });
   }
 
-  void shrink_graph(graph_t &graph, vertex_t *d_cluster_vec)
+  void shrink_graph(graph_t &graph)
   {
     timer_start("shrinking graph");
 
     // renumber the clusters to the range 0..(num_clusters-1)
-    vertex_t num_clusters = renumber_clusters(d_cluster_vec);
-    cluster_weights_v_.resize(num_clusters);
+    vertex_t num_clusters = renumber_clusters();
+    cluster_weights_v_.resize(num_clusters, stream_);
 
     // shrink our graph to represent the graph of supervertices
     generate_superverticies_graph(graph, num_clusters);
 
-    // assign each new vertex to its own cluster
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end());
-
     timer_stop(stream_);
   }
 
-  vertex_t renumber_clusters(vertex_t *d_cluster_vec)
+  vertex_t renumber_clusters()
   {
-    vertex_t *d_tmp_array       = tmp_arr_v_.data().get();
-    vertex_t *d_cluster_inverse = cluster_inverse_v_.data().get();
-    vertex_t *d_cluster         = cluster_v_.data().get();
+    vertex_t *d_tmp_array       = tmp_arr_v_.data();
+    vertex_t *d_cluster_inverse = cluster_inverse_v_.data();
+    vertex_t *d_cluster         = dendrogram_->current_level_begin();
 
-    vertex_t old_num_clusters = cluster_v_.size();
+    vertex_t old_num_clusters = dendrogram_->current_level_size();
 
     //
     //  New technique.  Initialize cluster_inverse_v_ to 0
     //
-    thrust::fill(cluster_inverse_v_.begin(), cluster_inverse_v_.end(), vertex_t{0});
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_),
+                 cluster_inverse_v_.begin(),
+                 cluster_inverse_v_.end(),
+                 vertex_t{0});
 
     //
-    // Iterate over every element c in cluster_v_ and set cluster_inverse_v to 1
+    // Iterate over every element c in the current clustering and set cluster_inverse_v to 1
     //
     auto first_1 = thrust::make_constant_iterator<vertex_t>(1);
     auto last_1  = first_1 + old_num_clusters;
@@ -492,7 +511,7 @@ class Louvain {
     thrust::scatter(rmm::exec_policy(stream_)->on(stream_),
                     first_1,
                     last_1,
-                    cluster_v_.begin(),
+                    dendrogram_->current_level_begin(),
                     cluster_inverse_v_.begin());
 
     //
@@ -506,7 +525,7 @@ class Louvain {
       [d_cluster_inverse] __device__(const vertex_t idx) { return d_cluster_inverse[idx] == 1; });
 
     vertex_t new_num_clusters = thrust::distance(tmp_arr_v_.begin(), copy_end);
-    tmp_arr_v_.resize(new_num_clusters);
+    tmp_arr_v_.resize(new_num_clusters, stream_);
 
     //
     // Now we can set each value in cluster_inverse of a cluster to its index
@@ -525,32 +544,16 @@ class Louvain {
                        d_cluster[i] = d_cluster_inverse[d_cluster[i]];
                      });
 
-    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
-                     thrust::make_counting_iterator<vertex_t>(0),
-                     thrust::make_counting_iterator<vertex_t>(number_of_vertices_),
-                     [d_cluster_vec, d_cluster] __device__(vertex_t i) {
-                       d_cluster_vec[i] = d_cluster[d_cluster_vec[i]];
-                     });
-
-    cluster_inverse_v_.resize(new_num_clusters);
-    cluster_v_.resize(new_num_clusters);
+    cluster_inverse_v_.resize(new_num_clusters, stream_);
 
     return new_num_clusters;
   }
 
   void generate_superverticies_graph(graph_t &graph, vertex_t num_clusters)
   {
-    rmm::device_vector<vertex_t> new_src_v(graph.number_of_edges);
-    rmm::device_vector<vertex_t> new_dst_v(graph.number_of_edges);
-    rmm::device_vector<weight_t> new_weight_v(graph.number_of_edges);
-
-    vertex_t *d_old_src    = src_indices_v_.data().get();
-    vertex_t *d_old_dst    = graph.indices;
-    weight_t *d_old_weight = graph.edge_data;
-    vertex_t *d_new_src    = new_src_v.data().get();
-    vertex_t *d_new_dst    = new_dst_v.data().get();
-    vertex_t *d_clusters   = cluster_v_.data().get();
-    weight_t *d_new_weight = new_weight_v.data().get();
+    rmm::device_uvector<vertex_t> new_src_v(graph.number_of_edges, stream_);
+    rmm::device_uvector<vertex_t> new_dst_v(graph.number_of_edges, stream_);
+    rmm::device_uvector<weight_t> new_weight_v(graph.number_of_edges, stream_);
 
     //
     //  Renumber the COO
@@ -558,13 +561,13 @@ class Louvain {
     thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
                      thrust::make_counting_iterator<edge_t>(0),
                      thrust::make_counting_iterator<edge_t>(graph.number_of_edges),
-                     [d_old_src,
-                      d_old_dst,
-                      d_old_weight,
-                      d_new_src,
-                      d_new_dst,
-                      d_new_weight,
-                      d_clusters] __device__(edge_t e) {
+                     [d_old_src    = src_indices_v_.data(),
+                      d_old_dst    = graph.indices,
+                      d_old_weight = graph.edge_data,
+                      d_new_src    = new_src_v.data(),
+                      d_new_dst    = new_dst_v.data(),
+                      d_new_weight = new_weight_v.data(),
+                      d_clusters   = dendrogram_->current_level_begin()] __device__(edge_t e) {
                        d_new_src[e]    = d_clusters[d_old_src[e]];
                        d_new_dst[e]    = d_clusters[d_old_dst[e]];
                        d_new_weight[e] = d_old_weight[e];
@@ -572,39 +575,42 @@ class Louvain {
 
     thrust::stable_sort_by_key(
       rmm::exec_policy(stream_)->on(stream_),
-      d_new_dst,
-      d_new_dst + graph.number_of_edges,
-      thrust::make_zip_iterator(thrust::make_tuple(d_new_src, d_new_weight)));
+      new_dst_v.begin(),
+      new_dst_v.end(),
+      thrust::make_zip_iterator(thrust::make_tuple(new_src_v.begin(), new_weight_v.begin())));
     thrust::stable_sort_by_key(
       rmm::exec_policy(stream_)->on(stream_),
-      d_new_src,
-      d_new_src + graph.number_of_edges,
-      thrust::make_zip_iterator(thrust::make_tuple(d_new_dst, d_new_weight)));
+      new_src_v.begin(),
+      new_src_v.end(),
+      thrust::make_zip_iterator(thrust::make_tuple(new_dst_v.begin(), new_weight_v.begin())));
 
     //
     //  Now we reduce by key to combine the weights of duplicate
     //  edges.
     //
-    auto start     = thrust::make_zip_iterator(thrust::make_tuple(d_new_src, d_new_dst));
-    auto new_start = thrust::make_zip_iterator(thrust::make_tuple(d_old_src, d_old_dst));
-    auto new_end   = thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_),
+    auto start =
+      thrust::make_zip_iterator(thrust::make_tuple(new_src_v.begin(), new_dst_v.begin()));
+    auto new_start =
+      thrust::make_zip_iterator(thrust::make_tuple(src_indices_v_.data(), graph.indices));
+    auto new_end = thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_),
                                          start,
                                          start + graph.number_of_edges,
-                                         d_new_weight,
+                                         new_weight_v.begin(),
                                          new_start,
-                                         d_old_weight,
+                                         graph.edge_data,
                                          thrust::equal_to<thrust::tuple<vertex_t, vertex_t>>(),
                                          thrust::plus<weight_t>());
 
     graph.number_of_edges    = thrust::distance(new_start, new_end.first);
     graph.number_of_vertices = num_clusters;
 
-    detail::fill_offset(d_old_src, graph.offsets, num_clusters, graph.number_of_edges, stream_);
+    detail::fill_offset(
+      src_indices_v_.data(), graph.offsets, num_clusters, graph.number_of_edges, stream_);
     CHECK_CUDA(stream_);
 
-    src_indices_v_.resize(graph.number_of_edges);
-    indices_v_.resize(graph.number_of_edges);
-    weights_v_.resize(graph.number_of_edges);
+    src_indices_v_.resize(graph.number_of_edges, stream_);
+    indices_v_.resize(graph.number_of_edges, stream_);
+    weights_v_.resize(graph.number_of_edges, stream_);
   }
 
  protected:
@@ -613,27 +619,28 @@ class Louvain {
   edge_t number_of_edges_;
   cudaStream_t stream_;
 
+  std::unique_ptr<Dendrogram<vertex_t>> dendrogram_;
+
   //
   //  Copy of graph
   //
-  rmm::device_vector<edge_t> offsets_v_;
-  rmm::device_vector<vertex_t> indices_v_;
-  rmm::device_vector<weight_t> weights_v_;
-  rmm::device_vector<vertex_t> src_indices_v_;
+  rmm::device_uvector<edge_t> offsets_v_;
+  rmm::device_uvector<vertex_t> indices_v_;
+  rmm::device_uvector<weight_t> weights_v_;
+  rmm::device_uvector<vertex_t> src_indices_v_;
 
   //
   //  Weights and clustering across iterations of algorithm
   //
-  rmm::device_vector<weight_t> vertex_weights_v_;
-  rmm::device_vector<weight_t> cluster_weights_v_;
-  rmm::device_vector<vertex_t> cluster_v_;
+  rmm::device_uvector<weight_t> vertex_weights_v_;
+  rmm::device_uvector<weight_t> cluster_weights_v_;
 
   //
   //  Temporaries used within kernels.  Each iteration uses less
   //  of this memory
   //
-  rmm::device_vector<vertex_t> tmp_arr_v_;
-  rmm::device_vector<vertex_t> cluster_inverse_v_;
+  rmm::device_uvector<vertex_t> tmp_arr_v_;
+  rmm::device_uvector<vertex_t> cluster_inverse_v_;
 
 #ifdef TIMING
   HighResTimer hr_timer_;
diff --git a/cpp/src/components/connectivity.cu b/cpp/src/components/connectivity.cu
index 2cc1da017a9..f4c7bf1d35c 100644
--- a/cpp/src/components/connectivity.cu
+++ b/cpp/src/components/connectivity.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,8 +64,8 @@ std::enable_if_t<std::is_signed<VT>::value> connected_components_impl(
 {
   using ByteT = unsigned char;  // minimum addressable unit
 
-  CUGRAPH_EXPECTS(graph.offsets != nullptr, "Invalid API parameter: graph.offsets is nullptr");
-  CUGRAPH_EXPECTS(graph.indices != nullptr, "Invalid API parameter: graph.indices is nullptr");
+  CUGRAPH_EXPECTS(graph.offsets != nullptr, "Invalid input argument: graph.offsets is nullptr");
+  CUGRAPH_EXPECTS(graph.indices != nullptr, "Invalid input argument: graph.indices is nullptr");
 
   VT nrows = graph.number_of_vertices;
 
@@ -90,7 +90,7 @@ void connected_components(GraphCSRView<VT, ET, WT> const &graph,
 {
   cudaStream_t stream{nullptr};
 
-  CUGRAPH_EXPECTS(labels != nullptr, "Invalid API parameter: labels parameter is NULL");
+  CUGRAPH_EXPECTS(labels != nullptr, "Invalid input argument: labels parameter is NULL");
 
   return detail::connected_components_impl<VT, ET, WT>(graph, connectivity_type, labels, stream);
 }
diff --git a/cpp/src/cores/core_number.cu b/cpp/src/cores/core_number.cu
index cd2b928a81e..091ba07ccc6 100644
--- a/cpp/src/cores/core_number.cu
+++ b/cpp/src/cores/core_number.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -151,9 +151,9 @@ std::unique_ptr<GraphCOO<VT, ET, WT>> k_core(GraphCOOView<VT, ET, WT> const &in_
                                              VT num_vertex_ids,
                                              rmm::mr::device_memory_resource *mr)
 {
-  CUGRAPH_EXPECTS(vertex_id != nullptr, "Invalid API parameter: vertex_id is NULL");
-  CUGRAPH_EXPECTS(core_number != nullptr, "Invalid API parameter: core_number is NULL");
-  CUGRAPH_EXPECTS(k >= 0, "Invalid API parameter: k must be >= 0");
+  CUGRAPH_EXPECTS(vertex_id != nullptr, "Invalid input argument: vertex_id is NULL");
+  CUGRAPH_EXPECTS(core_number != nullptr, "Invalid input argument: core_number is NULL");
+  CUGRAPH_EXPECTS(k >= 0, "Invalid input argument: k must be >= 0");
 
   return detail::extract_subgraph(in_graph, vertex_id, core_number, k, num_vertex_ids, mr);
 }
diff --git a/cpp/src/experimental/bfs.cu b/cpp/src/experimental/bfs.cu
index f297587a1d6..7adfbd7fbd7 100644
--- a/cpp/src/experimental/bfs.cu
+++ b/cpp/src/experimental/bfs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -165,8 +165,6 @@ void bfs(raft::handle_t const &handle,
     handle.get_stream()));  // this is as necessary vertex_frontier will become out-of-scope once
                             // this function returns (FIXME: should I stream sync in VertexFrontier
                             // destructor?)
-
-  return;
 }
 
 }  // namespace detail
diff --git a/cpp/src/experimental/coarsen_graph.cu b/cpp/src/experimental/coarsen_graph.cu
new file mode 100644
index 00000000000..0cd551b0d73
--- /dev/null
+++ b/cpp/src/experimental/coarsen_graph.cu
@@ -0,0 +1,713 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <experimental/detail/graph_utils.cuh>
+#include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
+#include <experimental/graph_view.hpp>
+#include <patterns/copy_to_adj_matrix_row_col.cuh>
+#include <utilities/error.hpp>
+#include <utilities/shuffle_comm.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <tuple>
+#include <utility>
+
+namespace cugraph {
+namespace experimental {
+namespace detail {
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::
+  tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<weight_t>>
+  compressed_sparse_to_edgelist(edge_t const *compressed_sparse_offsets,
+                                vertex_t const *compressed_sparse_indices,
+                                weight_t const *compressed_sparse_weights,
+                                vertex_t major_first,
+                                vertex_t major_last,
+                                cudaStream_t stream)
+{
+  edge_t number_of_edges{0};
+  raft::update_host(
+    &number_of_edges, compressed_sparse_offsets + (major_last - major_first), 1, stream);
+  CUDA_TRY(cudaStreamSynchronize(stream));
+  rmm::device_uvector<vertex_t> edgelist_major_vertices(number_of_edges, stream);
+  rmm::device_uvector<vertex_t> edgelist_minor_vertices(number_of_edges, stream);
+  rmm::device_uvector<weight_t> edgelist_weights(
+    compressed_sparse_weights != nullptr ? number_of_edges : 0, stream);
+
+  // FIXME: this is highly inefficient for very high-degree vertices, for better performance, we can
+  // fill high-degree vertices using one CUDA block per vertex, mid-degree vertices using one CUDA
+  // warp per vertex, and low-degree vertices using one CUDA thread per block
+  thrust::for_each(rmm::exec_policy(stream)->on(stream),
+                   thrust::make_counting_iterator(major_first),
+                   thrust::make_counting_iterator(major_last),
+                   [compressed_sparse_offsets,
+                    major_first,
+                    p_majors = edgelist_major_vertices.begin()] __device__(auto v) {
+                     auto first = compressed_sparse_offsets[v - major_first];
+                     auto last  = compressed_sparse_offsets[v - major_first + 1];
+                     thrust::fill(thrust::seq, p_majors + first, p_majors + last, v);
+                   });
+  thrust::copy(rmm::exec_policy(stream)->on(stream),
+               compressed_sparse_indices,
+               compressed_sparse_indices + number_of_edges,
+               edgelist_minor_vertices.begin());
+  if (compressed_sparse_weights != nullptr) {
+    thrust::copy(rmm::exec_policy(stream)->on(stream),
+                 compressed_sparse_weights,
+                 compressed_sparse_weights + number_of_edges,
+                 edgelist_weights.data());
+  }
+
+  return std::make_tuple(std::move(edgelist_major_vertices),
+                         std::move(edgelist_minor_vertices),
+                         std::move(edgelist_weights));
+}
+
+template <typename vertex_t, typename weight_t>
+void sort_and_coarsen_edgelist(rmm::device_uvector<vertex_t> &edgelist_major_vertices /* [INOUT] */,
+                               rmm::device_uvector<vertex_t> &edgelist_minor_vertices /* [INOUT] */,
+                               rmm::device_uvector<weight_t> &edgelist_weights /* [INOUT] */,
+                               cudaStream_t stream)
+{
+  auto pair_first = thrust::make_zip_iterator(
+    thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
+
+  size_t number_of_edges{0};
+  if (edgelist_weights.size() > 0) {
+    thrust::sort_by_key(rmm::exec_policy(stream)->on(stream),
+                        pair_first,
+                        pair_first + edgelist_major_vertices.size(),
+                        edgelist_weights.begin());
+
+    rmm::device_uvector<vertex_t> tmp_edgelist_major_vertices(edgelist_major_vertices.size(),
+                                                              stream);
+    rmm::device_uvector<vertex_t> tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(),
+                                                              stream);
+    rmm::device_uvector<weight_t> tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream);
+    auto it = thrust::reduce_by_key(
+      rmm::exec_policy(stream)->on(stream),
+      pair_first,
+      pair_first + edgelist_major_vertices.size(),
+      edgelist_weights.begin(),
+      thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(),
+                                                   tmp_edgelist_minor_vertices.begin())),
+      tmp_edgelist_weights.begin());
+    number_of_edges = thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it));
+
+    edgelist_major_vertices = std::move(tmp_edgelist_major_vertices);
+    edgelist_minor_vertices = std::move(tmp_edgelist_minor_vertices);
+    edgelist_weights        = std::move(tmp_edgelist_weights);
+  } else {
+    thrust::sort(rmm::exec_policy(stream)->on(stream),
+                 pair_first,
+                 pair_first + edgelist_major_vertices.size());
+    auto it         = thrust::unique(rmm::exec_policy(stream)->on(stream),
+                             pair_first,
+                             pair_first + edgelist_major_vertices.size());
+    number_of_edges = thrust::distance(pair_first, it);
+  }
+
+  edgelist_major_vertices.resize(number_of_edges, stream);
+  edgelist_minor_vertices.resize(number_of_edges, stream);
+  edgelist_weights.resize(number_of_edges, stream);
+  edgelist_major_vertices.shrink_to_fit(stream);
+  edgelist_minor_vertices.shrink_to_fit(stream);
+  edgelist_weights.shrink_to_fit(stream);
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::
+  tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<weight_t>>
+  compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist(
+    edge_t const *compressed_sparse_offsets,
+    vertex_t const *compressed_sparse_indices,
+    weight_t const *compressed_sparse_weights,
+    vertex_t const *p_major_labels,
+    vertex_t const *p_minor_labels,
+    vertex_t major_first,
+    vertex_t major_last,
+    vertex_t minor_first,
+    vertex_t minor_last,
+    cudaStream_t stream)
+{
+  // FIXME: it might be possible to directly create relabled & coarsened edgelist from the
+  // compressed sparse format to save memory
+
+  rmm::device_uvector<vertex_t> edgelist_major_vertices(0, stream);
+  rmm::device_uvector<vertex_t> edgelist_minor_vertices(0, stream);
+  rmm::device_uvector<weight_t> edgelist_weights(0, stream);
+  std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) =
+    compressed_sparse_to_edgelist(compressed_sparse_offsets,
+                                  compressed_sparse_indices,
+                                  compressed_sparse_weights,
+                                  major_first,
+                                  major_last,
+                                  stream);
+
+  auto pair_first = thrust::make_zip_iterator(
+    thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
+  thrust::transform(
+    rmm::exec_policy(stream)->on(stream),
+    pair_first,
+    pair_first + edgelist_major_vertices.size(),
+    pair_first,
+    [p_major_labels, p_minor_labels, major_first, minor_first] __device__(auto val) {
+      return thrust::make_tuple(p_major_labels[thrust::get<0>(val) - major_first],
+                                p_minor_labels[thrust::get<1>(val) - minor_first]);
+    });
+
+  sort_and_coarsen_edgelist(
+    edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights, stream);
+
+  return std::make_tuple(std::move(edgelist_major_vertices),
+                         std::move(edgelist_minor_vertices),
+                         std::move(edgelist_weights));
+}
+
+// multi-GPU version
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::enable_if_t<
+  multi_gpu,
+  std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>,
+             rmm::device_uvector<vertex_t>>>
+coarsen_graph(
+  raft::handle_t const &handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const &graph_view,
+  vertex_t const *labels,
+  bool do_expensive_check)
+{
+  auto &comm               = handle.get_comms();
+  auto const comm_size     = comm.get_size();
+  auto const comm_rank     = comm.get_rank();
+  auto &row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_size = row_comm.get_size();
+  auto const row_comm_rank = row_comm.get_rank();
+  auto &col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
+  auto const col_comm_rank = col_comm.get_rank();
+
+  if (do_expensive_check) {
+    // currently, nothing to do
+  }
+
+  // 1. locally construct coarsened edge list
+
+  // FIXME: we don't need adj_matrix_major_labels if we apply the same partitioning scheme
+  // regardless of hypergraph partitioning is applied or not
+  rmm::device_uvector<vertex_t> adj_matrix_major_labels(
+    store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols()
+                     : graph_view.get_number_of_local_adj_matrix_partition_rows(),
+    handle.get_stream());
+  rmm::device_uvector<vertex_t> adj_matrix_minor_labels(
+    store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_rows()
+                     : graph_view.get_number_of_local_adj_matrix_partition_cols(),
+    handle.get_stream());
+  if (store_transposed) {
+    copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_major_labels.data());
+    copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_minor_labels.data());
+  } else {
+    copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_major_labels.data());
+    copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels.data());
+  }
+
+  rmm::device_uvector<vertex_t> coarsened_edgelist_major_vertices(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> coarsened_edgelist_minor_vertices(0, handle.get_stream());
+  rmm::device_uvector<weight_t> coarsened_edgelist_weights(0, handle.get_stream());
+  // FIXME: we may compare performance/memory footprint with the hash_based approach especially when
+  // cuco::dynamic_map becomes available (so we don't need to preallocate memory assuming the worst
+  // case). We may be able to limit the memory requirement close to the final coarsened edgelist
+  // with the hash based approach.
+  for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+    // get edge list
+
+    rmm::device_uvector<vertex_t> edgelist_major_vertices(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> edgelist_minor_vertices(0, handle.get_stream());
+    rmm::device_uvector<weight_t> edgelist_weights(0, handle.get_stream());
+    std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) =
+      compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist(
+        graph_view.offsets(i),
+        graph_view.indices(i),
+        graph_view.weights(i),
+        adj_matrix_major_labels.begin() +
+          (store_transposed ? graph_view.get_local_adj_matrix_partition_col_value_start_offset(i)
+                            : graph_view.get_local_adj_matrix_partition_row_value_start_offset(i)),
+        adj_matrix_minor_labels.begin(),
+        store_transposed ? graph_view.get_local_adj_matrix_partition_col_first(i)
+                         : graph_view.get_local_adj_matrix_partition_row_first(i),
+        store_transposed ? graph_view.get_local_adj_matrix_partition_col_last(i)
+                         : graph_view.get_local_adj_matrix_partition_row_last(i),
+        store_transposed ? graph_view.get_local_adj_matrix_partition_row_first(i)
+                         : graph_view.get_local_adj_matrix_partition_col_first(i),
+        store_transposed ? graph_view.get_local_adj_matrix_partition_row_last(i)
+                         : graph_view.get_local_adj_matrix_partition_col_last(i),
+        handle.get_stream());
+
+    auto cur_size = coarsened_edgelist_major_vertices.size();
+    // FIXME: this can lead to frequent costly reallocation; we may be able to avoid this if we can
+    // reserve address space to avoid expensive reallocation.
+    // https://devblogs.nvidia.com/introducing-low-level-gpu-virtual-memory-management
+    coarsened_edgelist_major_vertices.resize(cur_size + edgelist_major_vertices.size(),
+                                             handle.get_stream());
+    coarsened_edgelist_minor_vertices.resize(coarsened_edgelist_major_vertices.size(),
+                                             handle.get_stream());
+    coarsened_edgelist_weights.resize(
+      graph_view.is_weighted() ? coarsened_edgelist_major_vertices.size() : 0, handle.get_stream());
+
+    if (graph_view.is_weighted()) {
+      auto src_edge_first =
+        thrust::make_zip_iterator(thrust::make_tuple(edgelist_major_vertices.begin(),
+                                                     edgelist_minor_vertices.begin(),
+                                                     edgelist_weights.begin()));
+      auto dst_edge_first =
+        thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(),
+                                                     coarsened_edgelist_minor_vertices.begin(),
+                                                     coarsened_edgelist_weights.begin())) +
+        cur_size;
+      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   src_edge_first,
+                   src_edge_first + edgelist_major_vertices.size(),
+                   dst_edge_first);
+    } else {
+      auto src_edge_first = thrust::make_zip_iterator(
+        thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
+      auto dst_edge_first =
+        thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(),
+                                                     coarsened_edgelist_minor_vertices.begin())) +
+        cur_size;
+      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   src_edge_first,
+                   src_edge_first + edgelist_major_vertices.size(),
+                   dst_edge_first);
+    }
+  }
+
+  sort_and_coarsen_edgelist(coarsened_edgelist_major_vertices,
+                            coarsened_edgelist_minor_vertices,
+                            coarsened_edgelist_weights,
+                            handle.get_stream());
+
+  // 2. globally shuffle edge list and re-coarsen
+
+  {
+    auto edge_first =
+      thrust::make_zip_iterator(thrust::make_tuple(coarsened_edgelist_major_vertices.begin(),
+                                                   coarsened_edgelist_minor_vertices.begin(),
+                                                   coarsened_edgelist_weights.begin()));
+    rmm::device_uvector<vertex_t> rx_edgelist_major_vertices(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> rx_edgelist_minor_vertices(0, handle.get_stream());
+    rmm::device_uvector<weight_t> rx_edgelist_weights(0, handle.get_stream());
+    std::forward_as_tuple(
+      std::tie(rx_edgelist_major_vertices, rx_edgelist_minor_vertices, rx_edgelist_weights),
+      std::ignore) =
+      groupby_gpuid_and_shuffle_values(
+        handle.get_comms(),
+        edge_first,
+        edge_first + coarsened_edgelist_major_vertices.size(),
+        [key_func =
+           detail::compute_gpu_id_from_edge_t<vertex_t>{graph_view.is_hypergraph_partitioned(),
+                                                        comm.get_size(),
+                                                        row_comm.get_size(),
+                                                        col_comm.get_size()}] __device__(auto val) {
+          return key_func(thrust::get<0>(val), thrust::get<1>(val));
+        },
+        handle.get_stream());
+
+    sort_and_coarsen_edgelist(rx_edgelist_major_vertices,
+                              rx_edgelist_minor_vertices,
+                              rx_edgelist_weights,
+                              handle.get_stream());
+
+    coarsened_edgelist_major_vertices = std::move(rx_edgelist_major_vertices);
+    coarsened_edgelist_minor_vertices = std::move(rx_edgelist_minor_vertices);
+    coarsened_edgelist_weights        = std::move(rx_edgelist_weights);
+  }
+
+  // 3. find unique labels for this GPU
+
+  rmm::device_uvector<vertex_t> unique_labels(graph_view.get_number_of_local_vertices(),
+                                              handle.get_stream());
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               labels,
+               labels + unique_labels.size(),
+               unique_labels.begin());
+  thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               unique_labels.begin(),
+               unique_labels.end());
+  unique_labels.resize(
+    thrust::distance(unique_labels.begin(),
+                     thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    unique_labels.begin(),
+                                    unique_labels.end())),
+    handle.get_stream());
+
+  rmm::device_uvector<vertex_t> rx_unique_labels(0, handle.get_stream());
+  std::tie(rx_unique_labels, std::ignore) = groupby_gpuid_and_shuffle_values(
+    handle.get_comms(),
+    unique_labels.begin(),
+    unique_labels.end(),
+    [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm.get_size()}] __device__(
+      auto val) { return key_func(val); },
+    handle.get_stream());
+
+  unique_labels = std::move(rx_unique_labels);
+
+  thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               unique_labels.begin(),
+               unique_labels.end());
+  unique_labels.resize(
+    thrust::distance(unique_labels.begin(),
+                     thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    unique_labels.begin(),
+                                    unique_labels.end())),
+    handle.get_stream());
+
+  // 4. renumber
+
+  rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
+  partition_t<vertex_t> partition(std::vector<vertex_t>(comm_size + 1, 0),
+                                  graph_view.is_hypergraph_partitioned(),
+                                  row_comm_size,
+                                  col_comm_size,
+                                  row_comm_rank,
+                                  col_comm_rank);
+  vertex_t number_of_vertices{};
+  edge_t number_of_edges{};
+  std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) =
+    renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+      handle,
+      unique_labels.data(),
+      static_cast<vertex_t>(unique_labels.size()),
+      coarsened_edgelist_major_vertices.data(),
+      coarsened_edgelist_minor_vertices.data(),
+      static_cast<edge_t>(coarsened_edgelist_major_vertices.size()),
+      graph_view.is_hypergraph_partitioned(),
+      do_expensive_check);
+
+  // 5. build a graph
+
+  std::vector<edgelist_t<vertex_t, edge_t, weight_t>> edgelists{};
+  if (graph_view.is_hypergraph_partitioned()) {
+    CUGRAPH_FAIL("unimplemented.");
+  } else {
+    edgelists.resize(1);
+    edgelists[0].p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices.data()
+                                                   : coarsened_edgelist_major_vertices.data();
+    edgelists[0].p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices.data()
+                                                   : coarsened_edgelist_minor_vertices.data();
+    edgelists[0].p_edge_weights  = coarsened_edgelist_weights.data();
+    edgelists[0].number_of_edges = static_cast<edge_t>(coarsened_edgelist_major_vertices.size());
+  }
+
+  return std::make_tuple(
+    std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(
+      handle,
+      edgelists,
+      partition,
+      number_of_vertices,
+      number_of_edges,
+      graph_properties_t{graph_view.is_symmetric(), false},
+      true),
+    std::move(renumber_map_labels));
+}
+
+// single-GPU version
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::enable_if_t<
+  !multi_gpu,
+  std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>,
+             rmm::device_uvector<vertex_t>>>
+coarsen_graph(
+  raft::handle_t const &handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const &graph_view,
+  vertex_t const *labels,
+  bool do_expensive_check)
+{
+  if (do_expensive_check) {
+    // currently, nothing to do
+  }
+
+  rmm::device_uvector<vertex_t> coarsened_edgelist_major_vertices(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> coarsened_edgelist_minor_vertices(0, handle.get_stream());
+  rmm::device_uvector<weight_t> coarsened_edgelist_weights(0, handle.get_stream());
+  std::tie(coarsened_edgelist_major_vertices,
+           coarsened_edgelist_minor_vertices,
+           coarsened_edgelist_weights) =
+    compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist(
+      graph_view.offsets(),
+      graph_view.indices(),
+      graph_view.weights(),
+      labels,
+      labels,
+      vertex_t{0},
+      graph_view.get_number_of_vertices(),
+      vertex_t{0},
+      graph_view.get_number_of_vertices(),
+      handle.get_stream());
+
+  rmm::device_uvector<vertex_t> unique_labels(graph_view.get_number_of_vertices(),
+                                              handle.get_stream());
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               labels,
+               labels + unique_labels.size(),
+               unique_labels.begin());
+  thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               unique_labels.begin(),
+               unique_labels.end());
+  unique_labels.resize(
+    thrust::distance(unique_labels.begin(),
+                     thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    unique_labels.begin(),
+                                    unique_labels.end())),
+    handle.get_stream());
+
+  auto renumber_map_labels = renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+    handle,
+    unique_labels.data(),
+    static_cast<vertex_t>(unique_labels.size()),
+    coarsened_edgelist_major_vertices.data(),
+    coarsened_edgelist_minor_vertices.data(),
+    static_cast<edge_t>(coarsened_edgelist_major_vertices.size()),
+    do_expensive_check);
+
+  edgelist_t<vertex_t, edge_t, weight_t> edgelist{};
+  edgelist.p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices.data()
+                                             : coarsened_edgelist_major_vertices.data();
+  edgelist.p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices.data()
+                                             : coarsened_edgelist_minor_vertices.data();
+  edgelist.p_edge_weights  = coarsened_edgelist_weights.data();
+  edgelist.number_of_edges = static_cast<edge_t>(coarsened_edgelist_major_vertices.size());
+
+  return std::make_tuple(
+    std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(
+      handle,
+      edgelist,
+      static_cast<vertex_t>(renumber_map_labels.size()),
+      graph_properties_t{graph_view.is_symmetric(), false},
+      true),
+    std::move(renumber_map_labels));
+}
+
+}  // namespace detail
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>,
+           rmm::device_uvector<vertex_t>>
+coarsen_graph(
+  raft::handle_t const &handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const &graph_view,
+  vertex_t const *labels,
+  bool do_expensive_check)
+{
+  return detail::coarsen_graph(handle, graph_view, labels, do_expensive_check);
+}
+
+// explicit instantiation
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, float, true, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int32_t, float, true, true> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, float, false, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int32_t, float, false, true> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, float, true, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int32_t, float, true, false> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, float, false, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int32_t, float, false, false> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, float, true, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int64_t, float, true, true> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, float, false, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int64_t, float, false, true> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, float, true, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int64_t, float, true, false> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, float, false, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int64_t, float, false, false> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, float, true, true>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int64_t, int64_t, float, true, true> const &graph_view,
+              int64_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, float, false, true>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int64_t, int64_t, float, false, true> const &graph_view,
+              int64_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, float, true, false>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int64_t, int64_t, float, true, false> const &graph_view,
+              int64_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, float, false, false>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int64_t, int64_t, float, false, false> const &graph_view,
+              int64_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, double, true, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int32_t, double, true, true> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, double, false, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int32_t, double, false, true> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, double, true, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int32_t, double, true, false> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int32_t, double, false, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int32_t, double, false, false> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, double, true, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int64_t, double, true, true> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, double, false, true>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int64_t, double, false, true> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, double, true, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int64_t, double, true, false> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int32_t, int64_t, double, false, false>>,
+                    rmm::device_uvector<int32_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int32_t, int64_t, double, false, false> const &graph_view,
+              int32_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, double, true, true>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int64_t, int64_t, double, true, true> const &graph_view,
+              int64_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, double, false, true>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int64_t, int64_t, double, false, true> const &graph_view,
+              int64_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, double, true, false>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int64_t, int64_t, double, true, false> const &graph_view,
+              int64_t const *labels,
+              bool do_expensive_check);
+
+template std::tuple<std::unique_ptr<graph_t<int64_t, int64_t, double, false, false>>,
+                    rmm::device_uvector<int64_t>>
+coarsen_graph(raft::handle_t const &handle,
+              graph_view_t<int64_t, int64_t, double, false, false> const &graph_view,
+              int64_t const *labels,
+              bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/graph.cu b/cpp/src/experimental/graph.cu
index 3a2b7126d22..5cf393bfce4 100644
--- a/cpp/src/experimental/graph.cu
+++ b/cpp/src/experimental/graph.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include <experimental/detail/graph_utils.cuh>
 #include <experimental/graph.hpp>
 #include <partition_manager.hpp>
-#include <utilities/comm_utils.cuh>
 #include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <raft/device_atomics.cuh>
@@ -62,25 +62,19 @@ struct out_of_range_t {
 template <bool store_transposed, typename vertex_t, typename edge_t, typename weight_t>
 std::
   tuple<rmm::device_uvector<edge_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<weight_t>>
-  edge_list_to_compressed_sparse(raft::handle_t const &handle,
-                                 edgelist_t<vertex_t, edge_t, weight_t> const &edgelist,
-                                 vertex_t major_first,
-                                 vertex_t major_last,
-                                 vertex_t minor_first,
-                                 vertex_t minor_last)
+  edgelist_to_compressed_sparse(edgelist_t<vertex_t, edge_t, weight_t> const &edgelist,
+                                vertex_t major_first,
+                                vertex_t major_last,
+                                vertex_t minor_first,
+                                vertex_t minor_last,
+                                cudaStream_t stream)
 {
-  rmm::device_uvector<edge_t> offsets((major_last - major_first) + 1, handle.get_stream());
-  rmm::device_uvector<vertex_t> indices(edgelist.number_of_edges, handle.get_stream());
+  rmm::device_uvector<edge_t> offsets((major_last - major_first) + 1, stream);
+  rmm::device_uvector<vertex_t> indices(edgelist.number_of_edges, stream);
   rmm::device_uvector<weight_t> weights(
-    edgelist.p_edge_weights != nullptr ? edgelist.number_of_edges : 0, handle.get_stream());
-  thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-               offsets.begin(),
-               offsets.end(),
-               edge_t{0});
-  thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-               indices.begin(),
-               indices.end(),
-               vertex_t{0});
+    edgelist.p_edge_weights != nullptr ? edgelist.number_of_edges : 0, stream);
+  thrust::fill(rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end(), edge_t{0});
+  thrust::fill(rmm::exec_policy(stream)->on(stream), indices.begin(), indices.end(), vertex_t{0});
 
   // FIXME: need to performance test this code with R-mat graphs having highly-skewed degree
   // distribution. If there is a small number of vertices with very large degrees, atomicAdd can
@@ -98,7 +92,7 @@ std::
   auto p_weights =
     edgelist.p_edge_weights != nullptr ? weights.data() : static_cast<weight_t *>(nullptr);
 
-  thrust::for_each(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+  thrust::for_each(rmm::exec_policy(stream)->on(stream),
                    store_transposed ? edgelist.p_dst_vertices : edgelist.p_src_vertices,
                    store_transposed ? edgelist.p_dst_vertices + edgelist.number_of_edges
                                     : edgelist.p_src_vertices + edgelist.number_of_edges,
@@ -106,15 +100,13 @@ std::
                      atomicAdd(p_offsets + (v - major_first), edge_t{1});
                    });
 
-  thrust::exclusive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                         offsets.begin(),
-                         offsets.end(),
-                         offsets.begin());
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end(), offsets.begin());
 
   if (edgelist.p_edge_weights != nullptr) {
     auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
       edgelist.p_src_vertices, edgelist.p_dst_vertices, edgelist.p_edge_weights));
-    thrust::for_each(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    thrust::for_each(rmm::exec_policy(stream)->on(stream),
                      edge_first,
                      edge_first + edgelist.number_of_edges,
                      [p_offsets, p_indices, p_weights, major_first] __device__(auto e) {
@@ -137,7 +129,7 @@ std::
   } else {
     auto edge_first = thrust::make_zip_iterator(
       thrust::make_tuple(edgelist.p_src_vertices, edgelist.p_dst_vertices));
-    thrust::for_each(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    thrust::for_each(rmm::exec_policy(stream)->on(stream),
                      edge_first,
                      edge_first + edgelist.number_of_edges,
                      [p_offsets, p_indices, p_weights, major_first] __device__(auto e) {
@@ -162,42 +154,6 @@ std::
   return std::make_tuple(std::move(offsets), std::move(indices), std::move(weights));
 }
 
-template <typename vertex_t, typename DegreeIterator, typename ThresholdIterator>
-std::vector<vertex_t> segment_degree_sorted_vertex_partition(raft::handle_t const &handle,
-                                                             DegreeIterator degree_first,
-                                                             DegreeIterator degree_last,
-                                                             ThresholdIterator threshold_first,
-                                                             ThresholdIterator threshold_last)
-{
-  auto num_elements = thrust::distance(degree_first, degree_last);
-  auto num_segments = thrust::distance(threshold_first, threshold_last) + 1;
-
-  std::vector<vertex_t> h_segment_offsets(num_segments + 1);
-  h_segment_offsets[0]     = 0;
-  h_segment_offsets.back() = num_elements;
-
-  rmm::device_uvector<vertex_t> d_segment_offsets(num_segments - 1, handle.get_stream());
-
-  thrust::upper_bound(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                      degree_first,
-                      degree_last,
-                      threshold_first,
-                      threshold_last,
-                      d_segment_offsets.begin());
-
-  raft::update_host(h_segment_offsets.begin() + 1,
-                    d_segment_offsets.begin(),
-                    d_segment_offsets.size(),
-                    handle.get_stream());
-
-  CUDA_TRY(cudaStreamSynchronize(
-    handle.get_stream()));  // this is necessary as d_segment_offsets will become out-of-scope once
-                            // this function returns and this function returns a host variable which
-                            // can be used right after return.
-
-  return h_segment_offsets;
-}
-
 }  // namespace
 
 template <typename vertex_t,
@@ -233,7 +189,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   auto default_stream      = this->get_handle_ptr()->get_stream();
 
   CUGRAPH_EXPECTS(edgelists.size() > 0,
-                  "Invalid API parameter: edgelists.size() should be non-zero.");
+                  "Invalid input argument: edgelists.size() should be non-zero.");
 
   bool is_weighted = edgelists[0].p_edge_weights != nullptr;
 
@@ -246,14 +202,14 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                          (is_weighted && (edgelist.p_edge_weights == nullptr)) ||
                          (!is_weighted && (edgelist.p_edge_weights != nullptr));
                 }) == false,
-    "Invalid API parameter: edgelists[].p_src_vertices and edgelists[].p_dst_vertices should not "
+    "Invalid input argument: edgelists[].p_src_vertices and edgelists[].p_dst_vertices should not "
     "be nullptr and edgelists[].p_edge_weights should be nullptr (if edgelists[0].p_edge_weights "
     "is nullptr) or should not be nullptr (otherwise).");
 
   CUGRAPH_EXPECTS((partition.is_hypergraph_partitioned() &&
                    (edgelists.size() == static_cast<size_t>(col_comm_size))) ||
                     (!(partition.is_hypergraph_partitioned()) && (edgelists.size() == 1)),
-                  "Invalid API parameter: errneous edgelists.size().");
+                  "Invalid input argument: errneous edgelists.size().");
 
   // optional expensive checks (part 1/3)
 
@@ -278,17 +234,17 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                                        edge_first + edgelists[i].number_of_edges,
                                        out_of_range_t<vertex_t>{
                                          major_first, major_last, minor_first, minor_last}) == 0,
-                      "Invalid API parameter: edgelists[] have out-of-range values.");
+                      "Invalid input argument: edgelists[] have out-of-range values.");
     }
     number_of_local_edges_sum =
       host_scalar_allreduce(comm, number_of_local_edges_sum, default_stream);
-    CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(),
-                    "Invalid API parameter: the sum of local edges doe counts not match with "
-                    "number_of_local_edges.");
+    CUGRAPH_EXPECTS(
+      number_of_local_edges_sum == this->get_number_of_edges(),
+      "Invalid input argument: the sum of local edge counts does not match with number_of_edges.");
 
     CUGRAPH_EXPECTS(
       partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,
-      "Invalid API parameter: vertex partition should cover [0, number_of_vertices).");
+      "Invalid input argument: vertex partition should cover [0, number_of_vertices).");
   }
 
   // convert edge list (COO) to compressed sparse format (CSR or CSC)
@@ -307,8 +263,13 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     rmm::device_uvector<edge_t> offsets(0, default_stream);
     rmm::device_uvector<vertex_t> indices(0, default_stream);
     rmm::device_uvector<weight_t> weights(0, default_stream);
-    std::tie(offsets, indices, weights) = edge_list_to_compressed_sparse<store_transposed>(
-      *(this->get_handle_ptr()), edgelists[i], major_first, major_last, minor_first, minor_last);
+    std::tie(offsets, indices, weights) =
+      edgelist_to_compressed_sparse<store_transposed>(edgelists[i],
+                                                      major_first,
+                                                      major_last,
+                                                      minor_first,
+                                                      minor_last,
+                                                      this->get_handle_ptr()->get_stream());
     adj_matrix_partition_offsets_.push_back(std::move(offsets));
     adj_matrix_partition_indices_.push_back(std::move(indices));
     if (is_weighted) { adj_matrix_partition_weights_.push_back(std::move(weights)); }
@@ -327,7 +288,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                                         degrees.begin(),
                                         degrees.end(),
                                         thrust::greater<edge_t>{}),
-                      "Invalid API parameter: sorted_by_global_degree_within_vertex_partition is "
+                      "Invalid input argument: sorted_by_global_degree_within_vertex_partition is "
                       "set to true, but degrees are not non-ascending.");
     }
 
@@ -427,7 +388,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
 
   CUGRAPH_EXPECTS(
     (edgelist.p_src_vertices != nullptr) && (edgelist.p_dst_vertices != nullptr),
-    "Invalid API parameter: edgelist.p_src_vertices and edgelist.p_dst_vertices should "
+    "Invalid input argument: edgelist.p_src_vertices and edgelist.p_dst_vertices should "
     "not be nullptr.");
 
   // optional expensive checks (part 1/2)
@@ -443,7 +404,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                       edge_first + edgelist.number_of_edges,
                       out_of_range_t<vertex_t>{
                         0, this->get_number_of_vertices(), 0, this->get_number_of_vertices()}) == 0,
-                    "Invalid API parameter: edgelist have out-of-range values.");
+                    "Invalid input argument: edgelist have out-of-range values.");
 
     // FIXME: check for symmetricity may better be implemetned with transpose().
     if (this->is_symmetric()) {}
@@ -455,12 +416,12 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   // convert edge list (COO) to compressed sparse format (CSR or CSC)
 
   std::tie(offsets_, indices_, weights_) =
-    edge_list_to_compressed_sparse<store_transposed>(*(this->get_handle_ptr()),
-                                                     edgelist,
-                                                     vertex_t{0},
-                                                     this->get_number_of_vertices(),
-                                                     vertex_t{0},
-                                                     this->get_number_of_vertices());
+    edgelist_to_compressed_sparse<store_transposed>(edgelist,
+                                                    vertex_t{0},
+                                                    this->get_number_of_vertices(),
+                                                    vertex_t{0},
+                                                    this->get_number_of_vertices(),
+                                                    this->get_handle_ptr()->get_stream());
 
   // update degree-based segment offsets (to be used for graph analytics kernel optimization)
 
@@ -472,12 +433,13 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     // optional expensive checks (part 2/2)
 
     if (do_expensive_check) {
-      CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream),
-                                        degree_first,
-                                        degree_first + this->get_number_of_vertices(),
-                                        thrust::greater<edge_t>{}),
-                      "Invalid API parameter: sorted_by_degree is set to true, but degrees are not "
-                      "non-ascending.");
+      CUGRAPH_EXPECTS(
+        thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream),
+                          degree_first,
+                          degree_first + this->get_number_of_vertices(),
+                          thrust::greater<edge_t>{}),
+        "Invalid input argument: sorted_by_degree is set to true, but degrees are not "
+        "non-ascending.");
     }
 
     static_assert(detail::num_segments_per_vertex_partition == 3);
@@ -508,9 +470,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
       segment_offsets_.data(), segment_offsets.data(), segment_offsets.size(), default_stream);
 
     CUDA_TRY(cudaStreamSynchronize(
-      default_stream));  // this is necessary as d_thresholds and segment_offsets will become
-                         // out-of-scpe once control flow exits this block and segment_offsets_ can
-                         // be used right after return.
+      default_stream));  // this is necessary as segment_offsets_ can be used right after return.
   }
 
   // optional expensive checks (part 3/3)
diff --git a/cpp/src/experimental/graph_view.cu b/cpp/src/experimental/graph_view.cu
index 04d2ea990df..df92fd94194 100644
--- a/cpp/src/experimental/graph_view.cu
+++ b/cpp/src/experimental/graph_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include <experimental/detail/graph_utils.cuh>
 #include <experimental/graph_view.hpp>
 #include <partition_manager.hpp>
-#include <utilities/comm_utils.cuh>
 #include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
 
 #include <raft/cudart_utils.h>
 #include <rmm/thrust_rmm_allocator.h>
@@ -49,6 +49,27 @@ struct out_of_range_t {
   __device__ bool operator()(vertex_t v) { return (v < min) || (v >= max); }
 };
 
+template <typename vertex_t, typename edge_t>
+std::vector<edge_t> update_adj_matrix_partition_edge_counts(
+  std::vector<edge_t const*> const& adj_matrix_partition_offsets,
+  partition_t<vertex_t> const& partition,
+  cudaStream_t stream)
+{
+  std::vector<edge_t> adj_matrix_partition_edge_counts(partition.get_number_of_matrix_partitions(),
+                                                       0);
+  for (size_t i = 0; i < adj_matrix_partition_offsets.size(); ++i) {
+    vertex_t major_first{};
+    vertex_t major_last{};
+    std::tie(major_first, major_last) = partition.get_matrix_partition_major_range(i);
+    raft::update_host(&(adj_matrix_partition_edge_counts[i]),
+                      adj_matrix_partition_offsets[i] + (major_last - major_first),
+                      1,
+                      stream);
+  }
+  CUDA_TRY(cudaStreamSynchronize(stream));
+  return adj_matrix_partition_edge_counts;
+}
+
 }  // namespace
 
 template <typename vertex_t,
@@ -73,6 +94,8 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
     adj_matrix_partition_offsets_(adj_matrix_partition_offsets),
     adj_matrix_partition_indices_(adj_matrix_partition_indices),
     adj_matrix_partition_weights_(adj_matrix_partition_weights),
+    adj_matrix_partition_number_of_edges_(update_adj_matrix_partition_edge_counts(
+      adj_matrix_partition_offsets, partition, handle.get_stream())),
     partition_(partition),
     vertex_partition_segment_offsets_(vertex_partition_segment_offsets)
 {
@@ -87,18 +110,18 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
                                .get_size();
 
   CUGRAPH_EXPECTS(adj_matrix_partition_offsets.size() == adj_matrix_partition_indices.size(),
-                  "Invalid API parameter: adj_matrix_partition_offsets.size() and "
+                  "Internal Error: adj_matrix_partition_offsets.size() and "
                   "adj_matrix_partition_indices.size() should coincide.");
   CUGRAPH_EXPECTS((adj_matrix_partition_weights.size() == adj_matrix_partition_offsets.size()) ||
                     (adj_matrix_partition_weights.size() == 0),
-                  "Invalid API parameter: adj_matrix_partition_weights.size() should coincide with "
+                  "Internal Error: adj_matrix_partition_weights.size() should coincide with "
                   "adj_matrix_partition_offsets.size() (if weighted) or 0 (if unweighted).");
 
   CUGRAPH_EXPECTS(
     (partition.is_hypergraph_partitioned() &&
      (adj_matrix_partition_offsets.size() == static_cast<size_t>(row_comm_size))) ||
       (!(partition.is_hypergraph_partitioned()) && (adj_matrix_partition_offsets.size() == 1)),
-    "Invalid API parameter: errneous adj_matrix_partition_offsets.size().");
+    "Internal Error: erroneous adj_matrix_partition_offsets.size().");
 
   CUGRAPH_EXPECTS((sorted_by_global_degree_within_vertex_partition &&
                    (vertex_partition_segment_offsets.size() ==
@@ -106,7 +129,7 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
                       (detail::num_segments_per_vertex_partition + 1))) ||
                     (!sorted_by_global_degree_within_vertex_partition &&
                      (vertex_partition_segment_offsets.size() == 0)),
-                  "Invalid API parameter: vertex_partition_segment_offsets.size() does not match "
+                  "Internal Error: vertex_partition_segment_offsets.size() does not match "
                   "with sorted_by_global_degree_within_vertex_partition.");
 
   // optional expensive checks
@@ -133,7 +156,7 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
         thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream),
                           adj_matrix_partition_offsets[i],
                           adj_matrix_partition_offsets[i] + (major_last - major_first + 1)),
-        "Invalid API parameter: adj_matrix_partition_offsets[] is not sorted.");
+        "Internal Error: adj_matrix_partition_offsets[] is not sorted.");
       edge_t number_of_local_edges{};
       raft::update_host(&number_of_local_edges,
                         adj_matrix_partition_offsets[i] + (major_last - major_first),
@@ -148,22 +171,23 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
                          adj_matrix_partition_indices[i],
                          adj_matrix_partition_indices[i] + number_of_local_edges,
                          out_of_range_t<vertex_t>{minor_first, minor_last}) == 0,
-        "Invalid API parameter: adj_matrix_partition_indices[] have out-of-range vertex IDs.");
+        "Internal Error: adj_matrix_partition_indices[] have out-of-range vertex IDs.");
     }
     number_of_local_edges_sum = host_scalar_allreduce(
       this->get_handle_ptr()->get_comms(), number_of_local_edges_sum, default_stream);
     CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(),
-                    "Invalid API parameter: the sum of local edges doe counts not match with "
+                    "Internal Error: the sum of local edges counts does not match with "
                     "number_of_local_edges.");
 
     if (sorted_by_global_degree_within_vertex_partition) {
       auto degrees = detail::compute_major_degree(handle, adj_matrix_partition_offsets, partition);
-      CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream),
-                                        degrees.begin(),
-                                        degrees.end(),
-                                        thrust::greater<edge_t>{}),
-                      "Invalid API parameter: sorted_by_global_degree_within_vertex_partition is "
-                      "set to true, but degrees are not non-ascending.");
+      CUGRAPH_EXPECTS(
+        thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream),
+                          degrees.begin(),
+                          degrees.end(),
+                          thrust::greater<edge_t>{}),
+        "Invalid Invalid input argument: sorted_by_global_degree_within_vertex_partition is "
+        "set to true, but degrees are not non-ascending.");
 
       for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size);
            ++i) {
@@ -171,11 +195,11 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
                                          (detail::num_segments_per_vertex_partition + 1) * i,
                                        vertex_partition_segment_offsets.begin() +
                                          (detail::num_segments_per_vertex_partition + 1) * (i + 1)),
-                        "Invalid API parameter: erroneous vertex_partition_segment_offsets.");
+                        "Internal Error: erroneous vertex_partition_segment_offsets.");
         CUGRAPH_EXPECTS(
           vertex_partition_segment_offsets[(detail::num_segments_per_vertex_partition + 1) * i] ==
             0,
-          "Invalid API parameter: erroneous vertex_partition_segment_offsets.");
+          "Internal Error: erroneous vertex_partition_segment_offsets.");
         auto vertex_partition_idx = partition.is_hypergraph_partitioned()
                                       ? row_comm_size * i + row_comm_rank
                                       : col_comm_rank * row_comm_size + i;
@@ -183,13 +207,12 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
           vertex_partition_segment_offsets[(detail::num_segments_per_vertex_partition + 1) * i +
                                            detail::num_segments_per_vertex_partition] ==
             partition.get_vertex_partition_size(vertex_partition_idx),
-          "Invalid API parameter: erroneous vertex_partition_segment_offsets.");
+          "Internal Error: erroneous vertex_partition_segment_offsets.");
       }
     }
 
-    CUGRAPH_EXPECTS(
-      partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,
-      "Invalid API parameter: vertex partition should cover [0, number_of_vertices).");
+    CUGRAPH_EXPECTS(partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,
+                    "Internal Error: vertex partition should cover [0, number_of_vertices).");
 
     // FIXME: check for symmetricity may better be implemetned with transpose().
     if (this->is_symmetric()) {}
@@ -229,11 +252,10 @@ graph_view_t<vertex_t,
 {
   // cheap error checks
 
-  CUGRAPH_EXPECTS(
-    (sorted_by_degree &&
-     (segment_offsets.size() == (detail::num_segments_per_vertex_partition + 1))) ||
-      (!sorted_by_degree && (segment_offsets.size() == 0)),
-    "Invalid API parameter: segment_offsets.size() does not match with sorted_by_degree.");
+  CUGRAPH_EXPECTS((sorted_by_degree &&
+                   (segment_offsets.size() == (detail::num_segments_per_vertex_partition + 1))) ||
+                    (!sorted_by_degree && (segment_offsets.size() == 0)),
+                  "Internal Error: segment_offsets.size() does not match with sorted_by_degree.");
 
   // optional expensive checks
 
@@ -243,7 +265,7 @@ graph_view_t<vertex_t,
     CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream)->on(default_stream),
                                       offsets,
                                       offsets + (this->get_number_of_vertices() + 1)),
-                    "Invalid API parameter: offsets is not sorted.");
+                    "Internal Error: offsets is not sorted.");
 
     // better use thrust::any_of once https://github.com/thrust/thrust/issues/1016 is resolved
     CUGRAPH_EXPECTS(
@@ -251,7 +273,7 @@ graph_view_t<vertex_t,
                        indices,
                        indices + this->get_number_of_edges(),
                        out_of_range_t<vertex_t>{0, this->get_number_of_vertices()}) == 0,
-      "Invalid API parameter: adj_matrix_partition_indices[] have out-of-range vertex IDs.");
+      "Internal Error: adj_matrix_partition_indices[] have out-of-range vertex IDs.");
 
     if (sorted_by_degree) {
       auto degree_first =
@@ -261,14 +283,14 @@ graph_view_t<vertex_t,
                                         degree_first,
                                         degree_first + this->get_number_of_vertices(),
                                         thrust::greater<edge_t>{}),
-                      "Invalid API parameter: sorted_by_degree is set to true, but degrees are not "
-                      "non-ascending.");
+                      "Internal Error: sorted_by_degree is set to true, but degrees are not "
+                      "in ascending order.");
 
       CUGRAPH_EXPECTS(std::is_sorted(segment_offsets.begin(), segment_offsets.end()),
-                      "Invalid API parameter: erroneous segment_offsets.");
-      CUGRAPH_EXPECTS(segment_offsets[0] == 0, "Invalid API parameter: segment_offsets.");
+                      "Internal Error: erroneous segment_offsets.");
+      CUGRAPH_EXPECTS(segment_offsets[0] == 0, "Invalid input argument segment_offsets.");
       CUGRAPH_EXPECTS(segment_offsets.back() == this->get_number_of_vertices(),
-                      "Invalid API parameter: segment_offsets.");
+                      "Invalid input argument: segment_offsets.");
     }
 
     // FIXME: check for symmetricity may better be implemetned with transpose().
diff --git a/cpp/src/experimental/include_cuco_static_map.cuh b/cpp/src/experimental/include_cuco_static_map.cuh
new file mode 100644
index 00000000000..9e54acef72c
--- /dev/null
+++ b/cpp/src/experimental/include_cuco_static_map.cuh
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+// "FIXME": remove the guards below and references to CUCO_STATIC_MAP_DEFINED
+//
+// cuco/static_map.cuh depends on features not supported on or before Pascal.
+//
+// If we build for sm_60 or before, the inclusion of cuco/static_map.cuh wil
+// result in compilation errors.
+//
+// If we're Pascal or before we do nothing here and will suppress including
+// some code below.  If we are later than Pascal we define CUCO_STATIC_MAP_DEFINED
+// which will result in the full implementation being pulled in.
+//
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
+#else
+#define CUCO_STATIC_MAP_DEFINED
+#include <cuco/static_map.cuh>
+#endif
diff --git a/cpp/src/experimental/induced_subgraph.cu b/cpp/src/experimental/induced_subgraph.cu
new file mode 100644
index 00000000000..a88adf76ef4
--- /dev/null
+++ b/cpp/src/experimental/induced_subgraph.cu
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <experimental/graph_functions.hpp>
+#include <experimental/graph_view.hpp>
+#include <matrix_partition_device.cuh>
+#include <utilities/error.hpp>
+#include <vertex_partition_device.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/gather.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+#include <tuple>
+
+namespace cugraph {
+namespace experimental {
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<weight_t>,
+           rmm::device_uvector<size_t>>
+extract_induced_subgraphs(
+  raft::handle_t const &handle,
+  graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu> const &graph_view,
+  size_t const *subgraph_offsets /* size == num_subgraphs + 1 */,
+  vertex_t const *subgraph_vertices /* size == subgraph_offsets[num_subgraphs] */,
+  size_t num_subgraphs,
+  bool do_expensive_check)
+{
+  // FIXME: this code is inefficient for the vertices with their local degrees much larger than the
+  // number of vertices in the subgraphs (in this case, searching that the subgraph vertices are
+  // included in the local neighbors is more efficient than searching the local neighbors are
+  // included in the subgraph vertices). We may later add additional code to handle such cases.
+  // FIXME: we may consider the performance (speed & memory footprint, hash based approach uses
+  // extra-memory) of hash table based and binary search based approaches
+
+  // 1. check input arguments
+
+  if (do_expensive_check) {
+    size_t should_be_zero{std::numeric_limits<size_t>::max()};
+    size_t num_aggregate_subgraph_vertices{};
+    raft::update_host(&should_be_zero, subgraph_offsets, 1, handle.get_stream());
+    raft::update_host(
+      &num_aggregate_subgraph_vertices, subgraph_offsets + num_subgraphs, 1, handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+    CUGRAPH_EXPECTS(should_be_zero == 0,
+                    "Invalid input argument: subgraph_offsets[0] should be 0.");
+
+    CUGRAPH_EXPECTS(
+      thrust::is_sorted(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        subgraph_offsets,
+                        subgraph_offsets + (num_subgraphs + 1)),
+      "Invalid input argument: subgraph_offsets is not sorted.");
+    vertex_partition_device_t<graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>
+      vertex_partition(graph_view);
+    CUGRAPH_EXPECTS(thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                     subgraph_vertices,
+                                     subgraph_vertices + num_aggregate_subgraph_vertices,
+                                     [vertex_partition] __device__(auto v) {
+                                       return !vertex_partition.is_valid_vertex(v) ||
+                                              !vertex_partition.is_local_vertex_nocheck(v);
+                                     }) == 0,
+                    "Invalid input argument: subgraph_vertices has invalid vertex IDs.");
+
+    CUGRAPH_EXPECTS(
+      thrust::count_if(
+        rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+        thrust::make_counting_iterator(size_t{0}),
+        thrust::make_counting_iterator(num_subgraphs),
+        [subgraph_offsets, subgraph_vertices] __device__(auto i) {
+          // vertices are sorted and unique
+          return !thrust::is_sorted(thrust::seq,
+                                    subgraph_vertices + subgraph_offsets[i],
+                                    subgraph_vertices + subgraph_offsets[i + 1]) ||
+                 (thrust::count_if(
+                    thrust::seq,
+                    thrust::make_counting_iterator(subgraph_offsets[i]),
+                    thrust::make_counting_iterator(subgraph_offsets[i + 1]),
+                    [subgraph_vertices, last = subgraph_offsets[i + 1] - 1] __device__(auto i) {
+                      return (i != last) && (subgraph_vertices[i] == subgraph_vertices[i + 1]);
+                    }) != 0);
+        }) == 0,
+      "Invalid input argument: subgraph_vertices for each subgraph idx should be sorted in "
+      "ascending order and unique.");
+  }
+
+  // 2. extract induced subgraphs
+
+  if (multi_gpu) {
+    CUGRAPH_FAIL("Unimplemented.");
+    return std::make_tuple(rmm::device_uvector<vertex_t>(0, handle.get_stream()),
+                           rmm::device_uvector<vertex_t>(0, handle.get_stream()),
+                           rmm::device_uvector<weight_t>(0, handle.get_stream()),
+                           rmm::device_uvector<size_t>(0, handle.get_stream()));
+  } else {
+    // 2-1. Phase 1: calculate memory requirements
+
+    size_t num_aggregate_subgraph_vertices{};
+    raft::update_host(
+      &num_aggregate_subgraph_vertices, subgraph_offsets + num_subgraphs, 1, handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    rmm::device_uvector<size_t> subgraph_vertex_output_offsets(
+      num_aggregate_subgraph_vertices + 1,
+      handle.get_stream());  // for each element of subgraph_vertices
+
+    matrix_partition_device_t<graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>
+      matrix_partition(graph_view, 0);
+    // count the numbers of the induced subgraph edges for each vertex in the aggregate subgraph
+    // vertex list.
+    thrust::transform(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_aggregate_subgraph_vertices),
+      subgraph_vertex_output_offsets.begin(),
+      [subgraph_offsets, subgraph_vertices, num_subgraphs, matrix_partition] __device__(auto i) {
+        auto subgraph_idx = thrust::distance(
+          subgraph_offsets + 1,
+          thrust::upper_bound(thrust::seq, subgraph_offsets, subgraph_offsets + num_subgraphs, i));
+        vertex_t const *indices{nullptr};
+        weight_t const *weights{nullptr};
+        edge_t local_degree{};
+        auto major_offset =
+          matrix_partition.get_major_offset_from_major_nocheck(subgraph_vertices[i]);
+        thrust::tie(indices, weights, local_degree) =
+          matrix_partition.get_local_edges(major_offset);
+        // FIXME: this is inefficient for high local degree vertices
+        return thrust::count_if(
+          thrust::seq,
+          indices,
+          indices + local_degree,
+          [vertex_first = subgraph_vertices + subgraph_offsets[subgraph_idx],
+           vertex_last =
+             subgraph_vertices + subgraph_offsets[subgraph_idx + 1]] __device__(auto nbr) {
+            return thrust::binary_search(thrust::seq, vertex_first, vertex_last, nbr);
+          });
+      });
+    thrust::exclusive_scan(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                           subgraph_vertex_output_offsets.begin(),
+                           subgraph_vertex_output_offsets.end(),
+                           subgraph_vertex_output_offsets.begin());
+
+    size_t num_aggregate_edges{};
+    raft::update_host(&num_aggregate_edges,
+                      subgraph_vertex_output_offsets.data() + num_aggregate_subgraph_vertices,
+                      1,
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    // 2-2. Phase 2: find the edges in the induced subgraphs
+
+    rmm::device_uvector<vertex_t> edge_majors(num_aggregate_edges, handle.get_stream());
+    rmm::device_uvector<vertex_t> edge_minors(num_aggregate_edges, handle.get_stream());
+    rmm::device_uvector<weight_t> edge_weights(
+      graph_view.is_weighted() ? num_aggregate_edges : size_t{0}, handle.get_stream());
+
+    // fill the edge list buffer (to be returned) for each vetex in the aggregate subgraph vertex
+    // list (use the offsets computed in the Phase 1)
+    thrust::for_each(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_aggregate_subgraph_vertices),
+      [subgraph_offsets,
+       subgraph_vertices,
+       num_subgraphs,
+       matrix_partition,
+       subgraph_vertex_output_offsets = subgraph_vertex_output_offsets.data(),
+       edge_majors                    = edge_majors.data(),
+       edge_minors                    = edge_minors.data(),
+       edge_weights                   = edge_weights.data()] __device__(auto i) {
+        auto subgraph_idx = thrust::distance(
+          subgraph_offsets + 1,
+          thrust::upper_bound(
+            thrust::seq, subgraph_offsets, subgraph_offsets + num_subgraphs, size_t{i}));
+        vertex_t const *indices{nullptr};
+        weight_t const *weights{nullptr};
+        edge_t local_degree{};
+        auto major_offset =
+          matrix_partition.get_major_offset_from_major_nocheck(subgraph_vertices[i]);
+        thrust::tie(indices, weights, local_degree) =
+          matrix_partition.get_local_edges(major_offset);
+        if (weights != nullptr) {
+          auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
+            thrust::make_constant_iterator(subgraph_vertices[i]), indices, weights));
+          // FIXME: this is inefficient for high local degree vertices
+          thrust::copy_if(
+            thrust::seq,
+            triplet_first,
+            triplet_first + local_degree,
+            thrust::make_zip_iterator(thrust::make_tuple(edge_majors, edge_minors, edge_weights)) +
+              subgraph_vertex_output_offsets[i],
+            [vertex_first = subgraph_vertices + subgraph_offsets[subgraph_idx],
+             vertex_last =
+               subgraph_vertices + subgraph_offsets[subgraph_idx + 1]] __device__(auto t) {
+              return thrust::binary_search(
+                thrust::seq, vertex_first, vertex_last, thrust::get<1>(t));
+            });
+        } else {
+          auto pair_first = thrust::make_zip_iterator(
+            thrust::make_tuple(thrust::make_constant_iterator(subgraph_vertices[i]), indices));
+          // FIXME: this is inefficient for high local degree vertices
+          thrust::copy_if(thrust::seq,
+                          pair_first,
+                          pair_first + local_degree,
+                          thrust::make_zip_iterator(thrust::make_tuple(edge_majors, edge_minors)) +
+                            subgraph_vertex_output_offsets[i],
+                          [vertex_first = subgraph_vertices + subgraph_offsets[subgraph_idx],
+                           vertex_last  = subgraph_vertices +
+                                         subgraph_offsets[subgraph_idx + 1]] __device__(auto t) {
+                            return thrust::binary_search(
+                              thrust::seq, vertex_first, vertex_last, thrust::get<1>(t));
+                          });
+        }
+      });
+
+    rmm::device_uvector<size_t> subgraph_edge_offsets(num_subgraphs + 1, handle.get_stream());
+    thrust::gather(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   subgraph_offsets,
+                   subgraph_offsets + (num_subgraphs + 1),
+                   subgraph_vertex_output_offsets.begin(),
+                   subgraph_edge_offsets.begin());
+
+    return std::make_tuple(std::move(edge_majors),
+                           std::move(edge_minors),
+                           std::move(edge_weights),
+                           std::move(subgraph_edge_offsets));
+  }
+}
+
+// explicit instantiation
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<float>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const &handle,
+                          graph_view_t<int32_t, int32_t, float, true, false> const &graph_view,
+                          size_t const *subgraph_offsets,
+                          int32_t const *subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<float>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const &handle,
+                          graph_view_t<int32_t, int32_t, float, false, false> const &graph_view,
+                          size_t const *subgraph_offsets,
+                          int32_t const *subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<double>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const &handle,
+                          graph_view_t<int32_t, int32_t, double, true, false> const &graph_view,
+                          size_t const *subgraph_offsets,
+                          int32_t const *subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<double>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const &handle,
+                          graph_view_t<int32_t, int32_t, double, false, false> const &graph_view,
+                          size_t const *subgraph_offsets,
+                          int32_t const *subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<float>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const &handle,
+                          graph_view_t<int32_t, int64_t, float, true, false> const &graph_view,
+                          size_t const *subgraph_offsets,
+                          int32_t const *subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<float>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const &handle,
+                          graph_view_t<int32_t, int64_t, float, false, false> const &graph_view,
+                          size_t const *subgraph_offsets,
+                          int32_t const *subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<double>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const &handle,
+                          graph_view_t<int32_t, int64_t, double, true, false> const &graph_view,
+                          size_t const *subgraph_offsets,
+                          int32_t const *subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<double>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const &handle,
+                          graph_view_t<int32_t, int64_t, double, false, false> const &graph_view,
+                          size_t const *subgraph_offsets,
+                          int32_t const *subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<float>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const &handle,
+                          graph_view_t<int64_t, int64_t, float, true, false> const &graph_view,
+                          size_t const *subgraph_offsets,
+                          int64_t const *subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<float>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const &handle,
+                          graph_view_t<int64_t, int64_t, float, false, false> const &graph_view,
+                          size_t const *subgraph_offsets,
+                          int64_t const *subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<double>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const &handle,
+                          graph_view_t<int64_t, int64_t, double, true, false> const &graph_view,
+                          size_t const *subgraph_offsets,
+                          int64_t const *subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<double>,
+                    rmm::device_uvector<size_t>>
+extract_induced_subgraphs(raft::handle_t const &handle,
+                          graph_view_t<int64_t, int64_t, double, false, false> const &graph_view,
+                          size_t const *subgraph_offsets,
+                          int64_t const *subgraph_vertices,
+                          size_t num_subgraphs,
+                          bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/katz_centrality.cu b/cpp/src/experimental/katz_centrality.cu
index 587011da817..1ab824f1c91 100644
--- a/cpp/src/experimental/katz_centrality.cu
+++ b/cpp/src/experimental/katz_centrality.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -166,8 +166,6 @@ void katz_centrality(raft::handle_t const &handle,
                       katz_centralities,
                       [l2_norm] __device__(auto val) { return val / l2_norm; });
   }
-
-  return;
 }
 
 }  // namespace detail
diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index 08e52092362..f162cd17a61 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,32 +22,21 @@
 #include <rmm/thrust_rmm_allocator.h>
 #include <compute_partition.cuh>
 #include <experimental/shuffle.cuh>
-#include <utilities/comm_utils.cuh>
 #include <utilities/graph_utils.cuh>
 
 #include <raft/device_atomics.cuh>
 
+#include <experimental/graph_functions.hpp>
 #include <patterns/copy_to_adj_matrix_row_col.cuh>
 #include <patterns/copy_v_transform_reduce_in_out_nbr.cuh>
 #include <patterns/transform_reduce_e.cuh>
 #include <patterns/transform_reduce_v.cuh>
 
-// "FIXME": remove the guards below and references to CUCO_STATIC_MAP_DEFINED
-//
-// cuco/static_map.cuh depends on features not supported on or before Pascal.
-//
-// If we build for sm_60 or before, the inclusion of cuco/static_map.cuh wil
-// result in compilation errors.
-//
-// If we're Pascal or before we do nothing here and will suppress including
-// some code below.  If we are later than Pascal we define CUCO_STATIC_MAP_DEFINED
-// which will result in the full implementation being pulled in.
-//
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
-#else
-#define CUCO_STATIC_MAP_DEFINED
-#include <cuco/static_map.cuh>
-#endif
+#include <experimental/include_cuco_static_map.cuh>
+
+#include <community/dendrogram.cuh>
+
+#include <numeric>
 
 //#define TIMING
 
@@ -390,9 +379,9 @@ create_graph(raft::handle_t const &handle,
 //  as above would allow us to eventually run the single GPU version of single level Louvain
 //  on the contracted graphs - which should be more efficient.
 //
-// FIXME: We should return the dendogram and let the python layer clean it up (or have a
-//  separate C++ function to flatten the dendogram).  There are customers that might
-//  like the dendogram and the implementation would be a bit cleaner if we did the
+// FIXME: We should return the dendrogram and let the python layer clean it up (or have a
+//  separate C++ function to flatten the dendrogram).  There are customers that might
+//  like the dendrogram and the implementation would be a bit cleaner if we did the
 //  collapsing as a separate step
 //
 template <typename graph_view_type>
@@ -414,6 +403,7 @@ class Louvain {
       hr_timer_(),
 #endif
       handle_(handle),
+      dendrogram_(std::make_unique<Dendrogram<vertex_t>>()),
       current_graph_view_(graph_view),
       compute_partition_(graph_view),
       local_num_vertices_(graph_view.get_number_of_local_vertices()),
@@ -422,7 +412,6 @@ class Louvain {
       local_num_edges_(graph_view.get_number_of_edges()),
       vertex_weights_v_(graph_view.get_number_of_local_vertices()),
       cluster_weights_v_(graph_view.get_number_of_local_vertices()),
-      cluster_v_(graph_view.get_number_of_local_vertices()),
       number_of_vertices_(graph_view.get_number_of_local_vertices()),
       stream_(handle.get_stream())
   {
@@ -432,11 +421,16 @@ class Louvain {
       base_src_vertex_id_ = graph_view.get_local_adj_matrix_partition_row_first(0);
       base_dst_vertex_id_ = graph_view.get_local_adj_matrix_partition_col_first(0);
 
-      raft::copy(&local_num_edges_,
-                 graph_view.offsets() + graph_view.get_local_adj_matrix_partition_row_last(0) -
-                   graph_view.get_local_adj_matrix_partition_row_first(0),
-                 1,
-                 stream_);
+      local_num_edges_ = thrust::transform_reduce(
+        thrust::host,
+        thrust::make_counting_iterator<size_t>(0),
+        thrust::make_counting_iterator<size_t>(
+          graph_view.get_number_of_local_adj_matrix_partitions()),
+        [&graph_view](auto indx) {
+          return graph_view.get_number_of_local_adj_matrix_partition_edges(indx);
+        },
+        size_t{0},
+        thrust::plus<size_t>());
 
       CUDA_TRY(cudaStreamSynchronize(stream_));
     }
@@ -456,11 +450,12 @@ class Louvain {
     }
   }
 
-  virtual std::pair<size_t, weight_t> operator()(vertex_t *d_cluster_vec,
-                                                 size_t max_level,
-                                                 weight_t resolution)
+  Dendrogram<vertex_t> &get_dendrogram() const { return *dendrogram_; }
+
+  std::unique_ptr<Dendrogram<vertex_t>> move_dendrogram() { return dendrogram_; }
+
+  virtual weight_t operator()(size_t max_level, weight_t resolution)
   {
-    size_t num_level{0};
     weight_t best_modularity = weight_t{-1};
 
 #ifdef CUCO_STATIC_MAP_DEFINED
@@ -473,17 +468,12 @@ class Louvain {
       [] __device__(auto, auto, weight_t wt, auto, auto) { return wt; },
       weight_t{0});
 
-    //
-    //  Initialize every cluster to reference each vertex to itself
-    //
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
-                     cluster_v_.begin(),
-                     cluster_v_.end(),
-                     base_vertex_id_);
-    thrust::copy(
-      rmm::exec_policy(stream_)->on(stream_), cluster_v_.begin(), cluster_v_.end(), d_cluster_vec);
+    while (dendrogram_->num_levels() < max_level) {
+      //
+      //  Initialize every cluster to reference each vertex to itself
+      //
+      initialize_dendrogram_level(current_graph_view_.get_number_of_local_vertices());
 
-    while (num_level < max_level) {
       compute_vertex_and_cluster_weights();
 
       weight_t new_Q = update_clustering(total_edge_weight, resolution);
@@ -492,15 +482,13 @@ class Louvain {
 
       best_modularity = new_Q;
 
-      shrink_graph(d_cluster_vec);
-
-      num_level++;
+      shrink_graph();
     }
 
     timer_display(std::cout);
 #endif
 
-    return std::make_pair(num_level, best_modularity);
+    return best_modularity;
   }
 
  protected:
@@ -528,6 +516,17 @@ class Louvain {
 #endif
   }
 
+ protected:
+  void initialize_dendrogram_level(vertex_t num_vertices)
+  {
+    dendrogram_->add_level(num_vertices);
+
+    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
+                     dendrogram_->current_level_begin(),
+                     dendrogram_->current_level_end(),
+                     base_vertex_id_);
+  }
+
  public:
   weight_t modularity(weight_t total_edge_weight, weight_t resolution)
   {
@@ -577,23 +576,16 @@ class Louvain {
                  cluster_weights_v_.begin());
 
     cache_vertex_properties(
-      vertex_weights_v_, src_vertex_weights_cache_v_, dst_vertex_weights_cache_v_);
+      vertex_weights_v_.begin(), src_vertex_weights_cache_v_, dst_vertex_weights_cache_v_);
 
     cache_vertex_properties(
-      cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
+      cluster_weights_v_.begin(), src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
 
     timer_stop(stream_);
   }
 
-  //
-  // FIXME:  Consider returning d_src_cache and d_dst_cache
-  //         (as a pair).  This would be a nice optimization
-  //         for single GPU, as we wouldn't need to make 3 copies
-  //         of the data, could return a pair of device pointers to
-  //         local_input_v.
-  //
-  template <typename T>
-  void cache_vertex_properties(rmm::device_vector<T> const &local_input_v,
+  template <typename iterator_t, typename T>
+  void cache_vertex_properties(iterator_t const &local_input_iterator,
                                rmm::device_vector<T> &src_cache_v,
                                rmm::device_vector<T> &dst_cache_v,
                                bool src = true,
@@ -602,13 +594,13 @@ class Louvain {
     if (src) {
       src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows());
       copy_to_adj_matrix_row(
-        handle_, current_graph_view_, local_input_v.begin(), src_cache_v.begin());
+        handle_, current_graph_view_, local_input_iterator, src_cache_v.begin());
     }
 
     if (dst) {
       dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols());
       copy_to_adj_matrix_col(
-        handle_, current_graph_view_, local_input_v.begin(), dst_cache_v.begin());
+        handle_, current_graph_view_, local_input_iterator, dst_cache_v.begin());
     }
   }
 
@@ -617,9 +609,10 @@ class Louvain {
   {
     timer_start("update_clustering");
 
-    rmm::device_vector<vertex_t> next_cluster_v(cluster_v_);
+    rmm::device_vector<vertex_t> next_cluster_v(dendrogram_->current_level_begin(),
+                                                dendrogram_->current_level_end());
 
-    cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_);
+    cache_vertex_properties(next_cluster_v.begin(), src_cluster_cache_v_, dst_cluster_cache_v_);
 
     weight_t new_Q = modularity(total_edge_weight, resolution);
     weight_t cur_Q = new_Q - 1;
@@ -636,7 +629,7 @@ class Louvain {
 
       up_down = !up_down;
 
-      cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_);
+      cache_vertex_properties(next_cluster_v.begin(), src_cluster_cache_v_, dst_cluster_cache_v_);
 
       new_Q = modularity(total_edge_weight, resolution);
 
@@ -644,12 +637,13 @@ class Louvain {
         thrust::copy(rmm::exec_policy(stream_)->on(stream_),
                      next_cluster_v.begin(),
                      next_cluster_v.end(),
-                     cluster_v_.begin());
+                     dendrogram_->current_level_begin());
       }
     }
 
     // cache the final clustering locally on each cpu
-    cache_vertex_properties(cluster_v_, src_cluster_cache_v_, dst_cluster_cache_v_);
+    cache_vertex_properties(
+      dendrogram_->current_level_begin(), src_cluster_cache_v_, dst_cluster_cache_v_);
 
     timer_stop(stream_);
     return cur_Q;
@@ -678,7 +672,7 @@ class Louvain {
       old_cluster_sum_v.begin());
 
     cache_vertex_properties(
-      old_cluster_sum_v, src_old_cluster_sum_cache_v, empty_cache_weight_v_, true, false);
+      old_cluster_sum_v.begin(), src_old_cluster_sum_cache_v, empty_cache_weight_v_, true, false);
 
     detail::src_cluster_equality_comparator_t<vertex_t, edge_t> compare(
       src_indices_v_.data().get(),
@@ -1134,7 +1128,7 @@ class Louvain {
       });
 
     cache_vertex_properties(
-      cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
+      cluster_weights_v_.begin(), src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
   }
 
   template <typename hash_t, typename compare_t, typename skip_edge_t, typename count_t>
@@ -1219,475 +1213,63 @@ class Louvain {
 
     return std::make_pair(relevant_edges_v, relevant_edge_weights_v);
   }
+#endif
 
-  void shrink_graph(vertex_t *d_cluster_vec)
+  void shrink_graph()
   {
     timer_start("shrinking graph");
 
-    std::size_t capacity{static_cast<std::size_t>((local_num_rows_ + local_num_cols_) / 0.7)};
-
-    cuco::static_map<vertex_t, vertex_t> hash_map(
-      capacity, std::numeric_limits<vertex_t>::max(), std::numeric_limits<vertex_t>::max());
-
-    // renumber the clusters to the range 0..(num_clusters-1)
-    vertex_t num_clusters = renumber_clusters(hash_map);
-
-    renumber_result(hash_map, d_cluster_vec, num_clusters);
-
-    // shrink our graph to represent the graph of supervertices
-    generate_supervertices_graph(hash_map, num_clusters);
-
-    // assign each new vertex to its own cluster
-    //  MNMG:  This can be done locally with no communication required
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
-                     cluster_v_.begin(),
-                     cluster_v_.end(),
-                     base_vertex_id_);
-
-    timer_stop(stream_);
-  }
+    rmm::device_uvector<vertex_t> numbering_map(0, stream_);
 
-  vertex_t renumber_clusters(cuco::static_map<vertex_t, vertex_t> &hash_map)
-  {
-    rmm::device_vector<vertex_t> cluster_inverse_v(local_num_vertices_, vertex_t{0});
-
-    //
-    // FIXME:  Faster to iterate from graph_.get_vertex_partition_first()
-    //         to graph_.get_vertex_partition_last()?  That would potentially
-    //         result in adding a cluster that isn't used on this GPU,
-    //         although I don't think it would break the result in any way.
-    //
-    //         This would also eliminate this use of src_indices_v_.
-    //
-    auto it_src = thrust::make_transform_iterator(
-      src_indices_v_.begin(),
-      [base_src_vertex_id  = base_src_vertex_id_,
-       d_src_cluster_cache = src_cluster_cache_v_.data().get()] __device__(auto idx) {
-        return detail::create_cuco_pair_t<vertex_t>()(
-          d_src_cluster_cache[idx - base_src_vertex_id]);
-      });
-
-    auto it_dst = thrust::make_transform_iterator(
-      current_graph_view_.indices(),
-      [base_dst_vertex_id  = base_dst_vertex_id_,
-       d_dst_cluster_cache = dst_cluster_cache_v_.data().get()] __device__(auto idx) {
-        return detail::create_cuco_pair_t<vertex_t>()(
-          d_dst_cluster_cache[idx - base_dst_vertex_id]);
-      });
-
-    hash_map.insert(it_src, it_src + local_num_edges_);
-    hash_map.insert(it_dst, it_dst + local_num_edges_);
-
-    // Now I need to get the keys into an array and shuffle them
-    rmm::device_vector<vertex_t> used_cluster_ids_v(hash_map.get_size());
-
-    auto transform_iter = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<std::size_t>(0),
-      [d_hash_map = hash_map.get_device_view()] __device__(std::size_t idx) {
-        return d_hash_map.begin_slot()[idx].first.load();
-      });
-
-    used_cluster_ids_v = detail::remove_elements_from_vector(
-      used_cluster_ids_v,
-      transform_iter,
-      transform_iter + hash_map.get_capacity(),
-      [vmax = std::numeric_limits<vertex_t>::max()] __device__(vertex_t cluster) {
-        return cluster != vmax;
-      },
-      stream_);
-
-    auto partition_cluster_ids_iter = thrust::make_transform_iterator(
-      used_cluster_ids_v.begin(),
-      [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-        return d_vertex_device_view(v);
-      });
-
-    rmm::device_vector<std::size_t> original_gpus_v;
-    rmm::device_vector<vertex_t> my_cluster_ids_v =
-      variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, used_cluster_ids_v.size(), used_cluster_ids_v.begin(), partition_cluster_ids_iter);
-
-    if (graph_view_t::is_multi_gpu) {
-      original_gpus_v = variable_shuffle<graph_view_t::is_multi_gpu, std::size_t>(
-        handle_,
-        used_cluster_ids_v.size(),
-        thrust::make_constant_iterator<std::size_t>(rank_),
-        partition_cluster_ids_iter);
-    }
-
-    //
-    //   Now my_cluster_ids contains the cluster ids that this gpu is
-    //   responsible for. I'm going to set cluster_inverse_v to one
-    //   for each cluster in this list.
-    //
-    thrust::for_each(
-      rmm::exec_policy(stream_)->on(stream_),
-      my_cluster_ids_v.begin(),
-      my_cluster_ids_v.end(),
-      [base_vertex_id    = base_vertex_id_,
-       d_cluster_inverse = cluster_inverse_v.data().get()] __device__(vertex_t cluster) {
-        d_cluster_inverse[cluster - base_vertex_id] = 1;
-      });
-
-    rmm::device_vector<vertex_t> my_cluster_ids_deduped_v = detail::remove_elements_from_vector(
-      my_cluster_ids_v,
-      thrust::make_counting_iterator<size_t>(0),
-      thrust::make_counting_iterator<size_t>(cluster_inverse_v.size()),
-      [d_cluster_inverse = cluster_inverse_v.data().get()] __device__(auto idx) {
-        return d_cluster_inverse[idx] == 1;
-      },
-      stream_);
-
-    //
-    //  Need to gather everything to be able to compute base addresses
-    //
-    vertex_t base_address{0};
-
-    if (graph_view_t::is_multi_gpu) {
-      int num_gpus{1};
-      rmm::device_vector<std::size_t> sizes_v(num_gpus + 1, my_cluster_ids_deduped_v.size());
-
-      handle_.get_comms().allgather(
-        sizes_v.data().get() + num_gpus, sizes_v.data().get(), num_gpus, stream_);
-
-      base_address = thrust::reduce(rmm::exec_policy(stream_)->on(stream_),
-                                    sizes_v.begin(),
-                                    sizes_v.begin() + rank_,
-                                    vertex_t{0});
-    }
-
-    //
-    //  Now let's update cluster_inverse_v to contain
-    //  the mapping of old cluster id to new vertex id
-    //
-    thrust::fill(
-      cluster_inverse_v.begin(), cluster_inverse_v.end(), std::numeric_limits<vertex_t>::max());
-
-    thrust::for_each_n(rmm::exec_policy(stream_)->on(stream_),
-                       thrust::make_counting_iterator<std::size_t>(0),
-                       my_cluster_ids_deduped_v.size(),
-                       [base_address,
-                        d_my_cluster_ids_deduped = my_cluster_ids_deduped_v.data().get(),
-                        d_cluster_inverse = cluster_inverse_v.data().get()] __device__(auto idx) {
-                         d_cluster_inverse[d_my_cluster_ids_deduped[idx]] = idx + base_address;
-                       });
-
-    //
-    //  Now I need to shuffle back to original gpus the
-    //  subset of my mapping that is required
-    //
-    rmm::device_vector<vertex_t> new_vertex_ids_v =
-      variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_,
-        my_cluster_ids_v.size(),
-        thrust::make_transform_iterator(my_cluster_ids_v.begin(),
-                                        [d_cluster_inverse = cluster_inverse_v.data().get(),
-                                         base_vertex_id    = base_vertex_id_] __device__(auto v) {
-                                          return d_cluster_inverse[v - base_vertex_id];
-                                        }),
-        original_gpus_v.begin());
-
-    if (graph_view_t::is_multi_gpu) {
-      my_cluster_ids_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, my_cluster_ids_v.size(), my_cluster_ids_v.begin(), original_gpus_v.begin());
-    }
-
-    //
-    //  Now update the hash map with the new vertex id
-    //
-    thrust::for_each_n(rmm::exec_policy(stream_)->on(stream_),
-                       thrust::make_zip_iterator(
-                         thrust::make_tuple(my_cluster_ids_v.begin(), new_vertex_ids_v.begin())),
-                       my_cluster_ids_v.size(),
-                       [d_hash_map = hash_map.get_device_view()] __device__(auto p) mutable {
-                         auto pos = d_hash_map.find(thrust::get<0>(p));
-                         pos->second.store(thrust::get<1>(p));
-                       });
-
-    //
-    //  At this point we have a renumbered COO that is
-    //  improperly distributed around the cluster, which
-    //  will be fixed by generate_supervertices_graph
-    //
-    if (graph_t::is_multi_gpu) {
-      return host_scalar_allreduce(
-        handle_.get_comms(), static_cast<vertex_t>(my_cluster_ids_deduped_v.size()), stream_);
-    } else {
-      return static_cast<vertex_t>(my_cluster_ids_deduped_v.size());
-    }
-  }
-
-  void renumber_result(cuco::static_map<vertex_t, vertex_t> const &hash_map,
-                       vertex_t *d_cluster_vec,
-                       vertex_t num_clusters)
-  {
-    if (graph_view_t::is_multi_gpu) {
-      //
-      // FIXME: Perhaps there's a general purpose function hidden here...
-      //        Given a set of vertex_t values, and a distributed set of
-      //        vertex properties, go to the proper node and retrieve
-      //        the vertex properties and return them to this gpu.
-      //
-      std::size_t capacity{static_cast<std::size_t>((local_num_vertices_) / 0.7)};
-      cuco::static_map<vertex_t, vertex_t> result_hash_map(
-        capacity, std::numeric_limits<vertex_t>::max(), std::numeric_limits<vertex_t>::max());
-
-      auto cluster_iter = thrust::make_transform_iterator(d_cluster_vec, [] __device__(vertex_t c) {
-        return detail::create_cuco_pair_t<vertex_t>()(c);
-      });
-
-      result_hash_map.insert(cluster_iter, cluster_iter + local_num_vertices_);
-
-      rmm::device_vector<vertex_t> used_cluster_ids_v(result_hash_map.get_size());
-
-      auto transform_iter = thrust::make_transform_iterator(
-        thrust::make_counting_iterator<std::size_t>(0),
-        [d_result_hash_map = result_hash_map.get_device_view()] __device__(std::size_t idx) {
-          return d_result_hash_map.begin_slot()[idx].first.load();
-        });
-
-      used_cluster_ids_v = detail::remove_elements_from_vector(
-        used_cluster_ids_v,
-        transform_iter,
-        transform_iter + result_hash_map.get_capacity(),
-        [vmax = std::numeric_limits<vertex_t>::max()] __device__(vertex_t cluster) {
-          return cluster != vmax;
-        },
-        stream_);
-
-      auto partition_cluster_ids_iter = thrust::make_transform_iterator(
-        used_cluster_ids_v.begin(),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        });
-
-      rmm::device_vector<vertex_t> old_cluster_ids_v =
-        variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(handle_,
-                                                               used_cluster_ids_v.size(),
-                                                               used_cluster_ids_v.begin(),
-                                                               partition_cluster_ids_iter);
-
-      rmm::device_vector<std::size_t> original_gpus_v =
-        variable_shuffle<graph_view_t::is_multi_gpu, std::size_t>(
-          handle_,
-          used_cluster_ids_v.size(),
-          thrust::make_constant_iterator<std::size_t>(rank_),
-          partition_cluster_ids_iter);
-
-      // Now each GPU has old cluster ids, let's compute new cluster ids
-      rmm::device_vector<vertex_t> new_cluster_ids_v(old_cluster_ids_v.size());
-
-      thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                        old_cluster_ids_v.begin(),
-                        old_cluster_ids_v.end(),
-                        new_cluster_ids_v.begin(),
-                        [base_vertex_id = base_vertex_id_,
-                         d_cluster      = cluster_v_.data().get(),
-                         d_hash_map = hash_map.get_device_view()] __device__(vertex_t cluster_id) {
-                          vertex_t c = d_cluster[cluster_id - base_vertex_id];
-                          auto pos   = d_hash_map.find(c);
-                          return pos->second.load();
-                        });
-
-      // Shuffle everything back
-      old_cluster_ids_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, old_cluster_ids_v.size(), old_cluster_ids_v.begin(), original_gpus_v.begin());
-      new_cluster_ids_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, new_cluster_ids_v.size(), new_cluster_ids_v.begin(), original_gpus_v.begin());
-
-      // Update result_hash_map
-      thrust::for_each_n(
-        rmm::exec_policy(stream_)->on(stream_),
-        thrust::make_zip_iterator(
-          thrust::make_tuple(old_cluster_ids_v.begin(), new_cluster_ids_v.begin())),
-        old_cluster_ids_v.size(),
-        [d_result_hash_map = result_hash_map.get_device_view()] __device__(auto pair) mutable {
-          auto pos = d_result_hash_map.find(thrust::get<0>(pair));
-          pos->second.store(thrust::get<1>(pair));
-        });
-
-      thrust::transform(
-        rmm::exec_policy(stream_)->on(stream_),
-        d_cluster_vec,
-        d_cluster_vec + number_of_vertices_,
-        d_cluster_vec,
-        [d_result_hash_map = result_hash_map.get_device_view()] __device__(vertex_t c) {
-          auto pos = d_result_hash_map.find(c);
-          return pos->second.load();
-        });
-
-    } else {
-      thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                        d_cluster_vec,
-                        d_cluster_vec + number_of_vertices_,
-                        d_cluster_vec,
-                        [d_hash_map    = hash_map.get_device_view(),
-                         d_dst_cluster = dst_cluster_cache_v_.data()] __device__(vertex_t v) {
-                          vertex_t c = d_dst_cluster[v];
-                          auto pos   = d_hash_map.find(c);
-                          return pos->second.load();
-                        });
-    }
-  }
-
-  void generate_supervertices_graph(cuco::static_map<vertex_t, vertex_t> const &hash_map,
-                                    vertex_t num_clusters)
-  {
-    rmm::device_vector<vertex_t> new_src_v(local_num_edges_);
-    rmm::device_vector<vertex_t> new_dst_v(local_num_edges_);
-    rmm::device_vector<weight_t> new_weight_v(current_graph_view_.weights(),
-                                              current_graph_view_.weights() + local_num_edges_);
-
-    thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                      src_indices_v_.begin(),
-                      src_indices_v_.end(),
-                      new_src_v.begin(),
-                      [base_src_vertex_id = base_src_vertex_id_,
-                       d_src_cluster      = src_cluster_cache_v_.data().get(),
-                       d_hash_map         = hash_map.get_device_view()] __device__(vertex_t v) {
-                        vertex_t c = d_src_cluster[v - base_src_vertex_id];
-                        auto pos   = d_hash_map.find(c);
-                        return pos->second.load();
-                      });
-
-    thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                      current_graph_view_.indices(),
-                      current_graph_view_.indices() + local_num_edges_,
-                      new_dst_v.begin(),
-                      [base_dst_vertex_id = base_dst_vertex_id_,
-                       d_dst_cluster      = dst_cluster_cache_v_.data().get(),
-                       d_hash_map         = hash_map.get_device_view()] __device__(vertex_t v) {
-                        vertex_t c = d_dst_cluster[v - base_dst_vertex_id];
-                        auto pos   = d_hash_map.find(c);
-                        return pos->second.load();
-                      });
-
-    // Combine common edges on local gpu
-    std::tie(new_src_v, new_dst_v, new_weight_v) =
-      combine_local_edges(new_src_v, new_dst_v, new_weight_v);
-
-    if (graph_view_t::is_multi_gpu) {
-      //
-      // Shuffle the data to the proper GPU
-      //   FIXME:  This needs some performance exploration.  It is
-      //           possible (likely?) that the shrunken graph is
-      //           more dense than the original graph.  Perhaps that
-      //           changes the dynamic of partitioning efficiently.
-      //
-      // For now, we're going to keep the partitioning the same,
-      // but because we've renumbered to lower numbers, fewer
-      // partitions will actually have data.
-      //
-      rmm::device_vector<int> partition_v(new_src_v.size());
-
-      thrust::transform(
-        rmm::exec_policy(stream_)->on(stream_),
-        thrust::make_zip_iterator(thrust::make_tuple(new_src_v.begin(), new_dst_v.begin())),
-        thrust::make_zip_iterator(thrust::make_tuple(new_src_v.end(), new_dst_v.end())),
-        partition_v.begin(),
-        [d_edge_device_view = compute_partition_.edge_device_view()] __device__(
-          thrust::tuple<vertex_t, vertex_t> tuple) {
-          return d_edge_device_view(thrust::get<0>(tuple), thrust::get<1>(tuple));
-        });
-
-      new_src_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, partition_v.size(), new_src_v.begin(), partition_v.begin());
-
-      new_dst_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_, partition_v.size(), new_dst_v.begin(), partition_v.begin());
-
-      new_weight_v = variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-        handle_, partition_v.size(), new_weight_v.begin(), partition_v.begin());
-
-      //
-      //  Now everything is on the correct node, again combine like edges
-      //
-      std::tie(new_src_v, new_dst_v, new_weight_v) =
-        combine_local_edges(new_src_v, new_dst_v, new_weight_v);
-    }
-
-    //
-    //  Now I have a COO of the new graph, distributed according to the
-    //  original clustering (eventually this likely fits on one GPU and
-    //  everything else is empty).
-    //
-    current_graph_ =
-      detail::create_graph<vertex_t,
-                           edge_t,
-                           weight_t,
-                           graph_t::is_adj_matrix_transposed,
-                           graph_t::is_multi_gpu>(handle_,
-                                                  new_src_v,
-                                                  new_dst_v,
-                                                  new_weight_v,
-                                                  num_clusters,
-                                                  experimental::graph_properties_t{true, true},
-                                                  current_graph_view_);
+    std::tie(current_graph_, numbering_map) =
+      coarsen_graph(handle_, current_graph_view_, dendrogram_->current_level_begin());
 
     current_graph_view_ = current_graph_->view();
 
-    src_indices_v_.resize(new_src_v.size());
-
     local_num_vertices_ = current_graph_view_.get_number_of_local_vertices();
     local_num_rows_     = current_graph_view_.get_number_of_local_adj_matrix_partition_rows();
     local_num_cols_     = current_graph_view_.get_number_of_local_adj_matrix_partition_cols();
-    local_num_edges_    = new_src_v.size();
+    base_vertex_id_     = current_graph_view_.get_local_vertex_first();
+
+    local_num_edges_ = thrust::transform_reduce(
+      thrust::host,
+      thrust::make_counting_iterator<size_t>(0),
+      thrust::make_counting_iterator<size_t>(
+        current_graph_view_.get_number_of_local_adj_matrix_partitions()),
+      [this](auto indx) {
+        return current_graph_view_.get_number_of_local_adj_matrix_partition_edges(indx);
+      },
+      size_t{0},
+      thrust::plus<size_t>());
+
+    src_indices_v_.resize(local_num_edges_);
 
     cugraph::detail::offsets_to_indices(
       current_graph_view_.offsets(), local_num_rows_, src_indices_v_.data().get());
-  }
-#endif
 
-  std::
-    tuple<rmm::device_vector<vertex_t>, rmm::device_vector<vertex_t>, rmm::device_vector<weight_t>>
-    combine_local_edges(rmm::device_vector<vertex_t> &src_v,
-                        rmm::device_vector<vertex_t> &dst_v,
-                        rmm::device_vector<weight_t> &weight_v)
-  {
-    thrust::stable_sort_by_key(
-      rmm::exec_policy(stream_)->on(stream_),
-      dst_v.begin(),
-      dst_v.end(),
-      thrust::make_zip_iterator(thrust::make_tuple(src_v.begin(), weight_v.begin())));
-    thrust::stable_sort_by_key(
-      rmm::exec_policy(stream_)->on(stream_),
-      src_v.begin(),
-      src_v.end(),
-      thrust::make_zip_iterator(thrust::make_tuple(dst_v.begin(), weight_v.begin())));
+    rmm::device_uvector<vertex_t> numbering_indices(numbering_map.size(), stream_);
+    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
+                     numbering_indices.begin(),
+                     numbering_indices.end(),
+                     base_vertex_id_);
 
-    rmm::device_vector<vertex_t> combined_src_v(src_v.size());
-    rmm::device_vector<vertex_t> combined_dst_v(src_v.size());
-    rmm::device_vector<weight_t> combined_weight_v(src_v.size());
+    relabel<vertex_t, graph_view_t::is_multi_gpu>(
+      handle_,
+      std::make_tuple(static_cast<vertex_t const *>(numbering_map.begin()),
+                      static_cast<vertex_t const *>(numbering_indices.begin())),
+      local_num_vertices_,
+      dendrogram_->current_level_begin(),
+      dendrogram_->current_level_size());
 
-    //
-    //  Now we reduce by key to combine the weights of duplicate
-    //  edges.
-    //
-    auto start = thrust::make_zip_iterator(thrust::make_tuple(src_v.begin(), dst_v.begin()));
-    auto new_start =
-      thrust::make_zip_iterator(thrust::make_tuple(combined_src_v.begin(), combined_dst_v.begin()));
-    auto new_end = thrust::reduce_by_key(rmm::exec_policy(stream_)->on(stream_),
-                                         start,
-                                         start + src_v.size(),
-                                         weight_v.begin(),
-                                         new_start,
-                                         combined_weight_v.begin(),
-                                         thrust::equal_to<thrust::tuple<vertex_t, vertex_t>>(),
-                                         thrust::plus<weight_t>());
-
-    auto num_edges = thrust::distance(new_start, new_end.first);
-
-    combined_src_v.resize(num_edges);
-    combined_dst_v.resize(num_edges);
-    combined_weight_v.resize(num_edges);
-
-    return std::make_tuple(combined_src_v, combined_dst_v, combined_weight_v);
+    timer_stop(stream_);
   }
 
  protected:
   raft::handle_t const &handle_;
   cudaStream_t stream_;
 
+  std::unique_ptr<Dendrogram<vertex_t>> dendrogram_;
+
   vertex_t number_of_vertices_;
   vertex_t base_vertex_id_{0};
   vertex_t base_src_vertex_id_{0};
@@ -1723,7 +1305,6 @@ class Louvain {
   rmm::device_vector<weight_t> src_cluster_weights_cache_v_{};
   rmm::device_vector<weight_t> dst_cluster_weights_cache_v_{};
 
-  rmm::device_vector<vertex_t> cluster_v_;
   rmm::device_vector<vertex_t> src_cluster_cache_v_{};
   rmm::device_vector<vertex_t> dst_cluster_cache_v_{};
 
diff --git a/cpp/src/experimental/pagerank.cu b/cpp/src/experimental/pagerank.cu
index 1aa7f37fa6b..058cbfe5966 100644
--- a/cpp/src/experimental/pagerank.cu
+++ b/cpp/src/experimental/pagerank.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -286,8 +286,6 @@ void pagerank(raft::handle_t const& handle,
       CUGRAPH_FAIL("PageRank failed to converge.");
     }
   }
-
-  return;
 }
 
 }  // namespace detail
diff --git a/cpp/src/experimental/relabel.cu b/cpp/src/experimental/relabel.cu
new file mode 100644
index 00000000000..62bd6951f71
--- /dev/null
+++ b/cpp/src/experimental/relabel.cu
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <experimental/include_cuco_static_map.cuh>
+
+#include <experimental/detail/graph_utils.cuh>
+#include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
+#include <experimental/graph_view.hpp>
+#include <patterns/copy_to_adj_matrix_row_col.cuh>
+#include <utilities/error.hpp>
+#include <utilities/shuffle_comm.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <tuple>
+#include <utility>
+
+namespace cugraph {
+namespace experimental {
+
+template <typename vertex_t, bool multi_gpu>
+void relabel(raft::handle_t const& handle,
+             std::tuple<vertex_t const*, vertex_t const*> old_new_label_pairs,
+             vertex_t num_label_pairs,
+             vertex_t* labels /* [INOUT] */,
+             vertex_t num_labels,
+             bool do_expensive_check)
+{
+  double constexpr load_factor = 0.7;
+
+  // FIXME: remove this check once we drop Pascal support
+  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
+                  "Relabel not supported on Pascal and older architectures.");
+
+#ifdef CUCO_STATIC_MAP_DEFINED
+  if (multi_gpu) {
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+
+    auto key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size};
+
+    // find unique old labels (to be relabeled)
+
+    rmm::device_uvector<vertex_t> unique_old_labels(num_labels, handle.get_stream());
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 labels,
+                 labels + num_labels,
+                 unique_old_labels.data());
+    thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 unique_old_labels.begin(),
+                 unique_old_labels.end());
+    unique_old_labels.resize(
+      thrust::distance(
+        unique_old_labels.begin(),
+        thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       unique_old_labels.begin(),
+                       unique_old_labels.end())),
+      handle.get_stream());
+    unique_old_labels.shrink_to_fit(handle.get_stream());
+
+    // collect new labels for the unique old labels
+
+    rmm::device_uvector<vertex_t> new_labels_for_unique_old_labels(0, handle.get_stream());
+    {
+      // shuffle the old_new_label_pairs based on applying the compute_gpu_id_from_vertex_t functor
+      // to the old labels
+
+      rmm::device_uvector<vertex_t> rx_label_pair_old_labels(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> rx_label_pair_new_labels(0, handle.get_stream());
+      {
+        rmm::device_uvector<vertex_t> label_pair_old_labels(num_label_pairs, handle.get_stream());
+        rmm::device_uvector<vertex_t> label_pair_new_labels(num_label_pairs, handle.get_stream());
+        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     std::get<0>(old_new_label_pairs),
+                     std::get<0>(old_new_label_pairs) + num_label_pairs,
+                     label_pair_old_labels.begin());
+        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     std::get<1>(old_new_label_pairs),
+                     std::get<1>(old_new_label_pairs) + num_label_pairs,
+                     label_pair_new_labels.begin());
+        auto pair_first = thrust::make_zip_iterator(
+          thrust::make_tuple(label_pair_old_labels.begin(), label_pair_new_labels.begin()));
+        std::forward_as_tuple(std::tie(rx_label_pair_old_labels, rx_label_pair_new_labels),
+                              std::ignore) =
+          groupby_gpuid_and_shuffle_values(
+            handle.get_comms(),
+            pair_first,
+            pair_first + num_label_pairs,
+            [key_func] __device__(auto val) { return key_func(thrust::get<0>(val)); },
+            handle.get_stream());
+      }
+
+      // update intermediate relabel map
+
+      CUDA_TRY(cudaStreamSynchronize(
+        handle.get_stream()));  // cuco::static_map currently does not take stream
+
+      cuco::static_map<vertex_t, vertex_t> relabel_map{
+        static_cast<size_t>(static_cast<double>(rx_label_pair_old_labels.size()) / load_factor),
+        invalid_vertex_id<vertex_t>::value,
+        invalid_vertex_id<vertex_t>::value};
+
+      auto pair_first = thrust::make_transform_iterator(
+        thrust::make_zip_iterator(
+          thrust::make_tuple(rx_label_pair_old_labels.begin(), rx_label_pair_new_labels.begin())),
+        [] __device__(auto val) {
+          return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+        });
+      relabel_map.insert(pair_first, pair_first + rx_label_pair_old_labels.size());
+
+      rx_label_pair_old_labels.resize(0, handle.get_stream());
+      rx_label_pair_new_labels.resize(0, handle.get_stream());
+      rx_label_pair_old_labels.shrink_to_fit(handle.get_stream());
+      rx_label_pair_new_labels.shrink_to_fit(handle.get_stream());
+
+      // shuffle unique_old_labels, relabel using the intermediate relabel map, and shuffle back
+
+      {
+        rmm::device_uvector<vertex_t> rx_unique_old_labels(0, handle.get_stream());
+        std::vector<size_t> rx_value_counts{};
+        std::tie(rx_unique_old_labels, rx_value_counts) = groupby_gpuid_and_shuffle_values(
+          handle.get_comms(),
+          unique_old_labels.begin(),
+          unique_old_labels.end(),
+          [key_func] __device__(auto val) { return key_func(val); },
+          handle.get_stream());
+
+        CUDA_TRY(cudaStreamSynchronize(
+          handle.get_stream()));  // cuco::static_map currently does not take stream
+
+        relabel_map.find(
+          rx_unique_old_labels.begin(),
+          rx_unique_old_labels.end(),
+          rx_unique_old_labels
+            .begin());  // now rx_unique_old_lables hold new labels for the corresponding old labels
+
+        std::tie(new_labels_for_unique_old_labels, std::ignore) = shuffle_values(
+          handle.get_comms(), rx_unique_old_labels.begin(), rx_value_counts, handle.get_stream());
+      }
+    }
+
+    cuco::static_map<vertex_t, vertex_t> relabel_map(
+      static_cast<size_t>(static_cast<double>(unique_old_labels.size()) / load_factor),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value);
+
+    auto pair_first = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(
+        thrust::make_tuple(unique_old_labels.begin(), new_labels_for_unique_old_labels.begin())),
+      [] __device__(auto val) {
+        return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+      });
+
+    relabel_map.insert(pair_first, pair_first + unique_old_labels.size());
+    relabel_map.find(labels, labels + num_labels, labels);
+  } else {
+    cuco::static_map<vertex_t, vertex_t> relabel_map(
+      static_cast<size_t>(static_cast<double>(num_label_pairs) / load_factor),
+      invalid_vertex_id<vertex_t>::value,
+      invalid_vertex_id<vertex_t>::value);
+
+    auto pair_first = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(
+        thrust::make_tuple(std::get<0>(old_new_label_pairs), std::get<1>(old_new_label_pairs))),
+      [] __device__(auto val) {
+        return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+      });
+
+    relabel_map.insert(pair_first, pair_first + num_label_pairs);
+    relabel_map.find(labels, labels + num_labels, labels);
+  }
+
+  if (do_expensive_check) {
+    CUGRAPH_EXPECTS(
+      thrust::count(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    labels,
+                    labels + num_labels,
+                    invalid_vertex_id<vertex_t>::value) == 0,
+      "Invalid input argument: labels include old label values missing in old_new_label_pairs.");
+  }
+#endif
+
+  return;
+}
+
+// explicit instantiation
+
+template void relabel<int32_t, true>(raft::handle_t const& handle,
+                                     std::tuple<int32_t const*, int32_t const*> old_new_label_pairs,
+                                     int32_t num_label_pairs,
+                                     int32_t* labels,
+                                     int32_t num_labels,
+                                     bool do_expensive_check);
+
+template void relabel<int32_t, false>(
+  raft::handle_t const& handle,
+  std::tuple<int32_t const*, int32_t const*> old_new_label_pairs,
+  int32_t num_label_pairs,
+  int32_t* labels,
+  int32_t num_labels,
+  bool do_expensive_check);
+
+template void relabel<int64_t, true>(raft::handle_t const& handle,
+                                     std::tuple<int64_t const*, int64_t const*> old_new_label_pairs,
+                                     int64_t num_label_pairs,
+                                     int64_t* labels,
+                                     int64_t num_labels,
+                                     bool do_expensive_check);
+
+template void relabel<int64_t, false>(
+  raft::handle_t const& handle,
+  std::tuple<int64_t const*, int64_t const*> old_new_label_pairs,
+  int64_t num_label_pairs,
+  int64_t* labels,
+  int64_t num_labels,
+  bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/renumber_edgelist.cu b/cpp/src/experimental/renumber_edgelist.cu
new file mode 100644
index 00000000000..6a5a1c732c2
--- /dev/null
+++ b/cpp/src/experimental/renumber_edgelist.cu
@@ -0,0 +1,821 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <experimental/include_cuco_static_map.cuh>
+
+#include <experimental/detail/graph_utils.cuh>
+#include <experimental/graph_functions.hpp>
+#include <experimental/graph_view.hpp>
+#include <utilities/device_comm.cuh>
+#include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
+#include <utilities/shuffle_comm.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/sort.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+#include <tuple>
+#include <utility>
+
+namespace cugraph {
+namespace experimental {
+namespace detail {
+
+#ifdef CUCO_STATIC_MAP_DEFINED
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+rmm::device_uvector<vertex_t> compute_renumber_map(
+  raft::handle_t const& handle,
+  vertex_t const* vertices,
+  vertex_t num_local_vertices /* relevant only if vertices != nullptr */,
+  vertex_t const* edgelist_major_vertices,
+  vertex_t const* edgelist_minor_vertices,
+  edge_t num_edgelist_edges)
+{
+  // FIXME: compare this sort based approach with hash based approach in both speed and memory
+  // footprint
+
+  // 1. acquire (unique major label, count) pairs
+
+  rmm::device_uvector<vertex_t> tmp_labels(num_edgelist_edges, handle.get_stream());
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               edgelist_major_vertices,
+               edgelist_major_vertices + num_edgelist_edges,
+               tmp_labels.begin());
+  thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               tmp_labels.begin(),
+               tmp_labels.end());
+  rmm::device_uvector<vertex_t> major_labels(tmp_labels.size(), handle.get_stream());
+  rmm::device_uvector<edge_t> major_counts(major_labels.size(), handle.get_stream());
+  auto major_pair_it =
+    thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          tmp_labels.begin(),
+                          tmp_labels.end(),
+                          thrust::make_constant_iterator(edge_t{1}),
+                          major_labels.begin(),
+                          major_counts.begin());
+  tmp_labels.resize(0, handle.get_stream());
+  tmp_labels.shrink_to_fit(handle.get_stream());
+  major_labels.resize(thrust::distance(major_labels.begin(), thrust::get<0>(major_pair_it)),
+                      handle.get_stream());
+  major_counts.resize(major_labels.size(), handle.get_stream());
+  major_labels.shrink_to_fit(handle.get_stream());
+  major_counts.shrink_to_fit(handle.get_stream());
+
+  // 2. acquire unique minor labels
+
+  rmm::device_uvector<vertex_t> minor_labels(num_edgelist_edges, handle.get_stream());
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               edgelist_minor_vertices,
+               edgelist_minor_vertices + num_edgelist_edges,
+               minor_labels.begin());
+  thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               minor_labels.begin(),
+               minor_labels.end());
+  auto minor_label_it =
+    thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   minor_labels.begin(),
+                   minor_labels.end());
+  minor_labels.resize(thrust::distance(minor_labels.begin(), minor_label_it), handle.get_stream());
+  minor_labels.shrink_to_fit(handle.get_stream());
+
+  // 3. merge major and minor labels and vertex labels
+
+  rmm::device_uvector<vertex_t> merged_labels(major_labels.size() + minor_labels.size(),
+                                              handle.get_stream());
+
+  rmm::device_uvector<edge_t> merged_counts(merged_labels.size(), handle.get_stream());
+  thrust::merge_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       major_labels.begin(),
+                       major_labels.end(),
+                       minor_labels.begin(),
+                       minor_labels.end(),
+                       major_counts.begin(),
+                       thrust::make_constant_iterator(edge_t{0}),
+                       merged_labels.begin(),
+                       merged_counts.begin());
+
+  major_labels.resize(0, handle.get_stream());
+  major_counts.resize(0, handle.get_stream());
+  minor_labels.resize(0, handle.get_stream());
+  major_labels.shrink_to_fit(handle.get_stream());
+  major_counts.shrink_to_fit(handle.get_stream());
+  minor_labels.shrink_to_fit(handle.get_stream());
+
+  rmm::device_uvector<vertex_t> labels(merged_labels.size(), handle.get_stream());
+  rmm::device_uvector<edge_t> counts(labels.size(), handle.get_stream());
+  auto pair_it =
+    thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                          merged_labels.begin(),
+                          merged_labels.end(),
+                          merged_counts.begin(),
+                          labels.begin(),
+                          counts.begin());
+  merged_labels.resize(0, handle.get_stream());
+  merged_counts.resize(0, handle.get_stream());
+  merged_labels.shrink_to_fit(handle.get_stream());
+  merged_counts.shrink_to_fit(handle.get_stream());
+  labels.resize(thrust::distance(labels.begin(), thrust::get<0>(pair_it)), handle.get_stream());
+  counts.resize(labels.size(), handle.get_stream());
+  labels.shrink_to_fit(handle.get_stream());
+  counts.shrink_to_fit(handle.get_stream());
+
+  // 4. if multi-GPU, shuffle and reduce (label, count) pairs
+
+  if (multi_gpu) {
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+
+    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(labels.begin(), counts.begin()));
+    rmm::device_uvector<vertex_t> rx_labels(0, handle.get_stream());
+    rmm::device_uvector<edge_t> rx_counts(0, handle.get_stream());
+    std::forward_as_tuple(std::tie(rx_labels, rx_counts), std::ignore) =
+      groupby_gpuid_and_shuffle_values(
+        comm,
+        pair_first,
+        pair_first + labels.size(),
+        [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(
+          auto val) { return key_func(thrust::get<0>(val)); },
+        handle.get_stream());
+
+    labels.resize(rx_labels.size(), handle.get_stream());
+    counts.resize(labels.size(), handle.get_stream());
+    thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                        rx_labels.begin(),
+                        rx_labels.end(),
+                        rx_counts.begin());
+    pair_it = thrust::reduce_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    rx_labels.begin(),
+                                    rx_labels.end(),
+                                    rx_counts.begin(),
+                                    labels.begin(),
+                                    counts.begin());
+    rx_labels.resize(0, handle.get_stream());
+    rx_counts.resize(0, handle.get_stream());
+    rx_labels.shrink_to_fit(handle.get_stream());
+    rx_counts.shrink_to_fit(handle.get_stream());
+    labels.resize(thrust::distance(labels.begin(), thrust::get<0>(pair_it)), handle.get_stream());
+    counts.resize(labels.size(), handle.get_stream());
+    labels.shrink_to_fit(handle.get_stream());
+    labels.shrink_to_fit(handle.get_stream());
+  }
+
+  // 5. if vertices != nullptr, add isolated vertices
+
+  rmm::device_uvector<vertex_t> isolated_vertices(0, handle.get_stream());
+  if (vertices != nullptr) {
+    auto num_isolated_vertices = thrust::count_if(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      vertices,
+      vertices + num_local_vertices,
+      [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) {
+        return !thrust::binary_search(thrust::seq, label_first, label_last, v);
+      });
+    isolated_vertices.resize(num_isolated_vertices, handle.get_stream());
+    thrust::copy_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    vertices,
+                    vertices + num_local_vertices,
+                    isolated_vertices.begin(),
+                    [label_first = labels.begin(), label_last = labels.end()] __device__(auto v) {
+                      return !thrust::binary_search(thrust::seq, label_first, label_last, v);
+                    });
+  }
+
+  if (isolated_vertices.size() > 0) {
+    labels.resize(labels.size() + isolated_vertices.size(), handle.get_stream());
+    counts.resize(labels.size(), handle.get_stream());
+    thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 isolated_vertices.begin(),
+                 isolated_vertices.end(),
+                 labels.end() - isolated_vertices.size());
+    thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                 counts.end() - isolated_vertices.size(),
+                 counts.end(),
+                 edge_t{0});
+  }
+
+  // 6. sort by degree
+
+  thrust::sort_by_key(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                      counts.begin(),
+                      counts.end(),
+                      labels.begin(),
+                      thrust::greater<edge_t>());
+
+  return std::move(labels);
+}
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+void expensive_check_edgelist(
+  raft::handle_t const& handle,
+  vertex_t const* local_vertices,
+  vertex_t num_local_vertices /* relevant only if local_vertices != nullptr */,
+  vertex_t const* edgelist_major_vertices,
+  vertex_t const* edgelist_minor_vertices,
+  edge_t num_edgelist_edges,
+  bool is_hypergraph_partitioned /* relevant only if multi_gpu == true */)
+{
+  rmm::device_uvector<vertex_t> sorted_local_vertices(
+    local_vertices != nullptr ? num_local_vertices : vertex_t{0}, handle.get_stream());
+  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               local_vertices,
+               local_vertices + num_local_vertices,
+               sorted_local_vertices.begin());
+  thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+               sorted_local_vertices.begin(),
+               sorted_local_vertices.end());
+
+  if (multi_gpu) {
+    auto& comm               = handle.get_comms();
+    auto const comm_size     = comm.get_size();
+    auto const comm_rank     = comm.get_rank();
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_size = row_comm.get_size();
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_size = col_comm.get_size();
+
+    CUGRAPH_EXPECTS(
+      thrust::count_if(
+        rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+        local_vertices,
+        local_vertices + num_local_vertices,
+        [comm_rank,
+         key_func =
+           detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(auto val) {
+          return key_func(val) != comm_rank;
+        }) == 0,
+      "Invalid input argument: local_vertices should be pre-shuffled.");
+
+    auto edge_first = thrust::make_zip_iterator(
+      thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices));
+    CUGRAPH_EXPECTS(
+      thrust::count_if(
+        rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+        edge_first,
+        edge_first + num_edgelist_edges,
+        [comm_rank,
+         key_func =
+           detail::compute_gpu_id_from_edge_t<vertex_t>{is_hypergraph_partitioned,
+                                                        comm_size,
+                                                        row_comm_size,
+                                                        col_comm_size}] __device__(auto edge) {
+          return key_func(thrust::get<0>(edge), thrust::get<1>(edge)) != comm_rank;
+        }) == 0,
+      "Invalid input argument: edgelist_major_vertices & edgelist_minor_vertices should be "
+      "pre-shuffled.");
+
+    if (local_vertices != nullptr) {
+      rmm::device_uvector<vertex_t> unique_edge_vertices(num_edgelist_edges * 2,
+                                                         handle.get_stream());
+      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   edgelist_major_vertices,
+                   edgelist_major_vertices + num_edgelist_edges,
+                   unique_edge_vertices.begin());
+      thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   edgelist_minor_vertices,
+                   edgelist_minor_vertices + num_edgelist_edges,
+                   unique_edge_vertices.begin() + num_edgelist_edges);
+      thrust::sort(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   unique_edge_vertices.begin(),
+                   unique_edge_vertices.end());
+      unique_edge_vertices.resize(
+        thrust::distance(
+          unique_edge_vertices.begin(),
+          thrust::unique(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                         unique_edge_vertices.begin(),
+                         unique_edge_vertices.end())),
+        handle.get_stream());
+
+      rmm::device_uvector<vertex_t> rx_unique_edge_vertices(0, handle.get_stream());
+      std::tie(rx_unique_edge_vertices, std::ignore) = groupby_gpuid_and_shuffle_values(
+        handle.get_comms(),
+        unique_edge_vertices.begin(),
+        unique_edge_vertices.end(),
+        [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}] __device__(
+          auto val) { return key_func(val); },
+        handle.get_stream());
+
+      unique_edge_vertices = std::move(rx_unique_edge_vertices);
+
+      CUGRAPH_EXPECTS(
+        thrust::count_if(
+          rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+          unique_edge_vertices.begin(),
+          unique_edge_vertices.end(),
+          [num_local_vertices,
+           sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) {
+            return !thrust::binary_search(
+              thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v);
+          }) == 0,
+        "Invalid input argument: edgelist_major_vertices and/or edgelist_minor_vertices have "
+        "invalid vertex ID(s).");
+    }
+  } else {
+    if (local_vertices != nullptr) {
+      CUGRAPH_EXPECTS(
+        thrust::count_if(
+          rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+          edgelist_major_vertices,
+          edgelist_major_vertices + num_edgelist_edges,
+          [num_local_vertices,
+           sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) {
+            return !thrust::binary_search(
+              thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v);
+          }) == 0,
+        "Invalid input argument: edgelist_major_vertices has invalid vertex ID(s).");
+
+      CUGRAPH_EXPECTS(
+        thrust::count_if(
+          rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+          edgelist_major_vertices,
+          edgelist_major_vertices + num_edgelist_edges,
+          [num_local_vertices,
+           sorted_local_vertices = sorted_local_vertices.data()] __device__(auto v) {
+            return !thrust::binary_search(
+              thrust::seq, sorted_local_vertices, sorted_local_vertices + num_local_vertices, v);
+          }) == 0,
+        "Invalid input argument: edgelist_major_vertices has invalid vertex ID(s).");
+    }
+  }
+}
+#endif
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<multi_gpu,
+                 std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t>>
+renumber_edgelist(raft::handle_t const& handle,
+                  vertex_t const* local_vertices,
+                  vertex_t num_local_vertices /* relevant only if local_vertices != nullptr */,
+                  vertex_t* edgelist_major_vertices /* [INOUT] */,
+                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+                  edge_t num_edgelist_edges,
+                  bool is_hypergraph_partitioned,
+                  bool do_expensive_check)
+{
+  // FIXME: remove this check once we drop Pascal support
+  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
+                  "Relabel not supported on Pascal and older architectures.");
+
+#ifdef CUCO_STATIC_MAP_DEFINED
+  auto& comm               = handle.get_comms();
+  auto const comm_size     = comm.get_size();
+  auto const comm_rank     = comm.get_rank();
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_size = row_comm.get_size();
+  auto const row_comm_rank = row_comm.get_rank();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
+  auto const col_comm_rank = col_comm.get_rank();
+
+  if (do_expensive_check) {
+    expensive_check_edgelist<vertex_t, edge_t, multi_gpu>(handle,
+                                                          local_vertices,
+                                                          num_local_vertices,
+                                                          edgelist_major_vertices,
+                                                          edgelist_minor_vertices,
+                                                          num_edgelist_edges,
+                                                          is_hypergraph_partitioned);
+  }
+
+  // 1. compute renumber map
+
+  auto renumber_map_labels =
+    detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(handle,
+                                                              local_vertices,
+                                                              num_local_vertices,
+                                                              edgelist_major_vertices,
+                                                              edgelist_minor_vertices,
+                                                              num_edgelist_edges);
+
+  // 2. initialize partition_t object, number_of_vertices, and number_of_edges for the coarsened
+  // graph
+
+  auto vertex_partition_counts = host_scalar_allgather(
+    comm, static_cast<vertex_t>(renumber_map_labels.size()), handle.get_stream());
+  std::vector<vertex_t> vertex_partition_offsets(comm_size + 1, 0);
+  std::partial_sum(vertex_partition_counts.begin(),
+                   vertex_partition_counts.end(),
+                   vertex_partition_offsets.begin() + 1);
+
+  partition_t<vertex_t> partition(vertex_partition_offsets,
+                                  is_hypergraph_partitioned,
+                                  row_comm_size,
+                                  col_comm_size,
+                                  row_comm_rank,
+                                  col_comm_rank);
+
+  auto number_of_vertices = vertex_partition_offsets.back();
+  auto number_of_edges    = host_scalar_allreduce(comm, num_edgelist_edges, handle.get_stream());
+
+  // 3. renumber edges
+
+  if (is_hypergraph_partitioned) {
+    CUGRAPH_FAIL("unimplemented.");
+  } else {
+    double constexpr load_factor = 0.7;
+
+    // FIXME: compare this hash based approach with a binary search based approach in both memory
+    // footprint and execution time
+
+    {
+      vertex_t major_first{};
+      vertex_t major_last{};
+      std::tie(major_first, major_last) = partition.get_matrix_partition_major_range(0);
+      rmm::device_uvector<vertex_t> renumber_map_major_labels(major_last - major_first,
+                                                              handle.get_stream());
+      std::vector<size_t> recvcounts(row_comm_size);
+      for (int i = 0; i < row_comm_size; ++i) {
+        recvcounts[i] = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
+      }
+      std::vector<size_t> displacements(row_comm_size, 0);
+      std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1);
+      device_allgatherv(row_comm,
+                        renumber_map_labels.begin(),
+                        renumber_map_major_labels.begin(),
+                        recvcounts,
+                        displacements,
+                        handle.get_stream());
+
+      CUDA_TRY(cudaStreamSynchronize(
+        handle.get_stream()));  // cuco::static_map currently does not take stream
+
+      cuco::static_map<vertex_t, vertex_t> renumber_map{
+        static_cast<size_t>(static_cast<double>(renumber_map_major_labels.size()) / load_factor),
+        invalid_vertex_id<vertex_t>::value,
+        invalid_vertex_id<vertex_t>::value};
+      auto pair_first = thrust::make_transform_iterator(
+        thrust::make_zip_iterator(thrust::make_tuple(renumber_map_major_labels.begin(),
+                                                     thrust::make_counting_iterator(major_first))),
+        [] __device__(auto val) {
+          return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+        });
+      renumber_map.insert(pair_first, pair_first + renumber_map_major_labels.size());
+      renumber_map.find(edgelist_major_vertices,
+                        edgelist_major_vertices + num_edgelist_edges,
+                        edgelist_major_vertices);
+    }
+
+    {
+      vertex_t minor_first{};
+      vertex_t minor_last{};
+      std::tie(minor_first, minor_last) = partition.get_matrix_partition_minor_range();
+      rmm::device_uvector<vertex_t> renumber_map_minor_labels(minor_last - minor_first,
+                                                              handle.get_stream());
+
+      // FIXME: this P2P is unnecessary if we apply the partitioning scheme used with hypergraph
+      // partitioning
+      auto comm_src_rank = row_comm_rank * col_comm_size + col_comm_rank;
+      auto comm_dst_rank = (comm_rank % col_comm_size) * row_comm_size + comm_rank / col_comm_size;
+      // FIXME: this branch may be no longer necessary with NCCL backend
+      if (comm_src_rank == comm_rank) {
+        assert(comm_dst_rank == comm_rank);
+        thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     renumber_map_labels.begin(),
+                     renumber_map_labels.end(),
+                     renumber_map_minor_labels.begin() +
+                       (partition.get_vertex_partition_first(comm_src_rank) -
+                        partition.get_vertex_partition_first(row_comm_rank * col_comm_size)));
+      } else {
+        device_sendrecv(comm,
+                        renumber_map_labels.begin(),
+                        renumber_map_labels.size(),
+                        comm_dst_rank,
+                        renumber_map_minor_labels.begin() +
+                          (partition.get_vertex_partition_first(comm_src_rank) -
+                           partition.get_vertex_partition_first(row_comm_rank * col_comm_size)),
+                        static_cast<size_t>(partition.get_vertex_partition_size(comm_src_rank)),
+                        comm_src_rank,
+                        handle.get_stream());
+      }
+
+      // FIXME: these broadcast operations can be placed between ncclGroupStart() and
+      // ncclGroupEnd()
+      for (int i = 0; i < col_comm_size; ++i) {
+        auto offset = partition.get_vertex_partition_first(row_comm_rank * col_comm_size + i) -
+                      partition.get_vertex_partition_first(row_comm_rank * col_comm_size);
+        auto count = partition.get_vertex_partition_size(row_comm_rank * col_comm_size + i);
+        device_bcast(col_comm,
+                     renumber_map_minor_labels.begin() + offset,
+                     renumber_map_minor_labels.begin() + offset,
+                     count,
+                     i,
+                     handle.get_stream());
+      }
+
+      CUDA_TRY(cudaStreamSynchronize(
+        handle.get_stream()));  // cuco::static_map currently does not take stream
+
+      cuco::static_map<vertex_t, vertex_t> renumber_map{
+        static_cast<size_t>(static_cast<double>(renumber_map_minor_labels.size()) / load_factor),
+        invalid_vertex_id<vertex_t>::value,
+        invalid_vertex_id<vertex_t>::value};
+      auto pair_first = thrust::make_transform_iterator(
+        thrust::make_zip_iterator(thrust::make_tuple(renumber_map_minor_labels.begin(),
+                                                     thrust::make_counting_iterator(minor_first))),
+        [] __device__(auto val) {
+          return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+        });
+      renumber_map.insert(pair_first, pair_first + renumber_map_minor_labels.size());
+      renumber_map.find(edgelist_minor_vertices,
+                        edgelist_minor_vertices + num_edgelist_edges,
+                        edgelist_minor_vertices);
+    }
+  }
+
+  return std::make_tuple(
+    std::move(renumber_map_labels), partition, number_of_vertices, number_of_edges);
+#else
+  return std::make_tuple(
+    rmm::device_uvector<vertex_t>(0, handle.get_stream()),
+    partition_t<vertex_t>(std::vector<vertex_t>(), false, int{0}, int{0}, int{0}, int{0}),
+    vertex_t{0},
+    edge_t{0});
+#endif
+}
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
+  raft::handle_t const& handle,
+  vertex_t const* vertices,
+  vertex_t num_vertices /* relevant only if vertices != nullptr */,
+  vertex_t* edgelist_major_vertices /* [INOUT] */,
+  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+  edge_t num_edgelist_edges,
+  bool do_expensive_check)
+{
+  // FIXME: remove this check once we drop Pascal support
+  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
+                  "Relabel not supported on Pascal and older architectures.");
+
+#ifdef CUCO_STATIC_MAP_DEFINED
+  if (do_expensive_check) {
+    expensive_check_edgelist<vertex_t, edge_t, multi_gpu>(handle,
+                                                          vertices,
+                                                          num_vertices,
+                                                          edgelist_major_vertices,
+                                                          edgelist_minor_vertices,
+                                                          num_edgelist_edges,
+                                                          false);
+  }
+
+  auto renumber_map_labels =
+    detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(handle,
+                                                              vertices,
+                                                              num_vertices,
+                                                              edgelist_major_vertices,
+                                                              edgelist_minor_vertices,
+                                                              num_edgelist_edges);
+
+  double constexpr load_factor = 0.7;
+
+  // FIXME: compare this hash based approach with a binary search based approach in both memory
+  // footprint and execution time
+
+  cuco::static_map<vertex_t, vertex_t> renumber_map{
+    static_cast<size_t>(static_cast<double>(renumber_map_labels.size()) / load_factor),
+    invalid_vertex_id<vertex_t>::value,
+    invalid_vertex_id<vertex_t>::value};
+  auto pair_first = thrust::make_transform_iterator(
+    thrust::make_zip_iterator(
+      thrust::make_tuple(renumber_map_labels.begin(), thrust::make_counting_iterator(vertex_t{0}))),
+    [] __device__(auto val) {
+      return thrust::make_pair(thrust::get<0>(val), thrust::get<1>(val));
+    });
+  renumber_map.insert(pair_first, pair_first + renumber_map_labels.size());
+  renumber_map.find(
+    edgelist_major_vertices, edgelist_major_vertices + num_edgelist_edges, edgelist_major_vertices);
+  renumber_map.find(
+    edgelist_minor_vertices, edgelist_minor_vertices + num_edgelist_edges, edgelist_minor_vertices);
+
+  return std::move(renumber_map_labels);
+#else
+  return rmm::device_uvector<vertex_t>(0, handle.get_stream());
+#endif
+}
+
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<multi_gpu,
+                 std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t>>
+renumber_edgelist(raft::handle_t const& handle,
+                  vertex_t* edgelist_major_vertices /* [INOUT] */,
+                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+                  edge_t num_edgelist_edges,
+                  bool is_hypergraph_partitioned,
+                  bool do_expensive_check)
+{
+  // FIXME: remove this check once we drop Pascal support
+  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
+                  "Relabel not supported on Pascal and older architectures.");
+  return detail::renumber_edgelist<vertex_t, edge_t, multi_gpu>(handle,
+                                                                static_cast<vertex_t*>(nullptr),
+                                                                vertex_t{0},
+                                                                edgelist_major_vertices,
+                                                                edgelist_minor_vertices,
+                                                                num_edgelist_edges,
+                                                                is_hypergraph_partitioned,
+                                                                do_expensive_check);
+}
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
+  raft::handle_t const& handle,
+  vertex_t* edgelist_major_vertices /* [INOUT] */,
+  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+  edge_t num_edgelist_edges,
+  bool do_expensive_check)
+{
+  // FIXME: remove this check once we drop Pascal support
+  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
+                  "Relabel not supported on Pascal and older architectures.");
+  return detail::renumber_edgelist<vertex_t, edge_t, multi_gpu>(handle,
+                                                                static_cast<vertex_t*>(nullptr),
+                                                                vertex_t{0} /* dummy */,
+                                                                edgelist_major_vertices,
+                                                                edgelist_minor_vertices,
+                                                                num_edgelist_edges,
+                                                                do_expensive_check);
+}
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<multi_gpu,
+                 std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t>>
+renumber_edgelist(raft::handle_t const& handle,
+                  vertex_t const* local_vertices,
+                  vertex_t num_local_vertices,
+                  vertex_t* edgelist_major_vertices /* [INOUT] */,
+                  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+                  edge_t num_edgelist_edges,
+                  bool is_hypergraph_partitioned,
+                  bool do_expensive_check)
+{
+  // FIXME: remove this check once we drop Pascal support
+  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
+                  "Relabel not supported on Pascal and older architectures.");
+  return detail::renumber_edgelist<vertex_t, edge_t, multi_gpu>(handle,
+                                                                local_vertices,
+                                                                num_local_vertices,
+                                                                edgelist_major_vertices,
+                                                                edgelist_minor_vertices,
+                                                                num_edgelist_edges,
+                                                                is_hypergraph_partitioned,
+                                                                do_expensive_check);
+}
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+std::enable_if_t<!multi_gpu, rmm::device_uvector<vertex_t>> renumber_edgelist(
+  raft::handle_t const& handle,
+  vertex_t const* vertices,
+  vertex_t num_vertices,
+  vertex_t* edgelist_major_vertices /* [INOUT] */,
+  vertex_t* edgelist_minor_vertices /* [INOUT] */,
+  edge_t num_edgelist_edges,
+  bool do_expensive_check)
+{
+  // FIXME: remove this check once we drop Pascal support
+  CUGRAPH_EXPECTS(handle.get_device_properties().major >= 7,
+                  "Relabel not supported on Pascal and older architectures.");
+  return detail::renumber_edgelist<vertex_t, edge_t, multi_gpu>(handle,
+                                                                vertices,
+                                                                num_vertices,
+                                                                edgelist_major_vertices,
+                                                                edgelist_minor_vertices,
+                                                                num_edgelist_edges,
+                                                                do_expensive_check);
+}
+
+// explicit instantiation directives (EIDir's):
+//
+// instantiations for <vertex_t == int32_t, edge_t == int32_t>
+//
+template std::tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int32_t>
+renumber_edgelist<int32_t, int32_t, true>(raft::handle_t const& handle,
+                                          int32_t* edgelist_major_vertices /* [INOUT] */,
+                                          int32_t* edgelist_minor_vertices /* [INOUT] */,
+                                          int32_t num_edgelist_edges,
+                                          bool is_hypergraph_partitioned,
+                                          bool do_expensive_check);
+
+template rmm::device_uvector<int32_t> renumber_edgelist<int32_t, int32_t, false>(
+  raft::handle_t const& handle,
+  int32_t* edgelist_major_vertices /* [INOUT] */,
+  int32_t* edgelist_minor_vertices /* [INOUT] */,
+  int32_t num_edgelist_edges,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int32_t>
+renumber_edgelist<int32_t, int32_t, true>(raft::handle_t const& handle,
+                                          int32_t const* local_vertices,
+                                          int32_t num_local_vertices,
+                                          int32_t* edgelist_major_vertices /* [INOUT] */,
+                                          int32_t* edgelist_minor_vertices /* [INOUT] */,
+                                          int32_t num_edgelist_edges,
+                                          bool is_hypergraph_partitioned,
+                                          bool do_expensive_check);
+
+template rmm::device_uvector<int32_t> renumber_edgelist<int32_t, int32_t, false>(
+  raft::handle_t const& handle,
+  int32_t const* vertices,
+  int32_t num_vertices,
+  int32_t* edgelist_major_vertices /* [INOUT] */,
+  int32_t* edgelist_minor_vertices /* [INOUT] */,
+  int32_t num_edgelist_edges,
+  bool do_expensive_check);
+
+// instantiations for <vertex_t == int32_t, edge_t == int64_t>
+//
+template std::tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int64_t>
+renumber_edgelist<int32_t, int64_t, true>(raft::handle_t const& handle,
+                                          int32_t* edgelist_major_vertices /* [INOUT] */,
+                                          int32_t* edgelist_minor_vertices /* [INOUT] */,
+                                          int64_t num_edgelist_edges,
+                                          bool is_hypergraph_partitioned,
+                                          bool do_expensive_check);
+
+template rmm::device_uvector<int32_t> renumber_edgelist<int32_t, int64_t, false>(
+  raft::handle_t const& handle,
+  int32_t* edgelist_major_vertices /* [INOUT] */,
+  int32_t* edgelist_minor_vertices /* [INOUT] */,
+  int64_t num_edgelist_edges,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int64_t>
+renumber_edgelist<int32_t, int64_t, true>(raft::handle_t const& handle,
+                                          int32_t const* local_vertices,
+                                          int32_t num_local_vertices,
+                                          int32_t* edgelist_major_vertices /* [INOUT] */,
+                                          int32_t* edgelist_minor_vertices /* [INOUT] */,
+                                          int64_t num_edgelist_edges,
+                                          bool is_hypergraph_partitioned,
+                                          bool do_expensive_check);
+
+template rmm::device_uvector<int32_t> renumber_edgelist<int32_t, int64_t, false>(
+  raft::handle_t const& handle,
+  int32_t const* vertices,
+  int32_t num_vertices,
+  int32_t* edgelist_major_vertices /* [INOUT] */,
+  int32_t* edgelist_minor_vertices /* [INOUT] */,
+  int64_t num_edgelist_edges,
+  bool do_expensive_check);
+
+// instantiations for <vertex_t == int64_t, edge_t == int64_t>
+//
+template std::tuple<rmm::device_uvector<int64_t>, partition_t<int64_t>, int64_t, int64_t>
+renumber_edgelist<int64_t, int64_t, true>(raft::handle_t const& handle,
+                                          int64_t* edgelist_major_vertices /* [INOUT] */,
+                                          int64_t* edgelist_minor_vertices /* [INOUT] */,
+                                          int64_t num_edgelist_edges,
+                                          bool is_hypergraph_partitioned,
+                                          bool do_expensive_check);
+
+template rmm::device_uvector<int64_t> renumber_edgelist<int64_t, int64_t, false>(
+  raft::handle_t const& handle,
+  int64_t* edgelist_major_vertices /* [INOUT] */,
+  int64_t* edgelist_minor_vertices /* [INOUT] */,
+  int64_t num_edgelist_edges,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>, partition_t<int64_t>, int64_t, int64_t>
+renumber_edgelist<int64_t, int64_t, true>(raft::handle_t const& handle,
+                                          int64_t const* local_vertices,
+                                          int64_t num_local_vertices,
+                                          int64_t* edgelist_major_vertices /* [INOUT] */,
+                                          int64_t* edgelist_minor_vertices /* [INOUT] */,
+                                          int64_t num_edgelist_edges,
+                                          bool is_hypergraph_partitioned,
+                                          bool do_expensive_check);
+
+template rmm::device_uvector<int64_t> renumber_edgelist<int64_t, int64_t, false>(
+  raft::handle_t const& handle,
+  int64_t const* vertices,
+  int64_t num_vertices,
+  int64_t* edgelist_major_vertices /* [INOUT] */,
+  int64_t* edgelist_minor_vertices /* [INOUT] */,
+  int64_t num_edgelist_edges,
+  bool do_expensive_check);
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/sssp.cu b/cpp/src/experimental/sssp.cu
index ebcde1b1444..4996b3734cb 100644
--- a/cpp/src/experimental/sssp.cu
+++ b/cpp/src/experimental/sssp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -249,8 +249,6 @@ void sssp(raft::handle_t const &handle,
     handle.get_stream()));  // this is as necessary vertex_frontier will become out-of-scope once
                             // this function returns (FIXME: should I stream sync in VertexFrontier
                             // destructor?)
-
-  return;
 }
 
 }  // namespace detail
diff --git a/cpp/src/layout/force_atlas2.cu b/cpp/src/layout/force_atlas2.cu
index ef00f504d86..6da9b77b45d 100644
--- a/cpp/src/layout/force_atlas2.cu
+++ b/cpp/src/layout/force_atlas2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ void force_atlas2(GraphCOOView<vertex_t, edge_t, weight_t> &graph,
                   bool verbose,
                   internals::GraphBasedDimRedCallback *callback)
 {
-  CUGRAPH_EXPECTS(pos != nullptr, "Invalid API parameter: pos array should be of size 2 * V");
+  CUGRAPH_EXPECTS(pos != nullptr, "Invalid input argument: pos array should be of size 2 * V");
   CUGRAPH_EXPECTS(graph.number_of_vertices != 0, "Invalid input: Graph is empty");
 
   if (!barnes_hut_optimize) {
diff --git a/cpp/src/linear_assignment/README-hungarian.md b/cpp/src/linear_assignment/README-hungarian.md
new file mode 100644
index 00000000000..42dabd7cfbc
--- /dev/null
+++ b/cpp/src/linear_assignment/README-hungarian.md
@@ -0,0 +1,36 @@
+# LAP
+Implementation of ***O(n^3) Alternating Tree Variant*** of Hungarian Algorithm on NVIDIA CUDA-enabled GPU.
+
+This implementation solves a batch of ***k*** **Linear Assignment Problems (LAP)**, each with ***nxn*** matrix of single floating point cost values. At optimality, the algorithm produces an assignment with ***minimum*** cost.
+
+The API can be used to query optimal primal and dual costs, optimal assignment vector, and optimal row/column dual vectors for each subproblem in the batch.
+
+cuGraph exposes the Hungarian algorithm, the actual implementation is contained in the RAFT library which contains some common tools and kernels shared between cuGraph and cuML.
+
+Following parameters can be used to tune the performance of algorithm:
+
+1. epsilon: (in raft/lap/lap_kernels.cuh) This parameter controls the tolerance on the floating point precision. Setting this too small will result in increased solution time because the algorithm will search for precise solutions. Setting it too high may cause some inaccuracies.
+
+2. BLOCKDIMX, BLOCKDIMY: (in raft/lap/lap_functions.cuh) These parameters control threads_per_block to be used along the given dimension. Set these according to the device specifications and occupancy calculation.
+
+***This library is licensed under Apache License 2.0. Please cite our paper, if this library helps you in your research.***
+
+- Harvard citation style
+
+  Date, K. and Nagi, R., 2016. GPU-accelerated Hungarian algorithms for the Linear Assignment Problem. Parallel Computing, 57, pp.52-72.
+
+- BibTeX Citation block to be used in LaTeX bibliography file:
+
+```
+@article{date2016gpu,
+  title={GPU-accelerated Hungarian algorithms for the Linear Assignment Problem},
+  author={Date, Ketan and Nagi, Rakesh},
+  journal={Parallel Computing},
+  volume={57},
+  pages={52--72},
+  year={2016},
+  publisher={Elsevier}
+}
+```
+
+The paper is available online on [ScienceDirect](https://www.sciencedirect.com/science/article/abs/pii/S016781911630045X).
diff --git a/cpp/src/linear_assignment/hungarian.cu b/cpp/src/linear_assignment/hungarian.cu
index 164a386c6dd..40f7be52c90 100644
--- a/cpp/src/linear_assignment/hungarian.cu
+++ b/cpp/src/linear_assignment/hungarian.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,9 +67,9 @@ weight_t hungarian_sparse(raft::handle_t const &handle,
                           vertex_t *assignment,
                           cudaStream_t stream)
 {
-  CUGRAPH_EXPECTS(assignment != nullptr, "Invalid API parameter: assignment pointer is NULL");
+  CUGRAPH_EXPECTS(assignment != nullptr, "Invalid input argument: assignment pointer is NULL");
   CUGRAPH_EXPECTS(graph.edge_data != nullptr,
-                  "Invalid API parameter: graph must have edge data (costs)");
+                  "Invalid input argument: graph must have edge data (costs)");
 
 #ifdef TIMING
   HighResTimer hr_timer;
diff --git a/cpp/src/link_analysis/gunrock_hits.cpp b/cpp/src/link_analysis/gunrock_hits.cpp
index 8662c3bea79..5ffaacfe7a6 100644
--- a/cpp/src/link_analysis/gunrock_hits.cpp
+++ b/cpp/src/link_analysis/gunrock_hits.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,9 +42,9 @@ void hits(cugraph::GraphCSRView<vertex_t, edge_t, weight_t> const &graph,
           weight_t *hubs,
           weight_t *authorities)
 {
-  CUGRAPH_EXPECTS(hubs != nullptr, "Invalid API parameter: hubs array should be of size V");
+  CUGRAPH_EXPECTS(hubs != nullptr, "Invalid input argument: hubs array should be of size V");
   CUGRAPH_EXPECTS(authorities != nullptr,
-                  "Invalid API parameter: authorities array should be of size V");
+                  "Invalid input argument: authorities array should be of size V");
 
   //
   //  NOTE:  gunrock doesn't support passing a starting value
diff --git a/cpp/src/link_analysis/pagerank.cu b/cpp/src/link_analysis/pagerank.cu
deleted file mode 100644
index e5da24e328d..00000000000
--- a/cpp/src/link_analysis/pagerank.cu
+++ /dev/null
@@ -1,432 +0,0 @@
-/*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-// Pagerank solver
-// Author: Alex Fender afender@nvidia.com
-
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <algorithm>
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include "cub/cub.cuh"
-
-#include <raft/cudart_utils.h>
-#include <rmm/thrust_rmm_allocator.h>
-#include <utilities/error.hpp>
-
-#include <graph.hpp>
-#include "pagerank_1D.cuh"
-#include "utilities/graph_utils.cuh"
-
-#include <raft/spectral/matrix_wrappers.hpp>
-
-namespace cugraph {
-namespace detail {
-
-#ifdef DEBUG
-#define PR_VERBOSE
-#endif
-
-template <typename IndexType, typename ValueType>
-bool pagerankIteration(raft::handle_t const &handle,
-                       IndexType n,
-                       IndexType e,
-                       IndexType const *cscPtr,
-                       IndexType const *cscInd,
-                       ValueType *cscVal,
-                       ValueType alpha,
-                       ValueType *a,
-                       ValueType *b,
-                       float tolerance,
-                       int iter,
-                       int max_iter,
-                       ValueType *&tmp,
-                       void *cub_d_temp_storage,
-                       size_t cub_temp_storage_bytes,
-                       ValueType *&pr,
-                       ValueType *residual)
-{
-  ValueType dot_res;
-//#if defined(CUDART_VERSION) and CUDART_VERSION >= 11000
-#if 1
-  {
-    raft::matrix::sparse_matrix_t<IndexType, ValueType> const r_csr_m{
-      handle, cscPtr, cscInd, cscVal, n, e};
-    r_csr_m.mv(1.0, tmp, 0.0, pr);
-  }
-#else
-  CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage,
-                                  cub_temp_storage_bytes,
-                                  cscVal,
-                                  (IndexType *)cscPtr,
-                                  (IndexType *)cscInd,
-                                  tmp,
-                                  pr,
-                                  n,
-                                  n,
-                                  e));
-#endif
-  scal(n, alpha, pr);
-  dot_res = dot(n, a, tmp);
-  axpy(n, dot_res, b, pr);
-  scal(n, (ValueType)1.0 / nrm2(n, pr), pr);
-  axpy(n, (ValueType)-1.0, pr, tmp);
-  *residual = nrm2(n, tmp);
-  if (*residual < tolerance) {
-    scal(n, (ValueType)1.0 / nrm1(n, pr), pr);
-    return true;
-  } else {
-    if (iter < max_iter) {
-      // FIXME: Copy the pagerank vector results to the tmp vector, since there
-      // are still raw pointers in pagerank pointing to tmp vector locations
-      // that were std::swapped out in the solver.  A thrust::swap would
-      // probably be more efficent if the vectors were passed everywhere instead
-      // of pointers. std::swap is unsafe though. Just copying for now, as this
-      // may soon be replaced by the pattern accelerator.
-      copy(n, pr, tmp);
-    } else {
-      scal(n, (ValueType)1.0 / nrm1(n, pr), pr);
-    }
-    return false;
-  }
-}
-
-template <typename IndexType, typename ValueType>
-int pagerankSolver(raft::handle_t const &handle,
-                   IndexType n,
-                   IndexType e,
-                   IndexType const *cscPtr,
-                   IndexType const *cscInd,
-                   ValueType *cscVal,
-                   IndexType *prsVtx,
-                   ValueType *prsVal,
-                   IndexType prsLen,
-                   bool has_personalization,
-                   ValueType alpha,
-                   ValueType *a,
-                   bool has_guess,
-                   float tolerance,
-                   int max_iter,
-                   ValueType *&pagerank_vector,
-                   ValueType *&residual)
-{
-  int max_it, i = 0;
-  float tol;
-  bool converged              = false;
-  ValueType randomProbability = static_cast<ValueType>(1.0 / n);
-  ValueType *tmp_d{nullptr};
-  ValueType *b_d{nullptr};
-  void *cub_d_temp_storage      = NULL;
-  size_t cub_temp_storage_bytes = 0;
-
-  if (max_iter > 0)
-    max_it = max_iter;
-  else
-    max_it = 500;
-
-  if (tolerance == 0.0f)
-    tol = 1.0E-6f;
-  else if (tolerance < 1.0f && tolerance > 0.0f)
-    tol = tolerance;
-  else
-    return -1;
-
-  if (alpha <= 0.0f || alpha >= 1.0f) return -1;
-
-  rmm::device_vector<ValueType> b(n);
-  b_d = b.data().get();
-
-#if 1 /* temporary solution till https://github.com/NVlabs/cub/issues/162 is resolved */
-  thrust::device_vector<ValueType> tmp(n);
-  tmp_d = tmp.data().get();
-#else
-  rmm::device_vector<WT> tmp(n);
-  tmp_d = pr.data().get();
-#endif
-  // FIXME: this should take a passed CUDA strema instead of default nullptr
-  CHECK_CUDA(nullptr);
-
-  if (!has_guess) {
-    fill(n, pagerank_vector, randomProbability);
-    fill(n, tmp_d, randomProbability);
-  } else {
-    copy(n, pagerank_vector, tmp_d);
-  }
-
-  if (has_personalization) {
-    ValueType sum = nrm1(prsLen, prsVal);
-    if (static_cast<ValueType>(0) == sum) {
-      fill(n, b_d, randomProbability);
-    } else {
-      scal(n, static_cast<ValueType>(1.0 / sum), prsVal);
-      fill(n, b_d, static_cast<ValueType>(0));
-      scatter(prsLen, prsVal, b_d, prsVtx);
-    }
-  } else {
-    fill(n, b_d, randomProbability);
-  }
-  update_dangling_nodes(n, a, alpha);
-
-//#if defined(CUDART_VERSION) and CUDART_VERSION >= 11000
-#if 1
-  {
-    raft::matrix::sparse_matrix_t<IndexType, ValueType> const r_csr_m{
-      handle, cscPtr, cscInd, cscVal, n, e};
-    r_csr_m.mv(1.0, tmp_d, 0.0, pagerank_vector);
-  }
-#else
-  CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage,
-                                  cub_temp_storage_bytes,
-                                  cscVal,
-                                  (IndexType *)cscPtr,
-                                  (IndexType *)cscInd,
-                                  tmp_d,
-                                  pagerank_vector,
-                                  n,
-                                  n,
-                                  e));
-#endif
-  // Allocate temporary storage
-  rmm::device_buffer cub_temp_storage(cub_temp_storage_bytes);
-  cub_d_temp_storage = cub_temp_storage.data();
-
-#ifdef PR_VERBOSE
-  std::stringstream ss;
-  ss.str(std::string());
-  ss << " ------------------PageRank------------------" << std::endl;
-  ss << " --------------------------------------------" << std::endl;
-  ss << std::setw(10) << "Iteration" << std::setw(15) << "Residual" << std::endl;
-  ss << " --------------------------------------------" << std::endl;
-  std::cout << ss.str();
-#endif
-
-  while (!converged && i < max_it) {
-    i++;
-    converged = pagerankIteration<IndexType, ValueType>(handle,
-                                                        n,
-                                                        e,
-                                                        cscPtr,
-                                                        cscInd,
-                                                        cscVal,
-                                                        alpha,
-                                                        a,
-                                                        b_d,
-                                                        tol,
-                                                        i,
-                                                        max_it,
-                                                        tmp_d,
-                                                        cub_d_temp_storage,
-                                                        cub_temp_storage_bytes,
-                                                        pagerank_vector,
-                                                        residual);
-#ifdef PR_VERBOSE
-    ss.str(std::string());
-    ss << std::setw(10) << i;
-    ss.precision(3);
-    ss << std::setw(15) << std::scientific << *residual << std::endl;
-    std::cout << ss.str();
-#endif
-  }
-#ifdef PR_VERBOSE
-  std::cout << " --------------------------------------------" << std::endl;
-#endif
-
-  return converged ? 0 : 1;
-}
-
-// template int pagerankSolver<int, half> (  int n, int e, int *cscPtr, int *cscInd,half *cscVal,
-// half alpha, half *a, bool has_guess, float tolerance, int max_iter, half * &pagerank_vector, half
-// * &residual);
-template int pagerankSolver<int, float>(raft::handle_t const &handle,
-                                        int n,
-                                        int e,
-                                        int const *cscPtr,
-                                        int const *cscInd,
-                                        float *cscVal,
-                                        int *prsVtx,
-                                        float *prsVal,
-                                        int prsLen,
-                                        bool has_personalization,
-                                        float alpha,
-                                        float *a,
-                                        bool has_guess,
-                                        float tolerance,
-                                        int max_iter,
-                                        float *&pagerank_vector,
-                                        float *&residual);
-template int pagerankSolver<int, double>(raft::handle_t const &handle,
-                                         int n,
-                                         int e,
-                                         const int *cscPtr,
-                                         int const *cscInd,
-                                         double *cscVal,
-                                         int *prsVtx,
-                                         double *prsVal,
-                                         int prsLen,
-                                         bool has_personalization,
-                                         double alpha,
-                                         double *a,
-                                         bool has_guess,
-                                         float tolerance,
-                                         int max_iter,
-                                         double *&pagerank_vector,
-                                         double *&residual);
-
-template <typename VT, typename ET, typename WT>
-void pagerank_impl(raft::handle_t const &handle,
-                   GraphCSCView<VT, ET, WT> const &graph,
-                   WT *pagerank,
-                   VT personalization_subset_size = 0,
-                   VT *personalization_subset     = nullptr,
-                   WT *personalization_values     = nullptr,
-                   double alpha                   = 0.85,
-                   double tolerance               = 1e-5,
-                   int64_t max_iter               = 100,
-                   bool has_guess                 = false)
-{
-  bool has_personalization = false;
-  int prsLen               = 0;
-  VT m                     = graph.number_of_vertices;
-  ET nnz                   = graph.number_of_edges;
-  int status{0};
-  WT *d_pr{nullptr}, *d_val{nullptr}, *d_leaf_vector{nullptr};
-  WT res       = 1.0;
-  WT *residual = &res;
-
-  if (personalization_subset_size != 0) {
-    CUGRAPH_EXPECTS(personalization_subset != nullptr,
-                    "Invalid API parameter: personalization_subset array should be of size "
-                    "personalization_subset_size");
-    CUGRAPH_EXPECTS(personalization_values != nullptr,
-                    "Invalid API parameter: personalization_values array should be of size "
-                    "personalization_subset_size");
-    CUGRAPH_EXPECTS(personalization_subset_size <= m,
-                    "Personalization size should be smaller than V");
-    has_personalization = true;
-    prsLen              = static_cast<VT>(personalization_subset_size);
-  }
-
-#if 1 /* temporary solution till https://github.com/NVlabs/cub/issues/162 is resolved */
-  thrust::device_vector<WT> pr(m);
-  d_pr = pr.data().get();
-#else
-  rmm::device_vector<WT> pr(m);
-  d_pr = pr.data().get();
-#endif
-
-  rmm::device_vector<WT> leaf_vector(m);
-  rmm::device_vector<WT> val(nnz);
-
-  d_leaf_vector = leaf_vector.data().get();
-  d_val         = val.data().get();
-
-  //  The templating for HT_matrix_csc_coo assumes that m, nnz and data are all the same type
-  HT_matrix_csc_coo(m, nnz, graph.offsets, graph.indices, d_val, d_leaf_vector);
-
-  if (has_guess) { copy<WT>(m, (WT *)pagerank, d_pr); }
-
-  status = pagerankSolver<int32_t, WT>(handle,
-                                       m,
-                                       nnz,
-                                       graph.offsets,
-                                       graph.indices,
-                                       d_val,
-                                       personalization_subset,
-                                       personalization_values,
-                                       prsLen,
-                                       has_personalization,
-                                       alpha,
-                                       d_leaf_vector,
-                                       has_guess,
-                                       tolerance,
-                                       max_iter,
-                                       d_pr,
-                                       residual);
-
-  switch (status) {
-    case 0: break;
-    case -1: CUGRAPH_FAIL("Error : bad parameters in Pagerank");
-    case 1: break;  // Warning : Pagerank did not reached the desired tolerance
-    default: CUGRAPH_FAIL("Pagerank exec failed");
-  }
-
-  copy<WT>(m, d_pr, (WT *)pagerank);
-}
-}  // namespace detail
-
-template <typename VT, typename ET, typename WT>
-void pagerank(raft::handle_t const &handle,
-              GraphCSCView<VT, ET, WT> const &graph,
-              WT *pagerank,
-              VT personalization_subset_size,
-              VT *personalization_subset,
-              WT *personalization_values,
-              double alpha,
-              double tolerance,
-              int64_t max_iter,
-              bool has_guess)
-{
-  CUGRAPH_EXPECTS(pagerank != nullptr, "Invalid API parameter: Pagerank array should be of size V");
-  // Multi-GPU
-  if (handle.comms_initialized()) {
-    CUGRAPH_EXPECTS(has_guess == false,
-                    "Invalid API parameter: Multi-GPU Pagerank does not guess, please use the "
-                    "single GPU version for this feature");
-    CUGRAPH_EXPECTS(max_iter > 0, "The number of iteration must be positive");
-    cugraph::mg::pagerank<VT, ET, WT>(handle,
-                                      graph,
-                                      pagerank,
-                                      personalization_subset_size,
-                                      personalization_subset,
-                                      personalization_values,
-                                      alpha,
-                                      max_iter,
-                                      tolerance);
-  } else  // Single GPU
-    return detail::pagerank_impl<VT, ET, WT>(handle,
-                                             graph,
-                                             pagerank,
-                                             personalization_subset_size,
-                                             personalization_subset,
-                                             personalization_values,
-                                             alpha,
-                                             tolerance,
-                                             max_iter,
-                                             has_guess);
-}
-
-// explicit instantiation
-template void pagerank<int, int, float>(raft::handle_t const &handle,
-                                        GraphCSCView<int, int, float> const &graph,
-                                        float *pagerank,
-                                        int personalization_subset_size,
-                                        int *personalization_subset,
-                                        float *personalization_values,
-                                        double alpha,
-                                        double tolerance,
-                                        int64_t max_iter,
-                                        bool has_guess);
-template void pagerank<int, int, double>(raft::handle_t const &handle,
-                                         GraphCSCView<int, int, double> const &graph,
-                                         double *pagerank,
-                                         int personalization_subset_size,
-                                         int *personalization_subset,
-                                         double *personalization_values,
-                                         double alpha,
-                                         double tolerance,
-                                         int64_t max_iter,
-                                         bool has_guess);
-
-}  // namespace cugraph
diff --git a/cpp/src/link_analysis/pagerank_1D.cu b/cpp/src/link_analysis/pagerank_1D.cu
deleted file mode 100644
index 3774a364cf1..00000000000
--- a/cpp/src/link_analysis/pagerank_1D.cu
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Author: Alex Fender afender@nvidia.com
-
-#include <algorithm>
-#include <graph.hpp>
-#include "pagerank_1D.cuh"
-#include "utilities/graph_utils.cuh"
-
-namespace cugraph {
-namespace mg {
-
-template <typename VT, typename WT>
-__global__ void transition_kernel(const size_t e, const VT *ind, const VT *degree, WT *val)
-{
-  for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
-    val[i] = 1.0 / degree[ind[i]];  // Degree contains IN degree. So all degree[ind[i]] were
-                                    // incremented by definition (no div by 0).
-}
-
-template <typename VT, typename ET, typename WT>
-Pagerank<VT, ET, WT>::Pagerank(const raft::handle_t &handle_, GraphCSCView<VT, ET, WT> const &G)
-  : comm(handle_.get_comms()),
-    bookmark(G.number_of_vertices),
-    prev_pr(G.number_of_vertices),
-    val(G.local_edges[comm.get_rank()]),
-    handle(handle_),
-    has_personalization(false)
-{
-  v_glob         = G.number_of_vertices;
-  v_loc          = G.local_vertices[comm.get_rank()];
-  e_loc          = G.local_edges[comm.get_rank()];
-  part_off       = G.local_offsets;
-  local_vertices = G.local_vertices;
-  off            = G.offsets;
-  ind            = G.indices;
-  blocks         = handle_.get_device_properties().maxGridSize[0];
-  threads        = handle_.get_device_properties().maxThreadsPerBlock;
-  sm_count       = handle_.get_device_properties().multiProcessorCount;
-
-  is_setup = false;
-}
-
-template <typename VT, typename ET, typename WT>
-Pagerank<VT, ET, WT>::~Pagerank()
-{
-}
-
-template <typename VT, typename ET, typename WT>
-void Pagerank<VT, ET, WT>::transition_vals(const VT *degree)
-{
-  if (e_loc > 0) {
-    int threads = std::min(e_loc, this->threads);
-    int blocks  = std::min(32 * sm_count, this->blocks);
-    transition_kernel<VT, WT><<<blocks, threads>>>(e_loc, ind, degree, val.data().get());
-    CHECK_CUDA(nullptr);
-  }
-}
-
-template <typename VT, typename ET, typename WT>
-void Pagerank<VT, ET, WT>::flag_leafs(const VT *degree)
-{
-  if (v_glob > 0) {
-    int threads = std::min(v_glob, this->threads);
-    int blocks  = std::min(32 * sm_count, this->blocks);
-    cugraph::detail::flag_leafs_kernel<VT, WT>
-      <<<blocks, threads>>>(v_glob, degree, bookmark.data().get());
-    CHECK_CUDA(nullptr);
-  }
-}
-
-// Artificially create the google matrix by setting val and bookmark
-template <typename VT, typename ET, typename WT>
-void Pagerank<VT, ET, WT>::setup(WT _alpha,
-                                 VT *degree,
-                                 VT personalization_subset_size,
-                                 VT *personalization_subset,
-                                 WT *personalization_values)
-{
-  if (!is_setup) {
-    alpha   = _alpha;
-    WT zero = 0.0;
-    WT one  = 1.0;
-    // Update dangling node vector
-    cugraph::detail::fill(v_glob, bookmark.data().get(), zero);
-    flag_leafs(degree);
-    cugraph::detail::update_dangling_nodes(v_glob, bookmark.data().get(), alpha);
-
-    // Transition matrix
-    transition_vals(degree);
-
-    // personalize
-    if (personalization_subset_size != 0) {
-      CUGRAPH_EXPECTS(personalization_subset != nullptr,
-                      "Invalid API parameter: personalization_subset array should be of size "
-                      "personalization_subset_size");
-      CUGRAPH_EXPECTS(personalization_values != nullptr,
-                      "Invalid API parameter: personalization_values array should be of size "
-                      "personalization_subset_size");
-      CUGRAPH_EXPECTS(personalization_subset_size <= v_glob,
-                      "Personalization size should be smaller than V");
-
-      WT sum = cugraph::detail::nrm1(personalization_subset_size, personalization_values);
-      if (sum != zero) {
-        has_personalization = true;
-        personalization_vector.resize(v_glob);
-        cugraph::detail::fill(v_glob, personalization_vector.data().get(), zero);
-        cugraph::detail::scal(v_glob, one / sum, personalization_values);
-        cugraph::detail::scatter(personalization_subset_size,
-                                 personalization_values,
-                                 personalization_vector.data().get(),
-                                 personalization_subset);
-      }
-    }
-    is_setup = true;
-  } else
-    CUGRAPH_FAIL("MG PageRank : Setup can be called only once");
-}
-
-// run the power iteration on the google matrix
-template <typename VT, typename ET, typename WT>
-int Pagerank<VT, ET, WT>::solve(int max_iter, float tolerance, WT *pagerank)
-{
-  if (is_setup) {
-    WT dot_res;
-    WT one = 1.0;
-    WT *pr = pagerank;
-    cugraph::detail::fill(v_glob, pagerank, one / v_glob);
-    cugraph::detail::fill(v_glob, prev_pr.data().get(), one / v_glob);
-    // This cuda sync was added to fix #426
-    // This should not be requiered in theory
-    // This is not needed on one GPU at this time
-    cudaDeviceSynchronize();
-    dot_res = cugraph::detail::dot(v_glob, bookmark.data().get(), pr);
-    MGcsrmv<VT, ET, WT> spmv_solver(
-      handle, local_vertices, part_off, off, ind, val.data().get(), pagerank);
-
-    WT residual;
-    int i;
-    for (i = 0; i < max_iter; ++i) {
-      spmv_solver.run(pagerank);
-      cugraph::detail::scal(v_glob, alpha, pr);
-
-      // personalization
-      if (has_personalization)
-        cugraph::detail::axpy(v_glob, dot_res, personalization_vector.data().get(), pr);
-      else
-        cugraph::detail::addv(v_glob, dot_res * (one / v_glob), pr);
-
-      dot_res = cugraph::detail::dot(v_glob, bookmark.data().get(), pr);
-      cugraph::detail::scal(v_glob, one / cugraph::detail::nrm2(v_glob, pr), pr);
-
-      // convergence check
-      cugraph::detail::axpy(v_glob, (WT)-1.0, pr, prev_pr.data().get());
-      residual = cugraph::detail::nrm2(v_glob, prev_pr.data().get());
-      if (residual < tolerance)
-        break;
-      else
-        cugraph::detail::copy(v_glob, pr, prev_pr.data().get());
-    }
-    cugraph::detail::scal(v_glob, one / cugraph::detail::nrm1(v_glob, pr), pr);
-    return i;
-  } else {
-    CUGRAPH_FAIL("MG PageRank : Solve was called before setup");
-  }
-}
-
-template class Pagerank<int, int, double>;
-template class Pagerank<int, int, float>;
-
-}  // namespace mg
-}  // namespace cugraph
-
-#include "utilities/eidir_graph_utils.hpp"
diff --git a/cpp/src/link_analysis/pagerank_1D.cuh b/cpp/src/link_analysis/pagerank_1D.cuh
deleted file mode 100644
index feb410daa9a..00000000000
--- a/cpp/src/link_analysis/pagerank_1D.cuh
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Author: Alex Fender afender@nvidia.com
-
-#pragma once
-
-#include <rmm/thrust_rmm_allocator.h>
-#include <numeric>
-#include <raft/handle.hpp>
-
-#include "utilities/error.hpp"
-#include "utilities/spmv_1D.cuh"
-
-namespace cugraph {
-namespace mg {
-
-template <typename VT, typename ET, typename WT>
-class Pagerank {
- private:
-  VT v_glob{};  // global number of vertices
-  VT v_loc{};   // local number of vertices
-  ET e_loc{};   // local number of edges
-  WT alpha{};   // damping factor
-  bool has_personalization;
-  // CUDA
-  const raft::comms::comms_t &comm;  // info about the mg comm setup
-  cudaStream_t stream;
-  int blocks;
-  int threads;
-  int sm_count;
-
-  // Vertex offsets for each partition.
-  VT *part_off;
-  VT *local_vertices;
-
-  // Google matrix
-  ET *off;
-  VT *ind;
-
-  rmm::device_vector<WT> val;                     // values of the substochastic matrix
-  rmm::device_vector<WT> bookmark;                // constant vector with dangling node info
-  rmm::device_vector<WT> prev_pr;                 // record the last pagerank for convergence check
-  rmm::device_vector<WT> personalization_vector;  // personalization vector after reconstruction
-
-  bool is_setup;
-  raft::handle_t const &handle;  // raft handle propagation for SpMV, etc.
-
- public:
-  Pagerank(const raft::handle_t &handle, const GraphCSCView<VT, ET, WT> &G);
-  ~Pagerank();
-
-  void transition_vals(const VT *degree);
-
-  void flag_leafs(const VT *degree);
-
-  // Artificially create the google matrix by setting val and bookmark
-  void setup(WT _alpha,
-             VT *degree,
-             VT personalization_subset_size,
-             VT *personalization_subset,
-             WT *personalization_values);
-
-  // run the power iteration on the google matrix, return the number of iterations
-  int solve(int max_iter, float tolerance, WT *pagerank);
-};
-
-template <typename VT, typename ET, typename WT>
-int pagerank(raft::handle_t const &handle,
-             const GraphCSCView<VT, ET, WT> &G,
-             WT *pagerank_result,
-             VT personalization_subset_size,
-             VT *personalization_subset,
-             WT *personalization_values,
-             const double damping_factor = 0.85,
-             const int64_t n_iter        = 100,
-             const double tolerance      = 1e-5)
-{
-  // null pointers check
-  CUGRAPH_EXPECTS(G.offsets != nullptr, "Invalid API parameter - offsets is null");
-  CUGRAPH_EXPECTS(G.indices != nullptr, "Invalid API parameter - indidices is null");
-  CUGRAPH_EXPECTS(pagerank_result != nullptr,
-                  "Invalid API parameter - pagerank output memory must be allocated");
-
-  // parameter values
-  CUGRAPH_EXPECTS(damping_factor > 0.0,
-                  "Invalid API parameter - invalid damping factor value (alpha<0)");
-  CUGRAPH_EXPECTS(damping_factor < 1.0,
-                  "Invalid API parameter - invalid damping factor value (alpha>1)");
-  CUGRAPH_EXPECTS(n_iter > 0, "Invalid API parameter - n_iter must be > 0");
-
-  rmm::device_vector<VT> degree(G.number_of_vertices);
-
-  // in-degree of CSC (equivalent to out-degree of original edge list)
-  G.degree(degree.data().get(), DegreeDirection::IN);
-
-  // Allocate and intialize Pagerank class
-  Pagerank<VT, ET, WT> pr_solver(handle, G);
-
-  // Set all constants info
-  pr_solver.setup(damping_factor,
-                  degree.data().get(),
-                  personalization_subset_size,
-                  personalization_subset,
-                  personalization_values);
-
-  // Run pagerank
-  return pr_solver.solve(n_iter, tolerance, pagerank_result);
-}
-
-}  // namespace mg
-}  // namespace cugraph
diff --git a/cpp/src/link_prediction/jaccard.cu b/cpp/src/link_prediction/jaccard.cu
index 70952974b39..83a4ec6e713 100644
--- a/cpp/src/link_prediction/jaccard.cu
+++ b/cpp/src/link_prediction/jaccard.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -315,7 +315,7 @@ int jaccard_pairs(vertex_t n,
 template <typename VT, typename ET, typename WT>
 void jaccard(GraphCSRView<VT, ET, WT> const &graph, WT const *weights, WT *result)
 {
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: result pointer is NULL");
+  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
 
   rmm::device_vector<WT> weight_i(graph.number_of_edges);
   rmm::device_vector<WT> weight_s(graph.number_of_edges);
@@ -352,9 +352,9 @@ void jaccard_list(GraphCSRView<VT, ET, WT> const &graph,
                   VT const *second,
                   WT *result)
 {
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: result pointer is NULL");
-  CUGRAPH_EXPECTS(first != nullptr, "Invalid API parameter: first is NULL");
-  CUGRAPH_EXPECTS(second != nullptr, "Invalid API parameter: second in NULL");
+  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
+  CUGRAPH_EXPECTS(first != nullptr, "Invalid input argument: first is NULL");
+  CUGRAPH_EXPECTS(second != nullptr, "Invalid input argument: second in NULL");
 
   rmm::device_vector<WT> weight_i(num_pairs, WT{0.0});
   rmm::device_vector<WT> weight_s(num_pairs);
diff --git a/cpp/src/link_prediction/overlap.cu b/cpp/src/link_prediction/overlap.cu
index e3f80b50d9a..83fdc799649 100644
--- a/cpp/src/link_prediction/overlap.cu
+++ b/cpp/src/link_prediction/overlap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -316,7 +316,7 @@ int overlap_pairs(vertex_t n,
 template <typename VT, typename ET, typename WT>
 void overlap(GraphCSRView<VT, ET, WT> const &graph, WT const *weights, WT *result)
 {
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: result pointer is NULL");
+  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
 
   rmm::device_vector<WT> weight_i(graph.number_of_edges);
   rmm::device_vector<WT> weight_s(graph.number_of_edges);
@@ -353,9 +353,9 @@ void overlap_list(GraphCSRView<VT, ET, WT> const &graph,
                   VT const *second,
                   WT *result)
 {
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid API parameter: result pointer is NULL");
-  CUGRAPH_EXPECTS(first != nullptr, "Invalid API parameter: first column is NULL");
-  CUGRAPH_EXPECTS(second != nullptr, "Invalid API parameter: second column is NULL");
+  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
+  CUGRAPH_EXPECTS(first != nullptr, "Invalid input argument: first column is NULL");
+  CUGRAPH_EXPECTS(second != nullptr, "Invalid input argument: second column is NULL");
 
   rmm::device_vector<WT> weight_i(num_pairs);
   rmm::device_vector<WT> weight_s(num_pairs);
diff --git a/cpp/src/structure/graph.cu b/cpp/src/structure/graph.cu
index 63ef725c3b7..056ad39fefc 100644
--- a/cpp/src/structure/graph.cu
+++ b/cpp/src/structure/graph.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -148,3 +148,5 @@ template class GraphCOOView<int32_t, int32_t, double>;
 template class GraphCompressedSparseBaseView<int32_t, int32_t, float>;
 template class GraphCompressedSparseBaseView<int32_t, int32_t, double>;
 }  // namespace cugraph
+
+#include "utilities/eidir_graph_utils.hpp"
diff --git a/cpp/src/traversal/sssp.cu b/cpp/src/traversal/sssp.cu
index 4018c9d9878..6ffbbbf462b 100644
--- a/cpp/src/traversal/sssp.cu
+++ b/cpp/src/traversal/sssp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -247,7 +247,7 @@ void sssp(GraphCSRView<VT, ET, WT> const &graph,
           VT *predecessors,
           const VT source_vertex)
 {
-  CUGRAPH_EXPECTS(distances || predecessors, "Invalid API parameter, both outputs are nullptr");
+  CUGRAPH_EXPECTS(distances || predecessors, "Invalid input argument, both outputs are nullptr");
 
   if (typeid(VT) != typeid(int)) CUGRAPH_FAIL("Unsupported vertex id data type, please use int");
   if (typeid(ET) != typeid(int)) CUGRAPH_FAIL("Unsupported edge id data type, please use int");
diff --git a/cpp/src/traversal/tsp.cu b/cpp/src/traversal/tsp.cu
new file mode 100644
index 00000000000..c669246bc49
--- /dev/null
+++ b/cpp/src/traversal/tsp.cu
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <numeric>
+#include <raft/spatial/knn/knn.hpp>
+
+#include "tsp.hpp"
+#include "tsp_solver.hpp"
+
+namespace cugraph {
+namespace detail {
+
+TSP::TSP(raft::handle_t &handle,
+         int const *vtx_ptr,
+         float const *x_pos,
+         float const *y_pos,
+         int nodes,
+         int restarts,
+         bool beam_search,
+         int k,
+         int nstart,
+         bool verbose,
+         int *route)
+  : handle_(handle),
+    vtx_ptr_(vtx_ptr),
+    x_pos_(x_pos),
+    y_pos_(y_pos),
+    nodes_(nodes),
+    restarts_(restarts),
+    beam_search_(beam_search),
+    k_(k),
+    nstart_(nstart),
+    verbose_(verbose),
+    route_(route),
+    stream_(handle_.get_stream()),
+    max_blocks_(handle_.get_device_properties().maxGridSize[0]),
+    max_threads_(handle_.get_device_properties().maxThreadsPerBlock),
+    warp_size_(handle_.get_device_properties().warpSize),
+    sm_count_(handle_.get_device_properties().multiProcessorCount),
+    restart_batch_(4096)
+{
+  allocate();
+}
+
+void TSP::allocate()
+{
+  // Scalars
+  mylock_    = mylock_scalar_.data();
+  best_tour_ = best_tour_scalar_.data();
+  climbs_    = climbs_scalar_.data();
+
+  // Vectors
+  neighbors_vec_.resize((k_ + 1) * nodes_);
+  // pre-allocate workspace for climbs, each block needs a separate permutation space and search
+  // buffer. We allocate a work buffer that will store the computed distances, px, py and the route.
+  // We align it on the warp size.
+  work_vec_.resize(sizeof(float) * restart_batch_ *
+                   ((4 * nodes_ + 3 + warp_size_ - 1) / warp_size_ * warp_size_));
+
+  // Pointers
+  neighbors_ = neighbors_vec_.data().get();
+  work_      = work_vec_.data().get();
+}
+
+float TSP::compute()
+{
+  float valid_coo_dist    = 0.f;
+  int num_restart_batches = (restarts_ + restart_batch_ - 1) / restart_batch_;
+  int restart_resid       = restarts_ - (num_restart_batches - 1) * restart_batch_;
+  int global_best         = INT_MAX;
+  float *soln             = nullptr;
+  int *route_sol          = nullptr;
+  int best                = 0;
+  std::vector<float> h_x_pos;
+  std::vector<float> h_y_pos;
+  h_x_pos.reserve(nodes_ + 1);
+  h_y_pos.reserve(nodes_ + 1);
+
+  // Stats
+  int n_timers      = 3;
+  long total_climbs = 0;
+  std::vector<float> h_times;
+  struct timeval starttime, endtime;
+
+  // KNN call
+  knn();
+
+  if (verbose_) {
+    std::cout << "Doing " << num_restart_batches - 1 << " batches of size " << restart_batch_
+              << ", with " << restart_resid << " tail\n";
+    std::cout << "configuration: " << nodes_ << " nodes, " << restarts_ << " restart\n";
+    std::cout << "optimizing graph with kswap = " << kswaps << "\n";
+  }
+
+  // Tell the cache how we want it to behave
+  cudaFuncSetCacheConfig(search_solution, cudaFuncCachePreferEqual);
+
+  int threads = best_thread_count(nodes_, max_threads_, sm_count_, warp_size_);
+  if (verbose_) std::cout << "Calculated best thread number = " << threads << "\n";
+
+  rmm::device_vector<float> times(n_timers * threads + n_timers);
+  h_times.reserve(n_timers * threads + n_timers);
+
+  gettimeofday(&starttime, NULL);
+  for (int b = 0; b < num_restart_batches; ++b) {
+    reset<<<1, 1, 0, stream_>>>(mylock_, best_tour_, climbs_);
+    CHECK_CUDA(stream_);
+
+    if (b == num_restart_batches - 1) restart_batch_ = restart_resid;
+
+    search_solution<<<restart_batch_, threads, sizeof(int) * threads, stream_>>>(mylock_,
+                                                                                 best_tour_,
+                                                                                 vtx_ptr_,
+                                                                                 beam_search_,
+                                                                                 k_,
+                                                                                 nodes_,
+                                                                                 neighbors_,
+                                                                                 x_pos_,
+                                                                                 y_pos_,
+                                                                                 work_,
+                                                                                 nstart_,
+                                                                                 times.data().get(),
+                                                                                 climbs_,
+                                                                                 threads);
+
+    CHECK_CUDA(stream_);
+    cudaDeviceSynchronize();
+
+    CUDA_TRY(cudaMemcpy(&best, best_tour_, sizeof(int), cudaMemcpyDeviceToHost));
+    cudaDeviceSynchronize();
+    if (verbose_) std::cout << "Best reported by kernel = " << best << "\n";
+
+    if (best < global_best) {
+      global_best = best;
+      CUDA_TRY(cudaMemcpyFromSymbol(&soln, best_soln, sizeof(void *)));
+      cudaDeviceSynchronize();
+      CUDA_TRY(cudaMemcpyFromSymbol(&route_sol, best_route, sizeof(void *)));
+      cudaDeviceSynchronize();
+    }
+    total_climbs += climbs_scalar_.value(stream_);
+  }
+  gettimeofday(&endtime, NULL);
+  double runtime =
+    endtime.tv_sec + endtime.tv_usec / 1e6 - starttime.tv_sec - starttime.tv_usec / 1e6;
+  long long moves = 1LL * total_climbs * (nodes_ - 2) * (nodes_ - 1) / 2;
+
+  raft::copy(route_, route_sol, nodes_, stream_);
+
+  CUDA_TRY(cudaMemcpy(h_x_pos.data(), soln, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost));
+  cudaDeviceSynchronize();
+  CUDA_TRY(cudaMemcpy(
+    h_y_pos.data(), soln + nodes_ + 1, sizeof(float) * (nodes_ + 1), cudaMemcpyDeviceToHost));
+  cudaDeviceSynchronize();
+
+  for (int i = 0; i < nodes_; ++i) {
+    if (verbose_) { std::cout << h_x_pos[i] << " " << h_y_pos[i] << "\n"; }
+    valid_coo_dist += euclidean_dist(h_x_pos.data(), h_y_pos.data(), i, i + 1);
+  }
+
+  CUDA_TRY(cudaMemcpy(h_times.data(),
+                      times.data().get(),
+                      sizeof(float) * n_timers * threads + n_timers,
+                      cudaMemcpyDeviceToHost));
+  cudaDeviceSynchronize();
+
+  if (verbose_) {
+    std::cout << "Search runtime = " << runtime << ", " << moves * 1e-9 / runtime << " Gmoves/s\n";
+    std::cout << "Optimized tour length = " << global_best << "\n";
+    print_times(h_times, n_timers, handle_.get_device(), threads);
+  }
+
+  return valid_coo_dist;
+}
+
+void TSP::knn()
+{
+  if (verbose_) std::cout << "Looking at " << k_ << " nearest neighbors\n";
+
+  int dim              = 2;
+  bool row_major_order = false;
+
+  rmm::device_vector<float> input(nodes_ * dim);
+  float *input_ptr = input.data().get();
+  raft::copy(input_ptr, x_pos_, nodes_, stream_);
+  raft::copy(input_ptr + nodes_, y_pos_, nodes_, stream_);
+
+  rmm::device_vector<float> search_data(nodes_ * dim);
+  float *search_data_ptr = search_data.data().get();
+  raft::copy(search_data_ptr, input_ptr, nodes_ * dim, stream_);
+
+  rmm::device_vector<float> distances(nodes_ * (k_ + 1));
+  float *distances_ptr = distances.data().get();
+
+  std::vector<float *> input_vec;
+  std::vector<int> sizes_vec;
+  input_vec.push_back(input_ptr);
+  sizes_vec.push_back(nodes_);
+
+  // k neighbors + 1 is needed because the nearest neighbor of each point is
+  // the point itself that we don't want to take into account.
+
+  raft::spatial::knn::brute_force_knn(handle_,
+                                      input_vec,
+                                      sizes_vec,
+                                      dim,
+                                      search_data_ptr,
+                                      nodes_,
+                                      neighbors_,
+                                      distances_ptr,
+                                      k_ + 1,
+                                      row_major_order,
+                                      row_major_order);
+}
+}  // namespace detail
+
+float traveling_salesperson(raft::handle_t &handle,
+                            int const *vtx_ptr,
+                            float const *x_pos,
+                            float const *y_pos,
+                            int nodes,
+                            int restarts,
+                            bool beam_search,
+                            int k,
+                            int nstart,
+                            bool verbose,
+                            int *route)
+{
+  RAFT_EXPECTS(route != nullptr, "route should equal the number of nodes");
+  RAFT_EXPECTS(nodes > 0, "nodes should be strictly positive");
+  RAFT_EXPECTS(restarts > 0, "restarts should be strictly positive");
+  RAFT_EXPECTS(nstart >= 0 && nstart < nodes, "nstart should be between 0 and nodes - 1");
+  RAFT_EXPECTS(k > 0, "k should be strictly positive");
+
+  cugraph::detail::TSP tsp(
+    handle, vtx_ptr, x_pos, y_pos, nodes, restarts, beam_search, k, nstart, verbose, route);
+  return tsp.compute();
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/traversal/tsp.hpp b/cpp/src/traversal/tsp.hpp
new file mode 100644
index 00000000000..b065b779b96
--- /dev/null
+++ b/cpp/src/traversal/tsp.hpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+#include <algorithms.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace detail {
+class TSP {
+ public:
+  TSP(raft::handle_t &handle,
+      int const *vtx_ptr,
+      float const *x_pos,
+      float const *y_pos,
+      int nodes,
+      int restarts,
+      bool beam_search,
+      int k,
+      int nstart,
+      bool verbose,
+      int *route);
+
+  void allocate();
+  float compute();
+  void knn();
+  ~TSP(){};
+
+ private:
+  // Config
+  raft::handle_t &handle_;
+  cudaStream_t stream_;
+  int max_blocks_;
+  int max_threads_;
+  int warp_size_;
+  int sm_count_;
+  // how large a grid we want to run, this is fixed
+  int restart_batch_;
+
+  // TSP
+  int const *vtx_ptr_;
+  int *route_;
+  float const *x_pos_;
+  float const *y_pos_;
+  int nodes_;
+  int restarts_;
+  bool beam_search_;
+  int k_;
+  int nstart_;
+  bool verbose_;
+
+  // Scalars
+  rmm::device_scalar<int> mylock_scalar_;
+  rmm::device_scalar<int> best_tour_scalar_;
+  rmm::device_scalar<int> climbs_scalar_;
+
+  int *mylock_;
+  int *best_tour_;
+  int *climbs_;
+
+  // Vectors
+  rmm::device_vector<int64_t> neighbors_vec_;
+  rmm::device_vector<int> work_vec_;
+
+  int64_t *neighbors_;
+  int *work_;
+  int *work_route_;
+};
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/traversal/tsp_solver.hpp b/cpp/src/traversal/tsp_solver.hpp
new file mode 100644
index 00000000000..20d826cac5c
--- /dev/null
+++ b/cpp/src/traversal/tsp_solver.hpp
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <curand_kernel.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <raft/cuda_utils.cuh>
+
+#include "tsp_utils.hpp"
+
+namespace cugraph {
+namespace detail {
+
+__device__ float *best_soln;
+__device__ int *best_route;
+extern __shared__ int shbuf[];
+
+__global__ void reset(int *mylock, int *best_tour, int *climbs)
+{
+  *mylock    = 0;
+  *best_tour = INT_MAX;
+  *climbs    = 0;
+  best_soln  = nullptr;
+  best_route = nullptr;
+}
+
+// random permutation kernel
+__device__ void random_init(float const *posx,
+                            float const *posy,
+                            int const *vtx_ptr,
+                            int *path,
+                            float *px,
+                            float *py,
+                            int const nstart,
+                            int const nodes)
+{
+  // Fill values
+  for (int i = threadIdx.x; i <= nodes; i += blockDim.x) {
+    px[i]   = posx[i];
+    py[i]   = posy[i];
+    path[i] = vtx_ptr[i];
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) { /* serial permutation as starting point */
+    // swap to start at nstart node
+    raft::swapVals(px[0], px[nstart]);
+    raft::swapVals(py[0], py[nstart]);
+    raft::swapVals(path[0], path[nstart]);
+
+    curandState rndstate;
+    curand_init(blockIdx.x, 0, 0, &rndstate);
+    for (int i = 1; i < nodes; i++) {
+      int j = curand(&rndstate) % (nodes - 1 - i) + i;
+      if (i == j) continue;
+      raft::swapVals(px[i], px[j]);
+      raft::swapVals(py[i], py[j]);
+      raft::swapVals(path[i], path[j]);
+    }
+    px[nodes]   = px[0]; /* close the loop now, avoid special cases later */
+    py[nodes]   = py[0];
+    path[nodes] = path[0];
+  }
+}
+
+// Use KNN as a starting solution
+__device__ void knn_init(float const *posx,
+                         float const *posy,
+                         int const *vtx_ptr,
+                         int64_t const *neighbors,
+                         int *buf,
+                         int *path,
+                         float *px,
+                         float *py,
+                         int const nstart,
+                         int const nodes,
+                         int const K)
+{
+  for (int i = threadIdx.x; i < nodes; i += blockDim.x) buf[i] = 0;
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    curandState rndstate;
+    curand_init(blockIdx.x, 0, 0, &rndstate);
+    int progress = 0;
+    int initlen  = 0;
+
+    px[0]     = posx[nstart];
+    py[0]     = posy[nstart];
+    path[0]   = vtx_ptr[nstart];
+    int head  = nstart;
+    int v     = 0;
+    buf[head] = 1;
+    while (progress < nodes - 1) {  // beam search as starting point
+      for (int i = 1; i <= progress; i++) buf[i] = 0;
+      progress      = 0;  // reset current location in path and visited array
+      initlen       = 0;
+      int randjumps = 0;
+      while (progress < nodes - 1) {
+        int nj     = curand(&rndstate) % K;
+        int linked = 0;
+        for (int nh = 0; nh < K; ++nh) {
+          // offset (idx / K) + 1 filters the points as their own nearest neighbors.
+          int offset = (K * head + nj) / K + 1;
+          v          = neighbors[K * head + nj + offset];
+          if (v < nodes && buf[v] == 0) {
+            head = v;
+            progress += 1;
+            buf[head] = 1;
+            linked    = 1;
+            break;
+          }
+          nj = (nj + 1) % K;
+        }
+        if (linked == 0) {
+          if (randjumps > nodes - 1)
+            break;  // give up on this traversal, we failed to find a next link
+          randjumps += 1;
+          int nr = (head + 1) % nodes;  // jump to next node
+          while (buf[nr] == 1) { nr = (nr + 1) % nodes; }
+          head = nr;
+          progress += 1;
+          buf[head] = 1;
+        }
+        // copy from input into beam-search order, update len
+        px[progress]   = posx[head];
+        py[progress]   = posy[head];
+        path[progress] = vtx_ptr[head];
+        initlen += __float2int_rn(euclidean_dist(px, py, progress, progress - 1));
+      }
+    }
+    px[nodes]   = px[nstart];
+    py[nodes]   = py[nstart];
+    path[nodes] = path[nstart];
+    initlen += __float2int_rn(euclidean_dist(px, py, nodes, nstart));
+  }
+}
+
+__device__ void two_opt_search(
+  int *buf, float *px, float *py, int *shbuf, int *minchange, int *mini, int *minj, int const nodes)
+{
+  __shared__ float shmem_x[tilesize];
+  __shared__ float shmem_y[tilesize];
+
+  for (int ii = 0; ii < nodes - 2; ii += blockDim.x) {
+    int i = ii + threadIdx.x;
+    float pxi0, pyi0, pxi1, pyi1, pxj1, pyj1;
+    if (i < nodes - 2) {
+      minchange[0] -= buf[i];
+      pxi0 = px[i];
+      pyi0 = py[i];
+      pxi1 = px[i + 1];
+      pyi1 = py[i + 1];
+      pxj1 = px[nodes];
+      pyj1 = py[nodes];
+    }
+    for (int jj = nodes - 1; jj >= ii + 2; jj -= tilesize) {
+      int bound = jj - tilesize + 1;
+      for (int k = threadIdx.x; k < tilesize; k += blockDim.x) {
+        if (k + bound >= ii + 2) {
+          shmem_x[k] = px[k + bound];
+          shmem_y[k] = py[k + bound];
+          shbuf[k]   = buf[k + bound];
+        }
+      }
+      __syncthreads();
+
+      int lower = bound;
+      if (lower < (i + 2)) lower = i + 2;
+      for (int j = jj; j >= lower; j--) {
+        int jm     = j - bound;
+        float pxj0 = shmem_x[jm];
+        float pyj0 = shmem_y[jm];
+        int delta =
+          shbuf[jm] +
+          __float2int_rn(sqrtf((pxi0 - pxj0) * (pxi0 - pxj0) + (pyi0 - pyj0) * (pyi0 - pyj0))) +
+          __float2int_rn(sqrtf((pxi1 - pxj1) * (pxi1 - pxj1) + (pyi1 - pyj1) * (pyi1 - pyj1)));
+        pxj1 = pxj0;
+        pyj1 = pyj0;
+
+        if (delta < minchange[0]) {
+          minchange[0] = delta;
+          mini[0]      = i;
+          minj[0]      = j;
+        }
+      }
+      __syncthreads();
+    }
+
+    if (i < nodes - 2) { minchange[0] += buf[i]; }
+  }
+}
+
+// This function being runned for each block
+__device__ void hill_climbing(
+  float *px, float *py, int *buf, int *path, int *shbuf, int const nodes, int *climbs)
+{
+  __shared__ int best_change[kswaps];
+  __shared__ int best_i[kswaps];
+  __shared__ int best_j[kswaps];
+
+  int minchange;
+  int mini;
+  int minj;
+  int kswaps_active = kswaps;
+  int myswaps       = 0;
+
+  // Hill climbing, iteratively improve from the starting guess
+  do {
+    if (threadIdx.x == 0) {
+      for (int k = 0; k < kswaps; k++) {
+        best_change[k] = 0;
+        best_i[k]      = 0;
+        best_j[k]      = 0;
+      }
+    }
+    __syncthreads();
+    for (int i = threadIdx.x; i < nodes; i += blockDim.x) {
+      buf[i] = -__float2int_rn(euclidean_dist(px, py, i, i + 1));
+    }
+    __syncthreads();
+
+    // Reset
+    minchange = 0;
+    mini      = 0;
+    minj      = 0;
+
+    // Find best indices
+    two_opt_search(buf, px, py, shbuf, &minchange, &mini, &minj, nodes);
+    __syncthreads();
+
+    // Stats only
+    if (threadIdx.x == 0) atomicAdd(climbs, 1);
+
+    shbuf[threadIdx.x] = minchange;
+
+    int j = blockDim.x;  // warp reduction to find best thread results
+    do {
+      int k = (j + 1) / 2;
+      if ((threadIdx.x + k) < j) {
+        shbuf[threadIdx.x] = min(shbuf[threadIdx.x + k], shbuf[threadIdx.x]);
+      }
+      j = k;
+      __syncthreads();
+    } while (j > 1);  // thread winner for this k is in shbuf[0]
+
+    if (threadIdx.x == 0) {
+      best_change[0] = shbuf[0];  // sort best result in shared
+    }
+    __syncthreads();
+
+    if (minchange == shbuf[0]) {  // My thread is as good as the winner
+      shbuf[1] = threadIdx.x;     // store thread ID in shbuf[1]
+    }
+    __syncthreads();
+
+    if (threadIdx.x == shbuf[1]) {  // move from thread local to shared
+      best_i[0] = mini;             // shared best indices for compatibility checks
+      best_j[0] = minj;
+    }
+    __syncthreads();
+
+    // look for more compatible swaps
+    for (int kmin = 1; kmin < kswaps_active; kmin++) {
+      // disallow swaps that conflict with ones already picked
+      for (int kchk = kmin - 1; kchk >= 0; --kchk) {
+        if ((mini < (best_j[kchk] + 1)) && (minj > (best_i[kchk] - 1))) {
+          minchange = shbuf[threadIdx.x] = 0;
+        }
+        __syncthreads();
+      }
+      shbuf[threadIdx.x] = minchange;
+
+      j = blockDim.x;
+      do {
+        int k = (j + 1) / 2;
+        if ((threadIdx.x + k) < j) {
+          shbuf[threadIdx.x] = min(shbuf[threadIdx.x + k], shbuf[threadIdx.x]);
+        }
+        j = k;
+        __syncthreads();
+      } while (j > 1);  // thread winner for this k is in shbuf[0]
+
+      if (threadIdx.x == 0) {
+        best_change[kmin] = shbuf[0];  // store best result in shared
+      }
+      __syncthreads();
+
+      if (minchange == shbuf[0]) {  // My thread is as good as the winner
+        shbuf[1] = threadIdx.x;     // store thread ID in shbuf[1]
+        __threadfence_block();
+      }
+      __syncthreads();
+
+      if (threadIdx.x == shbuf[1]) {  // move from thread local to shared
+        best_i[kmin] = mini;          // store swap targets
+        best_j[kmin] = minj;
+        __threadfence_block();
+      }
+      __syncthreads();
+      // look for the best compatible move
+    }  // end loop over kmin
+    minchange = best_change[0];
+    myswaps += 1;
+    for (int kmin = 0; kmin < kswaps_active; kmin++) {
+      int sum = best_i[kmin] + best_j[kmin] + 1;  // = mini + minj +1
+      // this is a reversal of all nodes included in the range [ i+1, j ]
+      for (int i = threadIdx.x; (i + i) < sum; i += blockDim.x) {
+        if (best_i[kmin] < i) {
+          int j = sum - i;
+          raft::swapVals(px[i], px[j]);
+          raft::swapVals(py[i], py[j]);
+          raft::swapVals(path[i], path[j]);
+        }
+      }
+      __syncthreads();
+    }
+  } while (minchange < 0 && myswaps < 2 * nodes);
+}
+
+__device__ void get_optimal_tour(
+  int *mylock, int *best_tour, float *px, float *py, int *path, int *shbuf, int const nodes)
+{
+  // Now find actual length of the last tour, result of the climb
+  int term = 0;
+  for (int i = threadIdx.x; i < nodes; i += blockDim.x) {
+    term += __float2int_rn(euclidean_dist(px, py, i, i + 1));
+  }
+  shbuf[threadIdx.x] = term;
+  __syncthreads();
+
+  int j = blockDim.x;  // block level reduction
+  do {
+    int k = (j + 1) / 2;
+    if ((threadIdx.x + k) < j) { shbuf[threadIdx.x] += shbuf[threadIdx.x + k]; }
+    j = k;  // divide active warp size in half
+    __syncthreads();
+  } while (j > 1);
+  term = shbuf[0];
+
+  if (threadIdx.x == 0) {
+    atomicMin(best_tour, term);
+    while (atomicExch(mylock, 1) != 0)
+      ;  // acquire
+    if (best_tour[0] == term) {
+      best_soln  = px;
+      best_route = path;
+    }
+    *mylock = 0;  // release
+    __threadfence();
+  }
+}
+
+__global__ __launch_bounds__(2048, 2) void search_solution(int *mylock,
+                                                           int *best_tour,
+                                                           int const *vtx_ptr,
+                                                           bool beam_search,
+                                                           int const K,
+                                                           int nodes,
+                                                           int64_t const *neighbors,
+                                                           float const *posx,
+                                                           float const *posy,
+                                                           int *work,
+                                                           int const nstart,
+                                                           float *times,
+                                                           int *climbs,
+                                                           int threads)
+{
+  int *buf  = &work[blockIdx.x * ((4 * nodes + 3 + 31) / 32 * 32)];
+  float *px = (float *)(&buf[nodes]);
+  float *py = &px[nodes + 1];
+  int *path = (int *)(&py[nodes + 1]);
+  __shared__ int shbuf[tilesize];
+  clock_t start;
+
+  start = clock64();
+  if (!beam_search)
+    random_init(posx, posy, vtx_ptr, path, px, py, nstart, nodes);
+  else
+    knn_init(posx, posy, vtx_ptr, neighbors, buf, path, px, py, nstart, nodes, K);
+  __syncthreads();
+  times[threadIdx.x] = clock64() - start;
+
+  start = clock64();
+  hill_climbing(px, py, buf, path, shbuf, nodes, climbs);
+  __syncthreads();
+  times[threads + threadIdx.x + 1] = clock64() - start;
+
+  start = clock64();
+  get_optimal_tour(mylock, best_tour, px, py, path, shbuf, nodes);
+  times[2 * threads + threadIdx.x + 1] = clock64() - start;
+}
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/traversal/tsp_utils.hpp b/cpp/src/traversal/tsp_utils.hpp
new file mode 100644
index 00000000000..3faa2efea3b
--- /dev/null
+++ b/cpp/src/traversal/tsp_utils.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define tilesize 128
+#define kswaps 4
+
+#include <sys/time.h>
+#include <string>
+#include <vector>
+
+namespace cugraph {
+namespace detail {
+
+__host__ __device__ inline float euclidean_dist(float *px, float *py, int a, int b)
+{
+  return sqrtf((px[a] - px[b]) * (px[a] - px[b]) + (py[a] - py[b]) * (py[a] - py[b]));
+}
+
+static std::vector<std::string> device_func = {"Find First", "Hill Climbing", "Retrieve Path"};
+
+void print_times(std::vector<float> &h_times, int const n_timers, int device, int threads)
+{
+  int clock_rate;
+  cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, device);
+
+  double total = 0;
+  h_times[0] /= (float)clock_rate;
+  total += h_times[0];
+  for (int i = 1; i < n_timers; ++i) {
+    h_times[i * threads + 1] /= (float)clock_rate;
+    total += h_times[i * threads + 1];
+  }
+  std::cout << "Stats: \n";
+  std::cout << device_func[0] << " time: " << h_times[0] * 1e-3 << " "
+            << (h_times[0] / total) * 100.0 << "%\n";
+  for (int i = 1; i < n_timers; ++i) {
+    std::cout << device_func[i] << " time: " << h_times[i * threads + 1] * 1e-3 << " "
+              << (h_times[i * threads + 1] / total) * 100.0 << "%\n";
+  }
+}
+
+// Get maximum number of threads we can run on based on number of nodes,
+// shared memory usage, max threads per block and SM, max blocks for SM and registers per SM.
+int best_thread_count(int nodes, int max_threads, int sm_count, int warp_size)
+{
+  int smem, blocks, thr, perf;
+  int const max_threads_sm = 2048;
+  int max                  = nodes - 2;
+  int best                 = 0;
+  int bthr                 = 4;
+
+  if (max > max_threads) max = max_threads;
+
+  for (int threads = 1; threads <= max; ++threads) {
+    smem   = sizeof(int) * threads + 2 * sizeof(float) * tilesize + sizeof(int) * tilesize;
+    blocks = (16384 * 2) / smem;
+    if (blocks > sm_count) blocks = sm_count;
+    thr = (threads + warp_size - 1) / warp_size * warp_size;
+    while (blocks * thr > max_threads_sm) blocks--;
+    perf = threads * blocks;
+    if (perf > best) {
+      best = perf;
+      bthr = threads;
+    }
+  }
+
+  return bthr;
+}
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index 6c8ef98e2e2..e95a001cb91 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,16 +15,20 @@
  */
 
 #include <algorithms.hpp>
+#include <experimental/detail/graph_utils.cuh>
+#include <experimental/graph_functions.hpp>
 #include <experimental/graph_view.hpp>
 #include <graph.hpp>
 #include <partition_manager.hpp>
 #include <raft/handle.hpp>
 #include <utilities/cython.hpp>
 #include <utilities/error.hpp>
+#include <utilities/shuffle_comm.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 
 namespace cugraph {
 namespace cython {
@@ -86,7 +90,6 @@ create_graph(raft::handle_t const& handle, graph_container_t const& graph_contai
     reinterpret_cast<vertex_t*>(graph_container.dst_vertices),
     reinterpret_cast<weight_t*>(graph_container.weights),
     static_cast<edge_t>(graph_container.num_partition_edges)};
-
   return std::make_unique<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
     handle,
     edgelist,
@@ -123,12 +126,18 @@ void populate_graph_container(graph_container_t& graph_container,
   bool do_expensive_check{true};
   bool hypergraph_partitioned{false};
 
-  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-  auto const row_comm_rank = row_comm.get_rank();
-  auto const row_comm_size = row_comm.get_size();  // pcols
-  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-  auto const col_comm_rank = col_comm.get_rank();
-  auto const col_comm_size = col_comm.get_size();  // prows
+  if (multi_gpu) {
+    auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+    auto const row_comm_rank = row_comm.get_rank();
+    auto const row_comm_size = row_comm.get_size();  // pcols
+    auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+    auto const col_comm_rank = col_comm.get_rank();
+    auto const col_comm_size = col_comm.get_size();  // prows
+    graph_container.row_comm_size = row_comm_size;
+    graph_container.col_comm_size = col_comm_size;
+    graph_container.row_comm_rank = row_comm_rank;
+    graph_container.col_comm_rank = col_comm_rank;
+  }
 
   graph_container.vertex_partition_offsets = vertex_partition_offsets;
   graph_container.src_vertices             = src_vertices;
@@ -143,10 +152,6 @@ void populate_graph_container(graph_container_t& graph_container,
   graph_container.transposed               = transposed;
   graph_container.is_multi_gpu             = multi_gpu;
   graph_container.hypergraph_partitioned   = hypergraph_partitioned;
-  graph_container.row_comm_size            = row_comm_size;
-  graph_container.col_comm_size            = col_comm_size;
-  graph_container.row_comm_rank            = row_comm_rank;
-  graph_container.col_comm_rank            = col_comm_rank;
   graph_container.sorted_by_degree         = sorted_by_degree;
   graph_container.do_expensive_check       = do_expensive_check;
 
@@ -463,33 +468,7 @@ void call_pagerank(raft::handle_t const& handle,
                    int64_t max_iter,
                    bool has_guess)
 {
-  if (graph_container.graph_type == graphTypeEnum::GraphCSCViewFloat) {
-    pagerank(handle,
-             *(graph_container.graph_ptr_union.GraphCSCViewFloatPtr),
-             reinterpret_cast<float*>(p_pagerank),
-             static_cast<int32_t>(personalization_subset_size),
-             reinterpret_cast<int32_t*>(personalization_subset),
-             reinterpret_cast<float*>(personalization_values),
-             alpha,
-             tolerance,
-             max_iter,
-             has_guess);
-    graph_container.graph_ptr_union.GraphCSCViewFloatPtr->get_vertex_identifiers(
-      reinterpret_cast<int32_t*>(identifiers));
-  } else if (graph_container.graph_type == graphTypeEnum::GraphCSCViewDouble) {
-    pagerank(handle,
-             *(graph_container.graph_ptr_union.GraphCSCViewDoublePtr),
-             reinterpret_cast<double*>(p_pagerank),
-             static_cast<int32_t>(personalization_subset_size),
-             reinterpret_cast<int32_t*>(personalization_subset),
-             reinterpret_cast<double*>(personalization_values),
-             alpha,
-             tolerance,
-             max_iter,
-             has_guess);
-    graph_container.graph_ptr_union.GraphCSCViewDoublePtr->get_vertex_identifiers(
-      reinterpret_cast<int32_t*>(identifiers));
-  } else if (graph_container.graph_type == graphTypeEnum::graph_t) {
+  if (graph_container.is_multi_gpu) {
     if (graph_container.edgeType == numberTypeEnum::int32Type) {
       auto graph =
         detail::create_graph<int32_t, int32_t, weight_t, true, true>(handle, graph_container);
@@ -504,7 +483,7 @@ void call_pagerank(raft::handle_t const& handle,
                                       static_cast<weight_t>(tolerance),
                                       max_iter,
                                       has_guess,
-                                      false);
+                                      true);
     } else if (graph_container.edgeType == numberTypeEnum::int64Type) {
       auto graph =
         detail::create_graph<vertex_t, int64_t, weight_t, true, true>(handle, graph_container);
@@ -519,9 +498,39 @@ void call_pagerank(raft::handle_t const& handle,
                                       static_cast<weight_t>(tolerance),
                                       max_iter,
                                       has_guess,
-                                      false);
-    } else {
-      CUGRAPH_FAIL("vertexType/edgeType combination unsupported");
+                                      true);
+    }
+  } else {
+    if (graph_container.edgeType == numberTypeEnum::int32Type) {
+      auto graph =
+        detail::create_graph<int32_t, int32_t, weight_t, true, false>(handle, graph_container);
+      cugraph::experimental::pagerank(handle,
+                                      graph->view(),
+                                      static_cast<weight_t*>(nullptr),
+                                      reinterpret_cast<int32_t*>(personalization_subset),
+                                      reinterpret_cast<weight_t*>(personalization_values),
+                                      static_cast<int32_t>(personalization_subset_size),
+                                      reinterpret_cast<weight_t*>(p_pagerank),
+                                      static_cast<weight_t>(alpha),
+                                      static_cast<weight_t>(tolerance),
+                                      max_iter,
+                                      has_guess,
+                                      true);
+    } else if (graph_container.edgeType == numberTypeEnum::int64Type) {
+      auto graph =
+        detail::create_graph<vertex_t, int64_t, weight_t, true, false>(handle, graph_container);
+      cugraph::experimental::pagerank(handle,
+                                      graph->view(),
+                                      static_cast<weight_t*>(nullptr),
+                                      reinterpret_cast<vertex_t*>(personalization_subset),
+                                      reinterpret_cast<weight_t*>(personalization_values),
+                                      static_cast<vertex_t>(personalization_subset_size),
+                                      reinterpret_cast<weight_t*>(p_pagerank),
+                                      static_cast<weight_t>(alpha),
+                                      static_cast<weight_t>(tolerance),
+                                      max_iter,
+                                      has_guess,
+                                      true);
     }
   }
 }
@@ -638,6 +647,55 @@ void call_bfs(raft::handle_t const& handle,
   }
 }
 
+// Wrapper for calling extract_egonet through a graph container
+// FIXME : this should not be a legacy COO and it is not clear how to handle C++ api return type as
+// is.graph_container Need to figure out how to return edge lists
+template <typename vertex_t, typename weight_t>
+std::unique_ptr<cy_multi_edgelists_t> call_egonet(raft::handle_t const& handle,
+                                                  graph_container_t const& graph_container,
+                                                  vertex_t* source_vertex,
+                                                  vertex_t n_subgraphs,
+                                                  vertex_t radius)
+{
+  if (graph_container.edgeType == numberTypeEnum::int32Type) {
+    auto graph =
+      detail::create_graph<int32_t, int32_t, weight_t, false, false>(handle, graph_container);
+    auto g = cugraph::experimental::extract_ego(handle,
+                                                graph->view(),
+                                                reinterpret_cast<int32_t*>(source_vertex),
+                                                static_cast<int32_t>(n_subgraphs),
+                                                static_cast<int32_t>(radius));
+    cy_multi_edgelists_t coo_contents{
+      0,  // not used
+      std::get<0>(g).size(),
+      static_cast<size_t>(n_subgraphs),
+      std::make_unique<rmm::device_buffer>(std::get<0>(g).release()),
+      std::make_unique<rmm::device_buffer>(std::get<1>(g).release()),
+      std::make_unique<rmm::device_buffer>(std::get<2>(g).release()),
+      std::make_unique<rmm::device_buffer>(std::get<3>(g).release())};
+    return std::make_unique<cy_multi_edgelists_t>(std::move(coo_contents));
+  } else if (graph_container.edgeType == numberTypeEnum::int64Type) {
+    auto graph =
+      detail::create_graph<vertex_t, int64_t, weight_t, false, false>(handle, graph_container);
+    auto g = cugraph::experimental::extract_ego(handle,
+                                                graph->view(),
+                                                reinterpret_cast<vertex_t*>(source_vertex),
+                                                static_cast<vertex_t>(n_subgraphs),
+                                                static_cast<vertex_t>(radius));
+    cy_multi_edgelists_t coo_contents{
+      0,  // not used
+      std::get<0>(g).size(),
+      static_cast<size_t>(n_subgraphs),
+      std::make_unique<rmm::device_buffer>(std::get<0>(g).release()),
+      std::make_unique<rmm::device_buffer>(std::get<1>(g).release()),
+      std::make_unique<rmm::device_buffer>(std::get<2>(g).release()),
+      std::make_unique<rmm::device_buffer>(std::get<3>(g).release())};
+    return std::make_unique<cy_multi_edgelists_t>(std::move(coo_contents));
+  } else {
+    CUGRAPH_FAIL("vertexType/edgeType combination unsupported");
+  }
+}
+
 // Wrapper for calling SSSP through a graph container
 template <typename vertex_t, typename weight_t>
 void call_sssp(raft::handle_t const& handle,
@@ -686,6 +744,101 @@ void call_sssp(raft::handle_t const& handle,
   }
 }
 
+// wrapper for shuffling:
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::unique_ptr<major_minor_weights_t<vertex_t, weight_t>> call_shuffle(
+  raft::handle_t const& handle,
+  vertex_t*
+    edgelist_major_vertices,  // [IN / OUT]: groupby_gpuid_and_shuffle_values() sorts in-place
+  vertex_t* edgelist_minor_vertices,  // [IN / OUT]
+  weight_t* edgelist_weights,         // [IN / OUT]
+  edge_t num_edgelist_edges,
+  bool is_hypergraph_partitioned)  // = false
+{
+  auto& comm = handle.get_comms();
+
+  auto& row_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+
+  auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+
+  auto zip_edge = thrust::make_zip_iterator(
+    thrust::make_tuple(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights));
+
+  std::unique_ptr<major_minor_weights_t<vertex_t, weight_t>> ptr_ret =
+    std::make_unique<major_minor_weights_t<vertex_t, weight_t>>(handle);
+
+  std::forward_as_tuple(
+    std::tie(ptr_ret->get_major(), ptr_ret->get_minor(), ptr_ret->get_weights()),
+    std::ignore) =
+    cugraph::experimental::groupby_gpuid_and_shuffle_values(
+      comm,  // handle.get_comms(),
+      zip_edge,
+      zip_edge + num_edgelist_edges,
+      [key_func =
+         cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
+           is_hypergraph_partitioned,
+           comm.get_size(),
+           row_comm.get_size(),
+           col_comm.get_size()}] __device__(auto val) {
+        return key_func(thrust::get<0>(val), thrust::get<1>(val));
+      },
+      handle.get_stream());
+
+  return ptr_ret;  // RVO-ed
+}
+
+// Wrapper for calling renumber_edeglist() inplace:
+// TODO: check if return type needs further handling...
+//
+template <typename vertex_t, typename edge_t>
+std::unique_ptr<renum_quad_t<vertex_t, edge_t>> call_renumber(
+  raft::handle_t const& handle,
+  vertex_t* shuffled_edgelist_major_vertices /* [INOUT] */,
+  vertex_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
+  edge_t num_edgelist_edges,
+  bool is_hypergraph_partitioned,
+  bool do_expensive_check,
+  bool multi_gpu)  // bc. cython cannot take non-type template params
+{
+  // caveat: return values have different types on the 2 branches below:
+  //
+  std::unique_ptr<renum_quad_t<vertex_t, edge_t>> p_ret =
+    std::make_unique<renum_quad_t<vertex_t, edge_t>>(handle);
+
+  if (multi_gpu) {
+    std::tie(
+      p_ret->get_dv(), p_ret->get_partition(), p_ret->get_num_vertices(), p_ret->get_num_edges()) =
+      cugraph::experimental::renumber_edgelist<vertex_t, edge_t, true>(
+        handle,
+        shuffled_edgelist_major_vertices,
+        shuffled_edgelist_minor_vertices,
+        num_edgelist_edges,
+        is_hypergraph_partitioned,
+        do_expensive_check);
+  } else {
+    auto ret_f = cugraph::experimental::renumber_edgelist<vertex_t, edge_t, false>(
+      handle,
+      shuffled_edgelist_major_vertices,
+      shuffled_edgelist_minor_vertices,
+      num_edgelist_edges,
+      do_expensive_check);
+
+    auto tot_vertices = static_cast<vertex_t>(ret_f.size());
+
+    p_ret->get_dv() = std::move(ret_f);
+    cugraph::experimental::partition_t<vertex_t> part_sg(
+      std::vector<vertex_t>{0, tot_vertices}, false, 1, 1, 0, 0);
+
+    p_ret->get_partition() = std::move(part_sg);
+
+    p_ret->get_num_vertices() = tot_vertices;
+    p_ret->get_num_edges()    = num_edgelist_edges;
+  }
+
+  return p_ret;  // RVO-ed (copy ellision)
+}
+
 // Helper for setting up subcommunicators
 void init_subcomms(raft::handle_t& handle, size_t row_comm_size)
 {
@@ -836,6 +989,33 @@ template void call_bfs<int64_t, double>(raft::handle_t const& handle,
                                         double* sp_counters,
                                         const int64_t start_vertex,
                                         bool directed);
+template std::unique_ptr<cy_multi_edgelists_t> call_egonet<int32_t, float>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int32_t* source_vertex,
+  int32_t n_subgraphs,
+  int32_t radius);
+
+template std::unique_ptr<cy_multi_edgelists_t> call_egonet<int32_t, double>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int32_t* source_vertex,
+  int32_t n_subgraphs,
+  int32_t radius);
+
+template std::unique_ptr<cy_multi_edgelists_t> call_egonet<int64_t, float>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int64_t* source_vertex,
+  int64_t n_subgraphs,
+  int64_t radius);
+
+template std::unique_ptr<cy_multi_edgelists_t> call_egonet<int64_t, double>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int64_t* source_vertex,
+  int64_t n_subgraphs,
+  int64_t radius);
 
 template void call_sssp(raft::handle_t const& handle,
                         graph_container_t const& graph_container,
@@ -865,5 +1045,82 @@ template void call_sssp(raft::handle_t const& handle,
                         int64_t* predecessors,
                         const int64_t source_vertex);
 
+template std::unique_ptr<major_minor_weights_t<int32_t, float>> call_shuffle(
+  raft::handle_t const& handle,
+  int32_t* edgelist_major_vertices,
+  int32_t* edgelist_minor_vertices,
+  float* edgelist_weights,
+  int32_t num_edgelist_edges,
+  bool is_hypergraph_partitioned);
+
+template std::unique_ptr<major_minor_weights_t<int32_t, float>> call_shuffle(
+  raft::handle_t const& handle,
+  int32_t* edgelist_major_vertices,
+  int32_t* edgelist_minor_vertices,
+  float* edgelist_weights,
+  int64_t num_edgelist_edges,
+  bool is_hypergraph_partitioned);
+
+template std::unique_ptr<major_minor_weights_t<int32_t, double>> call_shuffle(
+  raft::handle_t const& handle,
+  int32_t* edgelist_major_vertices,
+  int32_t* edgelist_minor_vertices,
+  double* edgelist_weights,
+  int32_t num_edgelist_edges,
+  bool is_hypergraph_partitioned);
+
+template std::unique_ptr<major_minor_weights_t<int32_t, double>> call_shuffle(
+  raft::handle_t const& handle,
+  int32_t* edgelist_major_vertices,
+  int32_t* edgelist_minor_vertices,
+  double* edgelist_weights,
+  int64_t num_edgelist_edges,
+  bool is_hypergraph_partitioned);
+
+template std::unique_ptr<major_minor_weights_t<int64_t, float>> call_shuffle(
+  raft::handle_t const& handle,
+  int64_t* edgelist_major_vertices,
+  int64_t* edgelist_minor_vertices,
+  float* edgelist_weights,
+  int64_t num_edgelist_edges,
+  bool is_hypergraph_partitioned);
+
+template std::unique_ptr<major_minor_weights_t<int64_t, double>> call_shuffle(
+  raft::handle_t const& handle,
+  int64_t* edgelist_major_vertices,
+  int64_t* edgelist_minor_vertices,
+  double* edgelist_weights,
+  int64_t num_edgelist_edges,
+  bool is_hypergraph_partitioned);
+
+// TODO: add the remaining relevant EIDIr's:
+//
+template std::unique_ptr<renum_quad_t<int32_t, int32_t>> call_renumber(
+  raft::handle_t const& handle,
+  int32_t* shuffled_edgelist_major_vertices /* [INOUT] */,
+  int32_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
+  int32_t num_edgelist_edges,
+  bool is_hypergraph_partitioned,
+  bool do_expensive_check,
+  bool multi_gpu);
+
+template std::unique_ptr<renum_quad_t<int32_t, int64_t>> call_renumber(
+  raft::handle_t const& handle,
+  int32_t* shuffled_edgelist_major_vertices /* [INOUT] */,
+  int32_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
+  int64_t num_edgelist_edges,
+  bool is_hypergraph_partitioned,
+  bool do_expensive_check,
+  bool multi_gpu);
+
+template std::unique_ptr<renum_quad_t<int64_t, int64_t>> call_renumber(
+  raft::handle_t const& handle,
+  int64_t* shuffled_edgelist_major_vertices /* [INOUT] */,
+  int64_t* shuffled_edgelist_minor_vertices /* [INOUT] */,
+  int64_t num_edgelist_edges,
+  bool is_hypergraph_partitioned,
+  bool do_expensive_check,
+  bool multi_gpu);
+
 }  // namespace cython
 }  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 9b57ad4557c..5425c68e896 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,6 +1,6 @@
 ﻿#=============================================================================
 #
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 ###################################################################################################
 # - compiler function -----------------------------------------------------------------------------
 
-function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC CMAKE_EXTRA_LIBS)
+function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC)
     add_executable(${CMAKE_TEST_NAME}
         ${CMAKE_TEST_SRC})
 
@@ -30,21 +30,36 @@ function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC CMAKE_EXTRA_LIBS)
 	"${CUCO_INCLUDE_DIR}"
 	"${LIBCUDACXX_INCLUDE_DIR}"
         "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
-        "${GTEST_INCLUDE_DIR}"
         "${RMM_INCLUDE}"
         "${CUDF_INCLUDE}"
         "${CUDF_INCLUDE}/libcudf/libcudacxx"
         "${NCCL_INCLUDE_DIRS}"
-        "${CMAKE_SOURCE_DIR}/../thirdparty/mmio"
-        "${CMAKE_SOURCE_DIR}/include"
-        "${CMAKE_SOURCE_DIR}/src"
+        "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio"
+        "${CMAKE_CURRENT_SOURCE_DIR}/../include"
+        "${CMAKE_CURRENT_SOURCE_DIR}/../src"
         "${CMAKE_CURRENT_SOURCE_DIR}"
         "${RAFT_DIR}/cpp/include"
     )
 
+    target_link_directories(${CMAKE_TEST_NAME}
+        PRIVATE
+        # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported
+        # variable containing the link directories for nvcc.
+        "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
+
     target_link_libraries(${CMAKE_TEST_NAME}
         PRIVATE
-        gtest gmock_main gmock cugraph ${CUDF_LIBRARY} ${CMAKE_EXTRA_LIBS}  ${NCCL_LIBRARIES} cudart cuda cublas cusparse cusolver curand)
+        cugraph
+        GTest::GTest
+        GTest::Main
+        ${CUDF_LIBRARY}
+        ${NCCL_LIBRARIES}
+        cudart
+        cuda
+        cublas
+        cusparse
+        cusolver
+        curand)
 
     if(OpenMP_CXX_FOUND)
         target_link_libraries(${CMAKE_TEST_NAME} PRIVATE
@@ -96,8 +111,17 @@ function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC CMAKE_EXTRA_LIBS)
 ###       BUILD_RPATH "${TARGET_BUILD_RPATH}")
         ${OpenMP_CXX_LIB_NAMES})
     endif(OpenMP_CXX_FOUND)
+
+    # CUDA_ARCHITECTURES=OFF implies cmake will not pass arch flags to the
+    # compiler. CUDA_ARCHITECTURES must be set to a non-empty value to prevent
+    # cmake warnings about policy CMP0104. With this setting, arch flags must be
+    # manually set! ("evaluate_gpu_archs(GPU_ARCHS)" is the current mechanism
+    # used in cpp/CMakeLists.txt for setting arch options).
+    # Run "cmake --help-policy CMP0104" for policy details.
+    # NOTE: the CUDA_ARCHITECTURES=OFF setting may be removed after migrating to
+    # the findcudatoolkit features in cmake 3.17+
     set_target_properties(${CMAKE_TEST_NAME} PROPERTIES
-        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gtests/")
+        CUDA_ARCHITECTURES OFF)
 
     add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
 endfunction()
@@ -116,206 +140,288 @@ endif(RAPIDS_DATASET_ROOT_DIR)
 ### test sources ##################################################################################
 ###################################################################################################
 
+# FIXME: consider adding a "add_library(cugraph_testing SHARED ...) instead of
+# adding the same test utility sources to each test target. There may need to be
+# an additional cugraph_mg_testing lib due to the optional inclusion of MPI.
+
 ###################################################################################################
 # - katz centrality tests -------------------------------------------------------------------------
 
 set(KATZ_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/centrality/katz_centrality_test.cu")
 
-  ConfigureTest(KATZ_TEST "${KATZ_TEST_SRC}" "")
+  ConfigureTest(KATZ_TEST "${KATZ_TEST_SRC}")
 
 ###################################################################################################
 # - betweenness centrality tests ------------------------------------------------------------------
 
 set(BETWEENNESS_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/centrality/betweenness_centrality_test.cu")
 
-  ConfigureTest(BETWEENNESS_TEST "${BETWEENNESS_TEST_SRC}" "")
+  ConfigureTest(BETWEENNESS_TEST "${BETWEENNESS_TEST_SRC}")
 
 set(EDGE_BETWEENNESS_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/centrality/edge_betweenness_centrality_test.cu")
 
-  ConfigureTest(EDGE_BETWEENNESS_TEST "${EDGE_BETWEENNESS_TEST_SRC}" "")
-
-###################################################################################################
-# - pagerank tests --------------------------------------------------------------------------------
-
-set(PAGERANK_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/pagerank/pagerank_test.cpp")
-
-ConfigureTest(PAGERANK_TEST "${PAGERANK_TEST_SRC}" "")
+  ConfigureTest(EDGE_BETWEENNESS_TEST "${EDGE_BETWEENNESS_TEST_SRC}")
 
 ###################################################################################################
 # - SSSP tests ------------------------------------------------------------------------------------
 
 set(SSSP_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/traversal/sssp_test.cu")
 
-ConfigureTest(SSSP_TEST "${SSSP_TEST_SRCS}" "")
+ConfigureTest(SSSP_TEST "${SSSP_TEST_SRCS}")
 
 ###################################################################################################
 # - BFS tests -------------------------------------------------------------------------------------
 
 set(BFS_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/traversal/bfs_test.cu")
 
-ConfigureTest(BFS_TEST "${BFS_TEST_SRCS}" "")
+ConfigureTest(BFS_TEST "${BFS_TEST_SRCS}")
 
 ###################################################################################################
 # - LOUVAIN tests ---------------------------------------------------------------------------------
 
 set(LOUVAIN_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
-    "${CMAKE_CURRENT_SOURCE_DIR}/community/louvain_test.cu")
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/community/louvain_test.cpp")
 
-ConfigureTest(LOUVAIN_TEST "${LOUVAIN_TEST_SRC}" "")
+ConfigureTest(LOUVAIN_TEST "${LOUVAIN_TEST_SRC}")
 
 ###################################################################################################
 # - LEIDEN tests ---------------------------------------------------------------------------------
 
 set(LEIDEN_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/community/leiden_test.cpp")
 
-ConfigureTest(LEIDEN_TEST "${LEIDEN_TEST_SRC}" "")
+ConfigureTest(LEIDEN_TEST "${LEIDEN_TEST_SRC}")
 
 ###################################################################################################
 # - ECG tests ---------------------------------------------------------------------------------
 
 set(ECG_TEST_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/community/ecg_test.cu")
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/community/ecg_test.cpp")
 
-ConfigureTest(ECG_TEST "${ECG_TEST_SRC}" "")
+ConfigureTest(ECG_TEST "${ECG_TEST_SRC}")
 
 ###################################################################################################
 # - Balanced cut clustering tests -----------------------------------------------------------------
 
 set(BALANCED_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/community/balanced_edge_test.cpp")
 
-ConfigureTest(BALANCED_TEST "${BALANCED_TEST_SRC}" "")
+ConfigureTest(BALANCED_TEST "${BALANCED_TEST_SRC}")
 
 ###################################################################################################
 # - TRIANGLE tests --------------------------------------------------------------------------------
 
 set(TRIANGLE_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/community/triangle_test.cu")
 
-ConfigureTest(TRIANGLE_TEST "${TRIANGLE_TEST_SRC}" "")
+ConfigureTest(TRIANGLE_TEST "${TRIANGLE_TEST_SRC}")
 
+###################################################################################################
+# - EGO tests --------------------------------------------------------------------------------
+
+set(EGO_TEST_SRC
+    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/community/egonet_test.cu")
+
+ConfigureTest(EGO_TEST "${EGO_TEST_SRC}" "")
 ###################################################################################################
 # - RENUMBERING tests -----------------------------------------------------------------------------
 
 set(RENUMBERING_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/renumber/renumber_test.cu")
 
-ConfigureTest(RENUMBERING_TEST "${RENUMBERING_TEST_SRC}" "")
+ConfigureTest(RENUMBERING_TEST "${RENUMBERING_TEST_SRC}")
 
 ###################################################################################################
 # - FORCE ATLAS 2  tests --------------------------------------------------------------------------
 
 set(FA2_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/layout/force_atlas2_test.cu")
 
-ConfigureTest(FA2_TEST "${FA2_TEST_SRC}" "")
+ConfigureTest(FA2_TEST "${FA2_TEST_SRC}")
+
+###################################################################################################
+# - TSP  tests --------------------------------------------------------------------------
+
+set(TSP_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/traversal/tsp_test.cu")
+
+  ConfigureTest(TSP_TEST "${TSP_TEST_SRC}" "")
 
 ###################################################################################################
 # - CONNECTED COMPONENTS  tests -------------------------------------------------------------------
 
 set(CONNECT_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/components/con_comp_test.cu")
 
-ConfigureTest(CONNECT_TEST "${CONNECT_TEST_SRC}" "")
+ConfigureTest(CONNECT_TEST "${CONNECT_TEST_SRC}")
 
 ###################################################################################################
 # - STRONGLY CONNECTED COMPONENTS  tests ----------------------------------------------------------
 
 set(SCC_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/components/scc_test.cu")
 
-ConfigureTest(SCC_TEST "${SCC_TEST_SRC}" "")
+ConfigureTest(SCC_TEST "${SCC_TEST_SRC}")
 
 ###################################################################################################
 #-Hungarian (Linear Assignment Problem)  tests ---------------------------------------------------------------------
 
 set(HUNGARIAN_TEST_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/linear_assignment/hungarian_test.cu")
 
-ConfigureTest(HUNGARIAN_TEST "${HUNGARIAN_TEST_SRC}" "")
+ConfigureTest(HUNGARIAN_TEST "${HUNGARIAN_TEST_SRC}")
 
 ###################################################################################################
 # - MST tests ----------------------------------------------------------------------------
 
 set(MST_TEST_SRC
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/tree/mst_test.cu")
 
-ConfigureTest(MST_TEST "${MST_TEST_SRC}" "")
+ConfigureTest(MST_TEST "${MST_TEST_SRC}")
 
 
 ###################################################################################################
 # - Experimental Graph tests ----------------------------------------------------------------------
 
 set(EXPERIMENTAL_GRAPH_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/experimental/graph_test.cpp")
 
-ConfigureTest(EXPERIMENTAL_GRAPH_TEST "${EXPERIMENTAL_GRAPH_TEST_SRCS}" "")
+ConfigureTest(EXPERIMENTAL_GRAPH_TEST "${EXPERIMENTAL_GRAPH_TEST_SRCS}")
+
+###################################################################################################
+# - Experimental coarsening tests -----------------------------------------------------------------
+
+set(EXPERIMENTAL_COARSEN_GRAPH_TEST_SRCS
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/coarsen_graph_test.cpp")
+
+ConfigureTest(EXPERIMENTAL_COARSEN_GRAPH_TEST "${EXPERIMENTAL_COARSEN_GRAPH_TEST_SRCS}")
+
+###################################################################################################
+# - Experimental induced subgraph tests -----------------------------------------------------------
+
+set(EXPERIMENTAL_INDUCED_SUBGRAPH_TEST_SRCS
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/induced_subgraph_test.cpp")
+
+ConfigureTest(EXPERIMENTAL_INDUCED_SUBGRAPH_TEST "${EXPERIMENTAL_INDUCED_SUBGRAPH_TEST_SRCS}")
 
 ###################################################################################################
 # - Experimental BFS tests ------------------------------------------------------------------------
 
 set(EXPERIMENTAL_BFS_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/experimental/bfs_test.cpp")
 
-ConfigureTest(EXPERIMENTAL_BFS_TEST "${EXPERIMENTAL_BFS_TEST_SRCS}" "")
+ConfigureTest(EXPERIMENTAL_BFS_TEST "${EXPERIMENTAL_BFS_TEST_SRCS}")
 
 ###################################################################################################
 # - Experimental SSSP tests -----------------------------------------------------------------------
 
 set(EXPERIMENTAL_SSSP_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/experimental/sssp_test.cpp")
 
-ConfigureTest(EXPERIMENTAL_SSSP_TEST "${EXPERIMENTAL_SSSP_TEST_SRCS}" "")
+ConfigureTest(EXPERIMENTAL_SSSP_TEST "${EXPERIMENTAL_SSSP_TEST_SRCS}")
 
 ###################################################################################################
 # - Experimental PAGERANK tests -------------------------------------------------------------------
 
 set(EXPERIMENTAL_PAGERANK_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/experimental/pagerank_test.cpp")
 
-ConfigureTest(EXPERIMENTAL_PAGERANK_TEST "${EXPERIMENTAL_PAGERANK_TEST_SRCS}" "")
+ConfigureTest(EXPERIMENTAL_PAGERANK_TEST "${EXPERIMENTAL_PAGERANK_TEST_SRCS}")
 
 ###################################################################################################
 # - Experimental LOUVAIN tests -------------------------------------------------------------------
 
 set(EXPERIMENTAL_LOUVAIN_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/experimental/louvain_test.cu")
 
-ConfigureTest(EXPERIMENTAL_LOUVAIN_TEST "${EXPERIMENTAL_LOUVAIN_TEST_SRCS}" "")
+ConfigureTest(EXPERIMENTAL_LOUVAIN_TEST "${EXPERIMENTAL_LOUVAIN_TEST_SRCS}")
 
 ###################################################################################################
 # - Experimental KATZ_CENTRALITY tests ------------------------------------------------------------
 
 set(EXPERIMENTAL_KATZ_CENTRALITY_TEST_SRCS
-    "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+    "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/experimental/katz_centrality_test.cpp")
 
-ConfigureTest(EXPERIMENTAL_KATZ_CENTRALITY_TEST "${EXPERIMENTAL_KATZ_CENTRALITY_TEST_SRCS}" "")
+ConfigureTest(EXPERIMENTAL_KATZ_CENTRALITY_TEST "${EXPERIMENTAL_KATZ_CENTRALITY_TEST_SRCS}")
+
+
+###################################################################################################
+# - MG tests --------------------------------------------------------------------------------------
+if(BUILD_CUGRAPH_MG_TESTS)
+    if(MPI_CXX_FOUND)
+        ###########################################################################################
+        # - MG PAGERANK tests ---------------------------------------------------------------------
+
+        set(MG_PAGERANK_TEST_SRCS
+            "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+            "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
+            "${CMAKE_CURRENT_SOURCE_DIR}/utilities/mg_test_utilities.cu"
+            "${CMAKE_CURRENT_SOURCE_DIR}/pagerank/pagerank_mg_test.cpp")
+
+        ConfigureTest(MG_PAGERANK_TEST "${MG_PAGERANK_TEST_SRCS}")
+        target_link_libraries(MG_PAGERANK_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX)
+
+    else(MPI_CXX_FOUND)
+       message(FATAL_ERROR "OpenMPI NOT found, cannot build MG tests.")
+    endif(MPI_CXX_FOUND)
+endif(BUILD_CUGRAPH_MG_TESTS)
 
 ###################################################################################################
 ### enable testing ################################################################################
diff --git a/cpp/tests/README.md b/cpp/tests/README.md
new file mode 100644
index 00000000000..b5808822467
--- /dev/null
+++ b/cpp/tests/README.md
@@ -0,0 +1,31 @@
+# libcugraph C++ tests
+
+## Prerequisites
+### Datasets
+```
+/path/to/cuGraph> ./datasets/get_test_data.sh
+/path/to/cuGraph> export RAPIDS_DATASET_ROOT_DIR=/path/to/cuGraph/datasets
+```
+### System Requirements
+* MPI (multi-GPU tests only)
+   ```
+   conda install -c conda-forge openmpi
+   ```
+
+## Building
+```
+/path/to/cuGraph> ./build.sh libcugraph
+```
+To build the multi-GPU tests:
+```
+/path/to/cuGraph> ./build.sh libcugraph cpp-mgtests
+```
+
+## Running
+```
+<example here>
+```
+To run the multi-GPU tests (example using 2 GPUs):
+```
+/path/to/cuGraph> mpirun -n 2 ./cpp/build/gtests/MG_PAGERANK_TEST
+```
diff --git a/cpp/tests/community/ecg_test.cu b/cpp/tests/community/ecg_test.cpp
similarity index 73%
rename from cpp/tests/community/ecg_test.cu
rename to cpp/tests/community/ecg_test.cpp
index 85b80b1610b..a13ee2fe360 100644
--- a/cpp/tests/community/ecg_test.cu
+++ b/cpp/tests/community/ecg_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -77,6 +77,10 @@ TEST(ecg, success)
 
 TEST(ecg, dolphin)
 {
+  raft::handle_t handle;
+
+  auto stream = handle.get_stream();
+
   std::vector<int> off_h = {0,   6,   14,  18,  21,  22,  26,  32,  37,  43,  50,  55,  56,
                             57,  65,  77,  84,  90,  99,  106, 110, 119, 125, 126, 129, 135,
                             138, 141, 146, 151, 160, 165, 166, 169, 179, 184, 185, 192, 203,
@@ -103,38 +107,55 @@ TEST(ecg, dolphin)
   int num_verts = off_h.size() - 1;
   int num_edges = ind_h.size();
 
-  thrust::host_vector<int> cluster_id(num_verts, -1);
+  std::vector<int> cluster_id(num_verts, -1);
 
-  rmm::device_vector<int> offsets_v(off_h);
-  rmm::device_vector<int> indices_v(ind_h);
-  rmm::device_vector<float> weights_v(w_h);
-  rmm::device_vector<int> result_v(cluster_id);
+  rmm::device_uvector<int> offsets_v(num_verts + 1, stream);
+  rmm::device_uvector<int> indices_v(num_edges, stream);
+  rmm::device_uvector<float> weights_v(num_edges, stream);
+  rmm::device_uvector<int> result_v(num_verts, stream);
+
+  raft::update_device(offsets_v.data(), off_h.data(), off_h.size(), stream);
+  raft::update_device(indices_v.data(), ind_h.data(), ind_h.size(), stream);
+  raft::update_device(weights_v.data(), w_h.data(), w_h.size(), stream);
 
   cugraph::GraphCSRView<int, int, float> graph_csr(
-    offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
+    offsets_v.data(), indices_v.data(), weights_v.data(), num_verts, num_edges);
 
-  raft::handle_t handle;
-  cugraph::ecg<int32_t, int32_t, float>(handle, graph_csr, .05, 16, result_v.data().get());
+  // "FIXME": remove this check once we drop support for Pascal
+  //
+  // Calling louvain on Pascal will throw an exception, we'll check that
+  // this is the behavior while we still support Pascal (device_prop.major < 7)
+  //
+  if (handle.get_device_properties().major < 7) {
+    EXPECT_THROW(
+      (cugraph::ecg<int32_t, int32_t, float>(handle, graph_csr, .05, 16, result_v.data())),
+      cugraph::logic_error);
+  } else {
+    cugraph::ecg<int32_t, int32_t, float>(handle, graph_csr, .05, 16, result_v.data());
 
-  cluster_id = result_v;
-  int max    = *max_element(cluster_id.begin(), cluster_id.end());
-  int min    = *min_element(cluster_id.begin(), cluster_id.end());
+    raft::update_host(cluster_id.data(), result_v.data(), num_verts, stream);
 
-  ASSERT_EQ((min >= 0), 1);
+    CUDA_TRY(cudaDeviceSynchronize());
 
-  std::set<int> cluster_ids;
-  for (auto c : cluster_id) { cluster_ids.insert(c); }
+    int max = *max_element(cluster_id.begin(), cluster_id.end());
+    int min = *min_element(cluster_id.begin(), cluster_id.end());
 
-  ASSERT_EQ(cluster_ids.size(), size_t(max + 1));
+    ASSERT_EQ((min >= 0), 1);
 
-  float modularity{0.0};
+    std::set<int> cluster_ids;
+    for (auto c : cluster_id) { cluster_ids.insert(c); }
 
-  cugraph::ext_raft::analyzeClustering_modularity(
-    graph_csr, max + 1, result_v.data().get(), &modularity);
+    ASSERT_EQ(cluster_ids.size(), size_t(max + 1));
+
+    float modularity{0.0};
+
+    cugraph::ext_raft::analyzeClustering_modularity(
+      graph_csr, max + 1, result_v.data(), &modularity);
 
-  float random_modularity{0.95 * 0.4962422251701355};
+    float random_modularity{0.95 * 0.4962422251701355};
 
-  ASSERT_GT(modularity, random_modularity);
+    ASSERT_GT(modularity, random_modularity);
+  }
 }
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/egonet_test.cu b/cpp/tests/community/egonet_test.cu
new file mode 100644
index 00000000000..ec031228998
--- /dev/null
+++ b/cpp/tests/community/egonet_test.cu
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <algorithms.hpp>
+#include <experimental/graph.hpp>
+#include <experimental/graph_view.hpp>
+#include <graph.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <raft/cudart_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
+#include <algorithm>
+#include <tuple>
+#include <vector>
+
+typedef struct InducedEgo_Usecase_t {
+  std::string graph_file_full_path{};
+  std::vector<int32_t> ego_sources{};
+  int32_t radius;
+  bool test_weighted{false};
+
+  InducedEgo_Usecase_t(std::string const& graph_file_path,
+                       std::vector<int32_t> const& ego_sources,
+                       int32_t radius,
+                       bool test_weighted)
+    : ego_sources(ego_sources), radius(radius), test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} InducedEgo_Usecase;
+
+class Tests_InducedEgo : public ::testing::TestWithParam<InducedEgo_Usecase> {
+ public:
+  Tests_InducedEgo() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+  void run_current_test(InducedEgo_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    auto graph = cugraph::test::
+      read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, store_transposed>(
+        handle, configuration.graph_file_full_path, configuration.test_weighted);
+    auto graph_view = graph.view();
+
+    rmm::device_uvector<vertex_t> d_ego_sources(configuration.ego_sources.size(),
+                                                handle.get_stream());
+
+    raft::update_device(d_ego_sources.data(),
+                        configuration.ego_sources.data(),
+                        configuration.ego_sources.size(),
+                        handle.get_stream());
+
+    rmm::device_uvector<vertex_t> d_ego_edgelist_src(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> d_ego_edgelist_dst(0, handle.get_stream());
+    rmm::device_uvector<weight_t> d_ego_edgelist_weights(0, handle.get_stream());
+    rmm::device_uvector<size_t> d_ego_edge_offsets(0, handle.get_stream());
+
+    std::tie(d_ego_edgelist_src, d_ego_edgelist_dst, d_ego_edgelist_weights, d_ego_edge_offsets) =
+      cugraph::experimental::extract_ego(handle,
+                                         graph_view,
+                                         d_ego_sources.data(),
+                                         static_cast<vertex_t>(configuration.ego_sources.size()),
+                                         configuration.radius);
+
+    std::vector<size_t> h_cugraph_ego_edge_offsets(d_ego_edge_offsets.size());
+    std::vector<vertex_t> h_cugraph_ego_edgelist_src(d_ego_edgelist_src.size());
+    std::vector<vertex_t> h_cugraph_ego_edgelist_dst(d_ego_edgelist_dst.size());
+    raft::update_host(h_cugraph_ego_edgelist_src.data(),
+                      d_ego_edgelist_src.data(),
+                      d_ego_edgelist_src.size(),
+                      handle.get_stream());
+    raft::update_host(h_cugraph_ego_edgelist_dst.data(),
+                      d_ego_edgelist_dst.data(),
+                      d_ego_edgelist_dst.size(),
+                      handle.get_stream());
+    raft::update_host(h_cugraph_ego_edge_offsets.data(),
+                      d_ego_edge_offsets.data(),
+                      d_ego_edge_offsets.size(),
+                      handle.get_stream());
+    ASSERT_TRUE(d_ego_edge_offsets.size() == (configuration.ego_sources.size() + 1));
+    ASSERT_TRUE(d_ego_edgelist_src.size() == d_ego_edgelist_dst.size());
+    if (configuration.test_weighted)
+      ASSERT_TRUE(d_ego_edgelist_src.size() == d_ego_edgelist_weights.size());
+    ASSERT_TRUE(h_cugraph_ego_edge_offsets[configuration.ego_sources.size()] ==
+                d_ego_edgelist_src.size());
+    for (size_t i = 0; i < configuration.ego_sources.size(); i++)
+      ASSERT_TRUE(h_cugraph_ego_edge_offsets[i] < h_cugraph_ego_edge_offsets[i + 1]);
+    auto n_vertices = graph_view.get_number_of_vertices();
+    for (size_t i = 0; i < d_ego_edgelist_src.size(); i++) {
+      ASSERT_TRUE(h_cugraph_ego_edgelist_src[i] >= 0);
+      ASSERT_TRUE(h_cugraph_ego_edgelist_src[i] < n_vertices);
+      ASSERT_TRUE(h_cugraph_ego_edgelist_dst[i] >= 0);
+      ASSERT_TRUE(h_cugraph_ego_edgelist_dst[i] < n_vertices);
+    }
+
+    /*
+    // For inspecting data
+    std::vector<weight_t> h_cugraph_ego_edgelist_weights(d_ego_edgelist_weights.size());
+    if (configuration.test_weighted) {
+      raft::update_host(h_cugraph_ego_edgelist_weights.data(),
+                        d_ego_edgelist_weights.data(),
+                        d_ego_edgelist_weights.size(),
+                        handle.get_stream());
+    }
+    raft::print_host_vector("offsets",
+                            &h_cugraph_ego_edge_offsets[0],
+                            h_cugraph_ego_edge_offsets.size(),
+                            std::cout);
+    raft::print_host_vector("src",
+                            &h_cugraph_ego_edgelist_src[0],
+                            h_cugraph_ego_edgelist_src.size(),
+                            std::cout);
+    raft::print_host_vector("dst",
+                            &h_cugraph_ego_edgelist_dst[0],
+                            h_cugraph_ego_edgelist_dst.size(),
+                            std::cout);
+    raft::print_host_vector("weights",
+                            &h_cugraph_ego_edgelist_weights[0],
+                            h_cugraph_ego_edgelist_weights.size(),
+                            std::cout);
+    */
+  }
+};
+
+TEST_P(Tests_InducedEgo, CheckInt32Int32FloatUntransposed)
+{
+  run_current_test<int32_t, int32_t, float, false>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(
+  simple_test,
+  Tests_InducedEgo,
+  ::testing::Values(
+    InducedEgo_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{0}, 1, false),
+    InducedEgo_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{0}, 2, false),
+    InducedEgo_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{1}, 3, false),
+    InducedEgo_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{10, 0, 5}, 2, false),
+    InducedEgo_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{9, 3, 10}, 2, false),
+    InducedEgo_Usecase("test/datasets/karate.mtx", std::vector<int32_t>{5, 12, 13}, 2, true)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/leiden_test.cpp b/cpp/tests/community/leiden_test.cpp
index 764ab8bf6cb..9083400f85c 100644
--- a/cpp/tests/community/leiden_test.cpp
+++ b/cpp/tests/community/leiden_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -19,6 +19,10 @@
 
 TEST(leiden_karate, success)
 {
+  raft::handle_t handle;
+
+  auto stream = handle.get_stream();
+
   std::vector<int> off_h = {0,  16,  25,  35,  41,  44,  48,  52,  56,  61,  63, 66,
                             67, 69,  74,  76,  78,  80,  82,  84,  87,  89,  91, 93,
                             98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156};
@@ -46,27 +50,38 @@ TEST(leiden_karate, success)
 
   std::vector<int> cluster_id(num_verts, -1);
 
-  rmm::device_vector<int> offsets_v(off_h);
-  rmm::device_vector<int> indices_v(ind_h);
-  rmm::device_vector<float> weights_v(w_h);
-  rmm::device_vector<int> result_v(cluster_id);
+  rmm::device_uvector<int> offsets_v(num_verts + 1, stream);
+  rmm::device_uvector<int> indices_v(num_edges, stream);
+  rmm::device_uvector<float> weights_v(num_edges, stream);
+  rmm::device_uvector<int> result_v(num_verts, stream);
+
+  raft::update_device(offsets_v.data(), off_h.data(), off_h.size(), stream);
+  raft::update_device(indices_v.data(), ind_h.data(), ind_h.size(), stream);
+  raft::update_device(weights_v.data(), w_h.data(), w_h.size(), stream);
 
   cugraph::GraphCSRView<int, int, float> G(
-    offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
+    offsets_v.data(), indices_v.data(), weights_v.data(), num_verts, num_edges);
 
   float modularity{0.0};
   size_t num_level = 40;
 
-  raft::handle_t handle;
-  std::tie(num_level, modularity) = cugraph::leiden(handle, G, result_v.data().get());
+  // "FIXME": remove this check once we drop support for Pascal
+  //
+  // Calling louvain on Pascal will throw an exception, we'll check that
+  // this is the behavior while we still support Pascal (device_prop.major < 7)
+  //
+  if (handle.get_device_properties().major < 7) {
+    EXPECT_THROW(cugraph::leiden(handle, G, result_v.data()), cugraph::logic_error);
+  } else {
+    std::tie(num_level, modularity) = cugraph::leiden(handle, G, result_v.data());
+
+    raft::update_host(cluster_id.data(), result_v.data(), num_verts, stream);
 
-  cudaMemcpy((void*)&(cluster_id[0]),
-             result_v.data().get(),
-             sizeof(int) * num_verts,
-             cudaMemcpyDeviceToHost);
+    CUDA_TRY(cudaDeviceSynchronize());
 
-  int min = *min_element(cluster_id.begin(), cluster_id.end());
+    int min = *min_element(cluster_id.begin(), cluster_id.end());
 
-  ASSERT_GE(min, 0);
-  ASSERT_GE(modularity, 0.41116042 * 0.99);
+    ASSERT_GE(min, 0);
+    ASSERT_GE(modularity, 0.41116042 * 0.99);
+  }
 }
diff --git a/cpp/tests/community/louvain_test.cu b/cpp/tests/community/louvain_test.cpp
similarity index 62%
rename from cpp/tests/community/louvain_test.cu
rename to cpp/tests/community/louvain_test.cpp
index 2bac0097212..d3024282be3 100644
--- a/cpp/tests/community/louvain_test.cu
+++ b/cpp/tests/community/louvain_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -15,10 +15,14 @@
 
 #include <thrust/extrema.h>
 
-#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/device_uvector.hpp>
 
 TEST(louvain, success)
 {
+  raft::handle_t handle;
+
+  auto stream = handle.get_stream();
+
   std::vector<int> off_h = {0,  16,  25,  35,  41,  44,  48,  52,  56,  61,  63, 66,
                             67, 69,  74,  76,  78,  80,  82,  84,  87,  89,  91, 93,
                             98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156};
@@ -49,42 +53,54 @@ TEST(louvain, success)
 
   std::vector<int> cluster_id(num_verts, -1);
 
-  rmm::device_vector<int> offsets_v(off_h);
-  rmm::device_vector<int> indices_v(ind_h);
-  rmm::device_vector<float> weights_v(w_h);
-  rmm::device_vector<int> result_v(cluster_id);
+  rmm::device_uvector<int> offsets_v(num_verts + 1, stream);
+  rmm::device_uvector<int> indices_v(num_edges, stream);
+  rmm::device_uvector<float> weights_v(num_edges, stream);
+  rmm::device_uvector<int> result_v(num_verts, stream);
+
+  raft::update_device(offsets_v.data(), off_h.data(), off_h.size(), stream);
+  raft::update_device(indices_v.data(), ind_h.data(), ind_h.size(), stream);
+  raft::update_device(weights_v.data(), w_h.data(), w_h.size(), stream);
 
   cugraph::GraphCSRView<int, int, float> G(
-    offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
+    offsets_v.data(), indices_v.data(), weights_v.data(), num_verts, num_edges);
 
   float modularity{0.0};
   size_t num_level = 40;
 
-  raft::handle_t handle;
+  // "FIXME": remove this check once we drop support for Pascal
+  //
+  // Calling louvain on Pascal will throw an exception, we'll check that
+  // this is the behavior while we still support Pascal (device_prop.major < 7)
+  //
+  if (handle.get_device_properties().major < 7) {
+    EXPECT_THROW(cugraph::louvain(handle, G, result_v.data()), cugraph::logic_error);
+  } else {
+    std::tie(num_level, modularity) = cugraph::louvain(handle, G, result_v.data());
 
-  std::tie(num_level, modularity) = cugraph::louvain(handle, G, result_v.data().get());
+    raft::update_host(cluster_id.data(), result_v.data(), num_verts, stream);
 
-  cudaMemcpy((void*)&(cluster_id[0]),
-             result_v.data().get(),
-             sizeof(int) * num_verts,
-             cudaMemcpyDeviceToHost);
+    CUDA_TRY(cudaDeviceSynchronize());
 
-  int min = *min_element(cluster_id.begin(), cluster_id.end());
+    int min = *min_element(cluster_id.begin(), cluster_id.end());
 
-  std::cout << "modularity = " << modularity << std::endl;
+    std::cout << "modularity = " << modularity << std::endl;
 
-  ASSERT_GE(min, 0);
-  ASSERT_GE(modularity, 0.402777 * 0.95);
-  ASSERT_EQ(result_v, result_h);
+    ASSERT_GE(min, 0);
+    ASSERT_GE(modularity, 0.402777 * 0.95);
+    ASSERT_EQ(cluster_id, result_h);
+  }
 }
 
 TEST(louvain_renumbered, success)
 {
+  raft::handle_t handle;
+
+  auto stream = handle.get_stream();
+
   std::vector<int> off_h = {0,   16,  25,  30,  34,  38,  42,  44,  46,  48,  50,  52,
                             54,  56,  73,  85,  95,  101, 107, 112, 117, 121, 125, 129,
-                            132, 135, 138, 141, 144, 147, 149, 151, 153, 155, 156
-
-  };
+                            132, 135, 138, 141, 144, 147, 149, 151, 153, 155, 156};
   std::vector<int> ind_h = {
     1,  3,  7,  11, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 30, 33, 0,  5,  11, 15, 16, 19, 21,
     25, 30, 4,  13, 14, 22, 27, 0,  9,  20, 24, 2,  13, 15, 26, 1,  13, 14, 18, 13, 15, 0,  16,
@@ -110,32 +126,42 @@ TEST(louvain_renumbered, success)
 
   std::vector<int> cluster_id(num_verts, -1);
 
-  rmm::device_vector<int> offsets_v(off_h);
-  rmm::device_vector<int> indices_v(ind_h);
-  rmm::device_vector<float> weights_v(w_h);
-  rmm::device_vector<int> result_v(cluster_id);
+  rmm::device_uvector<int> offsets_v(num_verts + 1, stream);
+  rmm::device_uvector<int> indices_v(num_edges, stream);
+  rmm::device_uvector<float> weights_v(num_edges, stream);
+  rmm::device_uvector<int> result_v(num_verts, stream);
+
+  raft::update_device(offsets_v.data(), off_h.data(), off_h.size(), stream);
+  raft::update_device(indices_v.data(), ind_h.data(), ind_h.size(), stream);
+  raft::update_device(weights_v.data(), w_h.data(), w_h.size(), stream);
 
   cugraph::GraphCSRView<int, int, float> G(
-    offsets_v.data().get(), indices_v.data().get(), weights_v.data().get(), num_verts, num_edges);
+    offsets_v.data(), indices_v.data(), weights_v.data(), num_verts, num_edges);
 
   float modularity{0.0};
   size_t num_level = 40;
 
-  raft::handle_t handle;
+  // "FIXME": remove this check once we drop support for Pascal
+  //
+  // Calling louvain on Pascal will throw an exception, we'll check that
+  // this is the behavior while we still support Pascal (device_prop.major < 7)
+  //
+  if (handle.get_device_properties().major < 7) {
+    EXPECT_THROW(cugraph::louvain(handle, G, result_v.data()), cugraph::logic_error);
+  } else {
+    std::tie(num_level, modularity) = cugraph::louvain(handle, G, result_v.data());
 
-  std::tie(num_level, modularity) = cugraph::louvain(handle, G, result_v.data().get());
+    raft::update_host(cluster_id.data(), result_v.data(), num_verts, stream);
 
-  cudaMemcpy((void*)&(cluster_id[0]),
-             result_v.data().get(),
-             sizeof(int) * num_verts,
-             cudaMemcpyDeviceToHost);
+    CUDA_TRY(cudaDeviceSynchronize());
 
-  int min = *min_element(cluster_id.begin(), cluster_id.end());
+    int min = *min_element(cluster_id.begin(), cluster_id.end());
 
-  std::cout << "modularity = " << modularity << std::endl;
+    std::cout << "modularity = " << modularity << std::endl;
 
-  ASSERT_GE(min, 0);
-  ASSERT_GE(modularity, 0.402777 * 0.95);
+    ASSERT_GE(min, 0);
+    ASSERT_GE(modularity, 0.402777 * 0.95);
+  }
 }
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/coarsen_graph_test.cpp b/cpp/tests/experimental/coarsen_graph_test.cpp
new file mode 100644
index 00000000000..b790dfffa69
--- /dev/null
+++ b/cpp/tests/experimental/coarsen_graph_test.cpp
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <algorithms.hpp>
+#include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
+#include <experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <map>
+#include <random>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+template <typename vertex_t>
+std::enable_if_t<std::is_signed<vertex_t>::value, bool> is_valid_vertex(vertex_t num_vertices,
+                                                                        vertex_t v)
+{
+  return (v >= 0) && (v < num_vertices);
+}
+
+template <typename vertex_t>
+std::enable_if_t<std::is_unsigned<vertex_t>::value, bool> is_valid_vertex(vertex_t num_vertices,
+                                                                          vertex_t v)
+{
+  return v < num_vertices;
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+void check_coarsened_graph_results(edge_t* org_offsets,
+                                   vertex_t* org_indices,
+                                   weight_t* org_weights,
+                                   vertex_t* org_labels,
+                                   edge_t* coarse_offsets,
+                                   vertex_t* coarse_indices,
+                                   weight_t* coarse_weights,
+                                   vertex_t* coarse_vertex_labels,
+                                   vertex_t num_org_vertices,
+                                   vertex_t num_coarse_vertices)
+{
+  ASSERT_TRUE(((org_weights == nullptr) && (coarse_weights == nullptr)) ||
+              ((org_weights != nullptr) && (coarse_weights != nullptr)));
+  ASSERT_TRUE(std::is_sorted(org_offsets, org_offsets + num_org_vertices));
+  ASSERT_TRUE(std::count_if(org_indices,
+                            org_indices + org_offsets[num_org_vertices],
+                            [num_org_vertices](auto nbr) {
+                              return !is_valid_vertex(num_org_vertices, nbr);
+                            }) == 0);
+  ASSERT_TRUE(std::is_sorted(coarse_offsets, coarse_offsets + num_coarse_vertices));
+  ASSERT_TRUE(std::count_if(coarse_indices,
+                            coarse_indices + coarse_offsets[num_coarse_vertices],
+                            [num_coarse_vertices](auto nbr) {
+                              return !is_valid_vertex(num_coarse_vertices, nbr);
+                            }) == 0);
+  ASSERT_TRUE(num_coarse_vertices <= num_org_vertices);
+
+  std::vector<vertex_t> org_unique_labels(num_org_vertices);
+  std::iota(org_unique_labels.begin(), org_unique_labels.end(), vertex_t{0});
+  std::transform(org_unique_labels.begin(),
+                 org_unique_labels.end(),
+                 org_unique_labels.begin(),
+                 [org_labels](auto v) { return org_labels[v]; });
+  std::sort(org_unique_labels.begin(), org_unique_labels.end());
+  org_unique_labels.resize(std::distance(
+    org_unique_labels.begin(), std::unique(org_unique_labels.begin(), org_unique_labels.end())));
+
+  ASSERT_TRUE(org_unique_labels.size() == static_cast<size_t>(num_coarse_vertices));
+
+  {
+    std::vector<vertex_t> tmp_coarse_vertex_labels(coarse_vertex_labels,
+                                                   coarse_vertex_labels + num_coarse_vertices);
+    std::sort(tmp_coarse_vertex_labels.begin(), tmp_coarse_vertex_labels.end());
+    ASSERT_TRUE(std::unique(tmp_coarse_vertex_labels.begin(), tmp_coarse_vertex_labels.end()) ==
+                tmp_coarse_vertex_labels.end());
+    ASSERT_TRUE(std::equal(
+      org_unique_labels.begin(), org_unique_labels.end(), tmp_coarse_vertex_labels.begin()));
+  }
+
+  std::vector<std::tuple<vertex_t, vertex_t>> label_org_vertex_pairs(num_org_vertices);
+  for (vertex_t i = 0; i < num_org_vertices; ++i) {
+    label_org_vertex_pairs[i] = std::make_tuple(org_labels[i], i);
+  }
+  std::sort(label_org_vertex_pairs.begin(), label_org_vertex_pairs.end());
+
+  std::map<vertex_t, vertex_t> label_to_coarse_vertex_map{};
+  for (vertex_t i = 0; i < num_coarse_vertices; ++i) {
+    label_to_coarse_vertex_map[coarse_vertex_labels[i]] = i;
+  }
+
+  auto threshold_ratio = (org_weights == nullptr) ? weight_t{1.0} /* irrelevant */ : weight_t{1e-4};
+  auto threshold_magnitude =
+    (org_weights == nullptr)
+      ? weight_t{1.0} /* irrelevant */
+      : (std::accumulate(
+           coarse_weights, coarse_weights + coarse_offsets[num_coarse_vertices], weight_t{0.0}) /
+         static_cast<weight_t>(coarse_offsets[num_coarse_vertices])) *
+          threshold_ratio;
+
+  for (size_t i = 0; i < org_unique_labels.size(); ++i) {  // for each vertex in the coarse graph
+    auto lb = std::lower_bound(
+      label_org_vertex_pairs.begin(),
+      label_org_vertex_pairs.end(),
+      std::make_tuple(org_unique_labels[i],
+                      cugraph::experimental::invalid_vertex_id<vertex_t>::value /* dummy */),
+      [](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); });
+    auto ub = std::upper_bound(
+      label_org_vertex_pairs.begin(),
+      label_org_vertex_pairs.end(),
+      std::make_tuple(org_unique_labels[i],
+                      cugraph::experimental::invalid_vertex_id<vertex_t>::value /* dummy */),
+      [](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); });
+    auto count  = std::distance(lb, ub);
+    auto offset = std::distance(label_org_vertex_pairs.begin(), lb);
+    if (org_weights == nullptr) {
+      std::vector<vertex_t> coarse_nbrs0{};
+      std::for_each(
+        lb,
+        ub,
+        [org_offsets, org_indices, org_labels, &label_to_coarse_vertex_map, &coarse_nbrs0](auto t) {
+          auto org_vertex = std::get<1>(t);
+          std::vector<vertex_t> tmp_nbrs(org_offsets[org_vertex + 1] - org_offsets[org_vertex]);
+          std::transform(org_indices + org_offsets[org_vertex],
+                         org_indices + org_offsets[org_vertex + 1],
+                         tmp_nbrs.begin(),
+                         [org_labels, &label_to_coarse_vertex_map](auto nbr) {
+                           return label_to_coarse_vertex_map[org_labels[nbr]];
+                         });
+          coarse_nbrs0.insert(coarse_nbrs0.end(), tmp_nbrs.begin(), tmp_nbrs.end());
+        });
+      std::sort(coarse_nbrs0.begin(), coarse_nbrs0.end());
+      coarse_nbrs0.resize(
+        std::distance(coarse_nbrs0.begin(), std::unique(coarse_nbrs0.begin(), coarse_nbrs0.end())));
+
+      auto coarse_vertex = label_to_coarse_vertex_map[org_unique_labels[i]];
+      auto coarse_offset = coarse_offsets[coarse_vertex];
+      auto coarse_count  = coarse_offsets[coarse_vertex + 1] - coarse_offset;
+      std::vector<vertex_t> coarse_nbrs1(coarse_indices + coarse_offset,
+                                         coarse_indices + coarse_offset + coarse_count);
+      std::sort(coarse_nbrs1.begin(), coarse_nbrs1.end());
+
+      ASSERT_TRUE(coarse_nbrs0.size() == coarse_nbrs1.size());
+      ASSERT_TRUE(std::equal(coarse_nbrs0.begin(), coarse_nbrs0.end(), coarse_nbrs1.begin()));
+    } else {
+      std::vector<std::tuple<vertex_t, weight_t>> coarse_nbr_weight_pairs0{};
+      std::for_each(lb,
+                    ub,
+                    [org_offsets,
+                     org_indices,
+                     org_weights,
+                     org_labels,
+                     &label_to_coarse_vertex_map,
+                     &coarse_nbr_weight_pairs0](auto t) {
+                      auto org_vertex = std::get<1>(t);
+                      std::vector<std::tuple<vertex_t, weight_t>> tmp_pairs(
+                        org_offsets[org_vertex + 1] - org_offsets[org_vertex]);
+                      for (auto j = org_offsets[org_vertex]; j < org_offsets[org_vertex + 1]; ++j) {
+                        tmp_pairs[j - org_offsets[org_vertex]] = std::make_tuple(
+                          label_to_coarse_vertex_map[org_labels[org_indices[j]]], org_weights[j]);
+                      }
+                      coarse_nbr_weight_pairs0.insert(
+                        coarse_nbr_weight_pairs0.end(), tmp_pairs.begin(), tmp_pairs.end());
+                    });
+      std::sort(coarse_nbr_weight_pairs0.begin(), coarse_nbr_weight_pairs0.end());
+      // reduce by key
+      {
+        size_t run_start_idx = 0;
+        for (size_t j = 1; j < coarse_nbr_weight_pairs0.size(); ++j) {
+          auto& start = coarse_nbr_weight_pairs0[run_start_idx];
+          auto& cur   = coarse_nbr_weight_pairs0[j];
+          if (std::get<0>(start) == std::get<0>(cur)) {
+            std::get<1>(start) += std::get<1>(cur);
+            std::get<0>(cur) = cugraph::experimental::invalid_vertex_id<vertex_t>::value;
+          } else {
+            run_start_idx = j;
+          }
+        }
+        coarse_nbr_weight_pairs0.erase(
+          std::remove_if(coarse_nbr_weight_pairs0.begin(),
+                         coarse_nbr_weight_pairs0.end(),
+                         [](auto t) {
+                           return std::get<0>(t) ==
+                                  cugraph::experimental::invalid_vertex_id<vertex_t>::value;
+                         }),
+          coarse_nbr_weight_pairs0.end());
+      }
+
+      auto coarse_vertex = label_to_coarse_vertex_map[org_unique_labels[i]];
+      std::vector<std::tuple<vertex_t, weight_t>> coarse_nbr_weight_pairs1(
+        coarse_offsets[coarse_vertex + 1] - coarse_offsets[coarse_vertex]);
+      for (auto j = coarse_offsets[coarse_vertex]; j < coarse_offsets[coarse_vertex + 1]; ++j) {
+        coarse_nbr_weight_pairs1[j - coarse_offsets[coarse_vertex]] =
+          std::make_tuple(coarse_indices[j], coarse_weights[j]);
+      }
+      std::sort(coarse_nbr_weight_pairs1.begin(), coarse_nbr_weight_pairs1.end());
+
+      ASSERT_TRUE(coarse_nbr_weight_pairs0.size() == coarse_nbr_weight_pairs1.size());
+      ASSERT_TRUE(std::equal(
+        coarse_nbr_weight_pairs0.begin(),
+        coarse_nbr_weight_pairs0.end(),
+        coarse_nbr_weight_pairs1.begin(),
+        [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+          return std::get<0>(lhs) == std::get<0>(rhs)
+                   ? (std::abs(std::get<1>(lhs) - std::get<1>(rhs)) <=
+                      std::max(std::max(std::abs(std::get<1>(lhs)), std::abs(std::get<1>(rhs))) *
+                                 threshold_ratio,
+                               threshold_magnitude))
+                   : false;
+        }));
+    }
+  }
+
+  return;
+}
+
+typedef struct CoarsenGraph_Usecase_t {
+  std::string graph_file_full_path{};
+  double coarsen_ratio{0.0};
+  bool test_weighted{false};
+
+  CoarsenGraph_Usecase_t(std::string const& graph_file_path,
+                         double coarsen_ratio,
+                         bool test_weighted)
+    : coarsen_ratio(coarsen_ratio), test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} CoarsenGraph_Usecase;
+
+class Tests_CoarsenGraph : public ::testing::TestWithParam<CoarsenGraph_Usecase> {
+ public:
+  Tests_CoarsenGraph() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+  void run_current_test(CoarsenGraph_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    // FIXME: remove this once we drop Pascal support
+    if (handle.get_device_properties().major < 7) {  // Pascal is not supported, skip testing
+      return;
+    }
+
+    auto graph = cugraph::test::
+      read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, store_transposed>(
+        handle, configuration.graph_file_full_path, configuration.test_weighted);
+    auto graph_view = graph.view();
+
+    if (graph_view.get_number_of_vertices() == 0) { return; }
+
+    std::vector<vertex_t> h_labels(graph_view.get_number_of_vertices());
+    auto num_labels =
+      std::max(static_cast<vertex_t>(h_labels.size() * configuration.coarsen_ratio), vertex_t{1});
+
+    std::default_random_engine generator{};
+    std::uniform_int_distribution<vertex_t> distribution{0, num_labels - 1};
+
+    std::for_each(h_labels.begin(), h_labels.end(), [&distribution, &generator](auto& label) {
+      label = distribution(generator);
+    });
+
+    rmm::device_uvector<vertex_t> d_labels(h_labels.size(), handle.get_stream());
+    raft::update_device(d_labels.data(), h_labels.data(), h_labels.size(), handle.get_stream());
+
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    std::unique_ptr<
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>>
+      coarse_graph{};
+    rmm::device_uvector<vertex_t> coarse_vertices_to_labels(0, handle.get_stream());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::tie(coarse_graph, coarse_vertices_to_labels) =
+      cugraph::experimental::coarsen_graph(handle, graph_view, d_labels.begin());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::vector<edge_t> h_org_offsets(graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_org_indices(graph_view.get_number_of_edges());
+    std::vector<weight_t> h_org_weights{};
+    raft::update_host(h_org_offsets.data(),
+                      graph_view.offsets(),
+                      graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_org_indices.data(),
+                      graph_view.indices(),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    if (graph_view.is_weighted()) {
+      h_org_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0});
+      raft::update_host(h_org_weights.data(),
+                        graph_view.weights(),
+                        graph_view.get_number_of_edges(),
+                        handle.get_stream());
+    }
+
+    auto coarse_graph_view = coarse_graph->view();
+
+    std::vector<edge_t> h_coarse_offsets(coarse_graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_coarse_indices(coarse_graph_view.get_number_of_edges());
+    std::vector<weight_t> h_coarse_weights{};
+    raft::update_host(h_coarse_offsets.data(),
+                      coarse_graph_view.offsets(),
+                      coarse_graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_coarse_indices.data(),
+                      coarse_graph_view.indices(),
+                      coarse_graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    if (graph_view.is_weighted()) {
+      h_coarse_weights.resize(coarse_graph_view.get_number_of_edges());
+      raft::update_host(h_coarse_weights.data(),
+                        coarse_graph_view.weights(),
+                        coarse_graph_view.get_number_of_edges(),
+                        handle.get_stream());
+    }
+
+    std::vector<vertex_t> h_coarse_vertices_to_labels(coarse_vertices_to_labels.size());
+    raft::update_host(h_coarse_vertices_to_labels.data(),
+                      coarse_vertices_to_labels.data(),
+                      coarse_vertices_to_labels.size(),
+                      handle.get_stream());
+
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    check_coarsened_graph_results(h_org_offsets.data(),
+                                  h_org_indices.data(),
+                                  h_org_weights.data(),
+                                  h_labels.data(),
+                                  h_coarse_offsets.data(),
+                                  h_coarse_indices.data(),
+                                  h_coarse_weights.data(),
+                                  h_coarse_vertices_to_labels.data(),
+                                  graph_view.get_number_of_vertices(),
+                                  coarse_graph_view.get_number_of_vertices());
+  }
+};
+
+// FIXME: add tests for type combinations
+
+TEST_P(Tests_CoarsenGraph, CheckInt32Int32FloatTransposed)
+{
+  run_current_test<int32_t, int32_t, float, true>(GetParam());
+}
+
+TEST_P(Tests_CoarsenGraph, CheckInt32Int32FloatUntransposed)
+{
+  run_current_test<int32_t, int32_t, float, false>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(
+  simple_test,
+  Tests_CoarsenGraph,
+  ::testing::Values(CoarsenGraph_Usecase("test/datasets/karate.mtx", 0.2, false),
+                    CoarsenGraph_Usecase("test/datasets/karate.mtx", 0.2, true),
+                    CoarsenGraph_Usecase("test/datasets/web-Google.mtx", 0.1, false),
+                    CoarsenGraph_Usecase("test/datasets/web-Google.mtx", 0.1, true),
+                    CoarsenGraph_Usecase("test/datasets/ljournal-2008.mtx", 0.1, false),
+                    CoarsenGraph_Usecase("test/datasets/ljournal-2008.mtx", 0.1, true),
+                    CoarsenGraph_Usecase("test/datasets/webbase-1M.mtx", 0.1, false),
+                    CoarsenGraph_Usecase("test/datasets/webbase-1M.mtx", 0.1, true)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/induced_subgraph_test.cpp b/cpp/tests/experimental/induced_subgraph_test.cpp
new file mode 100644
index 00000000000..72894a9349f
--- /dev/null
+++ b/cpp/tests/experimental/induced_subgraph_test.cpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
+#include <experimental/graph_view.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <tuple>
+#include <vector>
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::tuple<std::vector<vertex_t>, std::vector<vertex_t>, std::vector<weight_t>, std::vector<size_t>>
+extract_induced_subgraph_reference(edge_t const* offsets,
+                                   vertex_t const* indices,
+                                   weight_t const* weights,
+                                   size_t const* subgraph_offsets,
+                                   vertex_t const* subgraph_vertices,
+                                   vertex_t num_vertices,
+                                   size_t num_subgraphs)
+{
+  std::vector<vertex_t> edgelist_majors{};
+  std::vector<vertex_t> edgelist_minors{};
+  std::vector<weight_t> edgelist_weights{};
+  std::vector<size_t> subgraph_edge_offsets{0};
+
+  for (size_t i = 0; i < num_subgraphs; ++i) {
+    std::for_each(subgraph_vertices + subgraph_offsets[i],
+                  subgraph_vertices + subgraph_offsets[i + 1],
+                  [offsets,
+                   indices,
+                   weights,
+                   subgraph_vertices,
+                   subgraph_offsets,
+                   &edgelist_majors,
+                   &edgelist_minors,
+                   &edgelist_weights,
+                   i](auto v) {
+                    auto first = offsets[v];
+                    auto last  = offsets[v + 1];
+                    for (auto j = first; j < last; ++j) {
+                      if (std::binary_search(subgraph_vertices + subgraph_offsets[i],
+                                             subgraph_vertices + subgraph_offsets[i + 1],
+                                             indices[j])) {
+                        edgelist_majors.push_back(v);
+                        edgelist_minors.push_back(indices[j]);
+                        if (weights != nullptr) { edgelist_weights.push_back(weights[j]); }
+                      }
+                    }
+                  });
+    subgraph_edge_offsets.push_back(edgelist_majors.size());
+  }
+
+  return std::make_tuple(edgelist_majors, edgelist_minors, edgelist_weights, subgraph_edge_offsets);
+}
+
+typedef struct InducedSubgraph_Usecase_t {
+  std::string graph_file_full_path{};
+  std::vector<size_t> subgraph_sizes{};
+  bool test_weighted{false};
+
+  InducedSubgraph_Usecase_t(std::string const& graph_file_path,
+                            std::vector<size_t> const& subgraph_sizes,
+                            bool test_weighted)
+    : subgraph_sizes(subgraph_sizes), test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} InducedSubgraph_Usecase;
+
+class Tests_InducedSubgraph : public ::testing::TestWithParam<InducedSubgraph_Usecase> {
+ public:
+  Tests_InducedSubgraph() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+  void run_current_test(InducedSubgraph_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    auto graph = cugraph::test::
+      read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, store_transposed>(
+        handle, configuration.graph_file_full_path, configuration.test_weighted);
+    auto graph_view = graph.view();
+
+    std::vector<edge_t> h_offsets(graph_view.get_number_of_vertices() + 1);
+    std::vector<vertex_t> h_indices(graph_view.get_number_of_edges());
+    std::vector<weight_t> h_weights{};
+    raft::update_host(h_offsets.data(),
+                      graph_view.offsets(),
+                      graph_view.get_number_of_vertices() + 1,
+                      handle.get_stream());
+    raft::update_host(h_indices.data(),
+                      graph_view.indices(),
+                      graph_view.get_number_of_edges(),
+                      handle.get_stream());
+    if (graph_view.is_weighted()) {
+      h_weights.assign(graph_view.get_number_of_edges(), weight_t{0.0});
+      raft::update_host(h_weights.data(),
+                        graph_view.weights(),
+                        graph_view.get_number_of_edges(),
+                        handle.get_stream());
+    }
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    std::vector<size_t> h_subgraph_offsets(configuration.subgraph_sizes.size() + 1, 0);
+    std::partial_sum(configuration.subgraph_sizes.begin(),
+                     configuration.subgraph_sizes.end(),
+                     h_subgraph_offsets.begin() + 1);
+    std::vector<vertex_t> h_subgraph_vertices(
+      h_subgraph_offsets.back(), cugraph::experimental::invalid_vertex_id<vertex_t>::value);
+    std::default_random_engine generator{};
+    std::uniform_int_distribution<vertex_t> distribution{0,
+                                                         graph_view.get_number_of_vertices() - 1};
+
+    for (size_t i = 0; i < configuration.subgraph_sizes.size(); ++i) {
+      auto start = h_subgraph_offsets[i];
+      auto last  = h_subgraph_offsets[i + 1];
+      ASSERT_TRUE(last - start <= graph_view.get_number_of_vertices()) << "Invalid subgraph size.";
+      // this is inefficient if last - start << graph_view.get_number_of_vertices() but this is for
+      // the test puspose only and the time & memory cost is only linear to
+      // graph_view.get_number_of_vertices(), so this may not matter.
+      std::vector<vertex_t> vertices(graph_view.get_number_of_vertices());
+      std::iota(vertices.begin(), vertices.end(), vertex_t{0});
+      std::random_shuffle(vertices.begin(), vertices.end());
+      std::copy(
+        vertices.begin(), vertices.begin() + (last - start), h_subgraph_vertices.begin() + start);
+      std::sort(h_subgraph_vertices.begin() + start, h_subgraph_vertices.begin() + last);
+    }
+
+    rmm::device_uvector<size_t> d_subgraph_offsets(h_subgraph_offsets.size(), handle.get_stream());
+    rmm::device_uvector<vertex_t> d_subgraph_vertices(h_subgraph_vertices.size(),
+                                                      handle.get_stream());
+    raft::update_device(d_subgraph_offsets.data(),
+                        h_subgraph_offsets.data(),
+                        h_subgraph_offsets.size(),
+                        handle.get_stream());
+    raft::update_device(d_subgraph_vertices.data(),
+                        h_subgraph_vertices.data(),
+                        h_subgraph_vertices.size(),
+                        handle.get_stream());
+
+    std::vector<vertex_t> h_reference_subgraph_edgelist_majors{};
+    std::vector<vertex_t> h_reference_subgraph_edgelist_minors{};
+    std::vector<weight_t> h_reference_subgraph_edgelist_weights{};
+    std::vector<size_t> h_reference_subgraph_edge_offsets{};
+    std::tie(h_reference_subgraph_edgelist_majors,
+             h_reference_subgraph_edgelist_minors,
+             h_reference_subgraph_edgelist_weights,
+             h_reference_subgraph_edge_offsets) =
+      extract_induced_subgraph_reference(
+        h_offsets.data(),
+        h_indices.data(),
+        h_weights.size() > 0 ? h_weights.data() : static_cast<weight_t*>(nullptr),
+        h_subgraph_offsets.data(),
+        h_subgraph_vertices.data(),
+        graph_view.get_number_of_vertices(),
+        configuration.subgraph_sizes.size());
+
+    rmm::device_uvector<vertex_t> d_subgraph_edgelist_majors(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> d_subgraph_edgelist_minors(0, handle.get_stream());
+    rmm::device_uvector<weight_t> d_subgraph_edgelist_weights(0, handle.get_stream());
+    rmm::device_uvector<size_t> d_subgraph_edge_offsets(0, handle.get_stream());
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    // FIXME: turn-off do_expensive_check once verified.
+    std::tie(d_subgraph_edgelist_majors,
+             d_subgraph_edgelist_minors,
+             d_subgraph_edgelist_weights,
+             d_subgraph_edge_offsets) =
+      cugraph::experimental::extract_induced_subgraphs(handle,
+                                                       graph_view,
+                                                       d_subgraph_offsets.data(),
+                                                       d_subgraph_vertices.data(),
+                                                       configuration.subgraph_sizes.size(),
+                                                       true);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    std::vector<vertex_t> h_cugraph_subgraph_edgelist_majors(d_subgraph_edgelist_majors.size());
+    std::vector<vertex_t> h_cugraph_subgraph_edgelist_minors(d_subgraph_edgelist_minors.size());
+    std::vector<weight_t> h_cugraph_subgraph_edgelist_weights(d_subgraph_edgelist_weights.size());
+    std::vector<size_t> h_cugraph_subgraph_edge_offsets(d_subgraph_edge_offsets.size());
+
+    raft::update_host(h_cugraph_subgraph_edgelist_majors.data(),
+                      d_subgraph_edgelist_majors.data(),
+                      d_subgraph_edgelist_majors.size(),
+                      handle.get_stream());
+    raft::update_host(h_cugraph_subgraph_edgelist_minors.data(),
+                      d_subgraph_edgelist_minors.data(),
+                      d_subgraph_edgelist_minors.size(),
+                      handle.get_stream());
+    if (configuration.test_weighted) {
+      raft::update_host(h_cugraph_subgraph_edgelist_weights.data(),
+                        d_subgraph_edgelist_weights.data(),
+                        d_subgraph_edgelist_weights.size(),
+                        handle.get_stream());
+    }
+    raft::update_host(h_cugraph_subgraph_edge_offsets.data(),
+                      d_subgraph_edge_offsets.data(),
+                      d_subgraph_edge_offsets.size(),
+                      handle.get_stream());
+    CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+
+    ASSERT_TRUE(h_reference_subgraph_edge_offsets.size() == h_cugraph_subgraph_edge_offsets.size())
+      << "Returned subgraph edge offset vector has an invalid size.";
+    ASSERT_TRUE(std::equal(h_reference_subgraph_edge_offsets.begin(),
+                           h_reference_subgraph_edge_offsets.end(),
+                           h_cugraph_subgraph_edge_offsets.begin()))
+      << "Returned subgraph edge offset values do not match with the reference values.";
+
+    for (size_t i = 0; i < configuration.subgraph_sizes.size(); ++i) {
+      auto start = h_reference_subgraph_edge_offsets[i];
+      auto last  = h_reference_subgraph_edge_offsets[i + 1];
+      if (configuration.test_weighted) {
+        std::vector<std::tuple<vertex_t, vertex_t, weight_t>> reference_tuples(last - start);
+        std::vector<std::tuple<vertex_t, vertex_t, weight_t>> cugraph_tuples(last - start);
+        for (auto j = start; j < last; ++j) {
+          reference_tuples[j - start] = std::make_tuple(h_reference_subgraph_edgelist_majors[j],
+                                                        h_reference_subgraph_edgelist_minors[j],
+                                                        h_reference_subgraph_edgelist_weights[j]);
+          cugraph_tuples[j - start]   = std::make_tuple(h_cugraph_subgraph_edgelist_majors[j],
+                                                      h_cugraph_subgraph_edgelist_minors[j],
+                                                      h_cugraph_subgraph_edgelist_weights[j]);
+        }
+        ASSERT_TRUE(
+          std::equal(reference_tuples.begin(), reference_tuples.end(), cugraph_tuples.begin()))
+          << "Extracted subgraph edges do not match with the edges extracted by the reference "
+             "implementation.";
+      } else {
+        std::vector<std::tuple<vertex_t, vertex_t>> reference_tuples(last - start);
+        std::vector<std::tuple<vertex_t, vertex_t>> cugraph_tuples(last - start);
+        for (auto j = start; j < last; ++j) {
+          reference_tuples[j - start] = std::make_tuple(h_reference_subgraph_edgelist_majors[j],
+                                                        h_reference_subgraph_edgelist_minors[j]);
+          cugraph_tuples[j - start]   = std::make_tuple(h_cugraph_subgraph_edgelist_majors[j],
+                                                      h_cugraph_subgraph_edgelist_minors[j]);
+        }
+        ASSERT_TRUE(
+          std::equal(reference_tuples.begin(), reference_tuples.end(), cugraph_tuples.begin()))
+          << "Extracted subgraph edges do not match with the edges extracted by the reference "
+             "implementation.";
+      }
+    }
+  }
+};
+
+// FIXME: add tests for type combinations
+
+TEST_P(Tests_InducedSubgraph, CheckInt32Int32FloatTransposed)
+{
+  run_current_test<int32_t, int32_t, float, true>(GetParam());
+}
+
+TEST_P(Tests_InducedSubgraph, CheckInt32Int32FloatUntransposed)
+{
+  run_current_test<int32_t, int32_t, float, false>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(
+  simple_test,
+  Tests_InducedSubgraph,
+  ::testing::Values(
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{0}, false),
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{1}, false),
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{10}, false),
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{34}, false),
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{10, 0, 5}, false),
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{9, 3, 10}, false),
+    InducedSubgraph_Usecase("test/datasets/karate.mtx", std::vector<size_t>{5, 12, 13}, true),
+    InducedSubgraph_Usecase("test/datasets/web-Google.mtx",
+                            std::vector<size_t>{250, 130, 15},
+                            false),
+    InducedSubgraph_Usecase("test/datasets/web-Google.mtx",
+                            std::vector<size_t>{125, 300, 70},
+                            true),
+    InducedSubgraph_Usecase("test/datasets/ljournal-2008.mtx",
+                            std::vector<size_t>{300, 20, 400},
+                            false),
+    InducedSubgraph_Usecase("test/datasets/ljournal-2008.mtx",
+                            std::vector<size_t>{9130, 1200, 300},
+                            true),
+    InducedSubgraph_Usecase("test/datasets/webbase-1M.mtx", std::vector<size_t>{700}, false),
+    InducedSubgraph_Usecase("test/datasets/webbase-1M.mtx", std::vector<size_t>{500}, true)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/katz_centrality_test.cpp b/cpp/tests/experimental/katz_centrality_test.cpp
index cdbe3688248..3e9f0b478a0 100644
--- a/cpp/tests/experimental/katz_centrality_test.cpp
+++ b/cpp/tests/experimental/katz_centrality_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -198,8 +198,8 @@ class Tests_KatzCentrality : public ::testing::TestWithParam<KatzCentrality_Usec
       (1.0 / static_cast<result_t>(graph_view.get_number_of_vertices())) *
       threshold_ratio;  // skip comparison for low Katz Centrality verties (lowly ranked vertices)
     auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
-      auto diff = std::abs(lhs - rhs);
-      return (diff < std::max(lhs, rhs) * threshold_ratio) || (diff < threshold_magnitude);
+      return std::abs(lhs - rhs) <
+             std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
     };
 
     ASSERT_TRUE(std::equal(h_reference_katz_centralities.begin(),
diff --git a/cpp/tests/experimental/louvain_test.cu b/cpp/tests/experimental/louvain_test.cu
index 4a47b1a1aca..35a26923df6 100644
--- a/cpp/tests/experimental/louvain_test.cu
+++ b/cpp/tests/experimental/louvain_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tests/experimental/pagerank_test.cpp b/cpp/tests/experimental/pagerank_test.cpp
index 70c83ef8192..53143bf0bf3 100644
--- a/cpp/tests/experimental/pagerank_test.cpp
+++ b/cpp/tests/experimental/pagerank_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -274,8 +274,8 @@ class Tests_PageRank : public ::testing::TestWithParam<PageRank_Usecase> {
       (1.0 / static_cast<result_t>(graph_view.get_number_of_vertices())) *
       threshold_ratio;  // skip comparison for low PageRank verties (lowly ranked vertices)
     auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
-      auto diff = std::abs(lhs - rhs);
-      return (diff < std::max(lhs, rhs) * threshold_ratio) || (diff < threshold_magnitude);
+      return std::abs(lhs - rhs) <
+             std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
     };
 
     ASSERT_TRUE(std::equal(h_reference_pageranks.begin(),
diff --git a/cpp/tests/pagerank/pagerank_mg_test.cpp b/cpp/tests/pagerank/pagerank_mg_test.cpp
new file mode 100644
index 00000000000..7f789226bf1
--- /dev/null
+++ b/cpp/tests/pagerank/pagerank_mg_test.cpp
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/mg_test_utilities.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <algorithms.hpp>
+#include <partition_manager.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+////////////////////////////////////////////////////////////////////////////////
+// Test param object. This defines the input and expected output for a test, and
+// will be instantiated as the parameter to the tests defined below using
+// INSTANTIATE_TEST_CASE_P()
+//
+typedef struct Pagerank_Testparams_t {
+  std::string graph_file_full_path{};
+  double personalization_ratio{0.0};
+  bool test_weighted{false};
+
+  Pagerank_Testparams_t(std::string const& graph_file_path,
+                        double personalization_ratio,
+                        bool test_weighted)
+    : personalization_ratio(personalization_ratio), test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+} Pagerank_Testparams_t;
+
+////////////////////////////////////////////////////////////////////////////////
+// Parameterized test fixture, to be used with TEST_P().  This defines common
+// setup and teardown steps as well as common utilities used by each E2E MG
+// test.  In this case, each test is identical except for the inputs and
+// expected outputs, so the entire test is defined in the run_test() method.
+//
+class Pagerank_E2E_MG_Testfixture_t : public cugraph::test::MG_TestFixture_t,
+                                      public ::testing::WithParamInterface<Pagerank_Testparams_t> {
+ public:
+  Pagerank_E2E_MG_Testfixture_t() {}
+
+  // Run once for each test instance
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Return the results of running pagerank on a single GPU for the dataset in
+  // graph_file_path.
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  std::vector<result_t> get_sg_results(raft::handle_t& handle,
+                                       const std::string& graph_file_path,
+                                       const result_t alpha,
+                                       const result_t epsilon)
+  {
+    auto graph =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, true>(
+        handle, graph_file_path, true);  // FIXME: should use param.test_weighted instead of true
+
+    auto graph_view     = graph.view();
+    cudaStream_t stream = handle.get_stream();
+    rmm::device_uvector<result_t> d_pageranks(graph_view.get_number_of_vertices(), stream);
+
+    cugraph::experimental::pagerank(
+      handle,
+      graph_view,
+      static_cast<weight_t*>(nullptr),     // adj_matrix_row_out_weight_sums
+      static_cast<vertex_t*>(nullptr),     // personalization_vertices
+      static_cast<result_t*>(nullptr),     // personalization_values
+      static_cast<vertex_t>(0),            // personalization_vector_size
+      d_pageranks.begin(),                 // pageranks
+      alpha,                               // alpha (damping factor)
+      epsilon,                             // error tolerance for convergence
+      std::numeric_limits<size_t>::max(),  // max_iterations
+      false,                               // has_initial_guess
+      true);                               // do_expensive_check
+
+    std::vector<result_t> h_pageranks(graph_view.get_number_of_vertices());
+    raft::update_host(h_pageranks.data(), d_pageranks.data(), d_pageranks.size(), stream);
+
+    return h_pageranks;
+  }
+
+  // Compare the results of running pagerank on multiple GPUs to that of a
+  // single-GPU run for the configuration in param.
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_test(const Pagerank_Testparams_t& param)
+  {
+    result_t constexpr alpha{0.85};
+    result_t constexpr epsilon{1e-6};
+
+    raft::handle_t handle;
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    const auto& comm = handle.get_comms();
+
+    cudaStream_t stream = handle.get_stream();
+
+    // Assuming 2 GPUs which means 1 row, 2 cols. 2 cols = row_comm_size of 2.
+    // FIXME: DO NOT ASSUME 2 GPUs, add code to compute prows, pcols
+    size_t row_comm_size{2};
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    int my_rank = comm.get_rank();
+
+    // FIXME: graph must be weighted!
+    std::unique_ptr<cugraph::experimental::
+                      graph_t<vertex_t, edge_t, weight_t, true, true>>  // store_transposed=true,
+                                                                        // multi_gpu=true
+      mg_graph_ptr{};
+    rmm::device_uvector<vertex_t> d_renumber_map_labels(0, handle.get_stream());
+
+    std::tie(mg_graph_ptr, d_renumber_map_labels) = cugraph::test::
+      create_graph_for_gpu<vertex_t, edge_t, weight_t, true>  // store_transposed=true
+      (handle, param.graph_file_full_path);
+
+    auto mg_graph_view = mg_graph_ptr->view();
+
+    rmm::device_uvector<result_t> d_mg_pageranks(mg_graph_view.get_number_of_vertices(), stream);
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    cugraph::experimental::pagerank(
+      handle,
+      mg_graph_view,
+      static_cast<weight_t*>(nullptr),     // adj_matrix_row_out_weight_sums
+      static_cast<vertex_t*>(nullptr),     // personalization_vertices
+      static_cast<result_t*>(nullptr),     // personalization_values
+      static_cast<vertex_t>(0),            // personalization_vector_size
+      d_mg_pageranks.begin(),              // pageranks
+      alpha,                               // alpha (damping factor)
+      epsilon,                             // error tolerance for convergence
+      std::numeric_limits<size_t>::max(),  // max_iterations
+      false,                               // has_initial_guess
+      true);                               // do_expensive_check
+
+    std::vector<result_t> h_mg_pageranks(mg_graph_view.get_number_of_vertices());
+
+    raft::update_host(h_mg_pageranks.data(), d_mg_pageranks.data(), d_mg_pageranks.size(), stream);
+
+    std::vector<vertex_t> h_renumber_map_labels(mg_graph_view.get_number_of_vertices());
+    raft::update_host(h_renumber_map_labels.data(),
+                      d_renumber_map_labels.data(),
+                      d_renumber_map_labels.size(),
+                      stream);
+
+    // Compare MG to SG
+    // Each GPU will have pagerank values for their range, so ech GPU must
+    // compare to specific SG results for their respective range.
+
+    auto h_sg_pageranks = get_sg_results<vertex_t, edge_t, weight_t, result_t>(
+      handle, param.graph_file_full_path, alpha, epsilon);
+
+    // For this test, each GPU will have the full set of vertices and
+    // therefore the pageranks vectors should be equal in size.
+    ASSERT_EQ(h_sg_pageranks.size(), h_mg_pageranks.size());
+
+    auto threshold_ratio = 1e-3;
+    auto threshold_magnitude =
+      (1.0 / static_cast<result_t>(mg_graph_view.get_number_of_vertices())) *
+      threshold_ratio;  // skip comparison for low PageRank verties (lowly ranked vertices)
+    auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+      return std::abs(lhs - rhs) <
+             std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+    };
+
+    vertex_t mapped_vertex{0};
+    for (vertex_t i = 0;
+         i + mg_graph_view.get_local_vertex_first() < mg_graph_view.get_local_vertex_last();
+         ++i) {
+      mapped_vertex = h_renumber_map_labels[i];
+      ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_pageranks[mapped_vertex]))
+        << "MG PageRank value for vertex: " << i << " in rank: " << my_rank
+        << " has value: " << h_mg_pageranks[i]
+        << " which exceeds the error margin for comparing to SG value: " << h_sg_pageranks[i];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+TEST_P(Pagerank_E2E_MG_Testfixture_t, CheckInt32Int32FloatFloat)
+{
+  run_test<int32_t, int32_t, float, float>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(
+  e2e,
+  Pagerank_E2E_MG_Testfixture_t,
+
+  // FIXME: the personalization_ratio and use_weighted boo are not used
+  // (personilization vectors are not used, and all datasets are assumed
+  // weighted). update this to use personilization vectors and non-weighted
+  // graphs.
+  ::testing::Values(Pagerank_Testparams_t("test/datasets/karate.mtx", 0.0, true),
+                    // FIXME: The commented datasets contain isolate vertices
+                    // which result in a different number of vertices in the
+                    // renumbered MG graph (because the renumbering function
+                    // does not include them) vs. the SG graph object used for
+                    // the pagerank comparison because the SG graph reads the
+                    // COO as-is without renumbering.  Update the utility that
+                    // reads a .mtx and constructs a SG graph object to also
+                    // renumber and return the renumber vertices vector. This
+                    // will result in a comparison of an equal number of
+                    // pagerank values.
+                    //
+                    // Pagerank_Testparams_t("test/datasets/web-Google.mtx", 0.0, true),
+                    // Pagerank_Testparams_t("test/datasets/ljournal-2008.mtx", 0.0, true),
+                    Pagerank_Testparams_t("test/datasets/webbase-1M.mtx", 0.0, true)));
+
+// FIXME: Enable proper RMM configuration by using CUGRAPH_TEST_PROGRAM_MAIN().
+//        Currently seeing a RMM failure during init, need to investigate.
+// CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/pagerank/pagerank_test.cpp b/cpp/tests/pagerank/pagerank_test.cpp
deleted file mode 100644
index 48705f7f324..00000000000
--- a/cpp/tests/pagerank/pagerank_test.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-// Pagerank solver tests
-// Author: Alex Fender afender@nvidia.com
-
-#include <utilities/high_res_clock.h>
-#include <utilities/base_fixture.hpp>
-#include <utilities/test_utilities.hpp>
-
-#include <algorithms.hpp>
-#include <graph.hpp>
-
-#include <raft/error.hpp>
-#include <raft/handle.hpp>
-
-#include <rmm/device_uvector.hpp>
-
-#include <cuda_profiler_api.h>
-
-#include <cmath>
-
-// do the perf measurements
-// enabled by command line parameter s'--perf'
-static int PERF = 0;
-
-// iterations for perf tests
-// enabled by command line parameter '--perf-iters"
-static int PERF_MULTIPLIER = 5;
-
-typedef struct Pagerank_Usecase_t {
-  std::string matrix_file;
-  std::string result_file;
-  Pagerank_Usecase_t(const std::string& a, const std::string& b)
-  {
-    // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR
-    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
-    if ((a != "") && (a[0] != '/')) {
-      matrix_file = rapidsDatasetRootDir + "/" + a;
-    } else {
-      matrix_file = a;
-    }
-    if ((b != "") && (b[0] != '/')) {
-      result_file = rapidsDatasetRootDir + "/" + b;
-    } else {
-      result_file = b;
-    }
-  }
-  Pagerank_Usecase_t& operator=(const Pagerank_Usecase_t& rhs)
-  {
-    matrix_file = rhs.matrix_file;
-    result_file = rhs.result_file;
-    return *this;
-  }
-} Pagerank_Usecase;
-
-class Tests_Pagerank : public ::testing::TestWithParam<Pagerank_Usecase> {
- public:
-  Tests_Pagerank() {}
-  static void SetupTestCase() {}
-  static void TearDownTestCase()
-  {
-    if (PERF) {
-      for (unsigned int i = 0; i < pagerank_time.size(); ++i) {
-        std::cout << pagerank_time[i] / PERF_MULTIPLIER << std::endl;
-      }
-    }
-  }
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  static std::vector<double> pagerank_time;
-
-  template <typename T>
-  void run_current_test(const Pagerank_Usecase& param)
-  {
-    const ::testing::TestInfo* const test_info =
-      ::testing::UnitTest::GetInstance()->current_test_info();
-    std::stringstream ss;
-    std::string test_id = std::string(test_info->test_case_name()) + std::string(".") +
-                          std::string(test_info->name()) + std::string("_") +
-                          cugraph::test::getFileName(param.matrix_file) + std::string("_") +
-                          ss.str().c_str();
-
-    int m, k, nnz;
-    MM_typecode mc;
-
-    float tol = 1E-5f;
-
-    // Default parameters
-    /*
-    float alpha = 0.85;
-    int max_iter = 500;
-    bool has_guess = false;
-    */
-
-    HighResClock hr_clock;
-    double time_tmp;
-
-    FILE* fpin = fopen(param.matrix_file.c_str(), "r");
-    ASSERT_NE(fpin, nullptr) << "fopen (" << param.matrix_file << ") failure.";
-
-    ASSERT_EQ(cugraph::test::mm_properties<int>(fpin, 1, &mc, &m, &k, &nnz), 0)
-      << "could not read Matrix Market file properties"
-      << "\n";
-    ASSERT_TRUE(mm_is_matrix(mc));
-    ASSERT_TRUE(mm_is_coordinate(mc));
-    ASSERT_FALSE(mm_is_complex(mc));
-    ASSERT_FALSE(mm_is_skew(mc));
-
-    // Allocate memory on host
-    std::vector<int> cooRowInd(nnz), cooColInd(nnz);
-    std::vector<T> cooVal(nnz), pagerank(m);
-
-    // device alloc
-    rmm::device_uvector<T> pagerank_vector(static_cast<size_t>(m), nullptr);
-    T* d_pagerank = pagerank_vector.data();
-
-    // Read
-    ASSERT_EQ((cugraph::test::mm_to_coo<int, T>(
-                fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], &cooVal[0], NULL)),
-              0)
-      << "could not read matrix data"
-      << "\n";
-    ASSERT_EQ(fclose(fpin), 0);
-
-    //  Pagerank runs on CSC, so feed COOtoCSR the row/col backwards.
-    raft::handle_t handle;
-    cugraph::GraphCOOView<int, int, T> G_coo(&cooColInd[0], &cooRowInd[0], &cooVal[0], m, nnz);
-    auto G_unique = cugraph::coo_to_csr(G_coo);
-    cugraph::GraphCSCView<int, int, T> G(G_unique->view().offsets,
-                                         G_unique->view().indices,
-                                         G_unique->view().edge_data,
-                                         G_unique->view().number_of_vertices,
-                                         G_unique->view().number_of_edges);
-
-    cudaDeviceSynchronize();
-    if (PERF) {
-      hr_clock.start();
-      for (int i = 0; i < PERF_MULTIPLIER; ++i) {
-        cugraph::pagerank<int, int, T>(handle, G, d_pagerank);
-        cudaDeviceSynchronize();
-      }
-      hr_clock.stop(&time_tmp);
-      pagerank_time.push_back(time_tmp);
-    } else {
-      cudaProfilerStart();
-      cugraph::pagerank<int, int, T>(handle, G, d_pagerank);
-      cudaProfilerStop();
-      cudaDeviceSynchronize();
-    }
-
-    // Check vs golden data
-    if (param.result_file.length() > 0) {
-      std::vector<T> calculated_res(m);
-
-      CUDA_TRY(cudaMemcpy(&calculated_res[0], d_pagerank, sizeof(T) * m, cudaMemcpyDeviceToHost));
-      std::sort(calculated_res.begin(), calculated_res.end());
-      fpin = fopen(param.result_file.c_str(), "rb");
-      ASSERT_TRUE(fpin != NULL) << " Cannot read file with reference data: " << param.result_file
-                                << std::endl;
-      std::vector<T> expected_res(m);
-      ASSERT_EQ(cugraph::test::read_binary_vector(fpin, m, expected_res), 0);
-      fclose(fpin);
-      T err;
-      int n_err = 0;
-      for (int i = 0; i < m; i++) {
-        err = fabs(expected_res[i] - calculated_res[i]);
-        if (err > tol * 1.1) {
-          n_err++;  // count the number of mismatches
-        }
-      }
-      if (n_err) {
-        EXPECT_LE(n_err, 0.001 * m);  // we tolerate 0.1% of values with a litte difference
-      }
-    }
-  }
-};
-
-std::vector<double> Tests_Pagerank::pagerank_time;
-
-TEST_P(Tests_Pagerank, CheckFP32_T) { run_current_test<float>(GetParam()); }
-
-TEST_P(Tests_Pagerank, CheckFP64_T) { run_current_test<double>(GetParam()); }
-
-// --gtest_filter=*simple_test*
-INSTANTIATE_TEST_CASE_P(
-  simple_test,
-  Tests_Pagerank,
-  ::testing::Values(Pagerank_Usecase("test/datasets/karate.mtx", ""),
-                    Pagerank_Usecase("test/datasets/web-Google.mtx",
-                                     "test/ref/pagerank/web-Google.pagerank_val_0.85.bin"),
-                    Pagerank_Usecase("test/datasets/ljournal-2008.mtx",
-                                     "test/ref/pagerank/ljournal-2008.pagerank_val_0.85.bin"),
-                    Pagerank_Usecase("test/datasets/webbase-1M.mtx",
-                                     "test/ref/pagerank/webbase-1M.pagerank_val_0.85.bin")));
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/traversal/tsp_test.cu b/cpp/tests/traversal/tsp_test.cu
new file mode 100644
index 00000000000..383427a56cf
--- /dev/null
+++ b/cpp/tests/traversal/tsp_test.cu
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+// TSP solver tests
+// Author: Hugo Linsenmaier hlinsenmaier@nvidia.com
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+// TSP solver tests
+// Author: Hugo Linsenmaier hlinsenmaier@nvidia.com
+
+#include <utilities/high_res_clock.h>
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <algorithms.hpp>
+#include <graph.hpp>
+
+#include <cuda_profiler_api.h>
+
+#include <raft/error.hpp>
+#include <raft/handle.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <fstream>
+#include <set>
+#include <vector>
+
+typedef struct Tsp_Usecase_t {
+  std::string tsp_file;
+  float ref_cost;
+  Tsp_Usecase_t(const std::string& a, const float c)
+  {
+    // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR
+    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
+    if ((a != "") && (a[0] != '/')) {
+      tsp_file = rapidsDatasetRootDir + "/" + a;
+    } else {
+      tsp_file = a;
+    }
+    ref_cost = c;
+  }
+  Tsp_Usecase_t& operator=(const Tsp_Usecase_t& rhs)
+  {
+    tsp_file = rhs.tsp_file;
+    ref_cost = rhs.ref_cost;
+    return *this;
+  }
+} Tsp_Usecase;
+
+static std::vector<Tsp_Usecase_t> euc_2d{
+  {"tsplib/datasets/a280.tsp", 2579},      {"tsplib/datasets/berlin52.tsp", 7542},
+  {"tsplib/datasets/bier127.tsp", 118282}, {"tsplib/datasets/ch130.tsp", 6110},
+  {"tsplib/datasets/ch150.tsp", 6528},     {"tsplib/datasets/d1291.tsp", 50801},
+  {"tsplib/datasets/d1655.tsp", 62128},    {"tsplib/datasets/d198.tsp", 15780},
+  {"tsplib/datasets/d2103.tsp", 80450},    {"tsplib/datasets/d493.tsp", 35002},
+  {"tsplib/datasets/d657.tsp", 48912},     {"tsplib/datasets/eil101.tsp", 629},
+  {"tsplib/datasets/eil51.tsp", 426},      {"tsplib/datasets/eil76.tsp", 538},
+  {"tsplib/datasets/fl1400.tsp", 20127},   {"tsplib/datasets/fl1577.tsp", 22249},
+  {"tsplib/datasets/fl417.tsp", 11861},    {"tsplib/datasets/gil262.tsp", 2378},
+  {"tsplib/datasets/kroA100.tsp", 21282},  {"tsplib/datasets/kroA150.tsp", 26524},
+  {"tsplib/datasets/kroA200.tsp", 29368},  {"tsplib/datasets/kroB100.tsp", 22141},
+  {"tsplib/datasets/kroB150.tsp", 26130},  {"tsplib/datasets/kroB200.tsp", 29437},
+  {"tsplib/datasets/kroC100.tsp", 20749},  {"tsplib/datasets/kroD100.tsp", 21294},
+  {"tsplib/datasets/kroE100.tsp", 22068},  {"tsplib/datasets/lin105.tsp", 14379},
+  {"tsplib/datasets/lin318.tsp", 42029},   {"tsplib/datasets/nrw1379.tsp", 56638},
+  {"tsplib/datasets/p654.tsp", 34643},     {"tsplib/datasets/pcb1173.tsp", 56892},
+  {"tsplib/datasets/pcb442.tsp", 50778},   {"tsplib/datasets/pr1002.tsp", 259045},
+  {"tsplib/datasets/pr107.tsp", 44303},    {"tsplib/datasets/pr136.tsp", 96772},
+  {"tsplib/datasets/pr144.tsp", 58537},    {"tsplib/datasets/pr152.tsp", 73682},
+  {"tsplib/datasets/pr226.tsp", 80369},    {"tsplib/datasets/pr264.tsp", 49135},
+  {"tsplib/datasets/pr299.tsp", 48191},    {"tsplib/datasets/pr439.tsp", 107217},
+  {"tsplib/datasets/pr76.tsp", 108159},    {"tsplib/datasets/rat195.tsp", 2323},
+  {"tsplib/datasets/rat575.tsp", 6773},    {"tsplib/datasets/rat783.tsp", 8806},
+  {"tsplib/datasets/rat99.tsp", 1211},     {"tsplib/datasets/rd100.tsp", 7910},
+  {"tsplib/datasets/rd400.tsp", 15281},    {"tsplib/datasets/rl1323.tsp", 270199},
+  {"tsplib/datasets/st70.tsp", 675},       {"tsplib/datasets/ts225.tsp", 126643},
+  {"tsplib/datasets/tsp225.tsp", 3916},    {"tsplib/datasets/u1060.tsp", 224094},
+  {"tsplib/datasets/u1432.tsp", 152970},   {"tsplib/datasets/u159.tsp", 42080},
+  {"tsplib/datasets/u574.tsp", 36905},     {"tsplib/datasets/u724.tsp", 41910},
+  {"tsplib/datasets/vm1084.tsp", 239297},
+};
+
+struct Route {
+  std::vector<int> cities;
+  std::vector<float> x_pos;
+  std::vector<float> y_pos;
+};
+
+class Tests_Tsp : public ::testing::TestWithParam<Tsp_Usecase> {
+ public:
+  Tests_Tsp() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  void run_current_test(const Tsp_Usecase& param)
+  {
+    const ::testing::TestInfo* const test_info =
+      ::testing::UnitTest::GetInstance()->current_test_info();
+    std::stringstream ss;
+    std::string test_id = std::string(test_info->test_case_name()) + std::string(".") +
+                          std::string(test_info->name()) + std::string("_") +
+                          cugraph::test::getFileName(param.tsp_file) + std::string("_") +
+                          ss.str().c_str();
+
+    float tol = 1E-1f;
+    HighResClock hr_clock;
+    double time_tmp;
+    Route input;
+
+    std::cout << "File: " << param.tsp_file.c_str() << "\n";
+    int nodes = load_tsp(param.tsp_file.c_str(), &input);
+
+    // Device alloc
+    raft::handle_t handle;
+    rmm::device_uvector<int> vertices(static_cast<size_t>(nodes), nullptr);
+    rmm::device_uvector<int> route(static_cast<size_t>(nodes), nullptr);
+    rmm::device_uvector<float> x_pos(static_cast<size_t>(nodes), nullptr);
+    rmm::device_uvector<float> y_pos(static_cast<size_t>(nodes), nullptr);
+
+    int* vtx_ptr   = vertices.data();
+    int* d_route   = route.data();
+    float* d_x_pos = x_pos.data();
+    float* d_y_pos = y_pos.data();
+
+    CUDA_TRY(cudaMemcpy(vtx_ptr, input.cities.data(), sizeof(int) * nodes, cudaMemcpyHostToDevice));
+    CUDA_TRY(
+      cudaMemcpy(d_x_pos, input.x_pos.data(), sizeof(float) * nodes, cudaMemcpyHostToDevice));
+    CUDA_TRY(
+      cudaMemcpy(d_y_pos, input.y_pos.data(), sizeof(float) * nodes, cudaMemcpyHostToDevice));
+
+    // Default parameters
+    int restarts     = 4096;
+    bool beam_search = true;
+    int k            = 4;
+    int nstart       = 0;
+    bool verbose     = false;
+
+    hr_clock.start();
+    cudaDeviceSynchronize();
+    cudaProfilerStart();
+
+    float final_cost = cugraph::traveling_salesperson(
+      handle, vtx_ptr, d_x_pos, d_y_pos, nodes, restarts, beam_search, k, nstart, verbose, d_route);
+    cudaProfilerStop();
+    cudaDeviceSynchronize();
+    hr_clock.stop(&time_tmp);
+
+    std::vector<int> h_route;
+    h_route.resize(nodes);
+    std::vector<int> h_vertices;
+    h_vertices.resize(nodes);
+    CUDA_TRY(cudaMemcpy(h_route.data(), d_route, sizeof(int) * nodes, cudaMemcpyDeviceToHost));
+    cudaDeviceSynchronize();
+    CUDA_TRY(cudaMemcpy(h_vertices.data(), vtx_ptr, sizeof(int) * nodes, cudaMemcpyDeviceToHost));
+    cudaDeviceSynchronize();
+
+    std::cout << "tsp_time: " << time_tmp << " us" << std::endl;
+    std::cout << "Ref cost is: " << param.ref_cost << "\n";
+    std::cout << "Final cost is: " << final_cost << "\n";
+    float err = fabs(final_cost - param.ref_cost);
+    err /= param.ref_cost;
+    std::cout << "Approximation error is: " << err * 100 << "%\n";
+    EXPECT_LE(err, tol);
+
+    // Check route goes through each vertex once
+    size_t u_nodes = nodes;
+    std::set<int> node_set(h_route.begin(), h_route.end());
+    ASSERT_EQ(node_set.size(), u_nodes);
+
+    // Bound check
+    int max = *std::max_element(h_vertices.begin(), h_vertices.end());
+    int min = *std::min_element(h_vertices.begin(), h_vertices.end());
+    EXPECT_GE(*node_set.begin(), min);
+    EXPECT_LE(*node_set.rbegin(), max);
+  }
+
+ private:
+  std::vector<std::string> split(const std::string& s, char delimiter)
+  {
+    std::vector<std::string> tokens;
+    std::string token;
+    std::istringstream tokenStream(s);
+    while (std::getline(tokenStream, token, delimiter)) {
+      if (token.size() == 0) continue;
+      tokens.push_back(token);
+    }
+    return tokens;
+  }
+
+  // FIXME: At the moment TSP does not accept a graph_t as input and therefore
+  // deviates from the standard testing I/O pattern. Once other input types
+  // are supported we want to reconcile TSP testing with the rest of cugraph.
+  int load_tsp(const char* fname, Route* input)
+  {
+    std::fstream fs;
+    fs.open(fname);
+    std::string line;
+    std::vector<std::string> tokens;
+    int nodes = 0;
+    while (std::getline(fs, line) && line.find(':') != std::string::npos) {
+      tokens           = split(line, ':');
+      auto strip_token = split(tokens[0], ' ')[0];
+      if (strip_token == "DIMENSION") nodes = std::stof(tokens[1]);
+    }
+
+    while (std::getline(fs, line) && line.find(' ') != std::string::npos) {
+      tokens       = split(line, ' ');
+      auto city_id = std::stof(tokens[0]);
+      auto x       = std::stof(tokens[1]);
+      auto y       = std::stof(tokens[2]);
+      input->cities.push_back(city_id);
+      input->x_pos.push_back(x);
+      input->y_pos.push_back(y);
+    }
+    fs.close();
+    assert(nodes == input->cities.size());
+    return nodes;
+  }
+};
+
+TEST_P(Tests_Tsp, CheckFP32_T) { run_current_test(GetParam()); }
+
+INSTANTIATE_TEST_CASE_P(simple_test, Tests_Tsp, ::testing::ValuesIn(euc_2d));
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp
index 535b4b9c79e..3525db73425 100644
--- a/cpp/tests/utilities/base_fixture.hpp
+++ b/cpp/tests/utilities/base_fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,12 @@
 namespace cugraph {
 namespace test {
 
+// FIXME: The BaseFixture class is not used in any tests. This file is only
+// needed for the CUGRAPH_TEST_PROGRAM_MAIN macro and the code that it calls, so
+// consider removing the BaseFixture class and renaming this file, or moving
+// CUGRAPH_TEST_PROGRAM_MAIN to the test_utilities.hpp file and removing this
+// file completely.
+
 /**
  * @brief Base test fixture class from which all libcudf tests should inherit.
  *
diff --git a/cpp/tests/utilities/mg_test_utilities.cu b/cpp/tests/utilities/mg_test_utilities.cu
new file mode 100644
index 00000000000..26f2450b589
--- /dev/null
+++ b/cpp/tests/utilities/mg_test_utilities.cu
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/mg_test_utilities.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <experimental/detail/graph_utils.cuh>
+#include <experimental/graph.hpp>
+#include <experimental/graph_functions.hpp>
+#include <utilities/shuffle_comm.cuh>
+
+namespace cugraph {
+namespace test {
+
+// Given a raft handle and a path to a dataset (must be a .mtx file), returns a
+// tuple containing:
+//  * graph_t instance for the partition accesible from the raft handle
+//  * vector of indices representing the original unrenumberd vertices
+//
+// This function creates a graph_t instance appropriate for MG graph
+// applications from the edgelist graph data file passed in by filtering out the
+// vertices not to be assigned to the GPU in this rank, then renumbering the
+// vertices appropriately. The returned vector of vertices contains the original
+// vertex IDs, ordered by the new sequential renumbered IDs (this is needed for
+// unrenumbering).
+template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+std::tuple<
+  std::unique_ptr<cugraph::experimental::
+                    graph_t<vertex_t, edge_t, weight_t, store_transposed, true>>,  // multi_gpu=true
+  rmm::device_uvector<vertex_t>>
+create_graph_for_gpu(raft::handle_t& handle, const std::string& graph_file_path)
+{
+  const auto& comm = handle.get_comms();
+  auto& row_comm   = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto& col_comm   = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+
+  int my_rank = comm.get_rank();
+
+  auto edgelist_from_mm =
+    ::cugraph::test::read_edgelist_from_matrix_market_file<vertex_t, edge_t, weight_t>(
+      graph_file_path);
+
+  edge_t total_number_edges = static_cast<edge_t>(edgelist_from_mm.h_rows.size());
+
+  //////////
+  // Copy COO to device
+  rmm::device_uvector<vertex_t> d_edgelist_rows(total_number_edges, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_edgelist_cols(total_number_edges, handle.get_stream());
+  rmm::device_uvector<weight_t> d_edgelist_weights(total_number_edges, handle.get_stream());
+
+  raft::update_device(d_edgelist_rows.data(),
+                      edgelist_from_mm.h_rows.data(),
+                      total_number_edges,
+                      handle.get_stream());
+  raft::update_device(d_edgelist_cols.data(),
+                      edgelist_from_mm.h_cols.data(),
+                      total_number_edges,
+                      handle.get_stream());
+  raft::update_device(d_edgelist_weights.data(),
+                      edgelist_from_mm.h_weights.data(),
+                      total_number_edges,
+                      handle.get_stream());
+
+  //////////
+  // Filter out edges that are not to be associated with this rank
+  //
+  // Create a edge_gpu_identifier, which will be used by the individual jobs to
+  // identify if a edge belongs to a particular rank
+  cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t> edge_gpu_identifier{
+    false, comm.get_size(), row_comm.get_size(), col_comm.get_size()};
+
+  auto edgelist_zip_it_begin = thrust::make_zip_iterator(thrust::make_tuple(
+    d_edgelist_rows.begin(), d_edgelist_cols.begin(), d_edgelist_weights.begin()));
+  bool is_transposed{store_transposed};
+
+  // Do the removal - note: remove_if does not delete items, it moves "removed"
+  // items to the back of the vector and returns the iterator (new_end) that
+  // represents the items kept. Actual removal of items can be done by
+  // resizing (see below).
+  auto new_end = thrust::remove_if(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    edgelist_zip_it_begin,
+    edgelist_zip_it_begin + total_number_edges,
+    [my_rank, is_transposed, edge_gpu_identifier] __device__(auto tup) {
+      if (is_transposed) {
+        return (edge_gpu_identifier(thrust::get<1>(tup), thrust::get<0>(tup)) != my_rank);
+      } else {
+        return (edge_gpu_identifier(thrust::get<0>(tup), thrust::get<1>(tup)) != my_rank);
+      }
+    });
+
+  edge_t local_number_edges = thrust::distance(edgelist_zip_it_begin, new_end);
+  // Free the memory used for the items remove_if "removed". This not only
+  // frees memory, but keeps the actual vector sizes consistent with the data
+  // being used from this point forward.
+  d_edgelist_rows.resize(local_number_edges, handle.get_stream());
+  d_edgelist_rows.shrink_to_fit(handle.get_stream());
+  d_edgelist_cols.resize(local_number_edges, handle.get_stream());
+  d_edgelist_cols.shrink_to_fit(handle.get_stream());
+  d_edgelist_weights.resize(local_number_edges, handle.get_stream());
+  d_edgelist_weights.shrink_to_fit(handle.get_stream());
+
+  //////////
+  // renumber filtered edgelist_from_mm
+  vertex_t* major_vertices{nullptr};
+  vertex_t* minor_vertices{nullptr};
+  if (is_transposed) {
+    major_vertices = d_edgelist_cols.data();
+    minor_vertices = d_edgelist_rows.data();
+  } else {
+    major_vertices = d_edgelist_rows.data();
+    minor_vertices = d_edgelist_cols.data();
+  }
+
+  rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
+  cugraph::experimental::partition_t<vertex_t> partition(
+    std::vector<vertex_t>(comm.get_size() + 1, 0),
+    false,  // is_hypergraph_partitioned()
+    row_comm.get_size(),
+    col_comm.get_size(),
+    row_comm.get_rank(),
+    col_comm.get_rank());
+  vertex_t number_of_vertices{};
+  edge_t number_of_edges{};
+  std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges) =
+    ::cugraph::experimental::renumber_edgelist<vertex_t, edge_t, true>  // multi_gpu=true
+    (handle,
+     major_vertices,  // edgelist_major_vertices, INOUT of vertex_t*
+     minor_vertices,  // edgelist_minor_vertices, INOUT of vertex_t*
+     local_number_edges,
+     false,  // is_hypergraph_partitioned
+     true);  // do_expensive_check
+
+  cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{
+    d_edgelist_rows.data(), d_edgelist_cols.data(), d_edgelist_weights.data(), local_number_edges};
+
+  std::vector<cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t>> edgelist_vect;
+  edgelist_vect.push_back(edgelist);
+  cugraph::experimental::graph_properties_t properties;
+  properties.is_symmetric  = edgelist_from_mm.is_symmetric;
+  properties.is_multigraph = false;
+
+  // Finally, create instance of graph_t using filtered & renumbered edgelist
+  return std::make_tuple(
+    std::make_unique<
+      cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, true>>(
+      handle,
+      edgelist_vect,
+      partition,
+      number_of_vertices,
+      total_number_edges,
+      properties,
+      false,  // sorted_by_global_degree_within_vertex_partition
+      true),  // do_expensive_check
+    std::move(renumber_map_labels));
+}
+
+// explicit instantiation
+template std::tuple<
+  std::unique_ptr<
+    cugraph::experimental::graph_t<int32_t, int32_t, float, true, true>>,  // store_transposed=true
+                                                                           // multi_gpu=true
+  rmm::device_uvector<int32_t>>
+create_graph_for_gpu(raft::handle_t& handle, const std::string& graph_file_path);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/utilities/mg_test_utilities.hpp b/cpp/tests/utilities/mg_test_utilities.hpp
new file mode 100644
index 00000000000..c23f6c43a6d
--- /dev/null
+++ b/cpp/tests/utilities/mg_test_utilities.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <experimental/graph.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+
+#include <gtest/gtest.h>
+
+namespace cugraph {
+namespace test {
+
+// Given a raft handle and a path to a dataset (must be a .mtx file), returns a
+// tuple containing:
+//  * graph_t instance for the partition accesible from the raft handle
+//  * 4-tuple containing renumber info resulting from renumbering the
+//    edgelist for the partition
+template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+std::tuple<
+  std::unique_ptr<cugraph::experimental::
+                    graph_t<vertex_t, edge_t, weight_t, store_transposed, true>>,  // multi_gpu=true
+  rmm::device_uvector<vertex_t>>
+create_graph_for_gpu(raft::handle_t& handle, const std::string& graph_file_path);
+
+/**
+ * @brief Base test fixture class, responsible for handling common operations
+ * needed by all MG tests.
+ *
+ * It's expected this class will be built out and refactored often as new MG C++
+ * tests are added and new patterns evolve.
+ *
+ * Example:
+ * ```
+ * class MyTestFixture : public cugraph::test::MG_TestFixture_t<My_Testparams_t> {};
+ * ```
+ **/
+
+// FIXME: consider moving this to a separate file? (eg. mg_test_fixture.cpp)?
+
+class MG_TestFixture_t : public ::testing::Test {
+ public:
+  static void SetUpTestCase()
+  {
+    MPI_TRY(MPI_Init(NULL, NULL));
+
+    int rank, size;
+    MPI_TRY(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &size));
+
+    int nGpus;
+    CUDA_CHECK(cudaGetDeviceCount(&nGpus));
+
+    ASSERT(
+      nGpus >= size, "Number of GPUs are lesser than MPI ranks! ngpus=%d, nranks=%d", nGpus, size);
+
+    CUDA_CHECK(cudaSetDevice(rank));
+  }
+
+  static void TearDownTestCase() { MPI_TRY(MPI_Finalize()); }
+};
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/utilities/test_utilities.cpp b/cpp/tests/utilities/test_utilities.cpp
new file mode 100644
index 00000000000..abb416a632d
--- /dev/null
+++ b/cpp/tests/utilities/test_utilities.cpp
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <utilities/test_utilities.hpp>
+
+#include <experimental/graph.hpp>
+#include <functions.hpp>
+#include <utilities/error.hpp>
+
+#include <gtest/gtest.h>
+
+extern "C" {
+#include "mmio.h"
+}
+
+#include <gtest/gtest.h>
+
+#include <cfloat>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+namespace cugraph {
+namespace test {
+
+std::string getFileName(const std::string& s)
+{
+  char sep = '/';
+#ifdef _WIN32
+  sep = '\\';
+#endif
+  size_t i = s.rfind(sep, s.length());
+  if (i != std::string::npos) { return (s.substr(i + 1, s.length() - i)); }
+  return ("");
+}
+
+/// Read matrix properties from Matrix Market file
+/** Matrix Market file is assumed to be a sparse matrix in coordinate
+ *  format.
+ *
+ *  @param f File stream for Matrix Market file.
+ *  @param tg Boolean indicating whether to convert matrix to general
+ *  format (from symmetric, Hermitian, or skew symmetric format).
+ *  @param t (Output) MM_typecode with matrix properties.
+ *  @param m (Output) Number of matrix rows.
+ *  @param n (Output) Number of matrix columns.
+ *  @param nnz (Output) Number of non-zero matrix entries.
+ *  @return Zero if properties were read successfully. Otherwise
+ *  non-zero.
+ */
+template <typename IndexType_>
+int mm_properties(FILE* f, int tg, MM_typecode* t, IndexType_* m, IndexType_* n, IndexType_* nnz)
+{
+  // Read matrix properties from file
+  int mint, nint, nnzint;
+  if (fseek(f, 0, SEEK_SET)) {
+    fprintf(stderr, "Error: could not set position in file\n");
+    return -1;
+  }
+  if (mm_read_banner(f, t)) {
+    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
+    return -1;
+  }
+  if (!mm_is_matrix(*t) || !mm_is_coordinate(*t)) {
+    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
+    return -1;
+  }
+  if (mm_read_mtx_crd_size(f, &mint, &nint, &nnzint)) {
+    fprintf(stderr, "Error: could not read matrix dimensions\n");
+    return -1;
+  }
+  if (!mm_is_pattern(*t) && !mm_is_real(*t) && !mm_is_integer(*t) && !mm_is_complex(*t)) {
+    fprintf(stderr, "Error: matrix entries are not valid type\n");
+    return -1;
+  }
+  *m   = mint;
+  *n   = nint;
+  *nnz = nnzint;
+
+  // Find total number of non-zero entries
+  if (tg && !mm_is_general(*t)) {
+    // Non-diagonal entries should be counted twice
+    *nnz *= 2;
+
+    // Diagonal entries should not be double-counted
+    int st;
+    for (int i = 0; i < nnzint; ++i) {
+      // Read matrix entry
+      // MTX only supports int for row and col idx
+      int row, col;
+      double rval, ival;
+      if (mm_is_pattern(*t))
+        st = fscanf(f, "%d %d\n", &row, &col);
+      else if (mm_is_real(*t) || mm_is_integer(*t))
+        st = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
+      else  // Complex matrix
+        st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
+      if (ferror(f) || (st == EOF)) {
+        fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i + 1);
+        return -1;
+      }
+
+      // Check if entry is diagonal
+      if (row == col) --(*nnz);
+    }
+  }
+
+  return 0;
+}
+
+/// Read Matrix Market file and convert to COO format matrix
+/** Matrix Market file is assumed to be a sparse matrix in coordinate
+ *  format.
+ *
+ *  @param f File stream for Matrix Market file.
+ *  @param tg Boolean indicating whether to convert matrix to general
+ *  format (from symmetric, Hermitian, or skew symmetric format).
+ *  @param nnz Number of non-zero matrix entries.
+ *  @param cooRowInd (Output) Row indices for COO matrix. Should have
+ *  at least nnz entries.
+ *  @param cooColInd (Output) Column indices for COO matrix. Should
+ *  have at least nnz entries.
+ *  @param cooRVal (Output) Real component of COO matrix
+ *  entries. Should have at least nnz entries. Ignored if null
+ *  pointer.
+ *  @param cooIVal (Output) Imaginary component of COO matrix
+ *  entries. Should have at least nnz entries. Ignored if null
+ *  pointer.
+ *  @return Zero if matrix was read successfully. Otherwise non-zero.
+ */
+template <typename IndexType_, typename ValueType_>
+int mm_to_coo(FILE* f,
+              int tg,
+              IndexType_ nnz,
+              IndexType_* cooRowInd,
+              IndexType_* cooColInd,
+              ValueType_* cooRVal,
+              ValueType_* cooIVal)
+{
+  // Read matrix properties from file
+  MM_typecode t;
+  int m, n, nnzOld;
+  if (fseek(f, 0, SEEK_SET)) {
+    fprintf(stderr, "Error: could not set position in file\n");
+    return -1;
+  }
+  if (mm_read_banner(f, &t)) {
+    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
+    return -1;
+  }
+  if (!mm_is_matrix(t) || !mm_is_coordinate(t)) {
+    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
+    return -1;
+  }
+  if (mm_read_mtx_crd_size(f, &m, &n, &nnzOld)) {
+    fprintf(stderr, "Error: could not read matrix dimensions\n");
+    return -1;
+  }
+  if (!mm_is_pattern(t) && !mm_is_real(t) && !mm_is_integer(t) && !mm_is_complex(t)) {
+    fprintf(stderr, "Error: matrix entries are not valid type\n");
+    return -1;
+  }
+
+  // Add each matrix entry in file to COO format matrix
+  int i;      // Entry index in Matrix Market file; can only be int in the MTX format
+  int j = 0;  // Entry index in COO format matrix; can only be int in the MTX format
+  for (i = 0; i < nnzOld; ++i) {
+    // Read entry from file
+    int row, col;
+    double rval, ival;
+    int st;
+    if (mm_is_pattern(t)) {
+      st   = fscanf(f, "%d %d\n", &row, &col);
+      rval = 1.0;
+      ival = 0.0;
+    } else if (mm_is_real(t) || mm_is_integer(t)) {
+      st   = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
+      ival = 0.0;
+    } else  // Complex matrix
+      st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
+    if (ferror(f) || (st == EOF)) {
+      fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i + 1);
+      return -1;
+    }
+
+    // Switch to 0-based indexing
+    --row;
+    --col;
+
+    // Record entry
+    cooRowInd[j] = row;
+    cooColInd[j] = col;
+    if (cooRVal != NULL) cooRVal[j] = rval;
+    if (cooIVal != NULL) cooIVal[j] = ival;
+    ++j;
+
+    // Add symmetric complement of non-diagonal entries
+    if (tg && !mm_is_general(t) && (row != col)) {
+      // Modify entry value if matrix is skew symmetric or Hermitian
+      if (mm_is_skew(t)) {
+        rval = -rval;
+        ival = -ival;
+      } else if (mm_is_hermitian(t)) {
+        ival = -ival;
+      }
+
+      // Record entry
+      cooRowInd[j] = col;
+      cooColInd[j] = row;
+      if (cooRVal != NULL) cooRVal[j] = rval;
+      if (cooIVal != NULL) cooIVal[j] = ival;
+      ++j;
+    }
+  }
+  return 0;
+}
+
+int read_binary_vector(FILE* fpin, int n, std::vector<float>& val)
+{
+  size_t is_read1;
+
+  double* t_storage = new double[n];
+  is_read1          = fread(t_storage, sizeof(double), n, fpin);
+  for (int i = 0; i < n; i++) {
+    if (t_storage[i] == DBL_MAX)
+      val[i] = FLT_MAX;
+    else if (t_storage[i] == -DBL_MAX)
+      val[i] = -FLT_MAX;
+    else
+      val[i] = static_cast<float>(t_storage[i]);
+  }
+  delete[] t_storage;
+
+  if (is_read1 != (size_t)n) {
+    printf("%s", "I/O fail\n");
+    return 1;
+  }
+  return 0;
+}
+
+int read_binary_vector(FILE* fpin, int n, std::vector<double>& val)
+{
+  size_t is_read1;
+
+  is_read1 = fread(&val[0], sizeof(double), n, fpin);
+
+  if (is_read1 != (size_t)n) {
+    printf("%s", "I/O fail\n");
+    return 1;
+  }
+  return 0;
+}
+
+// FIXME: A similar function could be useful for CSC format
+//        There are functions above that operate coo -> csr and coo->csc
+/**
+ * @tparam
+ */
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::unique_ptr<cugraph::GraphCSR<vertex_t, edge_t, weight_t>> generate_graph_csr_from_mm(
+  bool& directed, std::string mm_file)
+{
+  vertex_t number_of_vertices;
+  edge_t number_of_edges;
+
+  FILE* fpin = fopen(mm_file.c_str(), "r");
+  EXPECT_NE(fpin, nullptr);
+
+  vertex_t number_of_columns = 0;
+  MM_typecode mm_typecode{0};
+  EXPECT_EQ(mm_properties<vertex_t>(
+              fpin, 1, &mm_typecode, &number_of_vertices, &number_of_columns, &number_of_edges),
+            0);
+  EXPECT_TRUE(mm_is_matrix(mm_typecode));
+  EXPECT_TRUE(mm_is_coordinate(mm_typecode));
+  EXPECT_FALSE(mm_is_complex(mm_typecode));
+  EXPECT_FALSE(mm_is_skew(mm_typecode));
+
+  directed = !mm_is_symmetric(mm_typecode);
+
+  // Allocate memory on host
+  std::vector<vertex_t> coo_row_ind(number_of_edges);
+  std::vector<vertex_t> coo_col_ind(number_of_edges);
+  std::vector<weight_t> coo_val(number_of_edges);
+
+  // Read
+  EXPECT_EQ((mm_to_coo<vertex_t, weight_t>(
+              fpin, 1, number_of_edges, &coo_row_ind[0], &coo_col_ind[0], &coo_val[0], NULL)),
+            0);
+  EXPECT_EQ(fclose(fpin), 0);
+
+  cugraph::GraphCOOView<vertex_t, edge_t, weight_t> cooview(
+    &coo_row_ind[0], &coo_col_ind[0], &coo_val[0], number_of_vertices, number_of_edges);
+
+  return cugraph::coo_to_csr(cooview);
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+edgelist_from_market_matrix_file_t<vertex_t, weight_t> read_edgelist_from_matrix_market_file(
+  std::string const& graph_file_full_path)
+{
+  edgelist_from_market_matrix_file_t<vertex_t, weight_t> ret{};
+
+  MM_typecode mc{};
+  vertex_t m{};
+  edge_t nnz{};
+
+  FILE* file = fopen(graph_file_full_path.c_str(), "r");
+  CUGRAPH_EXPECTS(file != nullptr, "fopen failure.");
+
+  edge_t tmp_m{};
+  edge_t tmp_k{};
+  auto mm_ret = cugraph::test::mm_properties<edge_t>(file, 1, &mc, &tmp_m, &tmp_k, &nnz);
+  CUGRAPH_EXPECTS(mm_ret == 0, "could not read Matrix Market file properties.");
+  m = static_cast<vertex_t>(tmp_m);
+  CUGRAPH_EXPECTS(mm_is_matrix(mc) && mm_is_coordinate(mc) && !mm_is_complex(mc) && !mm_is_skew(mc),
+                  "invalid Matrix Market file properties.");
+
+  ret.h_rows.assign(nnz, vertex_t{0});
+  ret.h_cols.assign(nnz, vertex_t{0});
+  ret.h_weights.assign(nnz, weight_t{0.0});
+  ret.number_of_vertices = m;
+  ret.is_symmetric       = mm_is_symmetric(mc);
+
+  mm_ret = cugraph::test::mm_to_coo<vertex_t, weight_t>(
+    file, 1, nnz, ret.h_rows.data(), ret.h_cols.data(), ret.h_weights.data(), nullptr);
+  CUGRAPH_EXPECTS(mm_ret == 0, "could not read matrix data");
+
+  auto file_ret = fclose(file);
+  CUGRAPH_EXPECTS(file_ret == 0, "fclose failure.");
+
+  return std::move(ret);
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>
+read_graph_from_matrix_market_file(raft::handle_t const& handle,
+                                   std::string const& graph_file_full_path,
+                                   bool test_weighted)
+{
+  auto mm_graph =
+    read_edgelist_from_matrix_market_file<vertex_t, edge_t, weight_t>(graph_file_full_path);
+  edge_t number_of_edges = static_cast<edge_t>(mm_graph.h_rows.size());
+
+  rmm::device_uvector<vertex_t> d_edgelist_rows(number_of_edges, handle.get_stream());
+  rmm::device_uvector<vertex_t> d_edgelist_cols(number_of_edges, handle.get_stream());
+  rmm::device_uvector<weight_t> d_edgelist_weights(test_weighted ? number_of_edges : 0,
+                                                   handle.get_stream());
+
+  raft::update_device(
+    d_edgelist_rows.data(), mm_graph.h_rows.data(), number_of_edges, handle.get_stream());
+  raft::update_device(
+    d_edgelist_cols.data(), mm_graph.h_cols.data(), number_of_edges, handle.get_stream());
+  if (test_weighted) {
+    raft::update_device(
+      d_edgelist_weights.data(), mm_graph.h_weights.data(), number_of_edges, handle.get_stream());
+  }
+
+  cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{
+    d_edgelist_rows.data(),
+    d_edgelist_cols.data(),
+    test_weighted ? d_edgelist_weights.data() : nullptr,
+    number_of_edges};
+
+  return cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>(
+    handle,
+    edgelist,
+    mm_graph.number_of_vertices,
+    cugraph::experimental::graph_properties_t{mm_graph.is_symmetric, false},
+    false,
+    true);
+}
+
+// explicit instantiations
+
+template int mm_to_coo(
+  FILE* f, int tg, int nnz, int* cooRowInd, int* cooColInd, int* cooRVal, int* cooIVal);
+
+template int mm_to_coo(
+  FILE* f, int tg, int nnz, int* cooRowInd, int* cooColInd, double* cooRVal, double* cooIVal);
+
+template int mm_to_coo(
+  FILE* f, int tg, int nnz, int* cooRowInd, int* cooColInd, float* cooRVal, float* cooIVal);
+
+template std::unique_ptr<cugraph::GraphCSR<unsigned int, unsigned int, float>>
+generate_graph_csr_from_mm(bool& directed, std::string mm_file);
+
+template std::unique_ptr<cugraph::GraphCSR<int, int, float>> generate_graph_csr_from_mm(
+  bool& directed, std::string mm_file);
+
+template std::unique_ptr<cugraph::GraphCSR<int, int, double>> generate_graph_csr_from_mm(
+  bool& directed, std::string mm_file);
+
+template std::unique_ptr<cugraph::GraphCSR<long, long, float>> generate_graph_csr_from_mm(
+  bool& directed, std::string mm_file);
+
+template cugraph::experimental::graph_t<int, int, float, false, false>
+read_graph_from_matrix_market_file<int, int, float, false>(raft::handle_t const& handle,
+                                                           std::string const& graph_file_full_path,
+                                                           bool test_weighted);
+
+template cugraph::experimental::graph_t<int, int, float, true, false>
+read_graph_from_matrix_market_file<int, int, float, true>(raft::handle_t const& handle,
+                                                          std::string const& graph_file_full_path,
+                                                          bool test_weighted);
+
+template cugraph::experimental::graph_t<int, long, float, false, false>
+read_graph_from_matrix_market_file<int, long, float, false>(raft::handle_t const& handle,
+                                                            std::string const& graph_file_full_path,
+                                                            bool test_weighted);
+
+template cugraph::experimental::graph_t<int, long, double, false, false>
+read_graph_from_matrix_market_file<int, long, double, false>(
+  raft::handle_t const& handle, std::string const& graph_file_full_path, bool test_weighted);
+
+template cugraph::experimental::graph_t<int, int, double, false, false>
+read_graph_from_matrix_market_file<int, int, double, false>(raft::handle_t const& handle,
+                                                            std::string const& graph_file_full_path,
+                                                            bool test_weighted);
+
+template cugraph::experimental::graph_t<long, long, double, false, false>
+read_graph_from_matrix_market_file<long, long, double, false>(
+  raft::handle_t const& handle, std::string const& graph_file_full_path, bool test_weighted);
+
+template cugraph::experimental::graph_t<long, long, float, false, false>
+read_graph_from_matrix_market_file<long, long, float, false>(
+  raft::handle_t const& handle, std::string const& graph_file_full_path, bool test_weighted);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp
index 518e7c2860e..406f09048e0 100644
--- a/cpp/tests/utilities/test_utilities.hpp
+++ b/cpp/tests/utilities/test_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,37 +16,20 @@
 #pragma once
 
 #include <experimental/graph.hpp>
-#include <functions.hpp>
-#include <utilities/error.hpp>
+#include <graph.hpp>
 
-#include <gtest/gtest.h>
+#include <cstdio>
+#include <string>
+#include <vector>
 
 extern "C" {
 #include "mmio.h"
 }
 
-#include <gtest/gtest.h>
-
-#include <cfloat>
-#include <cstdio>
-#include <string>
-#include <vector>
-
 namespace cugraph {
 namespace test {
 
-std::string getFileName(const std::string& s)
-{
-  char sep = '/';
-
-#ifdef _WIN32
-  sep = '\\';
-#endif
-
-  size_t i = s.rfind(sep, s.length());
-  if (i != std::string::npos) { return (s.substr(i + 1, s.length() - i)); }
-  return ("");
-}
+std::string getFileName(const std::string& s);
 
 /// Read matrix properties from Matrix Market file
 /** Matrix Market file is assumed to be a sparse matrix in coordinate
@@ -63,64 +46,7 @@ std::string getFileName(const std::string& s)
  *  non-zero.
  */
 template <typename IndexType_>
-int mm_properties(FILE* f, int tg, MM_typecode* t, IndexType_* m, IndexType_* n, IndexType_* nnz)
-{
-  // Read matrix properties from file
-  int mint, nint, nnzint;
-  if (fseek(f, 0, SEEK_SET)) {
-    fprintf(stderr, "Error: could not set position in file\n");
-    return -1;
-  }
-  if (mm_read_banner(f, t)) {
-    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
-    return -1;
-  }
-  if (!mm_is_matrix(*t) || !mm_is_coordinate(*t)) {
-    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
-    return -1;
-  }
-  if (mm_read_mtx_crd_size(f, &mint, &nint, &nnzint)) {
-    fprintf(stderr, "Error: could not read matrix dimensions\n");
-    return -1;
-  }
-  if (!mm_is_pattern(*t) && !mm_is_real(*t) && !mm_is_integer(*t) && !mm_is_complex(*t)) {
-    fprintf(stderr, "Error: matrix entries are not valid type\n");
-    return -1;
-  }
-  *m   = mint;
-  *n   = nint;
-  *nnz = nnzint;
-
-  // Find total number of non-zero entries
-  if (tg && !mm_is_general(*t)) {
-    // Non-diagonal entries should be counted twice
-    *nnz *= 2;
-
-    // Diagonal entries should not be double-counted
-    int st;
-    for (int i = 0; i < nnzint; ++i) {
-      // Read matrix entry
-      // MTX only supports int for row and col idx
-      int row, col;
-      double rval, ival;
-      if (mm_is_pattern(*t))
-        st = fscanf(f, "%d %d\n", &row, &col);
-      else if (mm_is_real(*t) || mm_is_integer(*t))
-        st = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
-      else  // Complex matrix
-        st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
-      if (ferror(f) || (st == EOF)) {
-        fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i + 1);
-        return -1;
-      }
-
-      // Check if entry is diagonal
-      if (row == col) --(*nnz);
-    }
-  }
-
-  return 0;
-}
+int mm_properties(FILE* f, int tg, MM_typecode* t, IndexType_* m, IndexType_* n, IndexType_* nnz);
 
 /// Read Matrix Market file and convert to COO format matrix
 /** Matrix Market file is assumed to be a sparse matrix in coordinate
@@ -149,169 +75,20 @@ int mm_to_coo(FILE* f,
               IndexType_* cooRowInd,
               IndexType_* cooColInd,
               ValueType_* cooRVal,
-              ValueType_* cooIVal)
-{
-  // Read matrix properties from file
-  MM_typecode t;
-  int m, n, nnzOld;
-  if (fseek(f, 0, SEEK_SET)) {
-    fprintf(stderr, "Error: could not set position in file\n");
-    return -1;
-  }
-  if (mm_read_banner(f, &t)) {
-    fprintf(stderr, "Error: could not read Matrix Market file banner\n");
-    return -1;
-  }
-  if (!mm_is_matrix(t) || !mm_is_coordinate(t)) {
-    fprintf(stderr, "Error: file does not contain matrix in coordinate format\n");
-    return -1;
-  }
-  if (mm_read_mtx_crd_size(f, &m, &n, &nnzOld)) {
-    fprintf(stderr, "Error: could not read matrix dimensions\n");
-    return -1;
-  }
-  if (!mm_is_pattern(t) && !mm_is_real(t) && !mm_is_integer(t) && !mm_is_complex(t)) {
-    fprintf(stderr, "Error: matrix entries are not valid type\n");
-    return -1;
-  }
-
-  // Add each matrix entry in file to COO format matrix
-  int i;      // Entry index in Matrix Market file; can only be int in the MTX format
-  int j = 0;  // Entry index in COO format matrix; can only be int in the MTX format
-  for (i = 0; i < nnzOld; ++i) {
-    // Read entry from file
-    int row, col;
-    double rval, ival;
-    int st;
-    if (mm_is_pattern(t)) {
-      st   = fscanf(f, "%d %d\n", &row, &col);
-      rval = 1.0;
-      ival = 0.0;
-    } else if (mm_is_real(t) || mm_is_integer(t)) {
-      st   = fscanf(f, "%d %d %lg\n", &row, &col, &rval);
-      ival = 0.0;
-    } else  // Complex matrix
-      st = fscanf(f, "%d %d %lg %lg\n", &row, &col, &rval, &ival);
-    if (ferror(f) || (st == EOF)) {
-      fprintf(stderr, "Error: error %d reading Matrix Market file (entry %d)\n", st, i + 1);
-      return -1;
-    }
-
-    // Switch to 0-based indexing
-    --row;
-    --col;
-
-    // Record entry
-    cooRowInd[j] = row;
-    cooColInd[j] = col;
-    if (cooRVal != NULL) cooRVal[j] = rval;
-    if (cooIVal != NULL) cooIVal[j] = ival;
-    ++j;
-
-    // Add symmetric complement of non-diagonal entries
-    if (tg && !mm_is_general(t) && (row != col)) {
-      // Modify entry value if matrix is skew symmetric or Hermitian
-      if (mm_is_skew(t)) {
-        rval = -rval;
-        ival = -ival;
-      } else if (mm_is_hermitian(t)) {
-        ival = -ival;
-      }
-
-      // Record entry
-      cooRowInd[j] = col;
-      cooColInd[j] = row;
-      if (cooRVal != NULL) cooRVal[j] = rval;
-      if (cooIVal != NULL) cooIVal[j] = ival;
-      ++j;
-    }
-  }
-  return 0;
-}
+              ValueType_* cooIVal);
 
-int read_binary_vector(FILE* fpin, int n, std::vector<float>& val)
-{
-  size_t is_read1;
-
-  double* t_storage = new double[n];
-  is_read1          = fread(t_storage, sizeof(double), n, fpin);
-  for (int i = 0; i < n; i++) {
-    if (t_storage[i] == DBL_MAX)
-      val[i] = FLT_MAX;
-    else if (t_storage[i] == -DBL_MAX)
-      val[i] = -FLT_MAX;
-    else
-      val[i] = static_cast<float>(t_storage[i]);
-  }
-  delete[] t_storage;
-
-  if (is_read1 != (size_t)n) {
-    printf("%s", "I/O fail\n");
-    return 1;
-  }
-  return 0;
-}
-
-int read_binary_vector(FILE* fpin, int n, std::vector<double>& val)
-{
-  size_t is_read1;
+int read_binary_vector(FILE* fpin, int n, std::vector<float>& val);
 
-  is_read1 = fread(&val[0], sizeof(double), n, fpin);
-
-  if (is_read1 != (size_t)n) {
-    printf("%s", "I/O fail\n");
-    return 1;
-  }
-  return 0;
-}
+int read_binary_vector(FILE* fpin, int n, std::vector<double>& val);
 
 // FIXME: A similar function could be useful for CSC format
 //        There are functions above that operate coo -> csr and coo->csc
 /**
  * @tparam
  */
-template <typename VT, typename ET, typename WT>
-std::unique_ptr<cugraph::GraphCSR<VT, ET, WT>> generate_graph_csr_from_mm(bool& directed,
-                                                                          std::string mm_file)
-{
-  VT number_of_vertices;
-  ET number_of_edges;
-
-  FILE* fpin = fopen(mm_file.c_str(), "r");
-  EXPECT_NE(fpin, nullptr);
-
-  VT number_of_columns = 0;
-  MM_typecode mm_typecode{0};
-  EXPECT_EQ(mm_properties<VT>(
-              fpin, 1, &mm_typecode, &number_of_vertices, &number_of_columns, &number_of_edges),
-            0);
-  EXPECT_TRUE(mm_is_matrix(mm_typecode));
-  EXPECT_TRUE(mm_is_coordinate(mm_typecode));
-  EXPECT_FALSE(mm_is_complex(mm_typecode));
-  EXPECT_FALSE(mm_is_skew(mm_typecode));
-
-  directed = !mm_is_symmetric(mm_typecode);
-
-  // Allocate memory on host
-  std::vector<VT> coo_row_ind(number_of_edges);
-  std::vector<VT> coo_col_ind(number_of_edges);
-  std::vector<WT> coo_val(number_of_edges);
-
-  // Read
-  EXPECT_EQ((mm_to_coo<VT, WT>(
-              fpin, 1, number_of_edges, &coo_row_ind[0], &coo_col_ind[0], &coo_val[0], NULL)),
-            0);
-  EXPECT_EQ(fclose(fpin), 0);
-
-  cugraph::GraphCOOView<VT, ET, WT> cooview(
-    &coo_row_ind[0], &coo_col_ind[0], &coo_val[0], number_of_vertices, number_of_edges);
-
-  return cugraph::coo_to_csr(cooview);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// FIXME: move this code to rapids-core
-////////////////////////////////////////////////////////////////////////////////
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::unique_ptr<cugraph::GraphCSR<vertex_t, edge_t, weight_t>> generate_graph_csr_from_mm(
+  bool& directed, std::string mm_file);
 
 // Define RAPIDS_DATASET_ROOT_DIR using a preprocessor variable to
 // allow for a build to override the default. This is useful for
@@ -342,79 +119,13 @@ struct edgelist_from_market_matrix_file_t {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
 edgelist_from_market_matrix_file_t<vertex_t, weight_t> read_edgelist_from_matrix_market_file(
-  std::string const& graph_file_full_path)
-{
-  edgelist_from_market_matrix_file_t<vertex_t, weight_t> ret{};
-
-  MM_typecode mc{};
-  vertex_t m{};
-  edge_t nnz{};
-
-  FILE* file = fopen(graph_file_full_path.c_str(), "r");
-  CUGRAPH_EXPECTS(file != nullptr, "fopen failure.");
-
-  edge_t tmp_m{};
-  edge_t tmp_k{};
-  auto mm_ret = cugraph::test::mm_properties<edge_t>(file, 1, &mc, &tmp_m, &tmp_k, &nnz);
-  CUGRAPH_EXPECTS(mm_ret == 0, "could not read Matrix Market file properties.");
-  m = static_cast<vertex_t>(tmp_m);
-  CUGRAPH_EXPECTS(mm_is_matrix(mc) && mm_is_coordinate(mc) && !mm_is_complex(mc) && !mm_is_skew(mc),
-                  "invalid Matrix Market file properties.");
-
-  ret.h_rows.assign(nnz, vertex_t{0});
-  ret.h_cols.assign(nnz, vertex_t{0});
-  ret.h_weights.assign(nnz, weight_t{0.0});
-  ret.number_of_vertices = m;
-  ret.is_symmetric       = mm_is_symmetric(mc);
-
-  mm_ret = cugraph::test::mm_to_coo<vertex_t, weight_t>(
-    file, 1, nnz, ret.h_rows.data(), ret.h_cols.data(), ret.h_weights.data(), nullptr);
-  CUGRAPH_EXPECTS(mm_ret == 0, "could not read matrix data");
-
-  auto file_ret = fclose(file);
-  CUGRAPH_EXPECTS(file_ret == 0, "fclose failure.");
-
-  return std::move(ret);
-}
+  std::string const& graph_file_full_path);
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
 cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>
 read_graph_from_matrix_market_file(raft::handle_t const& handle,
                                    std::string const& graph_file_full_path,
-                                   bool test_weighted)
-{
-  auto mm_graph =
-    read_edgelist_from_matrix_market_file<vertex_t, edge_t, weight_t>(graph_file_full_path);
-  edge_t number_of_edges = static_cast<edge_t>(mm_graph.h_rows.size());
-
-  rmm::device_uvector<vertex_t> d_edgelist_rows(number_of_edges, handle.get_stream());
-  rmm::device_uvector<vertex_t> d_edgelist_cols(number_of_edges, handle.get_stream());
-  rmm::device_uvector<weight_t> d_edgelist_weights(test_weighted ? number_of_edges : 0,
-                                                   handle.get_stream());
-
-  raft::update_device(
-    d_edgelist_rows.data(), mm_graph.h_rows.data(), number_of_edges, handle.get_stream());
-  raft::update_device(
-    d_edgelist_cols.data(), mm_graph.h_cols.data(), number_of_edges, handle.get_stream());
-  if (test_weighted) {
-    raft::update_device(
-      d_edgelist_weights.data(), mm_graph.h_weights.data(), number_of_edges, handle.get_stream());
-  }
-
-  cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{
-    d_edgelist_rows.data(),
-    d_edgelist_cols.data(),
-    test_weighted ? d_edgelist_weights.data() : nullptr,
-    number_of_edges};
-
-  return cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>(
-    handle,
-    edgelist,
-    mm_graph.number_of_vertices,
-    cugraph::experimental::graph_properties_t{mm_graph.is_symmetric, false},
-    false,
-    true);
-}
+                                   bool test_weighted);
 
 }  // namespace test
 }  // namespace cugraph
diff --git a/datasets/eil51.tsp b/datasets/eil51.tsp
new file mode 100644
index 00000000000..543d1013c14
--- /dev/null
+++ b/datasets/eil51.tsp
@@ -0,0 +1,58 @@
+NAME : eil51
+COMMENT : 51-city problem (Christofides/Eilon)
+TYPE : TSP
+DIMENSION : 51
+EDGE_WEIGHT_TYPE : EUC_2D
+NODE_COORD_SECTION
+1 37 52
+2 49 49
+3 52 64
+4 20 26
+5 40 30
+6 21 47
+7 17 63
+8 31 62
+9 52 33
+10 51 21
+11 42 41
+12 31 32
+13 5 25
+14 12 42
+15 36 16
+16 52 41
+17 27 23
+18 17 33
+19 13 13
+20 57 58
+21 62 42
+22 42 57
+23 16 57
+24 8 52
+25 7 38
+26 27 68
+27 30 48
+28 43 67
+29 58 48
+30 58 27
+31 37 69
+32 38 46
+33 46 10
+34 61 33
+35 62 63
+36 63 69
+37 32 22
+38 45 35
+39 59 15
+40 5 6
+41 10 17
+42 21 10
+43 5 64
+44 30 15
+45 39 10
+46 32 39
+47 25 32
+48 25 55
+49 48 28
+50 56 37
+51 30 40
+EOF
diff --git a/datasets/get_test_data.sh b/datasets/get_test_data.sh
index 071a4b8dea3..3e0b6c55c37 100755
--- a/datasets/get_test_data.sh
+++ b/datasets/get_test_data.sh
@@ -1,3 +1,16 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #!/bin/bash
 set -e
 set -o pipefail
@@ -31,6 +44,10 @@ benchmark
 # ~1s download
 https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/benchmark/hibench/hibench_1_small.tgz
 benchmark
+
+# ~0.6s download
+https://rapidsai-data.s3.us-east-2.amazonaws.com/cugraph/test/tsplib/datasets.tar.gz
+tsplib
 "
 
 EXTENDED_DATASET_DATA="
diff --git a/datasets/gil262.tsp b/datasets/gil262.tsp
new file mode 100755
index 00000000000..cfcb15c3b78
--- /dev/null
+++ b/datasets/gil262.tsp
@@ -0,0 +1,269 @@
+NAME : gil262
+COMMENT : 262-city problem (Gillet/Johnson)
+TYPE : TSP
+DIMENSION : 262
+EDGE_WEIGHT_TYPE : EUC_2D
+NODE_COORD_SECTION
+1 -99 -97
+2 -59 50
+3 0 14
+4 -17 -66
+5 -69 -19
+6 31 12
+7 5 -41
+8 -12 10
+9 -64 70
+10 -12 85
+11 -18 64
+12 -77 -16
+13 -53 88
+14 83 -24
+15 24 41
+16 17 21
+17 42 96
+18 -65 0
+19 -47 -26
+20 85 36
+21 -35 -54
+22 54 -21
+23 64 -17
+24 55 89
+25 17 -25
+26 -61 66
+27 -61 26
+28 17 -72
+29 79 38
+30 -62 -2
+31 -90 -68
+32 52 66
+33 -54 -50
+34 8 -84
+35 37 -90
+36 -83 49
+37 35 -1
+38 7 59
+39 12 48
+40 57 95
+41 92 28
+42 -3 97
+43 -7 52
+44 42 -15
+45 77 -43
+46 59 -49
+47 25 91
+48 69 -19
+49 -82 -14
+50 74 -70
+51 69 59
+52 29 33
+53 -97 9
+54 -58 9
+55 28 93
+56 7 73
+57 -28 73
+58 -76 55
+59 41 42
+60 92 40
+61 -84 -29
+62 -12 42
+63 51 -45
+64 -37 46
+65 -97 35
+66 14 89
+67 60 58
+68 -63 -75
+69 -18 34
+70 -46 -82
+71 -86 -79
+72 -43 -30
+73 -44 7
+74 -3 -20
+75 36 41
+76 -30 -94
+77 79 -62
+78 51 70
+79 -61 -26
+80 6 94
+81 -19 -62
+82 -20 51
+83 -81 37
+84 7 31
+85 52 12
+86 83 -91
+87 -7 -92
+88 82 -74
+89 -70 85
+90 -83 -30
+91 71 -61
+92 85 11
+93 66 -48
+94 78 -87
+95 9 -79
+96 -36 4
+97 66 39
+98 92 -17
+99 -46 -79
+100 -30 -63
+101 -42 63
+102 20 42
+103 15 98
+104 1 -17
+105 64 20
+106 -96 85
+107 93 -29
+108 -40 -84
+109 86 35
+110 91 36
+111 62 -8
+112 -24 4
+113 11 96
+114 -53 62
+115 -28 -71
+116 7 -4
+117 95 -9
+118 -3 17
+119 53 -90
+120 58 -19
+121 -83 84
+122 -1 49
+123 -4 17
+124 -82 -3
+125 -43 47
+126 6 -6
+127 70 99
+128 68 -29
+129 -94 -30
+130 -94 -20
+131 -21 77
+132 64 37
+133 -70 -19
+134 88 65
+135 2 29
+136 33 57
+137 -70 6
+138 -38 -56
+139 -80 -95
+140 -5 -39
+141 8 -22
+142 -61 -76
+143 76 -22
+144 49 -71
+145 -30 -68
+146 1 34
+147 77 79
+148 -58 64
+149 82 -97
+150 -80 55
+151 81 -86
+152 39 -49
+153 -67 72
+154 -25 -89
+155 -44 -95
+156 32 -68
+157 -17 49
+158 93 49
+159 99 81
+160 10 -49
+161 63 -41
+162 38 39
+163 -28 39
+164 -2 -47
+165 38 8
+166 -42 -6
+167 -67 88
+168 19 93
+169 40 27
+170 -61 56
+171 43 33
+172 -18 -39
+173 -69 19
+174 75 -18
+175 31 85
+176 25 58
+177 -16 36
+178 91 15
+179 60 -39
+180 49 -47
+181 42 33
+182 16 -81
+183 -78 53
+184 53 -80
+185 -46 -26
+186 -25 -54
+187 69 -46
+188 0 -78
+189 -84 74
+190 -16 16
+191 -63 -14
+192 51 -77
+193 -39 61
+194 5 97
+195 -55 39
+196 70 -14
+197 0 95
+198 -45 7
+199 38 -24
+200 50 -37
+201 59 71
+202 -73 -96
+203 -29 72
+204 -47 12
+205 -88 -61
+206 -88 36
+207 -46 -3
+208 26 -37
+209 -39 -67
+210 92 27
+211 -80 -31
+212 93 -50
+213 -20 -5
+214 -22 73
+215 -4 -7
+216 54 -48
+217 -70 39
+218 54 -82
+219 29 41
+220 -87 51
+221 -96 -36
+222 49 8
+223 -5 54
+224 -26 43
+225 -11 60
+226 40 61
+227 82 35
+228 -92 12
+229 -93 -86
+230 -66 63
+231 -72 -87
+232 -57 -84
+233 23 52
+234 -56 -62
+235 -19 59
+236 63 -14
+237 -13 38
+238 -19 87
+239 44 -84
+240 98 -17
+241 -16 62
+242 3 66
+243 26 22
+244 -38 -81
+245 70 80
+246 17 -35
+247 96 -83
+248 -77 80
+249 -14 44
+250 -33 33
+251 33 -33
+252 70 0
+253 -50 60
+254 -50 -60
+255 75 0
+256 0 75
+257 -75 0
+258 0 -75
+259 40 80
+260 40 -80
+261 -60 20
+262 -60 -20
+EOF
diff --git a/datasets/kroA100.tsp b/datasets/kroA100.tsp
new file mode 100644
index 00000000000..05ebae994ac
--- /dev/null
+++ b/datasets/kroA100.tsp
@@ -0,0 +1,107 @@
+NAME: kroA100
+TYPE: TSP
+COMMENT: 100-city problem A (Krolak/Felts/Nelson)
+DIMENSION: 100
+EDGE_WEIGHT_TYPE : EUC_2D
+NODE_COORD_SECTION
+1 1380 939
+2 2848 96
+3 3510 1671
+4 457 334
+5 3888 666
+6 984 965
+7 2721 1482
+8 1286 525
+9 2716 1432
+10 738 1325
+11 1251 1832
+12 2728 1698
+13 3815 169
+14 3683 1533
+15 1247 1945
+16 123 862
+17 1234 1946
+18 252 1240
+19 611 673
+20 2576 1676
+21 928 1700
+22 53 857
+23 1807 1711
+24 274 1420
+25 2574 946
+26 178 24
+27 2678 1825
+28 1795 962
+29 3384 1498
+30 3520 1079
+31 1256 61
+32 1424 1728
+33 3913 192
+34 3085 1528
+35 2573 1969
+36 463 1670
+37 3875 598
+38 298 1513
+39 3479 821
+40 2542 236
+41 3955 1743
+42 1323 280
+43 3447 1830
+44 2936 337
+45 1621 1830
+46 3373 1646
+47 1393 1368
+48 3874 1318
+49 938 955
+50 3022 474
+51 2482 1183
+52 3854 923
+53 376 825
+54 2519 135
+55 2945 1622
+56 953 268
+57 2628 1479
+58 2097 981
+59 890 1846
+60 2139 1806
+61 2421 1007
+62 2290 1810
+63 1115 1052
+64 2588 302
+65 327 265
+66 241 341
+67 1917 687
+68 2991 792
+69 2573 599
+70 19 674
+71 3911 1673
+72 872 1559
+73 2863 558
+74 929 1766
+75 839 620
+76 3893 102
+77 2178 1619
+78 3822 899
+79 378 1048
+80 1178 100
+81 2599 901
+82 3416 143
+83 2961 1605
+84 611 1384
+85 3113 885
+86 2597 1830
+87 2586 1286
+88 161 906
+89 1429 134
+90 742 1025
+91 1625 1651
+92 1187 706
+93 1787 1009
+94 22 987
+95 3640 43
+96 3756 882
+97 776 392
+98 1724 1642
+99 198 1810
+100 3950 1558
+EOF
diff --git a/datasets/tsp225.tsp b/datasets/tsp225.tsp
new file mode 100644
index 00000000000..ac9e06cecc1
--- /dev/null
+++ b/datasets/tsp225.tsp
@@ -0,0 +1,232 @@
+NAME : tsp225
+COMMENT : A TSP problem (Reinelt)
+TYPE : TSP
+DIMENSION : 225
+EDGE_WEIGHT_TYPE : EUC_2D
+NODE_COORD_SECTION
+1 155.42 150.65
+2 375.92 164.65
+3 183.92 150.65
+4 205.42 150.65
+5 205.42 171.65
+6 226.42 171.65
+7 226.42 186.15
+8 226.42 207.15
+9 226.42 235.65
+10 226.42 264.15
+11 226.42 292.65
+12 226.42 314.15
+13 226.42 335.65
+14 205.42 335.65
+15 190.92 335.65
+16 190.92 328.15
+17 176.92 328.15
+18 176.92 299.65
+19 155.42 299.65
+20 155.42 328.15
+21 155.42 356.65
+22 183.92 356.65
+23 219.42 356.65
+24 240.92 356.65
+25 269.42 356.65
+26 290.42 356.65
+27 387.42 136.15
+28 318.92 356.65
+29 318.92 335.65
+30 318.92 328.15
+31 318.92 299.65
+32 297.92 299.65
+33 290.42 328.15
+34 290.42 335.65
+35 297.92 328.15
+36 254.92 335.65
+37 254.92 314.15
+38 254.92 292.65
+39 254.92 271.65
+40 254.92 243.15
+41 254.92 221.65
+42 254.92 193.15
+43 254.92 171.65
+44 276.42 171.65
+45 296.42 150.65
+46 276.42 150.65
+47 375.92 150.65
+48 308.92 150.65
+49 354.92 164.65
+50 338.42 174.65
+51 354.92 174.65
+52 338.42 200.15
+53 338.42 221.65
+54 354.92 221.65
+55 354.92 200.15
+56 361.92 200.15
+57 361.92 186.15
+58 383.42 186.15
+59 383.42 179.15
+60 404.42 179.15
+61 404.42 186.15
+62 418.92 186.15
+63 418.92 200.15
+64 432.92 200.15
+65 432.92 221.65
+66 418.92 221.65
+67 418.92 235.65
+68 397.42 235.65
+69 397.42 243.15
+70 375.92 243.15
+71 375.92 257.15
+72 368.92 257.15
+73 368.92 264.15
+74 347.42 264.15
+75 347.42 278.65
+76 336.42 278.65
+77 336.42 328.15
+78 347.42 328.15
+79 347.42 342.65
+80 368.92 342.65
+81 368.92 353.65
+82 418.92 353.65
+83 418.92 342.65
+84 432.92 342.65
+85 432.92 356.65
+86 447.42 356.65
+87 447.42 321.15
+88 447.42 292.65
+89 432.92 292.65
+90 432.92 314.15
+91 418.92 314.15
+92 418.92 321.15
+93 397.42 321.15
+94 397.42 333.65
+95 375.92 333.65
+96 375.92 321.15
+97 361.92 321.15
+98 361.92 299.65
+99 375.92 299.65
+100 375.92 285.65
+101 397.42 285.65
+102 397.42 271.65
+103 418.92 271.65
+104 418.92 264.15
+105 439.92 264.15
+106 439.92 250.15
+107 454.42 250.15
+108 454.42 243.15
+109 461.42 243.15
+110 461.42 214.65
+111 461.42 193.15
+112 447.42 193.15
+113 447.42 179.15
+114 439.92 179.15
+115 439.92 167.65
+116 419.92 167.65
+117 419.92 150.65
+118 439.92 150.65
+119 454.42 150.65
+120 475.92 150.65
+121 475.92 171.65
+122 496.92 171.65
+123 496.92 193.15
+124 496.92 214.65
+125 496.92 243.15
+126 496.92 271.65
+127 496.92 292.65
+128 496.92 317.15
+129 496.92 335.65
+130 470.42 335.65
+131 470.42 356.65
+132 496.92 356.65
+133 347.42 150.65
+134 539.92 356.65
+135 560.92 356.65
+136 589.42 356.65
+137 589.42 342.65
+138 603.92 342.65
+139 610.92 342.65
+140 610.92 335.65
+141 610.92 321.15
+142 624.92 321.15
+143 624.92 278.65
+144 610.92 278.65
+145 610.92 257.15
+146 589.42 257.15
+147 589.42 250.15
+148 575.42 250.15
+149 560.92 250.15
+150 542.92 250.15
+151 542.92 264.15
+152 560.92 264.15
+153 575.42 264.15
+154 575.42 271.65
+155 582.42 271.65
+156 582.42 285.65
+157 596.42 285.65
+158 560.92 335.65
+159 596.42 314.15
+160 582.42 314.15
+161 582.42 321.15
+162 575.42 321.15
+163 575.42 335.65
+164 525.42 335.65
+165 525.42 314.15
+166 525.42 299.65
+167 525.42 281.65
+168 525.42 233.15
+169 525.42 214.65
+170 525.42 193.15
+171 525.42 171.65
+172 546.92 171.65
+173 546.92 150.65
+174 568.42 150.65
+175 475.92 160.65
+176 603.92 150.65
+177 624.92 150.65
+178 624.92 136.15
+179 596.42 136.15
+180 575.42 136.15
+181 553.92 136.15
+182 532.42 136.15
+183 575.42 356.65
+184 489.92 136.15
+185 468.42 136.15
+186 447.42 136.15
+187 425.92 136.15
+188 404.42 136.15
+189 370.42 136.15
+190 361.92 150.65
+191 340.42 136.15
+192 326.42 136.15
+193 301.92 136.15
+194 276.42 136.15
+195 254.92 136.15
+196 315.92 136.15
+197 212.42 136.15
+198 190.92 136.15
+199 338.92 150.65
+200 155.42 136.15
+201 624.92 299.65
+202 318.92 321.65
+203 155.42 314.15
+204 311.92 356.65
+205 355.42 136.15
+206 318.92 314.15
+207 362.92 164.65
+208 254.92 356.65
+209 383.42 333.65
+210 447.42 335.65
+211 470.42 345.65
+212 525.42 250.15
+213 546.92 335.65
+214 525.42 261.15
+215 525.42 356.65
+216 336.42 298.65
+217 336.42 313.15
+218 293.42 136.15
+219 336.42 306.15
+220 425.92 264.15
+221 391.42 353.65
+222 482.92 335.65
+223 429.92 167.65
+224 330.92 150.65
+225 368.42 150.65
+EOF
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 459e5fbf4f1..dcdf3e6ff33 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -40,13 +40,6 @@ Betweenness Centrality
     :members:
     :undoc-members:
 
-Edge Betweenness Centrality
----------------------------
-
-.. automodule:: cugraph.centrality.edge_betweenness_centrality
-    :members:
-    :undoc-members:
-
 Katz Centrality
 ---------------
 
@@ -58,6 +51,13 @@ Katz Centrality
 Community
 =========
 
+EgoNet
+------------------------------------
+
+.. automodule:: cugraph.community.egonet
+	:members:
+	:undoc-members:
+
 Ensemble clustering for graphs (ECG)
 ------------------------------------
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6b484a5f57b..eb4745a61f0 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -80,9 +80,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.18'
+version = '0.19'
 # The full version, including alpha/beta/rc tags.
-release = '0.18.0'
+release = '0.19.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/source/cugraph_blogs.rst b/docs/source/cugraph_blogs.rst
index 84e31d40a19..cbbc93a1b14 100644
--- a/docs/source/cugraph_blogs.rst
+++ b/docs/source/cugraph_blogs.rst
@@ -23,6 +23,12 @@ BLOGS
   * `Status of RAPIDS cuGraph — Refactoring Code And Rethinking Graphs <https://medium.com/rapids-ai/status-of-rapids-cugraph-refactoring-code-and-rethinking-graphs-efe9956d5528>`_
   * `Tackling Large Graphs with RAPIDS cuGraph and CUDA Unified Memory on GPUs <https://medium.com/rapids-ai/tackling-large-graphs-with-rapids-cugraph-and-unified-virtual-memory-b5b69a065d4>`_
   * `RAPIDS cuGraph adds NetworkX and DiGraph Compatibility <https://t.co/6DEhyarVGa>`_
+  * `Large Graph Visualization with RAPIDS cuGraph <https://medium.com/rapids-ai/large-graph-visualization-with-rapids-cugraph-590d07edce33>`_
+
+2021
+------
+   * <soon>
+
 
 
 Media
diff --git a/docs/source/cugraph_intro.md b/docs/source/cugraph_intro.md
index 5bf2b715462..142395fb719 100644
--- a/docs/source/cugraph_intro.md
+++ b/docs/source/cugraph_intro.md
@@ -1,8 +1,55 @@
 
 # cuGraph Introduction
+The Data Scientist has a collection of techniques within their 
+proverbial toolbox. Data engineering, statistical analysis, and 
+machine learning are among the most commonly known. However, there 
+are numerous cases where the focus of the analysis is on the 
+relationship between data elements. In those cases, the data is best 
+represented as a graph. Graph analysis, also called network analysis, 
+is a collection of algorithms for answering questions posed against 
+graph data. Graph analysis is not new.
 
+The first graph problem was posed by Euler in 1736, the [Seven Bridges of 
+Konigsberg](https://en.wikipedia.org/wiki/Seven_Bridges_of_K%C3%B6nigsberg), 
+and laid the foundation for the mathematical field of graph theory. 
+The application of graph analysis covers a wide variety of fields, including 
+marketing, biology, physics, computer science, sociology, and cyber to name a few.
 
-## Terminology
+RAPIDS cuGraph is a library of graph algorithms that seamlessly integrates 
+into the RAPIDS data science ecosystem and allows the data scientist to easily 
+call graph algorithms using data stored in a GPU DataFrame, NetworkX Graphs, or even 
+CuPy or SciPy sparse Matrix.  
+
+
+# Vision
+The vision of RAPIDS cuGraph is to ___make graph analysis ubiquitous to the 
+point that users just think in terms of analysis and not technologies or 
+frameworks___. This is a goal that many of us on the cuGraph team have been 
+working on for almost twenty years. Many of the early attempts focused on 
+solving one problem or using one technique. Those early attempts worked for 
+the initial goal but tended to break as the scope changed (e.g., shifting 
+to solving a dynamic graph problem with a static graph solution). The limiting 
+factors usually came down to compute power, ease-of-use, or choosing a data 
+structure that was not suited for all problems. NVIDIA GPUs, CUDA, and RAPIDS 
+have totally changed the paradigm and the goal of an accelerated unified graph 
+analytic library is now possible.
+
+The compute power of the latest NVIDIA GPUs (RAPIDS supports Pascal and later 
+GPU architectures) make graph analytics 1000x faster on average over NetworkX. 
+Moreover, the internal memory speed within a GPU allows cuGraph to rapidly 
+switch the data structure to best suit the needs of the analytic rather than 
+being restricted to a single data structure. cuGraph is working with several 
+frameworks for both static and dynamic graph data structures so that we always 
+have a solution to any graph problem. Since Python has emerged as the de facto 
+language for data science, allowing interactivity and the ability to run graph 
+analytics in Python makes cuGraph familiar and approachable. RAPIDS wraps all 
+the graph analytic goodness mentioned above with the ability to perform 
+high-speed ETL, statistics, and machine learning. To make things even better, 
+RAPIDS and DASK allows cuGraph to scale to multiple GPUs to support 
+multi-billion edge graphs.
+
+
+# Terminology
 
 cuGraph is a collection of GPU accelerated graph algorithms and graph utility
 functions. The application of graph analysis covers a lot of areas.
diff --git a/notebooks/community/ECG.ipynb b/notebooks/community/ECG.ipynb
index d7595dadb26..4a9eedd3c3a 100644
--- a/notebooks/community/ECG.ipynb
+++ b/notebooks/community/ECG.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Ensemble Clustering for Graphs (ECG)\n",
+    "# Does not run on Pascal\n",
     "In this notebook, we will use cuGraph to identify the cluster in a test graph using the Ensemble Clustering for Graph approach.  \n",
     "\n",
     "\n",
diff --git a/notebooks/community/Louvain.ipynb b/notebooks/community/Louvain.ipynb
index e5e5e6a04ed..bfb8e299f49 100755
--- a/notebooks/community/Louvain.ipynb
+++ b/notebooks/community/Louvain.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Louvain Community Detection\n",
+    "# Does not run on Pascal\n",
     "\n",
     "\n",
     "In this notebook, we will use cuGraph to identify the cluster in a test graph using the Louvain algorithm  \n",
diff --git a/notebooks/community/Subgraph-Extraction.ipynb b/notebooks/community/Subgraph-Extraction.ipynb
index e068ef53aa5..cac52262d4d 100755
--- a/notebooks/community/Subgraph-Extraction.ipynb
+++ b/notebooks/community/Subgraph-Extraction.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Subgraph Extraction\n",
+    "# Does not run on Pascal\n",
     "\n",
     "In this notebook, we will use cuGraph to extract a subgraph from the test graph.  \n",
     "\n",
diff --git a/notebooks/community/Triangle-Counting.ipynb b/notebooks/community/Triangle-Counting.ipynb
index 09d7906a526..19d3f838fc6 100755
--- a/notebooks/community/Triangle-Counting.ipynb
+++ b/notebooks/community/Triangle-Counting.ipynb
@@ -21,7 +21,7 @@
     "\n",
     "\n",
     "## Introduction\n",
-    "Triancle Counting, as the name implies, finds the number of triangles in a graph.  Triangles are important in computing the clustering Coefficient and can be used for clustering.  \n",
+    "Triangle Counting, as the name implies, finds the number of triangles in a graph.  Triangles are important in computing the clustering Coefficient and can be used for clustering.  \n",
     "\n",
     "\n",
     "To compute the Pagerank scores for a graph in cuGraph we use:<br>\n",
diff --git a/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb b/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb
index 58eb94bf0ee..6ae695e206e 100644
--- a/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/bfs_benchmark.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# BFS Performance Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "This notebook benchmarks performance of running BFS within cuGraph against NetworkX. \n",
     "\n",
diff --git a/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb b/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb
index a12b7c4bcc2..00e99a28617 100644
--- a/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/louvain_benchmark.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Louvain Performance Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "This notebook benchmarks performance improvement of running the Louvain clustering algorithm within cuGraph against NetworkX. The test is run over eight test networks (graphs) and then results plotted.  \n",
     "<p><p>\n",
diff --git a/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb b/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb
index 6f76868f9a4..403c317ac0a 100644
--- a/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb
+++ b/notebooks/cugraph_benchmarks/nx_cugraph_bc_benchmarking.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Benchmarking NetworkX compatibility\n",
+    "# Skip notebook test\n",
     "This notebook benchmark the use of a NetworkX Graph object as input into algorithms.  <p>\n",
     "The intention of the feature is to be able to drop cuGraph into existing NetworkX code in spot where performance is not optimal.\n",
     "\n",
diff --git a/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb b/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb
index c2933a10c7d..d0416efdd87 100644
--- a/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/pagerank_benchmark.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# PageRank Performance Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "This notebook benchmarks performance of running PageRank within cuGraph against NetworkX. NetworkX contains several implementations of PageRank.  This benchmark will compare cuGraph versus the defaukt Nx implementation as well as the SciPy version\n",
     "\n",
diff --git a/notebooks/cugraph_benchmarks/release.ipynb b/notebooks/cugraph_benchmarks/release.ipynb
index d3110da3621..3c6da55abc0 100644
--- a/notebooks/cugraph_benchmarks/release.ipynb
+++ b/notebooks/cugraph_benchmarks/release.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# Release Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "With every release, RAPIDS publishes a release slide deck that includes the current performance state of cuGraph. \n",
     "This notebook, starting with release 0.15, runs all the various algorithms to computes the performance gain.  \n",
diff --git a/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb b/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb
index 2d040e0acaf..32b562e7a1e 100644
--- a/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb
+++ b/notebooks/cugraph_benchmarks/sssp_benchmark.ipynb
@@ -5,6 +5,7 @@
    "metadata": {},
    "source": [
     "# SSSP Performance Benchmarking\n",
+    "# Skip notebook test\n",
     "\n",
     "This notebook benchmarks performance of running SSSP within cuGraph against NetworkX. \n",
     "\n",
diff --git a/notebooks/demo/uvm.ipynb b/notebooks/demo/uvm.ipynb
index d279be8ed54..8fa2b08b6d1 100644
--- a/notebooks/demo/uvm.ipynb
+++ b/notebooks/demo/uvm.ipynb
@@ -6,6 +6,7 @@
    "source": [
     "# Oversubscribing GPU memory in cuGraph\n",
     "#### Author : Alex Fender\n",
+    "# Skip notebook test\n",
     "\n",
     "In this notebook, we will show how to **scale to 4x larger graphs than before** without incurring a performance drop using managed memory features in cuGraph. We will compute the PageRank of each user in Twitter's dataset on a single GPU as an example. This technique applies to all features.\n",
     "\n",
diff --git a/notebooks/link_analysis/Pagerank.ipynb b/notebooks/link_analysis/Pagerank.ipynb
index c43561ff48c..a81e1ccf6c3 100755
--- a/notebooks/link_analysis/Pagerank.ipynb
+++ b/notebooks/link_analysis/Pagerank.ipynb
@@ -11,7 +11,7 @@
     "Notebook Credits\n",
     "* Original Authors: Bradley Rees and James Wyles\n",
     "* Created:   08/13/2019\n",
-    "* Updated:   08/16/2020\n",
+    "* Updated:   01/17/2021\n",
     "\n",
     "RAPIDS Versions: 0.14    \n",
     "\n",
@@ -190,7 +190,7 @@
    "metadata": {},
    "source": [
     "### Read in the data - GPU\n",
-    "cuGraph depends on cuDF for data loading and the initial Dataframe creation\n",
+    "cuGraph graphs can be created from cuDF, dask_cuDF and Pandas dataframes\n",
     "\n",
     "The data file contains an edge list, which represents the connection of a vertex to another.  The `source` to `destination` pairs is in what is known as Coordinate Format (COO).  In this test case, the data is just two columns.  However a third, `weight`, column is also possible"
    ]
@@ -219,8 +219,7 @@
    "outputs": [],
    "source": [
     "# create a Graph using the source (src) and destination (dst) vertex pairs from the Dataframe \n",
-    "G = cugraph.Graph()\n",
-    "G.from_cudf_edgelist(gdf, source='src', destination='dst')"
+    "G = cugraph.from_edgelist(gdf, source='src', destination='dst')"
    ]
   },
   {
diff --git a/python/cugraph/__init__.py b/python/cugraph/__init__.py
index d752c868237..8a847d1f1d4 100644
--- a/python/cugraph/__init__.py
+++ b/python/cugraph/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -24,11 +24,15 @@
     analyzeClustering_ratio_cut,
     subgraph,
     triangles,
+    ego_graph,
+    batched_ego_graphs,
 )
 
 from cugraph.structure import (
     Graph,
     DiGraph,
+    MultiGraph,
+    MultiDiGraph,
     from_edgelist,
     from_cudf_edgelist,
     from_pandas_edgelist,
@@ -77,7 +81,8 @@
     sssp,
     shortest_path,
     filter_unreachable,
-    shortest_path_length
+    shortest_path_length,
+    traveling_salesperson
 )
 
 from cugraph.tree import minimum_spanning_tree, maximum_spanning_tree
@@ -89,7 +94,7 @@
 from cugraph.proto.components import strong_connected_component
 from cugraph.proto.structure import find_bicliques
 
-from cugraph.linear_assignment import hungarian
+from cugraph.linear_assignment import hungarian, dense_hungarian
 from cugraph.layout import force_atlas2
 from cugraph.raft import raft_include_test
 from cugraph.comms import comms
diff --git a/python/cugraph/centrality/__init__.py b/python/cugraph/centrality/__init__.py
index da882a61850..f33df2fe61a 100644
--- a/python/cugraph/centrality/__init__.py
+++ b/python/cugraph/centrality/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 from cugraph.centrality.katz_centrality import katz_centrality
-from cugraph.centrality.betweenness_centrality import betweenness_centrality
 from cugraph.centrality.betweenness_centrality import (
+    betweenness_centrality,
     edge_betweenness_centrality,
 )
diff --git a/python/cugraph/centrality/betweenness_centrality.py b/python/cugraph/centrality/betweenness_centrality.py
index 93bdce7c515..3b7cfe6b68f 100644
--- a/python/cugraph/centrality/betweenness_centrality.py
+++ b/python/cugraph/centrality/betweenness_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -36,8 +36,10 @@ def betweenness_centrality(
     Betweenness centrality is a measure of the number of shortest paths that
     pass through a vertex.  A vertex with a high betweenness centrality score
     has more paths passing through it and is therefore believed to be more
-    important. Rather than doing an all-pair shortest path, a sample of k
-    starting vertices can be used.
+    important.
+
+    To improve performance. rather than doing an all-pair shortest path,
+    a sample of k starting vertices can be used.
 
     CuGraph does not currently support the 'endpoints' and 'weight' parameters
     as seen in the corresponding networkX call.
@@ -52,19 +54,18 @@ def betweenness_centrality(
 
     k : int or list or None, optional, default=None
         If k is not None, use k node samples to estimate betweenness.  Higher
-        values give better approximation
-        If k is a list, use the content of the list for estimation: the list
-        should contain vertices identifiers.
-        If k is None (the default), all the vertices are used to estimate
-        betweenness.
-        Vertices obtained through sampling or defined as a list will be used as
-        sources for traversals inside the algorithm.
+        values give better approximation.  If k is a list, use the content
+        of the list for estimation: the list should contain vertex
+        identifiers. If k is None (the default), all the vertices are used
+        to estimate betweenness.  Vertices obtained through sampling or
+        defined as a list will be used assources for traversals inside the
+        algorithm.
 
     normalized : bool, optional
         Default is True.
         If true, the betweenness values are normalized by
-        2 / ((n - 1) * (n - 2)) for Graphs (undirected), and
-        1 / ((n - 1) * (n - 2)) for DiGraphs (directed graphs)
+        __2 / ((n - 1) * (n - 2))__ for Graphs (undirected), and
+        __1 / ((n - 1) * (n - 2))__ for DiGraphs (directed graphs)
         where n is the number of nodes in G.
         Normalization will ensure that values are in [0, 1],
         this normalization scales for the highest possible value where one
@@ -145,15 +146,22 @@ def betweenness_centrality(
 
 
 def edge_betweenness_centrality(
-    G, k=None, normalized=True, weight=None, seed=None, result_dtype=np.float64
+    G,
+    k=None,
+    normalized=True,
+    weight=None,
+    seed=None,
+    result_dtype=np.float64
 ):
     """
     Compute the edge betweenness centrality for all edges of the graph G.
     Betweenness centrality is a measure of the number of shortest paths
     that pass over an edge.  An edge with a high betweenness centrality
     score has more paths passing over it and is therefore believed to be
-    more important. Rather than doing an all-pair shortest path, a sample
-    of k starting vertices can be used.
+    more important.
+
+    To improve performance, rather than doing an all-pair shortest path,
+    a sample of k starting vertices can be used.
 
     CuGraph does not currently support the 'weight' parameter
     as seen in the corresponding networkX call.
@@ -168,7 +176,7 @@ def edge_betweenness_centrality(
 
     k : int or list or None, optional, default=None
         If k is not None, use k node samples to estimate betweenness.  Higher
-        values give better approximation
+        values give better approximation.
         If k is a list, use the content of the list for estimation: the list
         should contain vertices identifiers.
         Vertices obtained through sampling or defined as a list will be used as
diff --git a/python/cugraph/comms/comms.py b/python/cugraph/comms/comms.py
index 925f4a1a060..85fc426f373 100644
--- a/python/cugraph/comms/comms.py
+++ b/python/cugraph/comms/comms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 from cugraph.raft.dask.common.comms import Comms as raftComms
-from cugraph.raft.dask.common.comms import worker_state
+from cugraph.raft.dask.common.comms import get_raft_comm_state
 from cugraph.raft.common.handle import Handle
 from cugraph.comms.comms_wrapper import init_subcomms as c_init_subcomms
 from dask.distributed import default_client
@@ -196,12 +196,12 @@ def get_default_handle():
 # Functions to be called from within workers
 
 def get_handle(sID):
-    sessionstate = worker_state(sID)
+    sessionstate = get_raft_comm_state(sID)
     return sessionstate['handle']
 
 
 def get_worker_id(sID):
-    sessionstate = worker_state(sID)
+    sessionstate = get_raft_comm_state(sID)
     return sessionstate['wid']
 
 
@@ -216,5 +216,5 @@ def get_n_workers(sID=None):
     if sID is None:
         return read_utils.get_n_workers()
     else:
-        sessionstate = worker_state(sID)
+        sessionstate = get_raft_comm_state(sID)
         return sessionstate['nworkers']
diff --git a/python/cugraph/community/__init__.py b/python/cugraph/community/__init__.py
index d3bb6472894..9cc92637e20 100644
--- a/python/cugraph/community/__init__.py
+++ b/python/cugraph/community/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -25,3 +25,5 @@
 from cugraph.community.triangle_count import triangles
 from cugraph.community.ktruss_subgraph import ktruss_subgraph
 from cugraph.community.ktruss_subgraph import k_truss
+from cugraph.community.egonet import ego_graph
+from cugraph.community.egonet import batched_ego_graphs
diff --git a/python/cugraph/community/egonet.pxd b/python/cugraph/community/egonet.pxd
new file mode 100644
index 00000000000..3ddf929674f
--- /dev/null
+++ b/python/cugraph/community/egonet.pxd
@@ -0,0 +1,23 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from cugraph.structure.graph_primtypes cimport *
+
+cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+    cdef unique_ptr[cy_multi_edgelists_t] call_egonet[vertex_t, weight_t](
+        const handle_t &handle,
+        const graph_container_t &g,
+        vertex_t* source_vertex,
+        vertex_t n_subgraphs,
+        vertex_t radius) except +
diff --git a/python/cugraph/community/egonet.py b/python/cugraph/community/egonet.py
new file mode 100644
index 00000000000..9ff12158b13
--- /dev/null
+++ b/python/cugraph/community/egonet.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.community import egonet_wrapper
+import cudf
+from cugraph.utilities import (
+    ensure_cugraph_obj,
+    import_optional,
+)
+from cugraph.utilities import cugraph_to_nx
+
+# optional dependencies used for handling different input types
+nx = import_optional("networkx")
+
+
+def _convert_graph_to_output_type(G, input_type):
+    """
+    Given a cugraph.Graph, convert it to a new type appropriate for the
+    graph algos in this module, based on input_type.
+    """
+    if (nx is not None) and (input_type in [nx.Graph, nx.DiGraph]):
+        return cugraph_to_nx(G)
+
+    else:
+        return G
+
+
+def _convert_df_series_to_output_type(df, offsets, input_type):
+    """
+    Given a cudf.DataFrame df, convert it to a new type appropriate for the
+    graph algos in this module, based on input_type.
+    """
+    if (nx is not None) and (input_type in [nx.Graph, nx.DiGraph]):
+        return df.to_pandas(), offsets.values_host.tolist()
+
+    else:
+        return df, offsets
+
+
+def ego_graph(G, n, radius=1, center=True, undirected=False, distance=None):
+    """
+    Compute the  induced subgraph of neighbors centered at node n,
+    within a given radius.
+
+    Parameters
+    ----------
+    G : cugraph.Graph, networkx.Graph, CuPy or SciPy sparse matrix
+        Graph or matrix object, which should contain the connectivity
+        information. Edge weights, if present, should be single or double
+        precision floating point values.
+    n : integer
+        A single node
+    radius: integer, optional
+        Include all neighbors of distance<=radius from n.
+    center: bool, optional
+        Defaults to True. False is not supported
+    undirected: bool, optional
+        Defaults to False. True is not supported
+    distance: key, optional
+        Distances are counted in hops from n. Other cases are not supported.
+
+    Returns
+    -------
+    G_ego : cuGraph.Graph or networkx.Graph
+        A graph descriptor with a minimum spanning tree or forest.
+        The networkx graph will not have all attributes copied over
+    """
+
+    (G, input_type) = ensure_cugraph_obj(G, nx_weight_attr="weight")
+    result_graph = type(G)()
+
+    if G.renumbered is True:
+        n = G.lookup_internal_vertex_id(cudf.Series([n]))
+
+    df, offsets = egonet_wrapper.egonet(G, n, radius)
+
+    if G.renumbered:
+        df = G.unrenumber(df, "src")
+        df = G.unrenumber(df, "dst")
+
+    if G.edgelist.weights:
+        result_graph.from_cudf_edgelist(
+            df, source="src", destination="dst", edge_attr="weight"
+        )
+    else:
+        result_graph.from_cudf_edgelist(df, source="src", destination="dst")
+    return _convert_graph_to_output_type(result_graph, input_type)
+
+
+def batched_ego_graphs(
+    G, seeds, radius=1, center=True, undirected=False, distance=None
+):
+    """
+    Compute the  induced subgraph of neighbors for each node in seeds
+    within a given radius.
+
+    Parameters
+    ----------
+    G : cugraph.Graph, networkx.Graph, CuPy or SciPy sparse matrix
+        Graph or matrix object, which should contain the connectivity
+        information. Edge weights, if present, should be single or double
+        precision floating point values.
+    seeds : cudf.Series or list
+        Specifies the seeds of the induced egonet subgraphs
+    radius: integer, optional
+        Include all neighbors of distance<=radius from n.
+    center: bool, optional
+        Defaults to True. False is not supported
+    undirected: bool, optional
+        Defaults to False. True is not supported
+    distance: key, optional
+        Distances are counted in hops from n. Other cases are not supported.
+
+    Returns
+    -------
+    ego_edge_lists : cudf.DataFrame or pandas.DataFrame
+        GPU data frame containing all induced sources identifiers,
+        destination identifiers, edge weights
+    seeds_offsets: cudf.Series
+        Series containing the starting offset in the returned edge list
+        for each seed.
+    """
+
+    (G, input_type) = ensure_cugraph_obj(G, nx_weight_attr="weight")
+
+    if G.renumbered is True:
+        seeds = G.lookup_internal_vertex_id(cudf.Series(seeds))
+
+    df, offsets = egonet_wrapper.egonet(G, seeds, radius)
+
+    if G.renumbered:
+        df = G.unrenumber(df, "src", preserve_order=True)
+        df = G.unrenumber(df, "dst", preserve_order=True)
+
+    return _convert_df_series_to_output_type(df, offsets, input_type)
diff --git a/python/cugraph/community/egonet_wrapper.pyx b/python/cugraph/community/egonet_wrapper.pyx
new file mode 100644
index 00000000000..122dedbfabd
--- /dev/null
+++ b/python/cugraph/community/egonet_wrapper.pyx
@@ -0,0 +1,116 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.community.egonet cimport call_egonet
+from cugraph.structure.graph_primtypes cimport *
+from libcpp cimport bool
+from libc.stdint cimport uintptr_t
+from cugraph.structure import graph_primtypes_wrapper
+import cudf
+import rmm
+import numpy as np
+import numpy.ctypeslib as ctypeslib
+from rmm._lib.device_buffer cimport DeviceBuffer
+from cudf.core.buffer import Buffer
+
+
+def egonet(input_graph, vertices, radius=1):
+    """
+    Call egonet
+    """
+    # FIXME: Offsets and indices are currently hardcoded to int, but this may
+    #        not be acceptable in the future.
+    numberTypeMap = {np.dtype("int32") : <int>numberTypeEnum.int32Type,
+                     np.dtype("int64") : <int>numberTypeEnum.int64Type,
+                     np.dtype("float32") : <int>numberTypeEnum.floatType,
+                     np.dtype("double") : <int>numberTypeEnum.doubleType}
+
+    [src, dst] = [input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']]
+    vertex_t = src.dtype
+    edge_t = np.dtype("int32")
+    weights = None
+    if input_graph.edgelist.weights:
+        weights = input_graph.edgelist.edgelist_df['weights']
+
+    num_verts = input_graph.number_of_vertices()
+    num_edges = input_graph.number_of_edges(directed_edges=True)
+    num_partition_edges = num_edges
+
+    cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_edge_weights = <uintptr_t>NULL
+    if weights is not None:
+        c_edge_weights = weights.__cuda_array_interface__['data'][0]
+        weight_t = weights.dtype
+    else:
+        weight_t = np.dtype("float32")
+
+    # Pointers for egonet
+    cdef uintptr_t c_source_vertex_ptr = vertices.__cuda_array_interface__['data'][0]
+    n_subgraphs = vertices.size
+
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+    handle_ = handle_ptr.get();
+
+    cdef graph_container_t graph_container
+    populate_graph_container(graph_container,
+                             handle_[0],
+                             <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
+                             <void*>NULL,
+                             <numberTypeEnum>(<int>(numberTypeMap[vertex_t])),
+                             <numberTypeEnum>(<int>(numberTypeMap[edge_t])),
+                             <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
+                             num_partition_edges,
+                             num_verts,
+                             num_edges,
+                             False,
+                             False, False) 
+
+    if(weight_t==np.dtype("float32")):
+        el_struct_ptr = move(call_egonet[int, float](handle_[0],
+                               graph_container,
+                               <int*> c_source_vertex_ptr,
+                               <int> n_subgraphs,
+                               <int> radius))
+    else:
+        el_struct_ptr = move(call_egonet[int, double](handle_[0],
+                               graph_container,
+                               <int*> c_source_vertex_ptr,
+                               <int> n_subgraphs,
+                               <int> radius))
+        
+    el_struct = move(el_struct_ptr.get()[0])
+    src = DeviceBuffer.c_from_unique_ptr(move(el_struct.src_indices))
+    dst = DeviceBuffer.c_from_unique_ptr(move(el_struct.dst_indices))
+    wgt = DeviceBuffer.c_from_unique_ptr(move(el_struct.edge_data))
+    src = Buffer(src)
+    dst = Buffer(dst)
+    wgt = Buffer(wgt)
+
+    src = cudf.Series(data=src, dtype=vertex_t)
+    dst = cudf.Series(data=dst, dtype=vertex_t)
+
+    df = cudf.DataFrame()
+    df['src'] = src
+    df['dst'] = dst
+    if wgt.nbytes != 0:
+        wgt = cudf.Series(data=wgt, dtype=weight_t)
+        df['weight'] = wgt
+
+    offsets = DeviceBuffer.c_from_unique_ptr(move(el_struct.subgraph_offsets))
+    offsets = Buffer(offsets)
+    offsets = cudf.Series(data=offsets, dtype="int")
+
+    return df, offsets
+
diff --git a/python/cugraph/community/spectral_clustering.py b/python/cugraph/community/spectral_clustering.py
index b5f175e8237..443e2169711 100644
--- a/python/cugraph/community/spectral_clustering.py
+++ b/python/cugraph/community/spectral_clustering.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -32,23 +32,23 @@ def spectralBalancedCutClustering(
     Parameters
     ----------
     G : cugraph.Graph or networkx.Graph
-        cuGraph graph descriptor
+         graph descriptor
     num_clusters : integer
-         Specifies the number of clusters to find
+         Specifies the number of clusters to find, must be greater than 1
     num_eigen_vects : integer
          Specifies the number of eigenvectors to use. Must be lower or equal to
-         num_clusters.
+         num_clusters.   Default is 2
     evs_tolerance: float
-         Specifies the tolerance to use in the eigensolver
+         Specifies the tolerance to use in the eigensolver.
          Default is 0.00001
     evs_max_iter: integer
-         Specifies the maximum number of iterations for the eigensolver
+         Specifies the maximum number of iterations for the eigensolver.
          Default is 100
     kmean_tolerance: float
-         Specifies the tolerance to use in the k-means solver
+         Specifies the tolerance to use in the k-means solver.
          Default is 0.00001
     kmean_max_iter: integer
-         Specifies the maximum number of iterations for the k-means solver
+         Specifies the maximum number of iterations for the k-means solver.
          Default is 100
 
     Returns
@@ -73,6 +73,8 @@ def spectralBalancedCutClustering(
     >>> df = cugraph.spectralBalancedCutClustering(G, 5)
     """
 
+    # Error checking in C++ code
+
     G, isNx = check_nx_graph(G)
 
     df = spectral_clustering_wrapper.spectralBalancedCutClustering(
@@ -109,24 +111,24 @@ def spectralModularityMaximizationClustering(
 
     Parameters
     ----------
-    G : cugraph.Graph
+    G : cugraph.Graph or networkx.Graph
         cuGraph graph descriptor. This graph should have edge weights.
     num_clusters : integer
          Specifies the number of clusters to find
     num_eigen_vects : integer
          Specifies the number of eigenvectors to use. Must be lower or equal to
-         num_clusters
+         num_clusters.  Default is 2
     evs_tolerance: float
-         Specifies the tolerance to use in the eigensolver
+         Specifies the tolerance to use in the eigensolver.
          Default is 0.00001
     evs_max_iter: integer
-         Specifies the maximum number of iterations for the eigensolver
+         Specifies the maximum number of iterations for the eigensolver.
          Default is 100
     kmean_tolerance: float
-         Specifies the tolerance to use in the k-means solver
+         Specifies the tolerance to use in the k-means solver.
          Default is 0.00001
     kmean_max_iter: integer
-         Specifies the maximum number of iterations for the k-means solver
+         Specifies the maximum number of iterations for the k-means solver.
          Default is 100
 
     Returns
@@ -148,6 +150,8 @@ def spectralModularityMaximizationClustering(
     >>> df = cugraph.spectralModularityMaximizationClustering(G, 5)
     """
 
+    # Error checking in C++ code
+
     G, isNx = check_nx_graph(G)
 
     df = spectral_clustering_wrapper.spectralModularityMaximizationClustering(
@@ -173,12 +177,15 @@ def analyzeClustering_modularity(G, n_clusters, clustering,
                                  vertex_col_name='vertex',
                                  cluster_col_name='cluster'):
     """
-    Compute the modularity score for a partitioning/clustering
+    Compute the modularity score for a given partitioning/clustering.
+    The assumption is that “clustering” is the results from a call
+    from a special clustering algorithm and contains columns named
+    “vertex” and “cluster”.
 
     Parameters
     ----------
-    G : cugraph.Graph
-        cuGraph graph descriptor. This graph should have edge weights.
+    G : cugraph.Graph or networkx.Graph
+        graph descriptor. This graph should have edge weights.
     n_clusters : integer
         Specifies the number of clusters in the given clustering
     clustering : cudf.DataFrame
@@ -204,10 +211,17 @@ def analyzeClustering_modularity(G, n_clusters, clustering,
     >>> G = cugraph.Graph()
     >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2')
     >>> df = cugraph.spectralBalancedCutClustering(G, 5)
-    >>> score = cugraph.analyzeClustering_modularity(G, 5, df,
-    >>>   'vertex', 'cluster')
+    >>> score = cugraph.analyzeClustering_modularity(G, 5, df)
     """
 
+    if type(vertex_col_name) is not str:
+        raise Exception("vertex_col_name must be a string")
+
+    if type(cluster_col_name) is not str:
+        raise Exception("cluster_col_name must be a string")
+
+    G, isNx = check_nx_graph(G)
+
     if G.renumbered:
         clustering = G.add_internal_vertex_id(clustering,
                                               vertex_col_name,
@@ -228,6 +242,9 @@ def analyzeClustering_edge_cut(G, n_clusters, clustering,
                                cluster_col_name='cluster'):
     """
     Compute the edge cut score for a partitioning/clustering
+    The assumption is that “clustering” is the results from a call
+    from a special clustering algorithm and contains columns named
+    “vertex” and “cluster”.
 
     Parameters
     ----------
@@ -258,10 +275,15 @@ def analyzeClustering_edge_cut(G, n_clusters, clustering,
     >>> G = cugraph.Graph()
     >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr=None)
     >>> df = cugraph.spectralBalancedCutClustering(G, 5)
-    >>> score = cugraph.analyzeClustering_edge_cut(G, 5, df,
-    >>>   'vertex', 'cluster')
+    >>> score = cugraph.analyzeClustering_edge_cut(G, 5, df)
     """
 
+    if type(vertex_col_name) is not str:
+        raise Exception("vertex_col_name must be a string")
+
+    if type(cluster_col_name) is not str:
+        raise Exception("cluster_col_name must be a string")
+
     G, isNx = check_nx_graph(G)
 
     if G.renumbered:
@@ -318,6 +340,12 @@ def analyzeClustering_ratio_cut(G, n_clusters, clustering,
     >>>   'vertex', 'cluster')
     """
 
+    if type(vertex_col_name) is not str:
+        raise Exception("vertex_col_name must be a string")
+
+    if type(cluster_col_name) is not str:
+        raise Exception("cluster_col_name must be a string")
+
     if G.renumbered:
         clustering = G.add_internal_vertex_id(clustering,
                                               vertex_col_name,
diff --git a/python/cugraph/community/subgraph_extraction_wrapper.pyx b/python/cugraph/community/subgraph_extraction_wrapper.pyx
index 5dbb6ce1e27..35b3c743987 100644
--- a/python/cugraph/community/subgraph_extraction_wrapper.pyx
+++ b/python/cugraph/community/subgraph_extraction_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -45,6 +45,10 @@ def subgraph(input_graph, vertices):
         if weights.dtype == np.float64:
             use_float = False
 
+    num_verts = input_graph.number_of_vertices()
+    num_edges = len(src)
+    num_input_vertices = len(vertices)
+
     cdef GraphCOOView[int,int,float]  in_graph_float
     cdef GraphCOOView[int,int,double] in_graph_double
     cdef unique_ptr[GraphCOO[int,int,float]]  out_graph_float
@@ -59,10 +63,6 @@ def subgraph(input_graph, vertices):
 
     cdef uintptr_t c_vertices = vertices.__cuda_array_interface__['data'][0]
 
-    num_verts = input_graph.number_of_vertices()
-    num_edges = len(src)
-    num_input_vertices = len(vertices)
-
     if use_float:
         in_graph_float = GraphCOOView[int,int,float](<int*>c_src, <int*>c_dst, <float*>c_weights, num_verts, num_edges);
         df = coo_to_df(move(c_extract_subgraph_vertex(in_graph_float, <int*>c_vertices, <int>num_input_vertices)));
diff --git a/python/cugraph/dask/structure/renumber.py b/python/cugraph/dask/structure/renumber.py
new file mode 100644
index 00000000000..606a6bc4dc1
--- /dev/null
+++ b/python/cugraph/dask/structure/renumber.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from dask.distributed import wait, default_client
+from cugraph.dask.common.input_utils import get_distributed_data
+from cugraph.dask.structure import renumber_wrapper as renumber_w
+import cugraph.comms.comms as Comms
+import dask_cudf
+
+
+def call_renumber(sID,
+                  data,
+                  num_verts,
+                  num_edges,
+                  is_mnmg):
+    wid = Comms.get_worker_id(sID)
+    handle = Comms.get_handle(sID)
+    return renumber_w.mg_renumber(data[0],
+                                  num_verts,
+                                  num_edges,
+                                  wid,
+                                  handle,
+                                  is_mnmg)
+
+
+def renumber(input_graph):
+
+    client = default_client()
+
+    ddf = input_graph.edgelist.edgelist_df
+
+    num_edges = len(ddf)
+
+    if isinstance(ddf, dask_cudf.DataFrame):
+        is_mnmg = True
+    else:
+        is_mnmg = False
+
+    num_verts = input_graph.number_of_vertices()
+
+    if is_mnmg:
+        data = get_distributed_data(ddf)
+        result = [client.submit(call_renumber,
+                                Comms.get_session_id(),
+                                wf[1],
+                                num_verts,
+                                num_edges,
+                                is_mnmg,
+                                workers=[wf[0]])
+                  for idx, wf in enumerate(data.worker_to_parts.items())]
+        wait(result)
+        ddf = dask_cudf.from_delayed(result)
+    else:
+        call_renumber(Comms.get_session_id(),
+                      ddf,
+                      num_verts,
+                      num_edges,
+                      is_mnmg)
+    return ddf
diff --git a/python/cugraph/dask/structure/renumber_wrapper.pyx b/python/cugraph/dask/structure/renumber_wrapper.pyx
new file mode 100644
index 00000000000..40dd80aeb67
--- /dev/null
+++ b/python/cugraph/dask/structure/renumber_wrapper.pyx
@@ -0,0 +1,460 @@
+#
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from cugraph.structure.utils_wrapper import *
+import cudf
+from cugraph.structure.graph_primtypes cimport *
+import cugraph.structure.graph_primtypes_wrapper as graph_primtypes_wrapper
+from libc.stdint cimport uintptr_t
+from cython.operator cimport dereference as deref
+import numpy as np
+
+from libcpp.utility cimport move
+from rmm._lib.device_buffer cimport device_buffer, DeviceBuffer
+
+cdef renumber_helper(shuffled_vertices_t* ptr_maj_min_w):
+    # extract shuffled result:
+    #
+    cdef pair[unique_ptr[device_buffer], size_t] pair_s_major   = deref(ptr_maj_min_w).get_major_wrap()
+    cdef pair[unique_ptr[device_buffer], size_t] pair_s_minor   = deref(ptr_maj_min_w).get_minor_wrap()
+    cdef pair[unique_ptr[device_buffer], size_t] pair_s_weights = deref(ptr_maj_min_w).get_weights_wrap()
+    
+    shufled_major_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_s_major.first))
+    shufled_major_buffer = Buffer(shufled_major_buffer)
+    
+    shufled_major_series = cudf.Series(data=shufled_major_buffer, dtype=vertex_t)
+    
+    shufled_minor_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_s_minor.first))
+    shufled_minor_buffer = Buffer(shufled_minor_buffer)
+    
+    shufled_minor_series = cudf.Series(data=shufled_minor_buffer, dtype=vertex_t)
+    
+    shufled_weights_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_s_weights.first))
+    shufled_weights_buffer = Buffer(shufled_weights_buffer)
+    
+    shufled_weights_series = cudf.Series(data=shufled_weights_buffer, dtype=weight_t)
+    
+    shuffled_df = cudf.DataFrame()
+    shuffled_df['src']=shuffled_major_series
+    shuffled_df['dst']=shuffled_minor_series
+    shuffled_df['weights']= shuffled_weights_series
+    
+    return shuffled_df
+
+def mg_renumber(input_df,           # maybe use cpdef ?
+                num_global_verts,
+                num_global_edges,    
+                rank,
+                handle,
+                is_multi_gpu):
+    """
+    Call MNMG renumber
+    """
+    cdef size_t handle_size_t = <size_t>handle.getHandle()
+    # TODO: get handle_t out of handle...
+    handle_ptr = <handle_t*>handle_size_t
+
+    src = input_df['src']
+    dst = input_df['dst']
+    cdef uintptr_t c_edge_weights = <uintptr_t>NULL # set below...
+    
+    vertex_t = src.dtype
+    if num_global_edges > (2**31 - 1):
+        edge_t = np.dtype("int64")
+    else:
+        edge_t = np.dtype("int32")
+    if "value" in input_df.columns:
+        weights = input_df['value']
+        weight_t = weights.dtype
+        c_edge_weights = weights.__cuda_array_interface__['data'][0]
+    else:
+        weight_t = np.dtype("float32")
+        
+    if (vertex_t != np.dtype("int32") and vertex_t != np.dtype("int64")):
+        raise Exception("Incorrect vertex_t type.")
+    if (edge_t != np.dtype("int32") and edge_t != np.dtype("int64")):
+        raise Exception("Incorrect edge_t type.")
+    if (weight_t != np.dtype("float32") and weight_t != np.dtype("float64")):
+        raise Exception("Incorrect weight_t type.")
+    if (vertex_t != np.dtype("int32") and edge_t != np.dtype("int64")):
+        raise Exception("Incompatible vertex_t and edge_t types.")
+
+    # FIXME: needs to be edge_t type not int
+    cdef int num_partition_edges = len(src)
+
+    cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
+
+    cdef bool is_hyper_partitioned = False # for now
+
+    cdef uintptr_t shuffled_major = <uintptr_t>NULL
+    cdef uintptr_t shuffled_minor = <uintptr_t>NULL
+    
+    cdef bool do_check = False # ? for now...
+    cdef bool mg_flag = is_multi_gpu # run Single-GPU or MNMG
+
+    cdef pair[unique_ptr[device_buffer], size_t] pair_original
+    cdef pair[unique_ptr[device_buffer], size_t] pair_partition
+
+    # tparams: vertex_t, weight_t:
+    #
+    cdef unique_ptr[major_minor_weights_t[int, float]] ptr_shuffled_32_32
+    cdef unique_ptr[major_minor_weights_t[int, double]] ptr_shuffled_32_64
+    cdef unique_ptr[major_minor_weights_t[long, float]] ptr_shuffled_64_32
+    cdef unique_ptr[major_minor_weights_t[long, double]] ptr_shuffled_64_64
+
+    # tparams: vertex_t, edge_t:
+    #
+    cdef unique_ptr[renum_quad_t[int, int]] ptr_renum_quad_32_32
+    cdef unique_ptr[renum_quad_t[int, long]] ptr_renum_quad_32_64
+    cdef unique_ptr[renum_quad_t[long, long]] ptr_renum_quad_64_64
+
+    # tparam: vertex_t:
+    #
+    cdef unique_ptr[vector[int]] uniq_partition_vector_32
+    cdef unique_ptr[vector[long]] uniq_partition_vector_64
+
+    cdef size_t rank_indx = <size_t>rank
+    
+    if (vertex_t == np.dtype("int32")):
+        if ( edge_t == np.dtype("int32")):
+            if( weight_t == np.dtype("float32")):
+                ptr_shuffled_32_32.reset(call_shuffle[int, int, float](deref(handle_ptr),
+                                                                       <int*>c_src_vertices,
+                                                                       <int*>c_dst_vertices,
+                                                                       <float*>c_edge_weights,
+                                                                       num_partition_edges,
+                                                                       is_hyper_partitioned).release())
+                
+                shuffled_df = renumber_helper(ptr_shuffled_32_32.get())
+                
+                shuffled_src = shufled_df['src']
+                shuffled_dst = shufled_df['dst']
+                        
+                shuffled_major = shuffled_src.__cuda_array_interface__['data'][0]
+                shuffled_minor = shuffled_dst.__cuda_array_interface__['data'][0]
+                
+                ptr_renum_quad_32_32.reset(call_renumber[int, int](deref(handle_ptr),
+                                                                   <int*>shuffled_major,
+                                                                   <int*>shuffled_minor,
+                                                                   num_partition_edges,
+                                                                   is_hyper_partitioned,
+                                                                   do_check,
+                                                                   mg_flag).release())
+                
+                pair_original = ptr_renum_quad_32_32.get().get_dv_wrap() # original vertices: see helper
+                
+
+                original_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_original.first))
+                original_buffer = Buffer(original_buffer)
+
+                original_series = cudf.Series(data=original_buffer, dtype=vertex_t)
+                
+                # extract unique_ptr[partition_offsets]:
+                #
+                uniq_partition_vector_32 = move(ptr_renum_quad_32_32.get().get_partition_offsets())
+
+                # create series out of a partition range from rank to rank+1:
+                #
+                if is_multi_gpu:
+                    new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(rank_indx),
+                                                       uniq_partition_vector_32.get()[0].at(rank_indx+1)),
+                                             dtype=vertex_t)
+                else:
+                    new_series = cudf.Series(np.arange(0, num_global_verts), dtype=vertex_t)
+                
+                # create new cudf df
+                #
+                # and add the previous series to it:
+                #
+                renumbered_map = cudf.DataFrame()
+                renumbered_map['original_ids'] = original_series
+                renumbered_map['new_ids'] = new_series
+
+                return renumbered_map, shuffled_df
+            elif( weight_t == np.dtype("float64")):
+                ptr_shuffled_32_64.reset(call_shuffle[int, int, double](deref(handle_ptr),
+                                                                        <int*>c_src_vertices,
+                                                                        <int*>c_dst_vertices,
+                                                                        <double*>c_edge_weights,
+                                                                        num_partition_edges,
+                                                                        is_hyper_partitioned).release())
+                
+                shuffled_df = renumber_helper(ptr_shuffled_32_64.get())
+                
+                shuffled_src = shufled_df['src']
+                shuffled_dst = shufled_df['dst']
+                        
+                shuffled_major = shuffled_src.__cuda_array_interface__['data'][0]
+                shuffled_minor = shuffled_dst.__cuda_array_interface__['data'][0]
+                
+                ptr_renum_quad_32_32.reset(call_renumber[int, int](deref(handle_ptr),
+                                                                   <int*>shuffled_major,
+                                                                   <int*>shuffled_minor,
+                                                                   num_partition_edges,
+                                                                   is_hyper_partitioned,
+                                                                   do_check,
+                                                                   mg_flag).release())
+                
+                pair_original = ptr_renum_quad_32_32.get().get_dv_wrap() # original vertices: see helper
+                
+
+                original_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_original.first))
+                original_buffer = Buffer(original_buffer)
+
+                original_series = cudf.Series(data=original_buffer, dtype=vertex_t)
+                
+                # extract unique_ptr[partition_offsets]:
+                #
+                uniq_partition_vector_32 = move(ptr_renum_quad_32_32.get().get_partition_offsets())
+
+                # create series out of a partition range from rank to rank+1:
+                #
+                if is_multi_gpu:
+                    new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(rank_indx),
+                                                       uniq_partition_vector_32.get()[0].at(rank_indx+1)),
+                                             dtype=vertex_t)
+                else:
+                    new_series = cudf.Series(np.arange(0, num_global_verts), dtype=vertex_t)
+                
+                # create new cudf df
+                #
+                # and add the previous series to it:
+                #
+                renumbered_map = cudf.DataFrame()
+                renumbered_map['original_ids'] = original_series
+                renumbered_map['new_ids'] = new_series
+
+                return renumbered_map, shuffled_df
+        elif ( edge_t == np.dtype("int64")):
+            if( weight_t == np.dtype("float32")):
+                ptr_shuffled_32_32.reset(call_shuffle[int, long, float](deref(handle_ptr),
+                                                                        <int*>c_src_vertices,
+                                                                        <int*>c_dst_vertices,
+                                                                        <float*>c_edge_weights,
+                                                                        num_partition_edges,
+                                                                        is_hyper_partitioned).release())
+                
+                shuffled_df = renumber_helper(ptr_shuffled_32_32.get())
+                
+                shuffled_src = shufled_df['src']
+                shuffled_dst = shufled_df['dst']
+                        
+                shuffled_major = shuffled_src.__cuda_array_interface__['data'][0]
+                shuffled_minor = shuffled_dst.__cuda_array_interface__['data'][0]
+                
+                ptr_renum_quad_32_64.reset(call_renumber[int, long](deref(handle_ptr),
+                                                                    <int*>shuffled_major,
+                                                                    <int*>shuffled_minor,
+                                                                    num_partition_edges,
+                                                                    is_hyper_partitioned,
+                                                                    do_check,
+                                                                    mg_flag).release())
+                
+                pair_original = ptr_renum_quad_32_64.get().get_dv_wrap() # original vertices: see helper
+                
+
+                original_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_original.first))
+                original_buffer = Buffer(original_buffer)
+
+                original_series = cudf.Series(data=original_buffer, dtype=vertex_t)
+                
+                # extract unique_ptr[partition_offsets]:
+                #
+                uniq_partition_vector_32 = move(ptr_renum_quad_32_64.get().get_partition_offsets())
+
+                # create series out of a partition range from rank to rank+1:
+                #
+                if is_multi_gpu:
+                    new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(rank_indx),
+                                                       uniq_partition_vector_32.get()[0].at(rank_indx+1)),
+                                             dtype=vertex_t)
+                else:
+                    new_series = cudf.Series(np.arange(0, num_global_verts), dtype=vertex_t)
+                
+                # create new cudf df
+                #
+                # and add the previous series to it:
+                #
+                renumbered_map = cudf.DataFrame()
+                renumbered_map['original_ids'] = original_series
+                renumbered_map['new_ids'] = new_series
+
+                return renumbered_map, shuffled_df
+            elif( weight_t == np.dtype("float64")):
+                ptr_shuffled_32_64.reset(call_shuffle[int, long, double](deref(handle_ptr),
+                                                                         <int*>c_src_vertices,
+                                                                         <int*>c_dst_vertices,
+                                                                         <double*>c_edge_weights,
+                                                                         num_partition_edges,
+                                                                         is_hyper_partitioned).release())
+                
+                shuffled_df = renumber_helper(ptr_shuffled_32_64.get())
+                
+                shuffled_src = shufled_df['src']
+                shuffled_dst = shufled_df['dst']
+                        
+                shuffled_major = shuffled_src.__cuda_array_interface__['data'][0]
+                shuffled_minor = shuffled_dst.__cuda_array_interface__['data'][0]
+                
+                ptr_renum_quad_32_64.reset(call_renumber[int, long](deref(handle_ptr),
+                                                                    <int*>shuffled_major,
+                                                                    <int*>shuffled_minor,
+                                                                    num_partition_edges,
+                                                                    is_hyper_partitioned,
+                                                                    do_check,
+                                                                    mg_flag).release())
+                
+                pair_original = ptr_renum_quad_32_64.get().get_dv_wrap() # original vertices: see helper
+                
+
+                original_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_original.first))
+                original_buffer = Buffer(original_buffer)
+
+                original_series = cudf.Series(data=original_buffer, dtype=vertex_t)
+                
+                # extract unique_ptr[partition_offsets]:
+                #
+                uniq_partition_vector_32 = move(ptr_renum_quad_32_64.get().get_partition_offsets())
+
+                # create series out of a partition range from rank to rank+1:
+                #
+                if is_multi_gpu:
+                    new_series = cudf.Series(np.arange(uniq_partition_vector_32.get()[0].at(rank_indx),
+                                                       uniq_partition_vector_32.get()[0].at(rank_indx+1)),
+                                             dtype=vertex_t)
+                else:
+                    new_series = cudf.Series(np.arange(0, num_global_verts), dtype=vertex_t)
+                
+                # create new cudf df
+                #
+                # and add the previous series to it:
+                #
+                renumbered_map = cudf.DataFrame()
+                renumbered_map['original_ids'] = original_series
+                renumbered_map['new_ids'] = new_series
+
+                return renumbered_map, shuffled_df
+    elif (vertex_t == np.dtype("int64")):
+        if ( edge_t == np.dtype("int64")):
+            if( weight_t == np.dtype("float32")):
+                ptr_shuffled_64_32.reset(call_shuffle[long, long, float](deref(handle_ptr),
+                                                                        <long*>c_src_vertices,
+                                                                        <long*>c_dst_vertices,
+                                                                        <float*>c_edge_weights,
+                                                                        num_partition_edges,
+                                                                        is_hyper_partitioned).release())
+                
+                shuffled_df = renumber_helper(ptr_shuffled_64_32.get())
+                
+                shuffled_src = shufled_df['src']
+                shuffled_dst = shufled_df['dst']
+                        
+                shuffled_major = shuffled_src.__cuda_array_interface__['data'][0]
+                shuffled_minor = shuffled_dst.__cuda_array_interface__['data'][0]
+                
+                ptr_renum_quad_64_64.reset(call_renumber[long, long](deref(handle_ptr),
+                                                                     <long*>shuffled_major,
+                                                                     <long*>shuffled_minor,
+                                                                     num_partition_edges,
+                                                                     is_hyper_partitioned,
+                                                                     do_check,
+                                                                     mg_flag).release())
+                
+                pair_original = ptr_renum_quad_64_64.get().get_dv_wrap() # original vertices: see helper
+                
+
+                original_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_original.first))
+                original_buffer = Buffer(original_buffer)
+
+                original_series = cudf.Series(data=original_buffer, dtype=vertex_t)
+                
+                # extract unique_ptr[partition_offsets]:
+                #
+                uniq_partition_vector_64 = move(ptr_renum_quad_64_64.get().get_partition_offsets())
+
+                # create series out of a partition range from rank to rank+1:
+                #
+                if is_multi_gpu:
+                    new_series = cudf.Series(np.arange(uniq_partition_vector_64.get()[0].at(rank_indx),
+                                                       uniq_partition_vector_64.get()[0].at(rank_indx+1)),
+                                             dtype=vertex_t)
+                else:
+                    new_series = cudf.Series(np.arange(0, num_global_verts), dtype=vertex_t)
+                
+                # create new cudf df
+                #
+                # and add the previous series to it:
+                #
+                renumbered_map = cudf.DataFrame()
+                renumbered_map['original_ids'] = original_series
+                renumbered_map['new_ids'] = new_series
+
+                return renumbered_map, shuffled_df
+            elif( weight_t == np.dtype("float64")):
+                ptr_shuffled_64_64.reset(call_shuffle[long, long, double](deref(handle_ptr),
+                                                                          <long*>c_src_vertices,
+                                                                          <long*>c_dst_vertices,
+                                                                          <double*>c_edge_weights,
+                                                                          num_partition_edges,
+                                                                          is_hyper_partitioned).release())
+                
+                shuffled_df = renumber_helper(ptr_shuffled_64_64.get())
+                
+                shuffled_src = shufled_df['src']
+                shuffled_dst = shufled_df['dst']
+                        
+                shuffled_major = shuffled_src.__cuda_array_interface__['data'][0]
+                shuffled_minor = shuffled_dst.__cuda_array_interface__['data'][0]
+                
+                ptr_renum_quad_64_64.reset(call_renumber[long, long](deref(handle_ptr),
+                                                                     <long*>shuffled_major,
+                                                                     <long*>shuffled_minor,
+                                                                     num_partition_edges,
+                                                                     is_hyper_partitioned,
+                                                                     do_check,
+                                                                     mg_flag).release())
+                
+                pair_original = ptr_renum_quad_64_64.get().get_dv_wrap() # original vertices: see helper
+                
+
+                original_buffer = DeviceBuffer.c_from_unique_ptr(move(pair_original.first))
+                original_buffer = Buffer(original_buffer)
+
+                original_series = cudf.Series(data=original_buffer, dtype=vertex_t)
+                
+                # extract unique_ptr[partition_offsets]:
+                #
+                uniq_partition_vector_64 = move(ptr_renum_quad_64_64.get().get_partition_offsets())
+
+                # create series out of a partition range from rank to rank+1:
+                #
+                if is_multi_gpu:
+                    new_series = cudf.Series(np.arange(uniq_partition_vector_64.get()[0].at(rank_indx),
+                                                       uniq_partition_vector_64.get()[0].at(rank_indx+1)),
+                                             dtype=vertex_t)
+                else:
+                    new_series = cudf.Series(np.arange(0, num_global_verts), dtype=vertex_t)
+                
+                # create new cudf df
+                #
+                # and add the previous series to it:
+                #
+                renumbered_map = cudf.DataFrame()
+                renumbered_map['original_ids'] = original_series
+                renumbered_map['new_ids'] = new_series
+
+                return renumbered_map, shuffled_df
diff --git a/python/cugraph/linear_assignment/__init__.py b/python/cugraph/linear_assignment/__init__.py
index 9bf09b67ed9..557bbbdf170 100644
--- a/python/cugraph/linear_assignment/__init__.py
+++ b/python/cugraph/linear_assignment/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,4 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.linear_assignment.lap import hungarian
+from cugraph.linear_assignment.lap import hungarian, dense_hungarian
diff --git a/python/cugraph/linear_assignment/lap.pxd b/python/cugraph/linear_assignment/lap.pxd
index f7991405b7f..782d5cfef60 100644
--- a/python/cugraph/linear_assignment/lap.pxd
+++ b/python/cugraph/linear_assignment/lap.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,9 +20,19 @@ from cugraph.structure.graph_primtypes cimport *
 
 cdef extern from "algorithms.hpp" namespace "cugraph":
 
-    cdef void hungarian[VT,ET,WT](
+    cdef weight_t hungarian[vertex_t,edge_t,weight_t](
         const handle_t &handle,
-        const GraphCOOView[VT,ET,WT] &graph,
-        VT num_workers,
-        const VT *workers,
-        VT *assignment) except +
+        const GraphCOOView[vertex_t,edge_t,weight_t] &graph,
+        vertex_t num_workers,
+        const vertex_t *workers,
+        vertex_t *assignment) except +
+
+
+cdef extern from "algorithms.hpp":
+
+    cdef weight_t dense_hungarian "cugraph::dense::hungarian" [vertex_t,weight_t](
+        const handle_t &handle,
+        const weight_t *costs,
+        vertex_t num_rows,
+        vertex_t num_columns,
+        vertex_t *assignment) except +
diff --git a/python/cugraph/linear_assignment/lap.py b/python/cugraph/linear_assignment/lap.py
index 5c501d17935..c634d9aceb4 100644
--- a/python/cugraph/linear_assignment/lap.py
+++ b/python/cugraph/linear_assignment/lap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -46,6 +46,8 @@ def hungarian(G, workers):
 
     Returns
     -------
+    cost : matches costs.dtype
+        The cost of the overall assignment
     df : cudf.DataFrame
       df['vertex'][i] gives the vertex id of the i'th vertex.  Only vertices
                       in the workers list are defined in this column.
@@ -60,7 +62,7 @@ def hungarian(G, workers):
     >>>                   dtype=['int32', 'int32', 'float32'], header=None)
     >>> G = cugraph.Graph()
     >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2')
-    >>> df = cugraph.hungarian(G, workers)
+    >>> cost, df = cugraph.hungarian(G, workers)
 
     """
 
@@ -69,9 +71,48 @@ def hungarian(G, workers):
     else:
         local_workers = workers
 
-    df = lap_wrapper.hungarian(G, local_workers)
+    df = lap_wrapper.sparse_hungarian(G, local_workers)
 
     if G.renumbered:
         df = G.unrenumber(df, 'vertex')
 
     return df
+
+
+def dense_hungarian(costs, num_rows, num_columns):
+    """
+    Execute the Hungarian algorithm against a dense bipartite
+    graph representation.
+
+    *NOTE*: This API is unstable and subject to change
+
+    The Hungarian algorithm identifies the lowest cost matching of vertices
+    such that all workers that can be assigned work are assigned exactly
+    on job.
+
+    Parameters
+    ----------
+    costs : cudf.Series
+        A dense representation (row major order) of the bipartite
+        graph.  Each row represents a worker, each column represents
+        a task, cost[i][j] represents the cost of worker i performing
+        task j.
+    num_rows : int
+        Number of rows in the matrix
+    num_columns : int
+        Number of columns in the matrix
+
+
+    Returns
+    -------
+    cost : matches costs.dtype
+        The cost of the overall assignment
+    assignment : cudf.Series
+      assignment[i] gives the vertex id of the task assigned to the
+                    worker i
+
+    FIXME: Update this with a real example...
+
+    """
+
+    return lap_wrapper.dense_hungarian(costs, num_rows, num_columns)
diff --git a/python/cugraph/linear_assignment/lap_wrapper.pyx b/python/cugraph/linear_assignment/lap_wrapper.pyx
index caaa837e859..0769ef42f0f 100644
--- a/python/cugraph/linear_assignment/lap_wrapper.pyx
+++ b/python/cugraph/linear_assignment/lap_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,6 +17,7 @@
 # cython: language_level = 3
 
 from cugraph.linear_assignment.lap cimport hungarian as c_hungarian
+from cugraph.linear_assignment.lap cimport dense_hungarian as c_dense_hungarian
 from cugraph.structure.graph_primtypes cimport *
 from cugraph.structure import graph_primtypes_wrapper
 from libc.stdint cimport uintptr_t
@@ -25,7 +26,7 @@ from cugraph.structure.graph import Graph as type_Graph
 import cudf
 import numpy as np
 
-def hungarian(input_graph, workers):
+def sparse_hungarian(input_graph, workers):
     """
     Call the hungarian algorithm
     """
@@ -76,10 +77,37 @@ def hungarian(input_graph, workers):
     if weights.dtype == np.float32:
         g_float = GraphCOOView[int,int,float](<int*>c_src, <int*>c_dst, <float*>c_weights, num_verts, num_edges)
 
-        c_hungarian[int,int,float](handle_[0], g_float, len(workers), <int*>c_workers, <int*>c_assignment)
+        cost = c_hungarian[int,int,float](handle_[0], g_float, len(workers), <int*>c_workers, <int*>c_assignment)
     else:
         g_double = GraphCOOView[int,int,double](<int*>c_src, <int*>c_dst, <double*>c_weights, num_verts, num_edges)
 
-        c_hungarian[int,int,double](handle_[0], g_double, len(workers), <int*>c_workers, <int*>c_assignment)
+        cost = c_hungarian[int,int,double](handle_[0], g_double, len(workers), <int*>c_workers, <int*>c_assignment)
 
-    return df
+    return cost, df
+
+
+def dense_hungarian(costs, num_rows, num_columns):
+    """
+    Call the dense hungarian algorithm
+    """
+    if type(costs) is not cudf.Series:
+        raise("costs must be a cudf.Series")
+
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+    handle_ = handle_ptr.get();
+
+    assignment = cudf.Series(np.zeros(num_rows, dtype=np.int32))
+
+    cdef uintptr_t c_costs = costs.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_assignment = assignment.__cuda_array_interface__['data'][0]
+
+
+    if costs.dtype == np.float32:
+        cost = c_dense_hungarian[int,float](handle_[0], <float*> c_costs, num_rows, num_columns, <int*> c_assignment)
+    elif costs.dtype == np.float64:
+        cost = c_dense_hungarian[int,double](handle_[0], <double*> c_costs, num_rows, num_columns, <int*> c_assignment)
+    else:
+        raise("unsported type: ", costs.dtype)
+
+    return cost, assignment
diff --git a/python/cugraph/link_analysis/pagerank.py b/python/cugraph/link_analysis/pagerank.py
index 69133d62af7..0bb89195e01 100644
--- a/python/cugraph/link_analysis/pagerank.py
+++ b/python/cugraph/link_analysis/pagerank.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -32,7 +32,7 @@ def pagerank(
     ----------
     graph : cugraph.Graph or networkx.Graph
         cuGraph graph descriptor, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm).
+        as an edge list.
         The transposed adjacency list will be computed if not already present.
     alpha : float
         The damping factor alpha represents the probability to follow an
@@ -68,11 +68,6 @@ def pagerank(
             Subset of vertices of graph for initial guess for pagerank values
         nstart['values'] : cudf.Series
             Pagerank values for vertices
-
-    weight : str
-        Edge data column to use.  Default is None
-        This version of PageRank current does not use edge weight.
-        This parameter is here for NetworkX compatibility
     dangling : dict
         This parameter is here for NetworkX compatibility and ignored
 
diff --git a/python/cugraph/link_analysis/pagerank_wrapper.pyx b/python/cugraph/link_analysis/pagerank_wrapper.pyx
index a8c1c9faee8..fea1939db6a 100644
--- a/python/cugraph/link_analysis/pagerank_wrapper.pyx
+++ b/python/cugraph/link_analysis/pagerank_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -33,21 +33,22 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.
     Call pagerank
     """
 
-    if not input_graph.transposedadjlist:
-        input_graph.view_transposed_adj_list()
-
     cdef unique_ptr[handle_t] handle_ptr
     handle_ptr.reset(new handle_t())
     handle_ = handle_ptr.get();
 
-    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.transposedadjlist.offsets, input_graph.transposedadjlist.indices], [np.int32])
-    [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.transposedadjlist.weights], [np.float32, np.float64])
+    [src, dst] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']], [np.int32])
+    weights = None
+    if input_graph.edgelist.weights:
+        [weights] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['weights']], [np.float32, np.float64])
 
     num_verts = input_graph.number_of_vertices()
     num_edges = input_graph.number_of_edges(directed_edges=True)
+    # FIXME: needs to be edge_t type not int
+    cdef int num_partition_edges = len(src)
 
     df = cudf.DataFrame()
-    df['vertex'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+    df['vertex'] = cudf.Series(np.arange(num_verts, dtype=np.int32))
     df['pagerank'] = cudf.Series(np.zeros(num_verts, dtype=np.float32))
 
     cdef bool has_guess = <bool> 0
@@ -62,19 +63,16 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.
 
     cdef uintptr_t c_pers_vtx = <uintptr_t>NULL
     cdef uintptr_t c_pers_val = <uintptr_t>NULL
-    cdef sz = 0
-
-    cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_weights = <uintptr_t>NULL
-    cdef uintptr_t c_local_verts = <uintptr_t> NULL;
-    cdef uintptr_t c_local_edges = <uintptr_t> NULL;
-    cdef uintptr_t c_local_offsets = <uintptr_t> NULL;
+    cdef int sz = 0
 
+    cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_edge_weights = <uintptr_t>NULL
+    
     personalization_id_series = None
 
     if weights is not None:
-        c_weights = weights.__cuda_array_interface__['data'][0]
+        c_edge_weights = weights.__cuda_array_interface__['data'][0]
         weight_t = weights.dtype
     else:
         weight_t = np.dtype("float32")
@@ -94,15 +92,19 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.
         c_pers_val = personalization['values'].__cuda_array_interface__['data'][0]
 
     cdef graph_container_t graph_container
-    populate_graph_container_legacy(graph_container,
-                                    <graphTypeEnum>(<int>(graphTypeEnum.LegacyCSC)),
-                                    handle_[0],
-                                    <void*>c_offsets, <void*>c_indices, <void*>c_weights,
-                                    <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
-                                    <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
-                                    <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
-                                    num_verts, num_edges,
-                                    <int*>c_local_verts, <int*>c_local_edges, <int*>c_local_offsets)
+    populate_graph_container(graph_container,
+                             handle_[0],
+                             <void*>c_src_vertices, <void*>c_dst_vertices, <void*>c_edge_weights,
+                             <void*>NULL,
+                             <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
+                             <numberTypeEnum>(<int>(numberTypeEnum.int32Type)),
+                             <numberTypeEnum>(<int>(numberTypeMap[weight_t])),
+                             #num_verts, num_edges,
+                             num_partition_edges,
+                             num_verts, num_edges,
+                             False,
+                             True,
+                             False)
 
     if (df['pagerank'].dtype == np.float32):
         call_pagerank[int, float](handle_[0], graph_container,
diff --git a/python/cugraph/structure/__init__.py b/python/cugraph/structure/__init__.py
index 34447e80ee9..ad67fe91876 100644
--- a/python/cugraph/structure/__init__.py
+++ b/python/cugraph/structure/__init__.py
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.structure.graph import Graph, DiGraph
+from cugraph.structure.graph import Graph, DiGraph, MultiGraph, MultiDiGraph
 from cugraph.structure.number_map import NumberMap
 from cugraph.structure.symmetrize import symmetrize, symmetrize_df , symmetrize_ddf
 from cugraph.structure.convert_matrix import (from_edgelist,
diff --git a/python/cugraph/structure/graph.py b/python/cugraph/structure/graph.py
index acc0ad8f066..a3024f9d081 100644
--- a/python/cugraph/structure/graph.py
+++ b/python/cugraph/structure/graph.py
@@ -72,7 +72,6 @@ def __init__(self, offsets, indices, value=None):
     def __init__(
         self,
         m_graph=None,
-        edge_attr=None,
         symmetrized=False,
         bipartite=False,
         multi=False,
@@ -112,24 +111,22 @@ def __init__(
         self.batch_transposed_adjlists = None
 
         if m_graph is not None:
-            if (type(self) is Graph and type(m_graph) is MultiGraph) or (
-                type(self) is DiGraph and type(m_graph) is MultiDiGraph
-            ):
-                self.from_cudf_edgelist(
-                    m_graph.edgelist.edgelist_df,
-                    source="src",
-                    destination="dst",
-                    edge_attr=edge_attr,
-                )
-                self.renumbered = m_graph.renumbered
-                self.renumber_map = m_graph.renumber_map
+            if type(m_graph) is MultiGraph or type(m_graph) is MultiDiGraph:
+                elist = m_graph.view_edge_list()
+                if m_graph.edgelist.weights:
+                    weights = "weights"
+                else:
+                    weights = None
+                self.from_cudf_edgelist(elist,
+                                        source="src",
+                                        destination="dst",
+                                        edge_attr=weights)
             else:
                 msg = (
-                    "Graph can be initialized using MultiGraph "
-                    "and DiGraph can be initialized using MultiDiGraph"
+                    "Graph can only be initialized using MultiGraph "
+                    "or MultiDiGraph"
                 )
                 raise Exception(msg)
-        # self.number_of_vertices = None
 
     def enable_batch(self):
         client = mg_utils.get_client()
@@ -277,6 +274,12 @@ def is_multipartite(self):
         # TO DO: Call coloring algorithm
         return self.multipartite or self.bipartite
 
+    def is_multigraph(self):
+        """
+        Returns True if the graph is a multigraph. Else returns False.
+        """
+        return self.multi
+
     def sets(self):
         """
         Returns the bipartite set of nodes. This solely relies on the user's
@@ -408,24 +411,19 @@ def from_cudf_edgelist(
         source_col = elist[source]
         dest_col = elist[destination]
 
-        if self.multi:
-            if type(edge_attr) is not list:
-                raise Exception("edge_attr should be a list of column names")
-            value_col = {}
-            for col_name in edge_attr:
-                value_col[col_name] = elist[col_name]
-        elif edge_attr is not None:
+        if edge_attr is not None:
             value_col = elist[edge_attr]
         else:
             value_col = None
 
-        if not self.symmetrized and not self.multi:
-            if value_col is not None:
-                source_col, dest_col, value_col = symmetrize(
-                    source_col, dest_col, value_col
-                )
-            else:
-                source_col, dest_col = symmetrize(source_col, dest_col)
+        if value_col is not None:
+            source_col, dest_col, value_col = symmetrize(
+                source_col, dest_col, value_col, multi=self.multi,
+                symmetrize=not self.symmetrized)
+        else:
+            source_col, dest_col = symmetrize(
+                source_col, dest_col, multi=self.multi,
+                symmetrize=not self.symmetrized)
 
         self.edgelist = Graph.EdgeList(source_col, dest_col, value_col)
 
@@ -700,7 +698,7 @@ def view_edge_list(self):
             edgelist_df = self.unrenumber(edgelist_df, "src")
             edgelist_df = self.unrenumber(edgelist_df, "dst")
 
-        if type(self) is Graph:
+        if type(self) is Graph or type(self) is MultiGraph:
             edgelist_df = edgelist_df[edgelist_df["src"] <= edgelist_df["dst"]]
             edgelist_df = edgelist_df.reset_index(drop=True)
             self.edge_count = len(edgelist_df)
@@ -992,7 +990,7 @@ def number_of_edges(self, directed_edges=False):
             return len(self.edgelist.edgelist_df)
         if self.edge_count is None:
             if self.edgelist is not None:
-                if type(self) is Graph:
+                if type(self) is Graph or type(self) is MultiGraph:
                     self.edge_count = len(
                         self.edgelist.edgelist_df[
                             self.edgelist.edgelist_df["src"]
@@ -1486,17 +1484,26 @@ def add_internal_vertex_id(
 
 
 class DiGraph(Graph):
-    def __init__(self, m_graph=None, edge_attr=None):
+    """
+    cuGraph directed graph class. Drops parallel edges.
+    """
+    def __init__(self, m_graph=None):
         super().__init__(
-            m_graph=m_graph, edge_attr=edge_attr, symmetrized=True
+            m_graph=m_graph, symmetrized=True
         )
 
 
 class MultiGraph(Graph):
+    """
+    cuGraph class to create and store undirected graphs with parallel edges.
+    """
     def __init__(self, renumbered=True):
         super().__init__(multi=True)
 
 
 class MultiDiGraph(Graph):
+    """
+    cuGraph class to create and store directed graphs with parallel edges.
+    """
     def __init__(self, renumbered=True):
         super().__init__(symmetrized=True, multi=True)
diff --git a/python/cugraph/structure/graph_primtypes.pxd b/python/cugraph/structure/graph_primtypes.pxd
index e46f4092dd4..07132df2598 100644
--- a/python/cugraph/structure/graph_primtypes.pxd
+++ b/python/cugraph/structure/graph_primtypes.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,6 +18,8 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport pair
+from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
@@ -142,6 +144,89 @@ cdef extern from "functions.hpp" namespace "cugraph":
         ET *map_size) except +
 
 
+# renumber_edgelist() interface:
+#
+#
+# 1. `cdef extern partition_t`:
+#
+cdef extern from "experimental/graph_view.hpp" namespace "cugraph::experimental":
+
+    cdef cppclass partition_t[vertex_t]:
+        pass
+
+
+# 2. return type for shuffle:
+#
+cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+
+    cdef cppclass major_minor_weights_t[vertex_t, weight_t]:
+        major_minor_weights_t(const handle_t &handle)
+        pair[unique_ptr[device_buffer], size_t] get_major_wrap()
+        pair[unique_ptr[device_buffer], size_t] get_minor_wrap()
+        pair[unique_ptr[device_buffer], size_t] get_weights_wrap()
+
+
+ctypedef fused shuffled_vertices_t:
+    major_minor_weights_t[int, float]
+    major_minor_weights_t[int, double]
+    major_minor_weights_t[long, float]
+    major_minor_weights_t[long, double]
+    
+# 3. return type for renumber:
+#
+cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+
+    cdef cppclass renum_quad_t[vertex_t, edge_t]:
+        renum_quad_t(const handle_t &handle)
+        pair[unique_ptr[device_buffer], size_t] get_dv_wrap()
+        vertex_t& get_num_vertices()
+        edge_t& get_num_edges()
+        int get_part_row_size()
+        int get_part_col_size()
+        int get_part_comm_rank()
+        unique_ptr[vector[vertex_t]] get_partition_offsets()
+        pair[vertex_t, vertex_t] get_part_local_vertex_range()
+        vertex_t get_part_local_vertex_first()
+        vertex_t get_part_local_vertex_last()
+        pair[vertex_t, vertex_t] get_part_vertex_partition_range(size_t vertex_partition_idx)
+        vertex_t get_part_vertex_partition_first(size_t vertex_partition_idx)
+        vertex_t get_part_vertex_partition_last(size_t vertex_partition_idx)
+        vertex_t get_part_vertex_partition_size(size_t vertex_partition_idx)
+        size_t get_part_number_of_matrix_partitions()
+        vertex_t get_part_matrix_partition_major_first(size_t partition_idx)
+        vertex_t get_part_matrix_partition_major_last(size_t partition_idx)
+        vertex_t get_part_matrix_partition_major_value_start_offset(size_t partition_idx)
+        pair[vertex_t, vertex_t] get_part_matrix_partition_minor_range()
+        vertex_t get_part_matrix_partition_minor_first()
+        vertex_t get_part_matrix_partition_minor_last()        
+
+# 4. `groupby_gpuid_and_shuffle_values()` wrapper:
+#
+cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+
+    cdef unique_ptr[major_minor_weights_t[vertex_t, weight_t]] call_shuffle[vertex_t, edge_t, weight_t](
+        const handle_t &handle,
+        vertex_t *edgelist_major_vertices,
+        vertex_t *edgelist_minor_vertices,
+        weight_t* edgelist_weights,
+        edge_t num_edges,
+        bool is_hyper_partitioned) except +
+
+
+# 5. `renumber_edgelist()` wrapper
+#
+cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
+
+    cdef unique_ptr[renum_quad_t[vertex_t, edge_t]] call_renumber[vertex_t, edge_t](
+        const handle_t &handle,
+        vertex_t *edgelist_major_vertices,
+        vertex_t *edgelist_minor_vertices,
+        edge_t num_edges,
+        bool is_hyper_partitioned,
+        bool do_check,
+        bool multi_gpu) except +
+
+
 cdef extern from "<utility>" namespace "std" nogil:
     cdef unique_ptr[GraphCOO[int,int,float]] move(unique_ptr[GraphCOO[int,int,float]])
     cdef unique_ptr[GraphCOO[int,int,double]] move(unique_ptr[GraphCOO[int,int,double]])
@@ -241,3 +326,16 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython":
         int *local_vertices,
         int *local_edges,
         int *local_offsets) except +
+
+    cdef cppclass cy_multi_edgelists_t:
+        size_t number_of_vertices
+        size_t number_of_edges
+        size_t number_of_subgraph
+        unique_ptr[device_buffer] src_indices
+        unique_ptr[device_buffer] dst_indices
+        unique_ptr[device_buffer] edge_data
+        unique_ptr[device_buffer] subgraph_offsets
+
+cdef extern from "<utility>" namespace "std" nogil:
+    cdef cy_multi_edgelists_t move(cy_multi_edgelists_t)
+    cdef unique_ptr[cy_multi_edgelists_t] move(unique_ptr[cy_multi_edgelists_t])
diff --git a/python/cugraph/structure/number_map.py b/python/cugraph/structure/number_map.py
index f1b8949eb5d..deb2b9f4114 100644
--- a/python/cugraph/structure/number_map.py
+++ b/python/cugraph/structure/number_map.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -895,7 +895,7 @@ def unrenumber(self, df, column_name, preserve_order=False):
         if preserve_order:
             df = df.sort_values(
                 index_name
-            ).drop(index_name).reset_index(drop=True)
+            ).drop(columns=index_name).reset_index(drop=True)
 
         if type(df) is dask_cudf.DataFrame:
             return df.map_partitions(
diff --git a/python/cugraph/structure/symmetrize.py b/python/cugraph/structure/symmetrize.py
index e7fd15144aa..0f4ca90a97c 100644
--- a/python/cugraph/structure/symmetrize.py
+++ b/python/cugraph/structure/symmetrize.py
@@ -16,7 +16,7 @@
 import dask_cudf
 
 
-def symmetrize_df(df, src_name, dst_name):
+def symmetrize_df(df, src_name, dst_name, multi=False, symmetrize=True):
     """
     Take a COO stored in a DataFrame, along with the column names of
     the source and destination columns and create a new data frame
@@ -42,6 +42,13 @@ def symmetrize_df(df, src_name, dst_name):
         Name of the column in the data frame containing the source ids
     dst_name : string
         Name of the column in the data frame containing the destination ids
+    multi : bool
+        Set to True if graph is a Multi(Di)Graph. This allows multiple
+        edges instead of dropping them.
+    symmetrize : bool
+        Default is True to perform symmetrization. If False only duplicate
+        edges are dropped.
+
     Examples
     --------
     >>> import cugraph.dask as dcg
@@ -54,26 +61,30 @@ def symmetrize_df(df, src_name, dst_name):
     >>> sym_ddf = cugraph.symmetrize_ddf(ddf, "src", "dst", "weight")
     >>> Comms.destroy()
     """
-    gdf = cudf.DataFrame()
-
     #
     #  Now append the columns.  We add sources to the end of destinations,
     #  and destinations to the end of sources.  Otherwise we append a
     #  column onto itself.
     #
-    for idx, name in enumerate(df.columns):
-        if name == src_name:
-            gdf[src_name] = df[src_name].append(
-                df[dst_name], ignore_index=True
-            )
-        elif name == dst_name:
-            gdf[dst_name] = df[dst_name].append(
-                df[src_name], ignore_index=True
-            )
-        else:
-            gdf[name] = df[name].append(df[name], ignore_index=True)
-
-    return gdf.groupby(by=[src_name, dst_name], as_index=False).min()
+    if symmetrize:
+        gdf = cudf.DataFrame()
+        for idx, name in enumerate(df.columns):
+            if name == src_name:
+                gdf[src_name] = df[src_name].append(
+                    df[dst_name], ignore_index=True
+                )
+            elif name == dst_name:
+                gdf[dst_name] = df[dst_name].append(
+                    df[src_name], ignore_index=True
+                )
+            else:
+                gdf[name] = df[name].append(df[name], ignore_index=True)
+    else:
+        gdf = df
+    if multi:
+        return gdf
+    else:
+        return gdf.groupby(by=[src_name, dst_name], as_index=False).min()
 
 
 def symmetrize_ddf(df, src_name, dst_name, weight_name=None):
@@ -105,6 +116,12 @@ def symmetrize_ddf(df, src_name, dst_name, weight_name=None):
         Name of the column in the data frame containing the source ids
     dst_name : string
         Name of the column in the data frame containing the destination ids
+    multi : bool
+        Set to True if graph is a Multi(Di)Graph. This allows multiple
+        edges instead of dropping them.
+    symmetrize : bool
+        Default is True to perform symmetrization. If False only duplicate
+        edges are dropped.
 
     Examples
     --------
@@ -129,7 +146,8 @@ def symmetrize_ddf(df, src_name, dst_name, weight_name=None):
     return result
 
 
-def symmetrize(source_col, dest_col, value_col=None):
+def symmetrize(source_col, dest_col, value_col=None, multi=False,
+               symmetrize=True):
     """
     Take a COO set of source destination pairs along with associated values
     stored in a single GPU or distributed
@@ -190,7 +208,8 @@ def symmetrize(source_col, dest_col, value_col=None):
             input_df, "source", "destination", weight_name
         ).persist()
     else:
-        output_df = symmetrize_df(input_df, "source", "destination")
+        output_df = symmetrize_df(input_df, "source", "destination", multi,
+                                  symmetrize)
 
     if value_col is not None:
         return (
diff --git a/python/cugraph/tests/test_betweenness_centrality.py b/python/cugraph/tests/test_betweenness_centrality.py
old mode 100644
new mode 100755
index 33b2842645d..f338e5aa633
--- a/python/cugraph/tests/test_betweenness_centrality.py
+++ b/python/cugraph/tests/test_betweenness_centrality.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.:
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.:
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -55,7 +55,7 @@
 # Comparison functions
 # =============================================================================
 def calc_betweenness_centrality(
-    graph_file,
+    graph_obj_tuple,
     directed=True,
     k=None,
     normalized=False,
@@ -120,8 +120,7 @@ def calc_betweenness_centrality(
     G = None
     Gnx = None
 
-    G, Gnx = utils.build_cu_and_nx_graphs(graph_file, directed=directed,
-                                          edgevals=edgevals)
+    G, Gnx = graph_obj_tuple
 
     assert G is not None and Gnx is not None
     if multi_gpu_batch:
@@ -299,46 +298,67 @@ def prepare_test():
     gc.collect()
 
 
+# =============================================================================
+# Pytest Fixtures
+# =============================================================================
+DIRECTED = [pytest.param(d) for d in DIRECTED_GRAPH_OPTIONS]
+DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL]
+DATASETS_UNRENUMBERED = [pytest.param(d) for d in utils.DATASETS_UNRENUMBERED]
+WEIGHTED_GRAPH_OPTIONS = [pytest.param(w) for w in WEIGHTED_GRAPH_OPTIONS]
+
+
+small_graph_fixture_params = utils.genFixtureParamsProduct(
+    (DATASETS_SMALL, "grph"),
+    (DIRECTED, "dirctd"),
+    (WEIGHTED_GRAPH_OPTIONS, "wgtd_gph_opts"))
+
+unrenumbered_graph_fixture_params = utils.genFixtureParamsProduct(
+    (DATASETS_UNRENUMBERED, "grph"),
+    (DIRECTED, "dirctd"),
+    (WEIGHTED_GRAPH_OPTIONS, "wgtd_gph_opts"))
+
+
+@pytest.fixture(scope="module", params=small_graph_fixture_params)
+def get_cu_nx_graph_datasets_small(request):
+    return utils.build_cu_and_nx_graphs(*request.param)
+
+
+@pytest.fixture(scope="module", params=unrenumbered_graph_fixture_params)
+def get_cu_nx_graph_datasets_unrenumbered(request):
+    return utils.build_cu_and_nx_graphs(*request.param)
+
+
 # =============================================================================
 # Tests
 # =============================================================================
-@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
-@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS)
 @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS)
 @pytest.mark.parametrize("weight", [None])
 @pytest.mark.parametrize("endpoints", ENDPOINTS_OPTIONS)
 @pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS)
 @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS)
-@pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS)
 def test_betweenness_centrality(
-    graph_file,
-    directed,
+    get_cu_nx_graph_datasets_small,
     subset_size,
     normalized,
     weight,
     endpoints,
     subset_seed,
     result_dtype,
-    edgevals
 ):
     prepare_test()
     sorted_df = calc_betweenness_centrality(
-        graph_file,
-        directed=directed,
+        get_cu_nx_graph_datasets_small,
         normalized=normalized,
         k=subset_size,
         weight=weight,
         endpoints=endpoints,
         seed=subset_seed,
         result_dtype=result_dtype,
-        edgevals=edgevals,
     )
     compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc")
 
 
-@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
-@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("subset_size", [None])
 @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS)
 @pytest.mark.parametrize("weight", [None])
@@ -346,10 +366,8 @@ def test_betweenness_centrality(
 @pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS)
 @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS)
 @pytest.mark.parametrize("use_k_full", [True])
-@pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS)
 def test_betweenness_centrality_k_full(
-    graph_file,
-    directed,
+    get_cu_nx_graph_datasets_small,
     subset_size,
     normalized,
     weight,
@@ -357,14 +375,12 @@ def test_betweenness_centrality_k_full(
     subset_seed,
     result_dtype,
     use_k_full,
-    edgevals
 ):
     """Tests full betweenness centrality by using k = G.number_of_vertices()
     instead of k=None, checks that k scales properly"""
     prepare_test()
     sorted_df = calc_betweenness_centrality(
-        graph_file,
-        directed=directed,
+        get_cu_nx_graph_datasets_small,
         normalized=normalized,
         k=subset_size,
         weight=weight,
@@ -372,7 +388,6 @@ def test_betweenness_centrality_k_full(
         seed=subset_seed,
         result_dtype=result_dtype,
         use_k_full=use_k_full,
-        edgevals=edgevals
     )
     compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc")
 
@@ -381,25 +396,20 @@ def test_betweenness_centrality_k_full(
 #       the function operating the comparison inside is first proceeding
 #       to a random sampling over the number of vertices (thus direct offsets)
 #       in the graph structure instead of actual vertices identifiers
-@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED)
-@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS)
 @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS)
 @pytest.mark.parametrize("weight", [None])
 @pytest.mark.parametrize("endpoints", ENDPOINTS_OPTIONS)
 @pytest.mark.parametrize("subset_seed", [None])
 @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS)
-@pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS)
 def test_betweenness_centrality_fixed_sample(
-    graph_file,
-    directed,
+    get_cu_nx_graph_datasets_unrenumbered,
     subset_size,
     normalized,
     weight,
     endpoints,
     subset_seed,
     result_dtype,
-    edgevals
 ):
     """Test Betweenness Centrality using a subset
 
@@ -407,38 +417,31 @@ def test_betweenness_centrality_fixed_sample(
     """
     prepare_test()
     sorted_df = calc_betweenness_centrality(
-        graph_file,
-        directed=directed,
+        get_cu_nx_graph_datasets_unrenumbered,
         k=subset_size,
         normalized=normalized,
         weight=weight,
         endpoints=endpoints,
         seed=subset_seed,
         result_dtype=result_dtype,
-        edgevals=edgevals
     )
     compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc")
 
 
-@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
-@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS)
 @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS)
 @pytest.mark.parametrize("weight", [[]])
 @pytest.mark.parametrize("endpoints", ENDPOINTS_OPTIONS)
 @pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS)
 @pytest.mark.parametrize("result_dtype", RESULT_DTYPE_OPTIONS)
-@pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS)
 def test_betweenness_centrality_weight_except(
-    graph_file,
-    directed,
+    get_cu_nx_graph_datasets_small,
     subset_size,
     normalized,
     weight,
     endpoints,
     subset_seed,
     result_dtype,
-    edgevals
 ):
     """Calls betwenness_centrality with weight
 
@@ -448,53 +451,44 @@ def test_betweenness_centrality_weight_except(
     prepare_test()
     with pytest.raises(NotImplementedError):
         sorted_df = calc_betweenness_centrality(
-            graph_file,
-            directed=directed,
+            get_cu_nx_graph_datasets_small,
             k=subset_size,
             normalized=normalized,
             weight=weight,
             endpoints=endpoints,
             seed=subset_seed,
             result_dtype=result_dtype,
-            edgevals=edgevals
         )
         compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc")
 
 
-@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL)
-@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS)
 @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS)
 @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS)
 @pytest.mark.parametrize("weight", [None])
 @pytest.mark.parametrize("endpoints", ENDPOINTS_OPTIONS)
 @pytest.mark.parametrize("subset_seed", SUBSET_SEED_OPTIONS)
 @pytest.mark.parametrize("result_dtype", [str])
-@pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS)
 def test_betweenness_invalid_dtype(
-    graph_file,
-    directed,
+    get_cu_nx_graph_datasets_small,
     subset_size,
     normalized,
     weight,
     endpoints,
     subset_seed,
     result_dtype,
-    edgevals
 ):
     """Test calls edge_betwenness_centrality an invalid type"""
 
     prepare_test()
     with pytest.raises(TypeError):
         sorted_df = calc_betweenness_centrality(
-            graph_file,
-            directed=directed,
+            get_cu_nx_graph_datasets_small,
             k=subset_size,
             normalized=normalized,
             weight=weight,
             endpoints=endpoints,
             seed=subset_seed,
             result_dtype=result_dtype,
-            edgevals=edgevals
         )
         compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc")
 
diff --git a/python/cugraph/tests/test_ecg.py b/python/cugraph/tests/test_ecg.py
index 4dc01c389cc..ba705a787ee 100644
--- a/python/cugraph/tests/test_ecg.py
+++ b/python/cugraph/tests/test_ecg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,34 +14,40 @@
 import gc
 
 import pytest
-
 import networkx as nx
 import cugraph
+
 from cugraph.tests import utils
+from cugraph.utilities.utils import is_device_version_less_than
+
+from pathlib import PurePath
 
 
 def cugraph_call(G, min_weight, ensemble_size):
     df = cugraph.ecg(G, min_weight, ensemble_size)
     num_parts = df["partition"].max() + 1
-    score = cugraph.analyzeClustering_modularity(G, num_parts, df,
-                                                 'vertex', 'partition')
+    score = cugraph.analyzeClustering_modularity(
+        G, num_parts, df, "vertex", "partition"
+    )
 
     return score, num_parts
 
 
 def golden_call(graph_file):
-    if graph_file == "../datasets/dolphins.csv":
+    if graph_file == PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "dolphins.csv":
         return 0.4962422251701355
-    if graph_file == "../datasets/karate.csv":
+    if graph_file == PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "karate.csv":
         return 0.38428664207458496
-    if graph_file == "../datasets/netscience.csv":
+    if (
+        graph_file
+        == PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "netscience.csv"
+    ):
         return 0.9279554486274719
 
 
 DATASETS = [
-    "../datasets/karate.csv",
-    "../datasets/dolphins.csv",
-    "../datasets/netscience.csv",
+    PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / f
+    for f in ["karate.csv", "dolphins.csv", "netscience.csv"]
 ]
 
 MIN_WEIGHTS = [0.05, 0.10, 0.15]
@@ -55,20 +61,32 @@ def golden_call(graph_file):
 def test_ecg_clustering(graph_file, min_weight, ensemble_size):
     gc.collect()
 
-    # Read in the graph and get a cugraph object
-    cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False)
-    G = cugraph.Graph()
-    G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
+    if is_device_version_less_than((7, 0)):
+        cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False)
+        G = cugraph.Graph()
+        G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
 
-    # Get the modularity score for partitioning versus random assignment
-    cu_score, num_parts = cugraph_call(G, min_weight, ensemble_size)
-    golden_score = golden_call(graph_file)
+        # Get the modularity score for partitioning versus random assignment
+        with pytest.raises(RuntimeError):
+            cu_score, num_parts = cugraph_call(G, min_weight, ensemble_size)
+    else:
+        # Read in the graph and get a cugraph object
+        cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False)
+        G = cugraph.Graph()
+        G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
+
+        # Get the modularity score for partitioning versus random assignment
+        cu_score, num_parts = cugraph_call(G, min_weight, ensemble_size)
+        golden_score = golden_call(graph_file)
 
-    # Assert that the partitioning has better modularity than the random
-    # assignment
-    assert cu_score > (0.95 * golden_score)
+        # Assert that the partitioning has better modularity than the random
+        # assignment
+        assert cu_score > (0.95 * golden_score)
 
 
+@pytest.mark.skipif(
+    is_device_version_less_than((7, 0)), reason="Not supported on Pascal"
+)
 @pytest.mark.parametrize("graph_file", DATASETS)
 @pytest.mark.parametrize("min_weight", MIN_WEIGHTS)
 @pytest.mark.parametrize("ensemble_size", ENSEMBLE_SIZES)
@@ -78,9 +96,10 @@ def test_ecg_clustering_nx(graph_file, min_weight, ensemble_size):
     # Read in the graph and get a NetworkX graph
     M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
     G = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight",
-        create_using=nx.Graph()
+        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
     )
 
     # Get the modularity score for partitioning versus random assignment
-    _ = cugraph.ecg(G, min_weight, ensemble_size, "weight")
+    df_dict = cugraph.ecg(G, min_weight, ensemble_size, "weight")
+
+    assert isinstance(df_dict, dict)
diff --git a/python/cugraph/tests/test_egonet.py b/python/cugraph/tests/test_egonet.py
new file mode 100644
index 00000000000..009fd1252f1
--- /dev/null
+++ b/python/cugraph/tests/test_egonet.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+
+import pytest
+
+import cugraph
+from cugraph.tests import utils
+
+# Temporarily suppress warnings till networkX fixes deprecation warnings
+# (Using or importing the ABCs from 'collections' instead of from
+# 'collections.abc' is deprecated, and in 3.8 it will stop working) for
+# python 3.7.  Also, this import networkx needs to be relocated in the
+# third-party group once this gets fixed.
+import warnings
+
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    import networkx as nx
+
+print("Networkx version : {} ".format(nx.__version__))
+
+SEEDS = [0, 5, 13]
+RADIUS = [1, 2, 3]
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("radius", RADIUS)
+def test_ego_graph_nx(graph_file, seed, radius):
+    gc.collect()
+
+    # Nx
+    df = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    Gnx = nx.from_pandas_edgelist(
+        df, create_using=nx.Graph(), source="0", target="1", edge_attr="weight"
+    )
+    ego_nx = nx.ego_graph(Gnx, seed, radius=radius)
+
+    # cugraph
+    ego_cugraph = cugraph.ego_graph(Gnx, seed, radius=radius)
+
+    assert nx.is_isomorphic(ego_nx, ego_cugraph)
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("seeds", [[0, 5, 13]])
+@pytest.mark.parametrize("radius", [1, 2, 3])
+def test_batched_ego_graphs(graph_file, seeds, radius):
+    """
+    Compute the  induced subgraph of neighbors for each node in seeds
+    within a given radius.
+    Parameters
+    ----------
+    G : cugraph.Graph, networkx.Graph, CuPy or SciPy sparse matrix
+        Graph or matrix object, which should contain the connectivity
+        information. Edge weights, if present, should be single or double
+        precision floating point values.
+    seeds : cudf.Series
+        Specifies the seeds of the induced egonet subgraphs
+    radius: integer, optional
+        Include all neighbors of distance<=radius from n.
+
+    Returns
+    -------
+    ego_edge_lists : cudf.DataFrame
+        GPU data frame containing all induced sources identifiers,
+        destination identifiers, edge weights
+    seeds_offsets: cudf.Series
+        Series containing the starting offset in the returned edge list
+        for each seed.
+    """
+    gc.collect()
+
+    # Nx
+    df = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    Gnx = nx.from_pandas_edgelist(
+        df, create_using=nx.Graph(), source="0", target="1", edge_attr="weight"
+    )
+
+    # cugraph
+    df, offsets = cugraph.batched_ego_graphs(Gnx, seeds, radius=radius)
+    for i in range(len(seeds)):
+        ego_nx = nx.ego_graph(Gnx, seeds[i], radius=radius)
+        ego_df = df[offsets[i]:offsets[i+1]]
+        ego_cugraph = nx.from_pandas_edgelist(ego_df,
+                                              source="src",
+                                              target="dst",
+                                              edge_attr="weight")
+    assert nx.is_isomorphic(ego_nx, ego_cugraph)
diff --git a/python/cugraph/tests/test_force_atlas2.py b/python/cugraph/tests/test_force_atlas2.py
index 4de49cb4088..f399460e5e6 100644
--- a/python/cugraph/tests/test_force_atlas2.py
+++ b/python/cugraph/tests/test_force_atlas2.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 import time
-
 import pytest
 
 import cugraph
@@ -20,6 +19,7 @@
 from cugraph.tests import utils
 from sklearn.manifold import trustworthiness
 import scipy.io
+from pathlib import PurePath
 
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
@@ -61,11 +61,14 @@ def cugraph_call(cu_M, max_iter, pos_list, outbound_attraction_distribution,
 
 
 DATASETS = [
-    ("../datasets/karate.csv", 0.70),
-    ("../datasets/polbooks.csv", 0.75),
-    ("../datasets/dolphins.csv", 0.66),
-    ("../datasets/netscience.csv", 0.66),
+    (PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/f,)+(d,) for (f, d) in [
+        ("karate.csv", 0.70),
+        ("polbooks.csv", 0.75),
+        ("dolphins.csv", 0.66),
+        ("netscience.csv", 0.66)]
 ]
+
+
 MAX_ITERATIONS = [500]
 BARNES_HUT_OPTIMIZE = [False, True]
 
@@ -120,7 +123,7 @@ def test_force_atlas2(graph_file, score, max_iter,
         iterations on a given graph.
     """
 
-    matrix_file = graph_file[:-4] + ".mtx"
+    matrix_file = graph_file.with_suffix(".mtx")
     M = scipy.io.mmread(matrix_file)
     M = M.todense()
     cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas())
diff --git a/python/cugraph/tests/test_hungarian.py b/python/cugraph/tests/test_hungarian.py
index 280903bc303..4183bcc2c89 100644
--- a/python/cugraph/tests/test_hungarian.py
+++ b/python/cugraph/tests/test_hungarian.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 import gc
-from itertools import product
 from timeit import default_timer as timer
 
 import numpy as np
@@ -21,7 +20,6 @@
 import cudf
 import cugraph
 from scipy.optimize import linear_sum_assignment
-import rmm
 
 
 def create_random_bipartite(v1, v2, size, dtype):
@@ -54,33 +52,23 @@ def create_random_bipartite(v1, v2, size, dtype):
     return df1['src'], g, a
 
 
-SPARSE_SIZES = [[5, 5, 100], [500, 500, 10000], [5000, 5000, 100000]]
+SPARSE_SIZES = [[5, 5, 100], [500, 500, 10000]]
+DENSE_SIZES = [[5, 100], [500, 10000]]
 
 
 def setup_function():
     gc.collect()
 
 
-# Test all combinations of default/managed and pooled/non-pooled allocation
-@pytest.mark.parametrize('managed, pool',
-                         list(product([False, True], [False, True])))
 @pytest.mark.parametrize('v1_size, v2_size, weight_limit', SPARSE_SIZES)
-def test_hungarian(managed, pool, v1_size, v2_size, weight_limit):
-    rmm.reinitialize(
-        managed_memory=managed,
-        pool_allocator=pool,
-        initial_pool_size=2 << 27
-    )
-
-    assert(rmm.is_initialized())
-
+def test_hungarian(v1_size, v2_size, weight_limit):
     v1, g, m = create_random_bipartite(v1_size,
                                        v2_size,
                                        weight_limit,
                                        np.float)
 
     start = timer()
-    matching = cugraph.hungarian(g, v1)
+    cugraph_cost, matching = cugraph.hungarian(g, v1)
     end = timer()
 
     print('cugraph time: ', (end - start))
@@ -93,14 +81,29 @@ def test_hungarian(managed, pool, v1_size, v2_size, weight_limit):
 
     scipy_cost = m[np_matching[0], np_matching[1]].sum()
 
-    cugraph_df = matching.merge(g.edgelist.edgelist_df,
-                                left_on=['vertex', 'assignment'],
-                                right_on=['src', 'dst'],
-                                how='left')
+    assert(scipy_cost == cugraph_cost)
+
 
-    cugraph_cost = cugraph_df['weights'].sum()
+@pytest.mark.parametrize('n, weight_limit', DENSE_SIZES)
+def test_dense_hungarian(n, weight_limit):
+    C = np.random.uniform(
+        0, weight_limit, size=(n, n)
+    ).round().astype(np.float32)
+
+    C_series = cudf.Series(C.flatten())
+
+    start = timer()
+    cugraph_cost, matching = cugraph.dense_hungarian(C_series, n, n)
+    end = timer()
+
+    print('cugraph time: ', (end - start))
+
+    start = timer()
+    np_matching = linear_sum_assignment(C)
+    end = timer()
+
+    print('scipy time: ', (end - start))
 
-    print('scipy_cost = ', scipy_cost)
-    print('cugraph_cost = ', cugraph_cost)
+    scipy_cost = C[np_matching[0], np_matching[1]].sum()
 
     assert(scipy_cost == cugraph_cost)
diff --git a/python/cugraph/tests/test_jaccard.py b/python/cugraph/tests/test_jaccard.py
index 3c3f6224d83..b61101ef1d0 100644
--- a/python/cugraph/tests/test_jaccard.py
+++ b/python/cugraph/tests/test_jaccard.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,11 +13,11 @@
 
 import gc
 import time
-
 import pytest
 
 import cugraph
 from cugraph.tests import utils
+from pathlib import PurePath
 
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
@@ -113,7 +113,9 @@ def test_jaccard(graph_file):
     assert err == 0
 
 
-@pytest.mark.parametrize("graph_file", ["../datasets/netscience.csv"])
+@pytest.mark.parametrize("graph_file", [PurePath(
+    utils.RAPIDS_DATASET_ROOT_DIR)/"netscience.csv"]
+)
 def test_jaccard_edgevals(graph_file):
     gc.collect()
 
diff --git a/python/cugraph/tests/test_k_truss_subgraph.py b/python/cugraph/tests/test_k_truss_subgraph.py
index e9ccac81cf6..a86490fb561 100644
--- a/python/cugraph/tests/test_k_truss_subgraph.py
+++ b/python/cugraph/tests/test_k_truss_subgraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -41,7 +41,11 @@
 # currently in networkx master and will hopefully will make it to a release
 # soon.
 def ktruss_ground_truth(graph_file):
-    G = nx.read_edgelist(graph_file, nodetype=int, data=(("weights", float),))
+    G = nx.read_edgelist(
+        str(graph_file),
+        nodetype=int,
+        data=(("weights", float),)
+    )
     df = nx.to_pandas_edgelist(G)
     return df
 
@@ -93,9 +97,6 @@ def test_ktruss_subgraph_Graph_nx(graph_file, nx_ground_truth):
         create_using=nx.Graph()
     )
     k_subgraph = cugraph.k_truss(G, k)
-    df = nx.to_pandas_edgelist(k_subgraph)
-
     k_truss_nx = nx.k_truss(G, k)
-    nx_df = nx.to_pandas_edgelist(k_truss_nx)
 
-    assert len(df) == len(nx_df)
+    assert nx.is_isomorphic(k_subgraph, k_truss_nx)
diff --git a/python/cugraph/tests/test_leiden.py b/python/cugraph/tests/test_leiden.py
index d6a7f86b5c5..89203d5014c 100644
--- a/python/cugraph/tests/test_leiden.py
+++ b/python/cugraph/tests/test_leiden.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,6 +20,8 @@
 import cugraph
 from cugraph.tests import utils
 
+from cugraph.utilities.utils import is_device_version_less_than
+
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
 # 'collections.abc' is deprecated, and in 3.8 it will stop working) for
@@ -53,6 +55,9 @@ def cugraph_louvain(G, edgevals=False):
     return parts, mod
 
 
+@pytest.mark.skipif(
+    is_device_version_less_than((7, 0)), reason="Not supported on Pascal"
+)
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 def test_leiden(graph_file):
     gc.collect()
@@ -73,6 +78,9 @@ def test_leiden(graph_file):
     assert leiden_mod >= (0.99 * louvain_mod)
 
 
+@pytest.mark.skipif(
+    is_device_version_less_than((7, 0)), reason="Not supported on Pascal"
+)
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 def test_leiden_nx(graph_file):
     gc.collect()
@@ -81,18 +89,13 @@ def test_leiden_nx(graph_file):
     NM = utils.read_csv_for_nx(graph_file)
 
     if edgevals:
-        G = nx.from_pandas_edgelist(NM,
-                                    create_using=nx.Graph(),
-                                    source="0",
-                                    target="1"
-                                    )
+        G = nx.from_pandas_edgelist(
+            NM, create_using=nx.Graph(), source="0", target="1"
+        )
     else:
-        G = nx.from_pandas_edgelist(NM,
-                                    create_using=nx.Graph(),
-                                    source="0",
-                                    target="1",
-                                    edge_attr="2"
-                                    )
+        G = nx.from_pandas_edgelist(
+            NM, create_using=nx.Graph(), source="0", target="1", edge_attr="2"
+        )
 
     leiden_parts, leiden_mod = cugraph_leiden(G, edgevals=True)
     louvain_parts, louvain_mod = cugraph_louvain(G, edgevals=True)
diff --git a/python/cugraph/tests/test_louvain.py b/python/cugraph/tests/test_louvain.py
index d6b0030eb73..50e9ccaa4c5 100644
--- a/python/cugraph/tests/test_louvain.py
+++ b/python/cugraph/tests/test_louvain.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,6 +18,7 @@
 
 import cugraph
 from cugraph.tests import utils
+from cugraph.utilities.utils import is_device_version_less_than
 
 # Temporarily suppress warnings till networkX fixes deprecation warnings
 # (Using or importing the ABCs from 'collections' instead of from
@@ -71,51 +72,63 @@ def networkx_call(M):
 def test_louvain_with_edgevals(graph_file):
     gc.collect()
 
-    M = utils.read_csv_for_nx(graph_file)
-    cu_M = utils.read_csv_file(graph_file)
-    cu_parts, cu_mod = cugraph_call(cu_M, edgevals=True)
+    if is_device_version_less_than((7, 0)):
+        cu_M = utils.read_csv_file(graph_file)
+        with pytest.raises(RuntimeError):
+            cu_parts, cu_mod = cugraph_call(cu_M)
+    else:
+        M = utils.read_csv_for_nx(graph_file)
+        cu_M = utils.read_csv_file(graph_file)
+        cu_parts, cu_mod = cugraph_call(cu_M, edgevals=True)
 
-    nx_parts = networkx_call(M)
-    # Calculating modularity scores for comparison
-    Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
-    )
+        nx_parts = networkx_call(M)
+        # Calculating modularity scores for comparison
+        Gnx = nx.from_pandas_edgelist(
+            M, source="0", target="1",
+            edge_attr="weight", create_using=nx.Graph()
+        )
 
-    cu_parts = cu_parts.to_pandas()
-    cu_map = dict(zip(cu_parts['vertex'], cu_parts['partition']))
+        cu_parts = cu_parts.to_pandas()
+        cu_map = dict(zip(cu_parts["vertex"], cu_parts["partition"]))
 
-    assert set(nx_parts.keys()) == set(cu_map.keys())
+        assert set(nx_parts.keys()) == set(cu_map.keys())
 
-    cu_mod_nx = community.modularity(cu_map, Gnx)
-    nx_mod = community.modularity(nx_parts, Gnx)
+        cu_mod_nx = community.modularity(cu_map, Gnx)
+        nx_mod = community.modularity(nx_parts, Gnx)
 
-    assert len(cu_parts) == len(nx_parts)
-    assert cu_mod > (0.82 * nx_mod)
-    assert abs(cu_mod - cu_mod_nx) < 0.0001
+        assert len(cu_parts) == len(nx_parts)
+        assert cu_mod > (0.82 * nx_mod)
+        assert abs(cu_mod - cu_mod_nx) < 0.0001
 
 
 @pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED)
 def test_louvain(graph_file):
     gc.collect()
 
-    M = utils.read_csv_for_nx(graph_file)
-    cu_M = utils.read_csv_file(graph_file)
-    cu_parts, cu_mod = cugraph_call(cu_M)
-    nx_parts = networkx_call(M)
+    if is_device_version_less_than((7, 0)):
+        cu_M = utils.read_csv_file(graph_file)
+        with pytest.raises(RuntimeError):
+            cu_parts, cu_mod = cugraph_call(cu_M)
+    else:
+        M = utils.read_csv_for_nx(graph_file)
+        cu_M = utils.read_csv_file(graph_file)
+        cu_parts, cu_mod = cugraph_call(cu_M)
+        nx_parts = networkx_call(M)
 
-    # Calculating modularity scores for comparison
-    Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
-    )
+        # Calculating modularity scores for comparison
+        Gnx = nx.from_pandas_edgelist(
+            M, source="0", target="1",
+            edge_attr="weight", create_using=nx.Graph()
+        )
 
-    cu_parts = cu_parts.to_pandas()
-    cu_map = dict(zip(cu_parts['vertex'], cu_parts['partition']))
+        cu_parts = cu_parts.to_pandas()
+        cu_map = dict(zip(cu_parts["vertex"], cu_parts["partition"]))
 
-    assert set(nx_parts.keys()) == set(cu_map.keys())
+        assert set(nx_parts.keys()) == set(cu_map.keys())
 
-    cu_mod_nx = community.modularity(cu_map, Gnx)
-    nx_mod = community.modularity(nx_parts, Gnx)
+        cu_mod_nx = community.modularity(cu_map, Gnx)
+        nx_mod = community.modularity(nx_parts, Gnx)
 
-    assert len(cu_parts) == len(nx_parts)
-    assert cu_mod > (0.82 * nx_mod)
-    assert abs(cu_mod - cu_mod_nx) < 0.0001
+        assert len(cu_parts) == len(nx_parts)
+        assert cu_mod > (0.82 * nx_mod)
+        assert abs(cu_mod - cu_mod_nx) < 0.0001
diff --git a/python/cugraph/tests/test_multigraph.py b/python/cugraph/tests/test_multigraph.py
new file mode 100644
index 00000000000..cb659bc7e24
--- /dev/null
+++ b/python/cugraph/tests/test_multigraph.py
@@ -0,0 +1,104 @@
+import cugraph
+import networkx as nx
+from cugraph.tests import utils
+import pytest
+import gc
+import numpy as np
+
+
+# =============================================================================
+# Pytest Setup / Teardown - called for each test function
+# =============================================================================
+def setup_function():
+    gc.collect()
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_multigraph(graph_file):
+    # FIXME: Migrate to new test fixtures for Graph setup once available
+    cuM = utils.read_csv_file(graph_file)
+    G = cugraph.MultiDiGraph()
+    G.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2")
+
+    nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    Gnx = nx.from_pandas_edgelist(
+        nxM,
+        source="0",
+        target="1",
+        edge_attr="weight",
+        create_using=nx.MultiDiGraph(),
+    )
+
+    assert G.number_of_edges() == Gnx.number_of_edges()
+    assert G.number_of_nodes() == Gnx.number_of_nodes()
+    cuedges = cugraph.to_pandas_edgelist(G)
+    cuedges.rename(columns={"src": "source", "dst": "target",
+                   "weights": "weight"}, inplace=True)
+    cuedges["weight"] = cuedges["weight"].round(decimals=3)
+    nxedges = nx.to_pandas_edgelist(Gnx).astype(dtype={"source": "int32",
+                                                       "target": "int32",
+                                                       "weight": "float32"})
+    cuedges = cuedges.sort_values(by=["source", "target"]).\
+        reset_index(drop=True)
+    nxedges = nxedges.sort_values(by=["source", "target"]).\
+        reset_index(drop=True)
+    nxedges["weight"] = nxedges["weight"].round(decimals=3)
+    assert nxedges.equals(cuedges[["source", "target", "weight"]])
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_Graph_from_MultiGraph(graph_file):
+    # FIXME: Migrate to new test fixtures for Graph setup once available
+    cuM = utils.read_csv_file(graph_file)
+    GM = cugraph.MultiGraph()
+    GM.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2")
+    nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    GnxM = nx.from_pandas_edgelist(
+        nxM,
+        source="0",
+        target="1",
+        edge_attr="weight",
+        create_using=nx.MultiGraph(),
+    )
+
+    G = cugraph.Graph(GM)
+    Gnx = nx.Graph(GnxM)
+    assert Gnx.number_of_edges() == G.number_of_edges()
+
+    GdM = cugraph.MultiDiGraph()
+    GdM.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2")
+    GnxdM = nx.from_pandas_edgelist(
+        nxM,
+        source="0",
+        target="1",
+        edge_attr="weight",
+        create_using=nx.MultiGraph(),
+    )
+    Gd = cugraph.DiGraph(GdM)
+    Gnxd = nx.DiGraph(GnxdM)
+    assert Gnxd.number_of_edges() == Gd.number_of_edges()
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_multigraph_sssp(graph_file):
+    # FIXME: Migrate to new test fixtures for Graph setup once available
+    cuM = utils.read_csv_file(graph_file)
+    G = cugraph.MultiDiGraph()
+    G.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2")
+    cu_paths = cugraph.sssp(G, 0)
+    max_val = np.finfo(cu_paths["distance"].dtype).max
+    cu_paths = cu_paths[cu_paths["distance"] != max_val]
+    nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    Gnx = nx.from_pandas_edgelist(
+        nxM,
+        source="0",
+        target="1",
+        edge_attr="weight",
+        create_using=nx.MultiDiGraph(),
+    )
+    nx_paths = nx.single_source_dijkstra_path_length(Gnx, 0)
+
+    cu_dist = cu_paths.sort_values(by='vertex')['distance'].to_array()
+    nx_dist = [i[1] for i in sorted(nx_paths.items())]
+
+    assert (cu_dist == nx_dist).all()
diff --git a/python/cugraph/tests/test_pagerank.py b/python/cugraph/tests/test_pagerank.py
index 1ab370041b5..3ce8dd4ffe9 100644
--- a/python/cugraph/tests/test_pagerank.py
+++ b/python/cugraph/tests/test_pagerank.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -143,7 +143,7 @@ def networkx_call(Gnx, max_iter, tol, alpha, personalization_perc, nnz_vtx):
 #
 # https://github.com/rapidsai/cugraph/issues/533
 #
-# @pytest.mark.parametrize("graph_file", utils.DATASETS)
+
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 @pytest.mark.parametrize("max_iter", MAX_ITERATIONS)
 @pytest.mark.parametrize("tol", TOLERANCE)
@@ -159,7 +159,8 @@ def test_pagerank(
     M = utils.read_csv_for_nx(graph_file)
     nnz_vtx = np.unique(M[['0', '1']])
     Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", create_using=nx.DiGraph()
+        M, source="0", target="1", edge_attr="weight",
+        create_using=nx.DiGraph()
     )
 
     networkx_pr, networkx_prsn = networkx_call(
@@ -169,13 +170,13 @@ def test_pagerank(
     cu_nstart = None
     if has_guess == 1:
         cu_nstart = cudify(networkx_pr)
-        max_iter = 5
+        max_iter = 20
     cu_prsn = cudify(networkx_prsn)
 
     # cuGraph PageRank
     cu_M = utils.read_csv_file(graph_file)
     G = cugraph.DiGraph()
-    G.from_cudf_edgelist(cu_M, source="0", destination="1")
+    G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2")
 
     cugraph_pr = cugraph_call(G, max_iter, tol, alpha, cu_prsn, cu_nstart)
 
@@ -218,7 +219,7 @@ def test_pagerank_nx(
     cu_nstart = None
     if has_guess == 1:
         cu_nstart = cudify(networkx_pr)
-        max_iter = 5
+        max_iter = 20
     cu_prsn = cudify(networkx_prsn)
 
     # cuGraph PageRank with Nx Graph
diff --git a/python/cugraph/tests/test_subgraph_extraction.py b/python/cugraph/tests/test_subgraph_extraction.py
index a4f36af994a..9e9eccc4347 100644
--- a/python/cugraph/tests/test_subgraph_extraction.py
+++ b/python/cugraph/tests/test_subgraph_extraction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -123,10 +123,9 @@ def test_subgraph_extraction_Graph_nx(graph_file):
         )
 
     nx_sub = nx.subgraph(G, verts)
-    nx_df = nx.to_pandas_edgelist(nx_sub).to_dict()
 
     cu_verts = cudf.Series(verts)
     cu_sub = cugraph.subgraph(G, cu_verts)
-    cu_df = nx.to_pandas_edgelist(cu_sub).to_dict()
 
-    assert nx_df == cu_df
+    for (u, v) in cu_sub.edges():
+        assert nx_sub.has_edge(u, v)
diff --git a/python/cugraph/tests/test_traveling_salesperson.py b/python/cugraph/tests/test_traveling_salesperson.py
new file mode 100644
index 00000000000..d43b55c43d0
--- /dev/null
+++ b/python/cugraph/tests/test_traveling_salesperson.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.tests import utils
+import cudf
+import cugraph
+import gc
+import numpy as np
+import pytest
+
+# Temporarily suppress warnings till networkX fixes deprecation warnings
+# (Using or importing the ABCs from 'collections' instead of from
+# 'collections.abc' is deprecated, and in 3.8 it will stop working) for
+# python 3.7.  Also, this import networkx needs to be relocated in the
+# third-party group once this gets fixed.
+import warnings
+
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    import networkx as nx
+
+print("Networkx version : {} ".format(nx.__version__))
+
+
+# =============================================================================
+# Pytest Setup / Teardown - called for each test function
+# =============================================================================
+def setup_function():
+    gc.collect()
+
+
+# =============================================================================
+# Helper functions
+# =============================================================================
+def load_tsp(filename=None):
+    gdf = cudf.read_csv(filename,
+                        delim_whitespace=True,
+                        skiprows=6,
+                        names=["vertex", "x", "y"],
+                        dtypes={"vertex": "int32",
+                                "x": "float32",
+                                "y": "float32"}
+                        )
+    gdf = gdf.dropna()
+    gdf['vertex'] = gdf['vertex'].str.strip()
+    gdf['vertex'] = gdf['vertex'].astype("int32")
+    return gdf
+
+
+# =============================================================================
+# Tests
+# =============================================================================
+@pytest.mark.parametrize("tsplib_file, ref_cost", utils.DATASETS_TSPLIB)
+def test_traveling_salesperson(gpubenchmark, tsplib_file, ref_cost):
+    pos_list = load_tsp(tsplib_file)
+
+    cu_route, cu_cost = gpubenchmark(cugraph.traveling_salesperson,
+                                     pos_list,
+                                     restarts=4096)
+
+    print("Cugraph cost: ", cu_cost)
+    print("Ref cost: ", ref_cost)
+    error = np.abs(cu_cost - ref_cost) / ref_cost
+    print("Approximation error is: {:.2f}%".format(error * 100))
+    # Check we are within 5% of TSPLIB
+    assert(error * 100 < 5.)
+    assert(cu_route.nunique() == pos_list.shape[0])
+    assert(cu_route.shape[0] == pos_list.shape[0])
+    min_val = pos_list["vertex"].min()
+    max_val = pos_list["vertex"].max()
+    assert(cu_route.clip(min_val, max_val).shape[0] == cu_route.shape[0])
diff --git a/python/cugraph/tests/test_utils.py b/python/cugraph/tests/test_utils.py
index 22af649ea2e..55410817f90 100644
--- a/python/cugraph/tests/test_utils.py
+++ b/python/cugraph/tests/test_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,16 +13,16 @@
 
 import gc
 import pytest
-
 import cugraph
 from cugraph.tests import utils
+from pathlib import PurePath
 
 
 def test_bfs_paths():
     with pytest.raises(ValueError) as ErrorMsg:
         gc.collect()
 
-        graph_file = '../datasets/karate.csv'
+        graph_file = PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/"karate.csv"
 
         cu_M = utils.read_csv_file(graph_file)
 
@@ -47,7 +47,7 @@ def test_bfs_paths_array():
     with pytest.raises(ValueError) as ErrorMsg:
         gc.collect()
 
-        graph_file = '../datasets/karate.csv'
+        graph_file = PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/"karate.csv"
 
         cu_M = utils.read_csv_file(graph_file)
 
diff --git a/python/cugraph/tests/utils.py b/python/cugraph/tests/utils.py
old mode 100644
new mode 100755
index 164c6efb084..c2c14e0c02d
--- a/python/cugraph/tests/utils.py
+++ b/python/cugraph/tests/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -26,7 +26,7 @@
 from scipy.sparse.coo import coo_matrix as sp_coo_matrix
 from scipy.sparse.csr import csr_matrix as sp_csr_matrix
 from scipy.sparse.csc import csc_matrix as sp_csc_matrix
-
+from pathlib import PurePath
 import cudf
 import dask_cudf
 
@@ -40,37 +40,59 @@
 #
 # Datasets
 #
-DATASETS_UNDIRECTED = ["../datasets/karate.csv", "../datasets/dolphins.csv"]
+
+
+RAPIDS_DATASET_ROOT_DIR = os.getenv("RAPIDS_DATASET_ROOT_DIR", "../datasets")
+
+DATASETS_UNDIRECTED = [PurePath(RAPIDS_DATASET_ROOT_DIR)/f for
+                       f in ["karate.csv", "dolphins.csv"]]
 
 DATASETS_UNDIRECTED_WEIGHTS = [
-    "../datasets/netscience.csv",
+    PurePath(RAPIDS_DATASET_ROOT_DIR)/"netscience.csv"
 ]
 
-DATASETS_UNRENUMBERED = ["../datasets/karate-disjoint.csv"]
+DATASETS_UNRENUMBERED = [PurePath(
+    RAPIDS_DATASET_ROOT_DIR)/"karate-disjoint.csv"
+]
 
-DATASETS = [
-    "../datasets/karate-disjoint.csv",
-    "../datasets/dolphins.csv",
-    "../datasets/netscience.csv",
+DATASETS = [PurePath(RAPIDS_DATASET_ROOT_DIR)/f for f in [
+    "karate-disjoint.csv",
+    "dolphins.csv",
+    "netscience.csv"]
 ]
+
+
 #            '../datasets/email-Eu-core.csv']
 
 STRONGDATASETS = [
-    "../datasets/dolphins.csv",
-    "../datasets/netscience.csv",
-    "../datasets/email-Eu-core.csv",
+    PurePath(RAPIDS_DATASET_ROOT_DIR)/f for f in [
+        "dolphins.csv",
+        "netscience.csv",
+        "email-Eu-core.csv"]
 ]
 
-DATASETS_KTRUSS = [
-    ("../datasets/polbooks.csv", "../datasets/ref/ktruss/polbooks.csv")
+
+DATASETS_KTRUSS = [(
+    PurePath(RAPIDS_DATASET_ROOT_DIR)/"polbooks.csv",
+    PurePath(RAPIDS_DATASET_ROOT_DIR)/"ref/ktruss/polbooks.csv")
+]
+
+DATASETS_TSPLIB = [
+        (PurePath(RAPIDS_DATASET_ROOT_DIR)/f,) + (d,) for (f, d) in [
+            ("gil262.tsp", 2378),
+            ("eil51.tsp", 426),
+            ("kroA100.tsp", 21282),
+            ("tsp225.tsp", 3916)]
 ]
 
 DATASETS_SMALL = [
-    "../datasets/karate.csv",
-    "../datasets/dolphins.csv",
-    "../datasets/polbooks.csv",
+    PurePath(RAPIDS_DATASET_ROOT_DIR)/f for f in [
+        "karate.csv",
+        "dolphins.csv",
+        "polbooks.csv"]
 ]
 
+
 MATRIX_INPUT_TYPES = [
     pytest.param(
         cp_coo_matrix, marks=pytest.mark.matrix_types, id="CuPy.coo_matrix"
diff --git a/python/cugraph/traversal/__init__.py b/python/cugraph/traversal/__init__.py
index 58e37a7add0..5944ebe0865 100644
--- a/python/cugraph/traversal/__init__.py
+++ b/python/cugraph/traversal/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,4 +18,5 @@
     shortest_path,
     filter_unreachable,
     shortest_path_length
-)
\ No newline at end of file
+)
+from cugraph.traversal.traveling_salesperson import traveling_salesperson
diff --git a/python/cugraph/traversal/sssp.py b/python/cugraph/traversal/sssp.py
index 4ba754ad4ed..8d77e6e9312 100644
--- a/python/cugraph/traversal/sssp.py
+++ b/python/cugraph/traversal/sssp.py
@@ -14,7 +14,7 @@
 import numpy as np
 
 import cudf
-from cugraph.structure import Graph, DiGraph
+from cugraph.structure import Graph, DiGraph, MultiGraph, MultiDiGraph
 from cugraph.traversal import sssp_wrapper
 from cugraph.utilities import (ensure_cugraph_obj,
                                is_matrix_type,
@@ -104,7 +104,7 @@ def _convert_df_to_output_type(df, input_type, return_predecessors):
     return_predecessors is only used for return values from cupy/scipy input
     types.
     """
-    if input_type in [Graph, DiGraph]:
+    if input_type in [Graph, DiGraph, MultiGraph, MultiDiGraph]:
         return df
 
     elif (nx is not None) and (input_type in [nx.Graph, nx.DiGraph]):
diff --git a/python/cugraph/traversal/traveling_salesperson.pxd b/python/cugraph/traversal/traveling_salesperson.pxd
new file mode 100644
index 00000000000..956c7da0978
--- /dev/null
+++ b/python/cugraph/traversal/traveling_salesperson.pxd
@@ -0,0 +1,34 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from cugraph.structure.graph_primtypes cimport *
+
+cdef extern from "algorithms.hpp" namespace "cugraph":
+
+    cdef float traveling_salesperson(const handle_t &handle,
+            int *vtx_ptr,
+            float *x_pos,
+            float *y_pos,
+            int nodes,
+            int restarts,
+            bool beam_search,
+            int k,
+            int nstart,
+            bool verbose,
+            int *route) except +
+
diff --git a/python/cugraph/traversal/traveling_salesperson.py b/python/cugraph/traversal/traveling_salesperson.py
new file mode 100644
index 00000000000..ae17555e4ea
--- /dev/null
+++ b/python/cugraph/traversal/traveling_salesperson.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.traversal import traveling_salesperson_wrapper
+from cugraph.structure.graph import null_check
+import cudf
+
+
+def traveling_salesperson(pos_list,
+                          restarts=100000,
+                          beam_search=True,
+                          k=4,
+                          nstart=None,
+                          verbose=False,
+                          ):
+    """
+    Finds an approximate solution to the traveling salesperson problem (TSP).
+    cuGraph computes an approximation of the TSP problem using hill climbing
+    optimization.
+
+    The current implementation does not support a weighted graph.
+    Parameters
+    ----------
+    pos_list: cudf.DataFrame
+        Data frame with initial vertex positions containing three columns:
+        'vertex' ids and 'x', 'y' positions.
+    restarts: int
+        Number of starts to try. The more restarts, the better the solution
+        will be approximated. The number of restarts depends on the problem
+        size and should be kept low for instances above 2k cities.
+    beam_search: bool
+        Specify if the initial solution should use KNN for an approximation
+        solution.
+    k: int
+        Beam width to use in the search.
+    nstart: int
+        Vertex id to use as starting position.
+    verbose: bool
+        Logs configuration and iterative improvement.
+
+    Returns
+    -------
+    route : cudf.Series
+        cudf.Series of size V containing the ordered list of vertices
+        than needs to be visited.
+    """
+
+    if not isinstance(pos_list, cudf.DataFrame):
+        raise TypeError("Instance should be cudf.DataFrame")
+
+    null_check(pos_list['vertex'])
+    null_check(pos_list['x'])
+    null_check(pos_list['y'])
+
+    if nstart is not None and not pos_list[pos_list['vertex'] == nstart].index:
+        raise ValueError("nstart should be in vertex ids")
+
+    route, cost = traveling_salesperson_wrapper.traveling_salesperson(
+            pos_list,
+            restarts,
+            beam_search,
+            k,
+            nstart,
+            verbose)
+    return route, cost
diff --git a/python/cugraph/traversal/traveling_salesperson_wrapper.pyx b/python/cugraph/traversal/traveling_salesperson_wrapper.pyx
new file mode 100644
index 00000000000..5f87c42a638
--- /dev/null
+++ b/python/cugraph/traversal/traveling_salesperson_wrapper.pyx
@@ -0,0 +1,83 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from cugraph.traversal.traveling_salesperson cimport traveling_salesperson as c_traveling_salesperson
+from cugraph.structure import graph_primtypes_wrapper
+from cugraph.structure.graph_primtypes cimport *
+from libcpp cimport bool
+from libc.stdint cimport uintptr_t
+from numba import cuda
+
+import cudf
+import numpy as np
+
+
+def traveling_salesperson(pos_list,
+                          restarts=100000,
+                          beam_search=True,
+                          k=4,
+                          nstart=None,
+                          verbose=False,
+                          renumber=True,
+):
+    """
+    Call traveling_salesperson
+    """
+
+    nodes = pos_list.shape[0]
+    cdef uintptr_t x_pos = <uintptr_t>NULL
+    cdef uintptr_t y_pos = <uintptr_t>NULL
+
+    pos_list['vertex'] = pos_list['vertex'].astype(np.int32)
+    pos_list['x'] = pos_list['x'].astype(np.float32)
+    pos_list['y'] = pos_list['y'].astype(np.float32)
+    x_pos = pos_list['x'].__cuda_array_interface__['data'][0]
+    y_pos = pos_list['y'].__cuda_array_interface__['data'][0]
+
+    cdef unique_ptr[handle_t] handle_ptr
+    handle_ptr.reset(new handle_t())
+    handle_ = handle_ptr.get();
+
+    cdef float final_cost = 0.0
+
+    cdef uintptr_t route_ptr = <uintptr_t>NULL
+    route_arr = cuda.device_array(nodes, dtype=np.int32)
+    route_ptr = route_arr.device_ctypes_pointer.value
+
+    cdef uintptr_t vtx_ptr = <uintptr_t>NULL
+    vtx_ptr = pos_list['vertex'].__cuda_array_interface__['data'][0]
+
+    if nstart is None:
+      renumbered_nstart = 0
+    else:
+      renumbered_nstart = pos_list[pos_list['vertex'] == nstart].index[0]
+
+    final_cost = c_traveling_salesperson(handle_[0],
+            <int*> vtx_ptr,
+            <float*> x_pos,
+            <float*> y_pos,
+            <int> nodes,
+            <int> restarts,
+            <bool> beam_search,
+            <int> k,
+            <int> renumbered_nstart,
+            <bool> verbose,
+            <int*> route_ptr)
+
+    route = cudf.Series(route_arr)
+    return route, final_cost
diff --git a/python/cugraph/tree/minimum_spanning_tree.pxd b/python/cugraph/tree/minimum_spanning_tree.pxd
index 8cea2bee0cc..a38aee96605 100644
--- a/python/cugraph/tree/minimum_spanning_tree.pxd
+++ b/python/cugraph/tree/minimum_spanning_tree.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/cugraph/utilities/utils.py b/python/cugraph/utilities/utils.py
index b77f6789abe..39b789d7f79 100644
--- a/python/cugraph/utilities/utils.py
+++ b/python/cugraph/utilities/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -219,11 +219,11 @@ def ensure_cugraph_obj(obj, nx_weight_attr=None, matrix_graph_type=None):
     cugraph Graph-type obj to create when converting from a matrix type.
     """
     # FIXME: importing here to avoid circular import
-    from cugraph.structure import Graph, DiGraph
+    from cugraph.structure import Graph, DiGraph, MultiGraph, MultiDiGraph
     from cugraph.utilities.nx_factory import convert_from_nx
 
     input_type = type(obj)
-    if input_type in [Graph, DiGraph]:
+    if input_type in [Graph, DiGraph, MultiGraph, MultiDiGraph]:
         return (obj, input_type)
 
     elif (nx is not None) and (input_type in [nx.Graph, nx.DiGraph]):
diff --git a/python/pytest.ini b/python/pytest.ini
index fb8c6ea0948..a1933ea34aa 100644
--- a/python/pytest.ini
+++ b/python/pytest.ini
@@ -5,6 +5,8 @@ addopts =
            --benchmark-min-rounds=1
            --benchmark-columns="mean, rounds"
            --benchmark-gpu-disable
+           --cov=cugraph
+           --cov-report term-missing:skip-covered
 
 markers =
           managedmem_on: RMM managed memory enabled
diff --git a/python/setuputils.py b/python/setuputils.py
index 360526c2b56..47eaf74d4b6 100644
--- a/python/setuputils.py
+++ b/python/setuputils.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -79,8 +79,8 @@ def use_raft_package(raft_path, cpp_build_path,
         if you want to change RAFT location.
     - Uses RAFT located in $RAFT_PATH if $RAFT_PATH exists.
     - Otherwise it will look for RAFT in the libcugraph build folder,
-        located either in the default location ../cpp/build or in
-        $CUGRAPH_BUILD_PATH.
+        located either in the default locations ../cpp/build/raft,
+        ../cpp/build/_deps/raft-src, or in $CUGRAPH_BUILD_PATH.
     -Otherwise it will clone RAFT into _external_repositories.
         - Branch/git tag cloned is located in git_info_file in this case.
 
@@ -88,17 +88,25 @@ def use_raft_package(raft_path, cpp_build_path,
      -------
      raft_include_path: Str
          Path to the C++ include folder of RAFT
+
     """
     if os.path.isdir('cugraph/raft'):
         raft_path = os.path.realpath('cugraph/raft')
         # walk up two dirs from `python/raft`
         raft_path = os.path.join(raft_path, '..', '..')
         print("-- Using existing RAFT folder")
+    elif cpp_build_path and os.path.isdir(os.path.join(cpp_build_path,
+                                                       '_deps/raft-src')):
+        raft_path = os.path.join(cpp_build_path, '_deps/raft-src')
+        raft_path = os.path.realpath(raft_path)
+        print("-- Using existing RAFT folder in CPP build dir from cmake "
+              "FetchContent")
     elif cpp_build_path and os.path.isdir(os.path.join(cpp_build_path,
                                                        'raft/src/raft')):
         raft_path = os.path.join(cpp_build_path, 'raft/src/raft')
         raft_path = os.path.realpath(raft_path)
-        print("-- Using existing RAFT folder in CPP build dir")
+        print("-- Using existing RAFT folder in CPP build dir from cmake "
+              "ExternalProject")
     elif isinstance(raft_path, (str, os.PathLike)):
         print('-- Using RAFT_PATH argument')
     elif os.environ.get('RAFT_PATH', False) is not False:
@@ -254,8 +262,8 @@ def get_repo_cmake_info(names, file_path):
         `ExternalProject_Add(name`
     file_path : String
         Relative path of the location of the CMakeLists.txt (or the cmake
-        module which contains ExternalProject_Add definitions) to extract
-        the information.
+        module which contains FetchContent_Declare or ExternalProject_Add
+        definitions) to extract the information.
 
     Returns
     -------
@@ -264,22 +272,31 @@ def get_repo_cmake_info(names, file_path):
         where results[name][0] is the url of the repo and
         repo_info[repo][1] is the tag/commit hash to be cloned as
         specified by cmake.
-
     """
     with open(file_path, encoding='utf-8') as f:
         s = f.read()
 
     results = {}
 
+    cmake_ext_proj_decls = ["FetchContent_Declare", "ExternalProject_Add"]
+
     for name in names:
-        res = re.findall(r'ExternalProject_Add\(' + re.escape(name)
-                         + '\s.*GIT_REPOSITORY.*\s.*GIT_TAG.*',  # noqa: W605
-                         s)
-
-        res = re.sub(' +', ' ', res[0])
-        res = res.split(' ')
-        res = [res[2][:-1], res[4]]
-        results[name] = res
+        res = None
+        for decl in cmake_ext_proj_decls:
+            res = re.search(f'{decl}\(\s*'                        # noqa: W605
+                            + '(' + re.escape(name) + ')'
+                            + '\s+.*GIT_REPOSITORY\s+(\S+)\s+.+'  # noqa: W605
+                            + '\s+.*GIT_TAG\s+(\S+)',             # noqa: W605
+                            s)
+            if res:
+                break
+        if res is None:
+            raise RuntimeError('Could not find any of the following '
+                               f'statements: {cmake_ext_proj_decls}, for '
+                               f'module "{name}" in file "{file_path}" with '
+                               'GIT_REPOSITORY and GIT_TAG settings')
+
+        results[res.group(1)] = [res.group(2), res.group(3)]
 
     return results