From 5c05a88b7cf9e007d9d3e4f9519d115dbf781478 Mon Sep 17 00:00:00 2001 From: Raymond Douglass Date: Thu, 25 Mar 2021 14:19:09 -0400 Subject: [PATCH 1/6] DOC v0.20 Updates --- CHANGELOG.md | 4 ++++ conda/environments/cugraph_dev_cuda10.1.yml | 16 ++++++++-------- conda/environments/cugraph_dev_cuda10.2.yml | 16 ++++++++-------- conda/environments/cugraph_dev_cuda11.0.yml | 16 ++++++++-------- cpp/CMakeLists.txt | 2 +- docs/source/conf.py | 4 ++-- 6 files changed, 31 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0011b99fbf3..a7b34d3e0fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# cuGraph 0.20.0 (Date TBD) + +Please see https://github.com/rapidsai//releases/tag/v0.20.0a for the latest changes to this development branch. + # cuGraph 0.19.0 (Date TBD) Please see https://github.com/rapidsai/cugraph/releases/tag/v0.19.0a for the latest changes to this development branch. diff --git a/conda/environments/cugraph_dev_cuda10.1.yml b/conda/environments/cugraph_dev_cuda10.1.yml index f26c3dd45d9..cc2b0538fb1 100644 --- a/conda/environments/cugraph_dev_cuda10.1.yml +++ b/conda/environments/cugraph_dev_cuda10.1.yml @@ -5,17 +5,17 @@ channels: - rapidsai-nightly - conda-forge dependencies: -- cudf=0.19.* -- libcudf=0.19.* -- rmm=0.19.* -- cuxfilter=0.19.* -- librmm=0.19.* +- cudf=0.20.* +- libcudf=0.20.* +- rmm=0.20.* +- cuxfilter=0.20.* +- librmm=0.20.* - dask>=2.12.0 - distributed>=2.12.0 -- dask-cuda=0.19* -- dask-cudf=0.19* +- dask-cuda=0.20* +- dask-cudf=0.20* - nccl>=2.8.4 -- ucx-py=0.19* +- ucx-py=0.20* - ucx-proc=*=gpu - scipy - networkx diff --git a/conda/environments/cugraph_dev_cuda10.2.yml b/conda/environments/cugraph_dev_cuda10.2.yml index 2848cc49dc7..06cd917db9d 100644 --- a/conda/environments/cugraph_dev_cuda10.2.yml +++ b/conda/environments/cugraph_dev_cuda10.2.yml @@ -5,17 +5,17 @@ channels: - rapidsai-nightly - conda-forge dependencies: -- cudf=0.19.* -- libcudf=0.19.* -- rmm=0.19.* -- cuxfilter=0.19.* -- librmm=0.19.* +- cudf=0.20.* +- libcudf=0.20.* +- rmm=0.20.* +- cuxfilter=0.20.* +- librmm=0.20.* - dask>=2.12.0 - distributed>=2.12.0 -- dask-cuda=0.19* -- dask-cudf=0.19* +- dask-cuda=0.20* +- dask-cudf=0.20* - nccl>=2.8.4 -- ucx-py=0.19* +- ucx-py=0.20* - ucx-proc=*=gpu - scipy - networkx diff --git a/conda/environments/cugraph_dev_cuda11.0.yml b/conda/environments/cugraph_dev_cuda11.0.yml index 82e8b409d13..00f202a6025 100644 --- a/conda/environments/cugraph_dev_cuda11.0.yml +++ b/conda/environments/cugraph_dev_cuda11.0.yml @@ -5,17 +5,17 @@ channels: - rapidsai-nightly - conda-forge dependencies: -- cudf=0.19.* -- libcudf=0.19.* -- rmm=0.19.* -- cuxfilter=0.19.* -- librmm=0.19.* +- cudf=0.20.* +- libcudf=0.20.* +- rmm=0.20.* +- cuxfilter=0.20.* +- librmm=0.20.* - dask>=2.12.0 - distributed>=2.12.0 -- dask-cuda=0.19* -- dask-cudf=0.19* +- dask-cuda=0.20* +- dask-cudf=0.20* - nccl>=2.8.4 -- ucx-py=0.19* +- ucx-py=0.20* - ucx-proc=*=gpu - scipy - networkx diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 34ea935e31d..7593a5cb89e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.18...3.18 FATAL_ERROR) -project(CUGRAPH VERSION 0.19.0 LANGUAGES C CXX CUDA) +project(CUGRAPH VERSION 0.20.0 LANGUAGES C CXX CUDA) # Write the version header include(cmake/Modules/Version.cmake) diff --git a/docs/source/conf.py b/docs/source/conf.py index eb4745a61f0..77053a3468a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -80,9 +80,9 @@ # built documents. # # The short X.Y version. -version = '0.19' +version = '0.20' # The full version, including alpha/beta/rc tags. -release = '0.19.0' +release = '0.20.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 26a203dcb770be7e00e2422f8c78c08346a3cad6 Mon Sep 17 00:00:00 2001 From: Ray Douglass <3107146+raydouglass@users.noreply.github.com> Date: Thu, 25 Mar 2021 14:28:47 -0400 Subject: [PATCH 2/6] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a7b34d3e0fe..bd5b313e550 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # cuGraph 0.20.0 (Date TBD) -Please see https://github.com/rapidsai//releases/tag/v0.20.0a for the latest changes to this development branch. +Please see https://github.com/rapidsai/cugraph/releases/tag/v0.20.0a for the latest changes to this development branch. # cuGraph 0.19.0 (Date TBD) From 63e69fcf32742fdee7e14267ba6accd94fd19c4c Mon Sep 17 00:00:00 2001 From: Joseph Nke <76006812+jnke2016@users.noreply.github.com> Date: Wed, 7 Apr 2021 18:51:32 -0500 Subject: [PATCH 3/6] Random Walks - Python Bindings (#1516) Python bindings for random walks closes #1488 check the rendering after the PR is merged to make sure everything render as expected Authors: - Joseph Nke (https://github.com/jnke2016) Approvers: - Brad Rees (https://github.com/BradReesWork) - Andrei Schaffer (https://github.com/aschaffer) - Alex Fender (https://github.com/afender) URL: https://github.com/rapidsai/cugraph/pull/1516 --- README.md | 1 + docs/source/api.rst | 11 ++ python/cugraph/__init__.py | 2 + python/cugraph/sampling/__init__.py | 14 ++ python/cugraph/sampling/random_walks.pxd | 22 +++ python/cugraph/sampling/random_walks.py | 95 +++++++++++ .../cugraph/sampling/random_walks_wrapper.pyx | 116 +++++++++++++ python/cugraph/structure/graph_utilities.pxd | 9 + python/cugraph/tests/test_random_walks.py | 154 ++++++++++++++++++ 9 files changed, 424 insertions(+) create mode 100644 python/cugraph/sampling/__init__.py create mode 100644 python/cugraph/sampling/random_walks.pxd create mode 100644 python/cugraph/sampling/random_walks.py create mode 100644 python/cugraph/sampling/random_walks_wrapper.pyx create mode 100644 python/cugraph/tests/test_random_walks.py diff --git a/README.md b/README.md index 4bdbcd00280..ccc91bfe225 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,7 @@ As of Release 0.18 - including 0.18 nightly | | Breadth First Search (BFS) | Multi-GPU | with cutoff support
[C++ README](cpp/src/traversal/README.md#BFS) | | | Single Source Shortest Path (SSSP) | Multi-GPU | [C++ README](cpp/src/traversal/README.md#SSSP) | | | Traveling Salesperson Problem (TSP) | Single-GPU | | +| Sampling | Random Walks (RW) | Single-GPU | | | Structure | | | | | | Renumbering | Single-GPU | multiple columns, any data type | | | Symmetrize | Multi-GPU | | diff --git a/docs/source/api.rst b/docs/source/api.rst index b02f8f488c5..b9b8ea4859c 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -225,6 +225,17 @@ Overlap Coefficient :undoc-members: +Sampling +======== + +Random Walks +------------ + +.. automodule:: cugraph.sampling.random_walks + :members: + :undoc-members: + + Traversal ========= diff --git a/python/cugraph/__init__.py b/python/cugraph/__init__.py index 11ba2d6ef96..d4632708591 100644 --- a/python/cugraph/__init__.py +++ b/python/cugraph/__init__.py @@ -101,6 +101,8 @@ from cugraph.raft import raft_include_test from cugraph.comms import comms +from cugraph.sampling import random_walks + # Versioneer from ._version import get_versions diff --git a/python/cugraph/sampling/__init__.py b/python/cugraph/sampling/__init__.py new file mode 100644 index 00000000000..fd9d072d4f8 --- /dev/null +++ b/python/cugraph/sampling/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cugraph.sampling.random_walks import random_walks diff --git a/python/cugraph/sampling/random_walks.pxd b/python/cugraph/sampling/random_walks.pxd new file mode 100644 index 00000000000..3e0e24b4e98 --- /dev/null +++ b/python/cugraph/sampling/random_walks.pxd @@ -0,0 +1,22 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#from cugraph.structure.graph_primtypes cimport * +from cugraph.structure.graph_utilities cimport * + +cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": + cdef unique_ptr[random_walk_ret_t] call_random_walks[vertex_t, edge_t]( + const handle_t &handle, + const graph_container_t &g, + const vertex_t *ptr_d_start, + edge_t num_paths, + edge_t max_depth) except + diff --git a/python/cugraph/sampling/random_walks.py b/python/cugraph/sampling/random_walks.py new file mode 100644 index 00000000000..7ab3191a07c --- /dev/null +++ b/python/cugraph/sampling/random_walks.py @@ -0,0 +1,95 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cudf +from cugraph.sampling import random_walks_wrapper +import cugraph +from collections import defaultdict + +# FIXME might be more efficient to return either (df + offset) or 3 cudf.Series + + +def random_walks( + G, + start_vertices, + max_depth=None +): + """ + compute random walks for each nodes in 'start_vertices' + + parameters + ---------- + G : cuGraph.Graph or networkx.Graph + The graph can be either directed (DiGraph) or undirected (Graph). + Weights in the graph are ignored. + Use weight parameter if weights need to be considered + (currently not supported) + + start_vertices : int or list or cudf.Series + A single node or a list or a cudf.Series of nodes from which to run + the random walks + + max_depth : int + The maximum depth of the random walks + + + Returns + ------- + random_walks_edge_lists : cudf.DataFrame + GPU data frame containing all random walks sources identifiers, + destination identifiers, edge weights + + seeds_offsets: cudf.Series + Series containing the starting offset in the returned edge list + for each vertex in start_vertices. + """ + if max_depth is None: + raise TypeError("must specify a 'max_depth'") + + G, _ = cugraph.utilities.check_nx_graph(G) + + if start_vertices is int: + start_vertices = [start_vertices] + + if not isinstance(start_vertices, cudf.Series): + start_vertices = cudf.Series(start_vertices) + + if G.renumbered is True: + start_vertices = G.lookup_internal_vertex_id(start_vertices) + vertex_set, edge_set, sizes = random_walks_wrapper.random_walks( + G, start_vertices, max_depth) + + if G.renumbered: + df_ = cudf.DataFrame() + df_['vertex_set'] = vertex_set + df_ = G.unrenumber(df_, 'vertex_set', preserve_order=True) + vertex_set = cudf.Series(df_['vertex_set']) + + edge_list = defaultdict(list) + next_path_idx = 0 + offsets = [0] + + df = cudf.DataFrame() + for s in sizes.values_host: + for i in range(next_path_idx, s+next_path_idx-1): + edge_list['src'].append(vertex_set.values_host[i]) + edge_list['dst'].append(vertex_set.values_host[i+1]) + next_path_idx += s + df = df.append(edge_list, ignore_index=True) + offsets.append(df.index[-1]+1) + edge_list['src'].clear() + edge_list['dst'].clear() + df['weight'] = edge_set + offsets = cudf.Series(offsets) + + return df, offsets diff --git a/python/cugraph/sampling/random_walks_wrapper.pyx b/python/cugraph/sampling/random_walks_wrapper.pyx new file mode 100644 index 00000000000..7b16ff14018 --- /dev/null +++ b/python/cugraph/sampling/random_walks_wrapper.pyx @@ -0,0 +1,116 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from cugraph.sampling.random_walks cimport call_random_walks +#from cugraph.structure.graph_primtypes cimport * +from cugraph.structure.graph_utilities cimport * +from libcpp cimport bool +from libcpp.utility cimport move +from libc.stdint cimport uintptr_t +from cugraph.structure import graph_primtypes_wrapper +import cudf +import rmm +import numpy as np +import numpy.ctypeslib as ctypeslib +from rmm._lib.device_buffer cimport DeviceBuffer +from cudf.core.buffer import Buffer +from cython.operator cimport dereference as deref +def random_walks(input_graph, start_vertices, max_depth): + """ + Call random_walks + """ + # FIXME: Offsets and indices are currently hardcoded to int, but this may + # not be acceptable in the future. + numberTypeMap = {np.dtype("int32") : numberTypeEnum.int32Type, + np.dtype("int64") : numberTypeEnum.int64Type, + np.dtype("float32") : numberTypeEnum.floatType, + np.dtype("double") : numberTypeEnum.doubleType} + [src, dst] = [input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']] + vertex_t = src.dtype + edge_t = np.dtype("int32") + weights = None + if input_graph.edgelist.weights: + weights = input_graph.edgelist.edgelist_df['weights'] + num_verts = input_graph.number_of_vertices() + num_edges = input_graph.number_of_edges(directed_edges=True) + num_partition_edges = num_edges + + if num_edges > (2**31 - 1): + edge_t = np.dtype("int64") + cdef unique_ptr[random_walk_ret_t] rw_ret_ptr + + cdef uintptr_t c_src_vertices = src.__cuda_array_interface__['data'][0] + cdef uintptr_t c_dst_vertices = dst.__cuda_array_interface__['data'][0] + cdef uintptr_t c_edge_weights = NULL + if weights is not None: + c_edge_weights = weights.__cuda_array_interface__['data'][0] + weight_t = weights.dtype + is_weighted = True + else: + weight_t = np.dtype("float32") + is_weighted = False + # Pointers for random_walks + start_vertices = start_vertices.astype('int32') + cdef uintptr_t c_start_vertex_ptr = start_vertices.__cuda_array_interface__['data'][0] + num_paths = start_vertices.size + cdef unique_ptr[handle_t] handle_ptr + handle_ptr.reset(new handle_t()) + handle_ = handle_ptr.get() + cdef graph_container_t graph_container + populate_graph_container(graph_container, + handle_[0], + c_src_vertices, c_dst_vertices, c_edge_weights, + NULL, + ((numberTypeMap[vertex_t])), + ((numberTypeMap[edge_t])), + ((numberTypeMap[weight_t])), + num_partition_edges, + num_verts, + num_edges, + False, + is_weighted, + False, False) + if(vertex_t == np.dtype("int32")): + if(edge_t == np.dtype("int32")): + rw_ret_ptr = move(call_random_walks[int, int]( deref(handle_), + graph_container, + c_start_vertex_ptr, + num_paths, + max_depth)) + else: # (edge_t == np.dtype("int64")): + rw_ret_ptr = move(call_random_walks[int, long]( deref(handle_), + graph_container, + c_start_vertex_ptr, + num_paths, + max_depth)) + else: # (vertex_t == edge_t == np.dtype("int64")): + rw_ret_ptr = move(call_random_walks[long, long]( deref(handle_), + graph_container, + c_start_vertex_ptr, + num_paths, + max_depth)) + + + rw_ret= move(rw_ret_ptr.get()[0]) + vertex_set = DeviceBuffer.c_from_unique_ptr(move(rw_ret.d_coalesced_v_)) + edge_set = DeviceBuffer.c_from_unique_ptr(move(rw_ret.d_coalesced_w_)) + sizes = DeviceBuffer.c_from_unique_ptr(move(rw_ret.d_sizes_)) + vertex_set = Buffer(vertex_set) + edge_set = Buffer(edge_set) + sizes = Buffer(sizes) + + set_vertex = cudf.Series(data=vertex_set, dtype=vertex_t) + set_edge = cudf.Series(data=edge_set, dtype=weight_t) + set_sizes = cudf.Series(data=sizes, dtype=edge_t) + + return set_vertex, set_edge, set_sizes + \ No newline at end of file diff --git a/python/cugraph/structure/graph_utilities.pxd b/python/cugraph/structure/graph_utilities.pxd index b169e42ccf8..c9cf1748bfe 100644 --- a/python/cugraph/structure/graph_utilities.pxd +++ b/python/cugraph/structure/graph_utilities.pxd @@ -83,6 +83,15 @@ cdef extern from "utilities/cython.hpp" namespace "cugraph::cython": unique_ptr[device_buffer] dst_indices unique_ptr[device_buffer] edge_data unique_ptr[device_buffer] subgraph_offsets + + cdef cppclass random_walk_ret_t: + size_t coalesced_sz_v_ + size_t coalesced_sz_w_ + size_t num_paths_ + size_t max_depth_ + unique_ptr[device_buffer] d_coalesced_v_ + unique_ptr[device_buffer] d_coalesced_w_ + unique_ptr[device_buffer] d_sizes_ cdef extern from "" namespace "std" nogil: cdef device_buffer move(device_buffer) diff --git a/python/cugraph/tests/test_random_walks.py b/python/cugraph/tests/test_random_walks.py new file mode 100644 index 00000000000..9767e81ba1f --- /dev/null +++ b/python/cugraph/tests/test_random_walks.py @@ -0,0 +1,154 @@ +# Copyright (c) 2020-2021, NVIDIA CORPORATION.: +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc + +import pytest + +from cugraph.tests import utils +import cugraph +import random + + +# ============================================================================= +# Parameters +# ============================================================================= +DIRECTED_GRAPH_OPTIONS = [False, True] +WEIGHTED_GRAPH_OPTIONS = [False, True] +DATASETS = [pytest.param(d) for d in utils.DATASETS] +DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL] + + +def calc_random_walks( + graph_file, + directed=False, + max_depth=None +): + """ + compute random walks for each nodes in 'start_vertices' + + parameters + ---------- + G : cuGraph.Graph or networkx.Graph + The graph can be either directed (DiGraph) or undirected (Graph). + Weights in the graph are ignored. + Use weight parameter if weights need to be considered + (currently not supported) + + start_vertices : int or list or cudf.Series + A single node or a list or a cudf.Series of nodes from which to run + the random walks + + max_depth : int + The maximum depth of the random walks + + + Returns + ------- + random_walks_edge_lists : cudf.DataFrame + GPU data frame containing all random walks sources identifiers, + destination identifiers, edge weights + + seeds_offsets: cudf.Series + Series containing the starting offset in the returned edge list + for each vertex in start_vertices. + """ + G = utils.generate_cugraph_graph_from_file( + graph_file, directed=directed, edgevals=True) + assert G is not None + + k = random.randint(1, 10) + start_vertices = random.sample(range(G.number_of_vertices()), k) + df, offsets = cugraph.random_walks(G, start_vertices, max_depth) + + return df, offsets, start_vertices + + +def check_random_walks(df, offsets, seeds, df_G=None): + invalid_edge = 0 + invalid_seeds = 0 + invalid_weight = 0 + offsets_idx = 0 + for i in range(len(df.index)): + src, dst, weight = df.iloc[i].to_array() + if i == offsets[offsets_idx]: + if df['src'].iloc[i] != seeds[offsets_idx]: + invalid_seeds += 1 + print( + "[ERR] Invalid seed: " + " src {} != src {}" + .format(df['src'].iloc[i], offsets[offsets_idx]) + ) + offsets_idx += 1 + + edge = df.loc[(df['src'] == (src)) & (df['dst'] == (dst))].reset_index( + drop=True) + exp_edge = df_G.loc[ + (df_G['src'] == (src)) & ( + df_G['dst'] == (dst))].reset_index(drop=True) + + if not exp_edge.equals(edge[:1]): + print( + "[ERR] Invalid edge: " + "There is no edge src {} dst {} weight {}" + .format(src, dst, weight) + ) + invalid_weight += 1 + + assert invalid_edge == 0 + assert invalid_seeds == 0 + assert invalid_weight == 0 + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= + + +def prepare_test(): + gc.collect() + + +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +@pytest.mark.parametrize("max_depth", [None]) +def test_random_walks_invalid_max_dept( + graph_file, + directed, + max_depth +): + """Test calls random_walks an invalid type""" + prepare_test() + with pytest.raises(TypeError): + df, offsets, seeds = calc_random_walks( + graph_file, + directed=directed, + max_depth=max_depth + ) + + +@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_random_walks( + graph_file, + directed +): + max_depth = random.randint(2, 10) + df_G = utils.read_csv_file(graph_file) + df_G.rename( + columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True) + df, offsets, seeds = calc_random_walks( + graph_file, + directed, + max_depth=max_depth + ) + check_random_walks(df, offsets, seeds, df_G) From 9fd4f3c92135108f67f986b3f8d8633f4de47f0f Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Thu, 8 Apr 2021 09:53:50 -0700 Subject: [PATCH 4/6] Update docs and remove all warnings (#1521) This pr fixes the following - Add traveling salesperson problem to the docs - Update docs to address all build warnings To remove some warnings. updated the use of `NOTE:` in cases like the one shown below. | Old | New | | ------------- | ------------- | | ![image](https://user-images.githubusercontent.com/19949207/113936070-283a2380-97ac-11eb-9705-9f261c965fa9.png) | ![image](https://user-images.githubusercontent.com/19949207/113935703-b06bf900-97ab-11eb-93a4-7df2f711c1aa.png) | Authors: - Ayush Dattagupta (https://github.com/ayushdg) Approvers: - Brad Rees (https://github.com/BradReesWork) - Rick Ratzel (https://github.com/rlratzel) - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cugraph/pull/1521 --- docs/source/api.rst | 29 ++++++++------- python/cugraph/centrality/katz_centrality.py | 18 +++++----- python/cugraph/components/connectivity.py | 36 ++++++++++++------- .../dask/centrality/katz_centrality.py | 19 +++++----- python/cugraph/dask/link_analysis/pagerank.py | 2 ++ python/cugraph/link_analysis/pagerank.py | 1 - python/cugraph/structure/symmetrize.py | 1 + python/cugraph/traversal/bfs.py | 6 ++-- .../traversal/traveling_salesperson.py | 1 + 9 files changed, 70 insertions(+), 43 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index b9b8ea4859c..e2c2c19cf02 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -192,7 +192,7 @@ Pagerank :undoc-members: Pagerank (MG) ---------- +------------- .. automodule:: cugraph.dask.link_analysis.pagerank :members: pagerank @@ -247,7 +247,7 @@ Breadth-first-search :undoc-members: Breadth-first-search (MG) --------------------- +------------------------- .. automodule:: cugraph.dask.traversal.bfs :members: @@ -261,12 +261,19 @@ Single-source-shortest-path :undoc-members: Single-source-shortest-path (MG) ---------------------------- +-------------------------------- .. automodule:: cugraph.dask.traversal.sssp :members: :undoc-members: +Traveling-salesperson-problem +----------------------------- + +.. automodule:: cugraph.traversal.traveling_salesperson + :members: + :undoc-members: + Tree ========= @@ -275,27 +282,25 @@ Minimum Spanning Tree --------------------- .. automodule:: cugraph.tree.minimum_spanning_tree - :members: + :members: minimum_spanning_tree :undoc-members: Maximum Spanning Tree --------------------- -.. automodule:: cugraph.tree.maximum_spanning_tree - :members: +.. automodule:: cugraph.tree.minimum_spanning_tree + :members: maximum_spanning_tree :undoc-members: + :noindex: -DASK MG Helper functions +DASK MG Helper functions =========================== .. automodule:: cugraph.comms.comms - :members: initialize - :undoc-members: - -.. automodule:: cugraph.comms.comms - :members: destroy + :members: initialize, destroy :undoc-members: + :member-order: bysource .. automodule:: cugraph.dask.common.read_utils :members: get_chunksize diff --git a/python/cugraph/centrality/katz_centrality.py b/python/cugraph/centrality/katz_centrality.py index 3e2680a196f..ce52d15f5db 100644 --- a/python/cugraph/centrality/katz_centrality.py +++ b/python/cugraph/centrality/katz_centrality.py @@ -39,14 +39,16 @@ def katz_centrality( Attenuation factor defaulted to None. If alpha is not specified then it is internally calculated as 1/(degree_max) where degree_max is the maximum out degree. - NOTE : The maximum acceptable value of alpha for convergence - alpha_max = 1/(lambda_max) where lambda_max is the largest eigenvalue - of the graph. - Since lambda_max is always lesser than or equal to degree_max for a - graph, alpha_max will always be greater than or equal to - (1/degree_max). Therefore, setting alpha to (1/degree_max) will - guarantee that it will never exceed alpha_max thus in turn fulfilling - the requirement for convergence. + + NOTE + The maximum acceptable value of alpha for convergence + alpha_max = 1/(lambda_max) where lambda_max is the largest eigenvalue + of the graph. + Since lambda_max is always lesser than or equal to degree_max for a + graph, alpha_max will always be greater than or equal to + (1/degree_max). Therefore, setting alpha to (1/degree_max) will + guarantee that it will never exceed alpha_max thus in turn fulfilling + the requirement for convergence. beta : None A weight scalar - currently Not Supported max_iter : int diff --git a/python/cugraph/components/connectivity.py b/python/cugraph/components/connectivity.py index 72f33ebfcbb..df33f8b8e03 100644 --- a/python/cugraph/components/connectivity.py +++ b/python/cugraph/components/connectivity.py @@ -138,8 +138,10 @@ def weakly_connected_components(G, directed : bool, optional - NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises - TypeError if used with a Graph object. + NOTE + For non-Graph-type (eg. sparse matrix) values of G only. + Raises TypeError if used with a Graph object. + If True (default), then convert the input matrix to a cugraph.DiGraph and only move from point i to point j along paths csgraph[i, j]. If False, then find the shortest path on an undirected graph: the @@ -154,8 +156,10 @@ def weakly_connected_components(G, return_labels : bool, optional - NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises - TypeError if used with a Graph object. + NOTE + For non-Graph-type (eg. sparse matrix) values of G only. Raises + TypeError if used with a Graph object. + If True (default), then return the labels for each of the connected components. @@ -231,8 +235,10 @@ def strongly_connected_components(G, directed : bool, optional - NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises - TypeError if used with a Graph object. + NOTE + For non-Graph-type (eg. sparse matrix) values of G only. + Raises TypeError if used with a Graph object. + If True (default), then convert the input matrix to a cugraph.DiGraph and only move from point i to point j along paths csgraph[i, j]. If False, then find the shortest path on an undirected graph: the @@ -247,8 +253,10 @@ def strongly_connected_components(G, return_labels : bool, optional - NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises - TypeError if used with a Graph object. + NOTE + For non-Graph-type (eg. sparse matrix) values of G only. Raises + TypeError if used with a Graph object. + If True (default), then return the labels for each of the connected components. @@ -325,8 +333,10 @@ def connected_components(G, directed : bool, optional - NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises - TypeError if used with a Graph object. + NOTE + For non-Graph-type (eg. sparse matrix) values of G only. Raises + TypeError if used with a Graph object. + If True (default), then convert the input matrix to a cugraph.DiGraph and only move from point i to point j along paths csgraph[i, j]. If False, then find the shortest path on an undirected graph: the @@ -340,8 +350,10 @@ def connected_components(G, return_labels : bool, optional - NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises - TypeError if used with a Graph object. + NOTE + For non-Graph-type (eg. sparse matrix) values of G only. Raises + TypeError if used with a Graph object. + If True (default), then return the labels for each of the connected components. diff --git a/python/cugraph/dask/centrality/katz_centrality.py b/python/cugraph/dask/centrality/katz_centrality.py index a2f83a0b2a8..45deda8b7ae 100644 --- a/python/cugraph/dask/centrality/katz_centrality.py +++ b/python/cugraph/dask/centrality/katz_centrality.py @@ -68,14 +68,16 @@ def katz_centrality(input_graph, Attenuation factor defaulted to None. If alpha is not specified then it is internally calculated as 1/(degree_max) where degree_max is the maximum out degree. - NOTE : The maximum acceptable value of alpha for convergence - alpha_max = 1/(lambda_max) where lambda_max is the largest eigenvalue - of the graph. - Since lambda_max is always lesser than or equal to degree_max for a - graph, alpha_max will always be greater than or equal to - (1/degree_max). Therefore, setting alpha to (1/degree_max) will - guarantee that it will never exceed alpha_max thus in turn fulfilling - the requirement for convergence. + + NOTE + The maximum acceptable value of alpha for convergence + alpha_max = 1/(lambda_max) where lambda_max is the largest eigenvalue + of the graph. + Since lambda_max is always lesser than or equal to degree_max for a + graph, alpha_max will always be greater than or equal to + (1/degree_max). Therefore, setting alpha to (1/degree_max) will + guarantee that it will never exceed alpha_max thus in turn fulfilling + the requirement for convergence. beta : None A weight scalar - currently Not Supported max_iter : int @@ -94,6 +96,7 @@ def katz_centrality(input_graph, acceptable. nstart : dask_cudf.Dataframe GPU Dataframe containing the initial guess for katz centrality + nstart['vertex'] : dask_cudf.Series Contains the vertex identifiers nstart['values'] : dask_cudf.Series diff --git a/python/cugraph/dask/link_analysis/pagerank.py b/python/cugraph/dask/link_analysis/pagerank.py index bfaada85a6f..fb9f4ad3a25 100644 --- a/python/cugraph/dask/link_analysis/pagerank.py +++ b/python/cugraph/dask/link_analysis/pagerank.py @@ -73,6 +73,7 @@ def pagerank(input_graph, personalization : cudf.Dataframe GPU Dataframe containing the personalization information. Currently not supported. + personalization['vertex'] : cudf.Series Subset of vertices of graph for personalization personalization['values'] : cudf.Series @@ -91,6 +92,7 @@ def pagerank(input_graph, acceptable. nstart : not supported initial guess for pagerank + Returns ------- PageRank : dask_cudf.DataFrame diff --git a/python/cugraph/link_analysis/pagerank.py b/python/cugraph/link_analysis/pagerank.py index 0bb89195e01..8a03ee077f6 100644 --- a/python/cugraph/link_analysis/pagerank.py +++ b/python/cugraph/link_analysis/pagerank.py @@ -46,7 +46,6 @@ def pagerank( Subset of vertices of graph for personalization personalization['values'] : cudf.Series Personalization values for vertices - max_iter : int The maximum number of iterations before an answer is returned. This can be used to limit the execution time and do an early exit before the diff --git a/python/cugraph/structure/symmetrize.py b/python/cugraph/structure/symmetrize.py index 0f4ca90a97c..8720f7ad343 100644 --- a/python/cugraph/structure/symmetrize.py +++ b/python/cugraph/structure/symmetrize.py @@ -32,6 +32,7 @@ def symmetrize_df(df, src_name, dst_name, multi=False, symmetrize=True): != data2 then this code will arbitrarily pick the smaller data element to keep, if this is not desired then the caller should should correct the data prior to calling symmetrize. + Parameters ---------- df : cudf.DataFrame diff --git a/python/cugraph/traversal/bfs.py b/python/cugraph/traversal/bfs.py index efbae095676..a483b96850b 100644 --- a/python/cugraph/traversal/bfs.py +++ b/python/cugraph/traversal/bfs.py @@ -136,8 +136,10 @@ def bfs(G, can be set, not both. directed : bool, optional - NOTE: For non-Graph-type (eg. sparse matrix) values of G only. Raises - TypeError if used with a Graph object. + NOTE + For non-Graph-type (eg. sparse matrix) values of G only. Raises + TypeError if used with a Graph object. + If True (default), then convert the input matrix to a cugraph.DiGraph, otherwise a cugraph.Graph object will be used. diff --git a/python/cugraph/traversal/traveling_salesperson.py b/python/cugraph/traversal/traveling_salesperson.py index ae17555e4ea..7aea7ae603f 100644 --- a/python/cugraph/traversal/traveling_salesperson.py +++ b/python/cugraph/traversal/traveling_salesperson.py @@ -29,6 +29,7 @@ def traveling_salesperson(pos_list, optimization. The current implementation does not support a weighted graph. + Parameters ---------- pos_list: cudf.DataFrame From e9d09eeb11414c2e12c46b4a188186e1ceee032d Mon Sep 17 00:00:00 2001 From: Iroy30 <41401566+Iroy30@users.noreply.github.com> Date: Thu, 8 Apr 2021 21:26:13 -0500 Subject: [PATCH 5/6] fix mg_renumber non-deterministic errors (#1523) * @Iroy30 added missing dask `persist()` call to ensure deterministic indirection map state prior to merging renumbering results. * @rlratzel updated MG renumbering test for latest API changes, removed redundant test, and updated test IDs to include the dataset name. Authors: - https://github.com/Iroy30 - Rick Ratzel (https://github.com/rlratzel) Approvers: - Brad Rees (https://github.com/BradReesWork) - Joseph Nke (https://github.com/jnke2016) URL: https://github.com/rapidsai/cugraph/pull/1523 --- python/cugraph/structure/number_map.py | 8 +- .../test_mg_batch_betweenness_centrality.py | 3 +- ...st_mg_batch_edge_betweenness_centrality.py | 5 +- python/cugraph/tests/dask/test_mg_bfs.py | 5 +- python/cugraph/tests/dask/test_mg_comms.py | 6 +- python/cugraph/tests/dask/test_mg_degree.py | 5 +- .../tests/dask/test_mg_katz_centrality.py | 5 +- python/cugraph/tests/dask/test_mg_louvain.py | 7 +- python/cugraph/tests/dask/test_mg_pagerank.py | 5 +- python/cugraph/tests/dask/test_mg_renumber.py | 93 ++++++------------- .../cugraph/tests/dask/test_mg_replication.py | 46 ++++++--- python/cugraph/tests/dask/test_mg_sssp.py | 5 +- python/cugraph/tests/dask/test_mg_utility.py | 5 +- 13 files changed, 107 insertions(+), 91 deletions(-) diff --git a/python/cugraph/structure/number_map.py b/python/cugraph/structure/number_map.py index e45a50d6dbe..cd24dfc0434 100644 --- a/python/cugraph/structure/number_map.py +++ b/python/cugraph/structure/number_map.py @@ -263,7 +263,6 @@ def indirection_map(self, ddf, src_col_names, dst_col_names): to_frame(name=newname) else: tmp_df[newname] = tmp[newname].append(tmp_dst[oldname]) - print(tmp_df.columns) else: for newname in self.col_names: tmp_df[newname] = tmp[newname] @@ -273,7 +272,7 @@ def indirection_map(self, ddf, src_col_names, dst_col_names): tmp_ddf = tmp_ddf.assign(idx=1) tmp_ddf['global_id'] = tmp_ddf.idx.cumsum() - 1 tmp_ddf = tmp_ddf.drop(columns='idx') - + tmp_ddf = tmp_ddf.persist() self.ddf = tmp_ddf return tmp_ddf @@ -481,8 +480,6 @@ def renumber(df, src_col_names, dst_col_names, preserve_order=False, renumber_type = 'legacy' else: renumber_type = 'experimental' - df = df.rename(columns={src_col_names: "src", - dst_col_names: "dst"}) renumber_map = NumberMap() if not isinstance(src_col_names, list): @@ -514,6 +511,9 @@ def renumber(df, src_col_names, dst_col_names, preserve_order=False, df, "dst", dst_col_names, drop=True, preserve_order=preserve_order ) + else: + df = df.rename(columns={src_col_names[0]: "src", + dst_col_names[0]: "dst"}) num_edges = len(df) diff --git a/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py b/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py index 6e1e5ea380a..02696f589e3 100644 --- a/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py +++ b/python/cugraph/tests/dask/test_mg_batch_betweenness_centrality.py @@ -51,7 +51,8 @@ @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("graph_file", DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS, + ids=[f"dataset={d.as_posix()}" for d in DATASETS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) diff --git a/python/cugraph/tests/dask/test_mg_batch_edge_betweenness_centrality.py b/python/cugraph/tests/dask/test_mg_batch_edge_betweenness_centrality.py index 54b58c340aa..89844797807 100644 --- a/python/cugraph/tests/dask/test_mg_batch_edge_betweenness_centrality.py +++ b/python/cugraph/tests/dask/test_mg_batch_edge_betweenness_centrality.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -48,7 +48,8 @@ @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("graph_file", DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS, + ids=[f"dataset={d}" for d in DATASETS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) diff --git a/python/cugraph/tests/dask/test_mg_bfs.py b/python/cugraph/tests/dask/test_mg_bfs.py index 63580461b17..36d1f436b52 100644 --- a/python/cugraph/tests/dask/test_mg_bfs.py +++ b/python/cugraph/tests/dask/test_mg_bfs.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -35,7 +35,10 @@ def client_connection(): def test_dask_bfs(client_connection): gc.collect() + # FIXME: update this to allow dataset to be parameterized and have dataset + # part of test param id (see other tests) input_data_path = r"../datasets/netscience.csv" + print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( diff --git a/python/cugraph/tests/dask/test_mg_comms.py b/python/cugraph/tests/dask/test_mg_comms.py index 61a4944b5f1..03a0a5d73d2 100644 --- a/python/cugraph/tests/dask/test_mg_comms.py +++ b/python/cugraph/tests/dask/test_mg_comms.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -38,10 +38,14 @@ def test_dask_pagerank(client_connection): # Initialize and run pagerank on two distributed graphs # with same communicator + # FIXME: update this to allow dataset to be parameterized and have dataset + # part of test param id (see other tests) input_data_path1 = r"../datasets/karate.csv" + print(f"dataset1={input_data_path1}") chunksize1 = dcg.get_chunksize(input_data_path1) input_data_path2 = r"../datasets/dolphins.csv" + print(f"dataset2={input_data_path2}") chunksize2 = dcg.get_chunksize(input_data_path2) ddf1 = dask_cudf.read_csv( diff --git a/python/cugraph/tests/dask/test_mg_degree.py b/python/cugraph/tests/dask/test_mg_degree.py index 9f4c0d94319..93e8a365dea 100644 --- a/python/cugraph/tests/dask/test_mg_degree.py +++ b/python/cugraph/tests/dask/test_mg_degree.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -34,7 +34,10 @@ def client_connection(): def test_dask_mg_degree(client_connection): gc.collect() + # FIXME: update this to allow dataset to be parameterized and have dataset + # part of test param id (see other tests) input_data_path = r"../datasets/karate.csv" + print(f"dataset={input_data_path}") chunksize = cugraph.dask.get_chunksize(input_data_path) diff --git a/python/cugraph/tests/dask/test_mg_katz_centrality.py b/python/cugraph/tests/dask/test_mg_katz_centrality.py index 631457f7558..eadf0f662d4 100644 --- a/python/cugraph/tests/dask/test_mg_katz_centrality.py +++ b/python/cugraph/tests/dask/test_mg_katz_centrality.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -36,7 +36,10 @@ def client_connection(): def test_dask_katz_centrality(client_connection): gc.collect() + # FIXME: update this to allow dataset to be parameterized and have dataset + # part of test param id (see other tests) input_data_path = r"../datasets/karate.csv" + print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( diff --git a/python/cugraph/tests/dask/test_mg_louvain.py b/python/cugraph/tests/dask/test_mg_louvain.py index a07eede8cb9..bd7374fb75e 100644 --- a/python/cugraph/tests/dask/test_mg_louvain.py +++ b/python/cugraph/tests/dask/test_mg_louvain.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -52,7 +52,10 @@ def client_connection(): @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.fixture(scope="module", params=utils.DATASETS_UNDIRECTED) +@pytest.fixture(scope="module", + params=utils.DATASETS_UNDIRECTED, + ids=[f"dataset={d.as_posix()}" + for d in utils.DATASETS_UNDIRECTED]) def daskGraphFromDataset(request, client_connection): """ Returns a new dask dataframe created from the dataset file param. diff --git a/python/cugraph/tests/dask/test_mg_pagerank.py b/python/cugraph/tests/dask/test_mg_pagerank.py index 4f0b45242dd..9cb00010311 100644 --- a/python/cugraph/tests/dask/test_mg_pagerank.py +++ b/python/cugraph/tests/dask/test_mg_pagerank.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -65,7 +65,10 @@ def client_connection(): def test_dask_pagerank(client_connection, personalization_perc): gc.collect() + # FIXME: update this to allow dataset to be parameterized and have dataset + # part of test param id (see other tests) input_data_path = r"../datasets/karate.csv" + print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( diff --git a/python/cugraph/tests/dask/test_mg_renumber.py b/python/cugraph/tests/dask/test_mg_renumber.py index 7f5cf6f08bc..68ec3de35f8 100644 --- a/python/cugraph/tests/dask/test_mg_renumber.py +++ b/python/cugraph/tests/dask/test_mg_renumber.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -38,11 +38,12 @@ def client_connection(): teardown_local_dask_cluster(cluster, client) -# Test all combinations of default/managed and pooled/non-pooled allocation @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED) +@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED, + ids=[f"dataset={d.as_posix()}" + for d in utils.DATASETS_UNRENUMBERED]) def test_mg_renumber(graph_file, client_connection): gc.collect() @@ -60,71 +61,37 @@ def test_mg_renumber(graph_file, client_connection): ddf = dask.dataframe.from_pandas(gdf, npartitions=2) - numbering = NumberMap() - numbering.from_dataframe(ddf, ["src", "src_old"], ["dst", "dst_old"]) - renumbered_df = numbering.add_internal_vertex_id( - numbering.add_internal_vertex_id(ddf, "src_id", ["src", "src_old"]), - "dst_id", - ["dst", "dst_old"], - ) - - check_src = numbering.from_internal_vertex_id( - renumbered_df, "src_id" - ).compute() - check_dst = numbering.from_internal_vertex_id( - renumbered_df, "dst_id" - ).compute() - - assert check_src["0"].to_pandas().equals(check_src["src"].to_pandas()) - assert check_src["1"].to_pandas().equals(check_src["src_old"].to_pandas()) - assert check_dst["0"].to_pandas().equals(check_dst["dst"].to_pandas()) - assert check_dst["1"].to_pandas().equals(check_dst["dst_old"].to_pandas()) - - -# Test all combinations of default/managed and pooled/non-pooled allocation -@pytest.mark.skipif( - is_single_gpu(), reason="skipping MG testing on Single GPU system" -) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED) -def test_mg_renumber2(graph_file, client_connection): - gc.collect() - - M = utils.read_csv_for_nx(graph_file) - sources = cudf.Series(M["0"]) - destinations = cudf.Series(M["1"]) - - translate = 1000 - - gdf = cudf.DataFrame() - gdf["src_old"] = sources - gdf["dst_old"] = destinations - gdf["src"] = sources + translate - gdf["dst"] = destinations + translate - gdf["weight"] = gdf.index.astype(np.float) - - ddf = dask.dataframe.from_pandas(gdf, npartitions=2) - - ren2, num2 = NumberMap.renumber( - ddf, ["src", "src_old"], ["dst", "dst_old"] - ) - - check_src = num2.from_internal_vertex_id(ren2, "src").compute() - check_src = check_src.sort_values("weight").reset_index(drop=True) - check_dst = num2.from_internal_vertex_id(ren2, "dst").compute() - check_dst = check_dst.sort_values("weight").reset_index(drop=True) - - assert check_src["0"].to_pandas().equals(gdf["src"].to_pandas()) - assert check_src["1"].to_pandas().equals(gdf["src_old"].to_pandas()) - assert check_dst["0"].to_pandas().equals(gdf["dst"].to_pandas()) - assert check_dst["1"].to_pandas().equals(gdf["dst_old"].to_pandas()) + # preserve_order is not supported for MG + renumbered_df, renumber_map = NumberMap.renumber(ddf, + ["src", "src_old"], + ["dst", "dst_old"], + preserve_order=False) + unrenumbered_df = renumber_map.unrenumber(renumbered_df, "src", + preserve_order=False) + unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst", + preserve_order=False) + + # sort needed only for comparisons, since preserve_order is False + gdf = gdf.sort_values(by=["src", "src_old", "dst", "dst_old"]) + gdf = gdf.reset_index() + unrenumbered_df = unrenumbered_df.compute() + unrenumbered_df = unrenumbered_df.sort_values(by=["0_src", "1_src", + "0_dst", "1_dst"]) + unrenumbered_df = unrenumbered_df.reset_index() + + assert gdf["src"].equals(unrenumbered_df["0_src"]) + assert gdf["src_old"].equals(unrenumbered_df["1_src"]) + assert gdf["dst"].equals(unrenumbered_df["0_dst"]) + assert gdf["dst_old"].equals(unrenumbered_df["1_dst"]) -# Test all combinations of default/managed and pooled/non-pooled allocation @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED) -def test_mg_renumber3(graph_file, client_connection): +@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED, + ids=[f"dataset={d.as_posix()}" + for d in utils.DATASETS_UNRENUMBERED]) +def test_mg_renumber_add_internal_vertex_id(graph_file, client_connection): gc.collect() M = utils.read_csv_for_nx(graph_file) diff --git a/python/cugraph/tests/dask/test_mg_replication.py b/python/cugraph/tests/dask/test_mg_replication.py index bb43d6c0f7a..3974cf9ed82 100644 --- a/python/cugraph/tests/dask/test_mg_replication.py +++ b/python/cugraph/tests/dask/test_mg_replication.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -34,7 +34,9 @@ @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS) +@pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS, + ids=[f"dataset={d.as_posix()}" + for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS) def test_replicate_cudf_dataframe_with_weights( input_data_path, mg_device_count @@ -60,7 +62,9 @@ def test_replicate_cudf_dataframe_with_weights( @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS) +@pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS, + ids=[f"dataset={d.as_posix()}" + for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS) def test_replicate_cudf_dataframe_no_weights(input_data_path, mg_device_count): gc.collect() @@ -84,7 +88,9 @@ def test_replicate_cudf_dataframe_no_weights(input_data_path, mg_device_count): @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS) +@pytest.mark.parametrize("input_data_path", DATASETS_OPTIONS, + ids=[f"dataset={d.as_posix()}" + for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS) def test_replicate_cudf_series(input_data_path, mg_device_count): gc.collect() @@ -114,7 +120,9 @@ def test_replicate_cudf_series(input_data_path, mg_device_count): @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS) +@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, + ids=[f"dataset={d.as_posix()}" + for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS) def test_enable_batch_no_context(graph_file, directed, mg_device_count): @@ -129,7 +137,9 @@ def test_enable_batch_no_context(graph_file, directed, mg_device_count): @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS) +@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, + ids=[f"dataset={d.as_posix()}" + for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS) def test_enable_batch_no_context_view_adj( @@ -145,7 +155,9 @@ def test_enable_batch_no_context_view_adj( @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS) +@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, + ids=[f"dataset={d.as_posix()}" + for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS) def test_enable_batch_context_then_views( @@ -174,7 +186,9 @@ def test_enable_batch_context_then_views( @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS) +@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, + ids=[f"dataset={d.as_posix()}" + for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS) def test_enable_batch_view_then_context(graph_file, directed, mg_device_count): @@ -205,7 +219,9 @@ def test_enable_batch_view_then_context(graph_file, directed, mg_device_count): @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS) +@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, + ids=[f"dataset={d.as_posix()}" + for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS) def test_enable_batch_context_no_context_views( @@ -230,7 +246,9 @@ def test_enable_batch_context_no_context_views( @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS) +@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, + ids=[f"dataset={d.as_posix()}" + for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS) def test_enable_batch_edgelist_replication( @@ -251,7 +269,9 @@ def test_enable_batch_edgelist_replication( @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS) +@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, + ids=[f"dataset={d.as_posix()}" + for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS) def test_enable_batch_adjlist_replication_weights( @@ -293,7 +313,9 @@ def test_enable_batch_adjlist_replication_weights( @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS) +@pytest.mark.parametrize("graph_file", DATASETS_OPTIONS, + ids=[f"dataset={d.as_posix()}" + for d in DATASETS_OPTIONS]) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("mg_device_count", MG_DEVICE_COUNT_OPTIONS) def test_enable_batch_adjlist_replication_no_weights( diff --git a/python/cugraph/tests/dask/test_mg_sssp.py b/python/cugraph/tests/dask/test_mg_sssp.py index d75d76d7fd4..9e1fd1ec82f 100644 --- a/python/cugraph/tests/dask/test_mg_sssp.py +++ b/python/cugraph/tests/dask/test_mg_sssp.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -35,7 +35,10 @@ def client_connection(): def test_dask_sssp(client_connection): gc.collect() + # FIXME: update this to allow dataset to be parameterized and have dataset + # part of test param id (see other tests) input_data_path = r"../datasets/netscience.csv" + print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( diff --git a/python/cugraph/tests/dask/test_mg_utility.py b/python/cugraph/tests/dask/test_mg_utility.py index 3217c1bef1a..150fa0137f5 100644 --- a/python/cugraph/tests/dask/test_mg_utility.py +++ b/python/cugraph/tests/dask/test_mg_utility.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -46,7 +46,10 @@ def client_connection(): is_single_gpu(), reason="skipping MG testing on Single GPU system" ) def test_from_edgelist(client_connection): + # FIXME: update this to allow dataset to be parameterized and have dataset + # part of test param id (see other tests) input_data_path = r"../datasets/karate.csv" + print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, From 62c1c6824ab9f4249ed227cb4954076d282d3b57 Mon Sep 17 00:00:00 2001 From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com> Date: Mon, 12 Apr 2021 08:39:03 -0500 Subject: [PATCH 6/6] Fixed copyright date and format. (#1526) Update copyright data and format Authors: - Rick Ratzel (https://github.com/rlratzel) Approvers: - Brad Rees (https://github.com/BradReesWork) URL: https://github.com/rapidsai/cugraph/pull/1526 --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index eb4745a61f0..3422428c96b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # -# Copyright (c) 2018-2020 NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. # # pygdf documentation build configuration file, created by # sphinx-quickstart on Wed May 3 10:59:22 2017.