diff --git a/src/fast_array_utils/conv/__init__.py b/src/fast_array_utils/conv/__init__.py index fea40e9..58afd56 100644 --- a/src/fast_array_utils/conv/__init__.py +++ b/src/fast_array_utils/conv/__init__.py @@ -21,25 +21,28 @@ @overload -def to_dense(x: CpuArray | DiskArray | types.sparray | types.spmatrix | types.CSDataset, /, *, to_cpu_memory: bool = False) -> NDArray[Any]: ... +def to_dense( + x: CpuArray | DiskArray | types.sparray | types.spmatrix | types.CSDataset, /, *, order: Literal["K", "A", "C", "F"] = "K", to_cpu_memory: bool = False +) -> NDArray[Any]: ... @overload -def to_dense(x: types.DaskArray, /, *, to_cpu_memory: Literal[False] = False) -> types.DaskArray: ... +def to_dense(x: types.DaskArray, /, *, order: Literal["K", "A", "C", "F"] = "K", to_cpu_memory: Literal[False] = False) -> types.DaskArray: ... @overload -def to_dense(x: types.DaskArray, /, *, to_cpu_memory: Literal[True]) -> NDArray[Any]: ... +def to_dense(x: types.DaskArray, /, *, order: Literal["K", "A", "C", "F"] = "K", to_cpu_memory: Literal[True]) -> NDArray[Any]: ... @overload -def to_dense(x: GpuArray | types.CupySpMatrix, /, *, to_cpu_memory: Literal[False] = False) -> types.CupyArray: ... +def to_dense(x: GpuArray | types.CupySpMatrix, /, *, order: Literal["K", "A", "C", "F"] = "K", to_cpu_memory: Literal[False] = False) -> types.CupyArray: ... @overload -def to_dense(x: GpuArray | types.CupySpMatrix, /, *, to_cpu_memory: Literal[True]) -> NDArray[Any]: ... +def to_dense(x: GpuArray | types.CupySpMatrix, /, *, order: Literal["K", "A", "C", "F"] = "K", to_cpu_memory: Literal[True]) -> NDArray[Any]: ... def to_dense( x: CpuArray | GpuArray | DiskArray | types.CSDataset | types.DaskArray | types.sparray | types.spmatrix | types.CupySpMatrix, /, *, + order: Literal["K", "A", "C", "F"] = "K", to_cpu_memory: bool = False, ) -> NDArray[Any] | types.DaskArray | types.CupyArray: r"""Convert x to a dense array. @@ -52,6 +55,16 @@ def to_dense( ---------- x Input object to be converted. + order + The order of the output array: ``C`` (row-major) or ``F`` (column-major). ``K`` and ``A`` derive the order from ``x``. + + The default matches numpy, and therefore diverges from the ``scipy.sparse`` matrices’ + :meth:`~scipy.sparse.csr_array.toarray`\ ’s default behavior + of always returning a ``C``-contiguous array. + Instead, CSC matrices become F-contiguous arrays when ``order="K"`` (the default). + + Dask :class:`~dask.array.Array`\ s concatenation behavior will result in ``order`` + having no effect on the :func:`dask.compute` / ``to_cpu_memory=True`` result. to_cpu_memory Also load data into memory (resulting in a :class:`numpy.ndarray`). @@ -60,4 +73,4 @@ def to_dense( Dense form of ``x`` """ - return to_dense_(x, to_cpu_memory=to_cpu_memory) + return to_dense_(x, order=order, to_cpu_memory=to_cpu_memory) diff --git a/src/fast_array_utils/conv/_to_dense.py b/src/fast_array_utils/conv/_to_dense.py index 099995d..5656a3e 100644 --- a/src/fast_array_utils/conv/_to_dense.py +++ b/src/fast_array_utils/conv/_to_dense.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: MPL-2.0 from __future__ import annotations +import warnings from functools import partial, singledispatch from typing import TYPE_CHECKING, cast @@ -11,7 +12,7 @@ if TYPE_CHECKING: - from typing import Any + from typing import Any, Literal from numpy.typing import NDArray @@ -22,40 +23,57 @@ def to_dense_( x: CpuArray | GpuArray | DiskArray | types.DaskArray | types.sparray | types.spmatrix | types.CupySpMatrix, /, *, + order: Literal["K", "A", "C", "F"] = "K", to_cpu_memory: bool = False, ) -> NDArray[Any] | types.CupyArray | types.DaskArray: del to_cpu_memory # it already is - return np.asarray(x) + return np.asarray(x, order=order) @to_dense_.register(types.spmatrix | types.sparray) # type: ignore[call-overload,misc] -def _to_dense_cs(x: types.spmatrix | types.sparray, /, *, to_cpu_memory: bool = False) -> NDArray[Any]: +def _to_dense_cs(x: types.spmatrix | types.sparray, /, *, order: Literal["K", "A", "C", "F"] = "K", to_cpu_memory: bool = False) -> NDArray[Any]: from . import scipy del to_cpu_memory # it already is - return scipy.to_dense(x) + return scipy.to_dense(x, order=sparse_order(x, order=order)) @to_dense_.register(types.DaskArray) -def _to_dense_dask(x: types.DaskArray, /, *, to_cpu_memory: bool = False) -> NDArray[Any] | types.DaskArray: +def _to_dense_dask(x: types.DaskArray, /, *, order: Literal["K", "A", "C", "F"] = "K", to_cpu_memory: bool = False) -> NDArray[Any] | types.DaskArray: from . import to_dense - x = x.map_blocks(partial(to_dense, to_cpu_memory=to_cpu_memory)) + if order == "F": + msg = f"{order=!r} will probably be ignored: Dask can not be made to emit F-contiguous arrays reliably." + warnings.warn(msg, RuntimeWarning, stacklevel=4) + x = x.map_blocks(partial(to_dense, order=order, to_cpu_memory=to_cpu_memory)) return x.compute() if to_cpu_memory else x # type: ignore[return-value] @to_dense_.register(types.CSDataset) -def _to_dense_ooc(x: types.CSDataset, /, *, to_cpu_memory: bool = False) -> NDArray[Any]: +def _to_dense_ooc(x: types.CSDataset, /, *, order: Literal["K", "A", "C", "F"] = "K", to_cpu_memory: bool = False) -> NDArray[Any]: from . import to_dense if not to_cpu_memory: msg = "to_cpu_memory must be True if x is an CS{R,C}Dataset" raise ValueError(msg) # TODO(flying-sheep): why is to_memory of type Any? # noqa: TD003 - return to_dense(cast("types.CSBase", x.to_memory())) + return to_dense(cast("types.CSBase", x.to_memory()), order=sparse_order(x, order=order)) @to_dense_.register(types.CupyArray | types.CupySpMatrix) # type: ignore[call-overload,misc] -def _to_dense_cupy(x: GpuArray, /, *, to_cpu_memory: bool = False) -> NDArray[Any] | types.CupyArray: - x = x.toarray() if isinstance(x, types.CupySpMatrix) else x - return x.get() if to_cpu_memory else x +def _to_dense_cupy(x: GpuArray, /, *, order: Literal["K", "A", "C", "F"] = "K", to_cpu_memory: bool = False) -> NDArray[Any] | types.CupyArray: + import cupy as cu + + x = x.toarray(sparse_order(x, order=order)) if isinstance(x, types.CupySpMatrix) else cu.asarray(x, order=order) + return x.get(order="A") if to_cpu_memory else x + + +def sparse_order(x: types.spmatrix | types.sparray | types.CupySpMatrix | types.CSDataset, /, *, order: Literal["K", "A", "C", "F"]) -> Literal["C", "F"]: + if TYPE_CHECKING: + from scipy.sparse._base import _spbase + + assert isinstance(x, _spbase | types.CSDataset) + + if order in {"K", "A"}: + order = "F" if x.format == "csc" else "C" + return cast("Literal['C', 'F']", order) diff --git a/src/fast_array_utils/types.py b/src/fast_array_utils/types.py index 99ca6f3..6269c69 100644 --- a/src/fast_array_utils/types.py +++ b/src/fast_array_utils/types.py @@ -8,11 +8,13 @@ __all__ = [ + "COOBase", "CSArray", "CSBase", "CSDataset", "CSMatrix", "CupyArray", + "CupyCOOMatrix", "CupyCSCMatrix", "CupyCSMatrix", "CupyCSRMatrix", @@ -22,6 +24,14 @@ "H5Group", "ZarrArray", "ZarrGroup", + "coo_array", + "coo_matrix", + "csc_array", + "csc_matrix", + "csr_array", + "csr_matrix", + "sparray", + "spmatrix", ] T_co = TypeVar("T_co", covariant=True) @@ -29,23 +39,26 @@ # scipy sparse if TYPE_CHECKING: - from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix, sparray, spmatrix + from scipy.sparse import coo_array, coo_matrix, csc_array, csc_matrix, csr_array, csr_matrix, sparray, spmatrix else: try: # cs?_array isn’t available in older scipy versions - from scipy.sparse import csc_array, csr_array, sparray + from scipy.sparse import coo_array, csc_array, csr_array, sparray except ImportError: # pragma: no cover + coo_array = type("coo_array", (), {}) csc_array = type("csc_array", (), {}) csr_array = type("csr_array", (), {}) sparray = type("sparray", (), {}) - csc_array.__module__ = csr_array.__module__ = sparray.__module__ = "scipy.sparse" + coo_array.__module__ = csc_array.__module__ = csr_array.__module__ = sparray.__module__ = "scipy.sparse" try: # cs?_matrix is available when scipy is installed - from scipy.sparse import csc_matrix, csr_matrix, spmatrix + from scipy.sparse import coo_matrix, csc_matrix, csr_matrix, spmatrix except ImportError: # pragma: no cover + coo_matrix = type("coo_matrix", (), {}) csc_matrix = type("csc_matrix", (), {}) csr_matrix = type("csr_matrix", (), {}) spmatrix = type("spmatrix", (), {}) - csc_matrix.__module__ = csr_matrix.__module__ = spmatrix.__module__ = "scipy.sparse" + coo_matrix.__module__ = csc_matrix.__module__ = csr_matrix.__module__ = spmatrix.__module__ = "scipy.sparse" +COOBase = coo_matrix | coo_array CSMatrix = csc_matrix | csr_matrix CSArray = csc_array | csr_array CSBase = CSMatrix | CSArray @@ -54,16 +67,18 @@ if TYPE_CHECKING or find_spec("cupy"): # cupy always comes with cupyx from cupy import ndarray as CupyArray + from cupyx.scipy.sparse import coo_matrix as CupyCOOMatrix from cupyx.scipy.sparse import csc_matrix as CupyCSCMatrix from cupyx.scipy.sparse import csr_matrix as CupyCSRMatrix from cupyx.scipy.sparse import spmatrix as CupySpMatrix else: # pragma: no cover CupyArray = type("ndarray", (), {}) CupyArray.__module__ = "cupy" + CupyCOOMatrix = type("coo_matrix", (), {}) CupyCSCMatrix = type("csc_matrix", (), {}) CupyCSRMatrix = type("csr_matrix", (), {}) CupySpMatrix = type("spmatrix", (), {}) - CupyCSCMatrix.__module__ = CupyCSRMatrix.__module__ = CupySpMatrix.__module__ = "cupyx.scipy.sparse" + CupyCOOMatrix.__module__ = CupyCSCMatrix.__module__ = CupyCSRMatrix.__module__ = CupySpMatrix.__module__ = "cupyx.scipy.sparse" CupyCSMatrix = CupyCSRMatrix | CupyCSCMatrix diff --git a/src/testing/fast_array_utils/_array_type.py b/src/testing/fast_array_utils/_array_type.py index 0459079..4062a3c 100644 --- a/src/testing/fast_array_utils/_array_type.py +++ b/src/testing/fast_array_utils/_array_type.py @@ -22,14 +22,14 @@ import h5py from numpy.typing import ArrayLike, DTypeLike, NDArray - from fast_array_utils.types import CSBase from fast_array_utils.typing import CpuArray, DiskArray, GpuArray InnerArray = CpuArray | GpuArray | DiskArray Array: TypeAlias = InnerArray | types.DaskArray | types.CSDataset + ExtendedArray = Array | types.COOBase | types.CupyCOOMatrix - Arr = TypeVar("Arr", bound=Array, default=Array) - Arr_co = TypeVar("Arr_co", bound=Array, covariant=True) + Arr = TypeVar("Arr", bound=ExtendedArray, default=Array) + Arr_co = TypeVar("Arr_co", bound=ExtendedArray, covariant=True) Inner = TypeVar("Inner", bound="ArrayType[InnerArray, None] | None", default=Any) @@ -305,7 +305,7 @@ def _to_scipy_sparse( /, *, dtype: DTypeLike | None = None, - cls: type[CSBase] | None = None, + cls: type[types.CSBase] | None = None, ) -> types.CSBase: """Convert to a scipy sparse matrix/array.""" if isinstance(x, types.DaskArray): diff --git a/tests/conftest.py b/tests/conftest.py index a348e19..d389f5c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,6 +13,8 @@ if TYPE_CHECKING: from collections.abc import Callable + from fast_array_utils import types + @pytest.fixture def dask_viz(request: pytest.FixtureRequest, cache: pytest.Cache) -> Callable[[object], None]: @@ -41,5 +43,5 @@ def viz(obj: object) -> None: @pytest.fixture(scope="session", params=COO_PARAMS) -def coo_matrix_type(request: pytest.FixtureRequest) -> ArrayType: - return cast("ArrayType", request.param) +def coo_matrix_type(request: pytest.FixtureRequest) -> ArrayType[types.COOBase | types.CupyCOOMatrix]: + return cast("ArrayType[types.COOBase | types.CupyCOOMatrix]", request.param) diff --git a/tests/test_test_utils.py b/tests/test_test_utils.py index bea9ce5..a78578a 100644 --- a/tests/test_test_utils.py +++ b/tests/test_test_utils.py @@ -15,9 +15,7 @@ if TYPE_CHECKING: from typing import Any - from cupyx.scipy.sparse import coo_matrix as CupyCooMatrix from numpy.typing import DTypeLike, NDArray - from scipy.sparse import coo_array, coo_matrix from testing.fast_array_utils import Array, ArrayType @@ -54,7 +52,7 @@ def test_conv_other(array_type: ArrayType, other_array_type: ArrayType) -> None: @pytest.mark.array_type(skip=Flags.Dask | Flags.Disk | Flags.Gpu) def test_conv_extra( array_type: ArrayType[NDArray[np.number[Any]] | types.CSBase], - coo_matrix_type: ArrayType[coo_matrix | coo_array | CupyCooMatrix], + coo_matrix_type: ArrayType[types.COOBase | types.CupyCOOMatrix], ) -> None: src_arr = array_type(np.arange(12).reshape(3, 4), dtype=np.float32) arr = coo_matrix_type(src_arr) diff --git a/tests/test_to_dense.py b/tests/test_to_dense.py index 119441c..bb61218 100644 --- a/tests/test_to_dense.py +++ b/tests/test_to_dense.py @@ -13,46 +13,98 @@ if TYPE_CHECKING: - from typing import TypeAlias + from collections.abc import Iterable + from typing import Literal, TypeAlias from fast_array_utils.typing import CpuArray, DiskArray, GpuArray from testing.fast_array_utils import ArrayType Array: TypeAlias = CpuArray | GpuArray | DiskArray | types.CSDataset | types.DaskArray + ExtendedArray: TypeAlias = Array | types.COOBase | types.CupyCOOMatrix WARNS_NUMBA = pytest.warns(RuntimeWarning, match="numba is not installed; falling back to slow conversion") @pytest.mark.parametrize("to_cpu_memory", [True, False], ids=["to_cpu_memory", "not_to_cpu_memory"]) -def test_to_dense(array_type: ArrayType[Array], *, to_cpu_memory: bool) -> None: +@pytest.mark.parametrize("order", argvalues=["K", "C", "F"]) # “A” behaves like “K” +def test_to_dense(array_type: ArrayType[Array], *, order: Literal["K", "C", "F"], to_cpu_memory: bool) -> None: x = array_type([[1, 2, 3], [4, 5, 6]], dtype=np.float32) if not to_cpu_memory and array_type.cls in {types.CSCDataset, types.CSRDataset}: with pytest.raises(ValueError, match="to_cpu_memory must be True if x is an CS{R,C}Dataset"): - to_dense(x, to_cpu_memory=to_cpu_memory) + to_dense(x, order=order, to_cpu_memory=to_cpu_memory) return - with WARNS_NUMBA if issubclass(array_type.cls, types.CSBase) and not find_spec("numba") else nullcontext(): - arr = to_dense(x, to_cpu_memory=to_cpu_memory) + with ( + pytest.warns(RuntimeWarning, match="Dask can not be made to emit F-contiguous arrays") + if (order == "F" and array_type.cls is types.DaskArray) + else nullcontext(), + WARNS_NUMBA if issubclass(array_type.cls, types.CSBase) and not find_spec("numba") else nullcontext(), + ): + arr = to_dense(x, order=order, to_cpu_memory=to_cpu_memory) + assert_expected_cls(x, arr, to_cpu_memory=to_cpu_memory) assert arr.shape == (2, 3) + # Dask is unreliable: for explicit “F”, we emit a warning (tested above), for “K” we just ignore the result + if not (array_type.cls is types.DaskArray and order in {"F", "K"}): + assert_expected_order(x, arr, order=order) @pytest.mark.parametrize("to_cpu_memory", [True, False], ids=["to_cpu_memory", "not_to_cpu_memory"]) -def test_to_dense_extra(coo_matrix_type: ArrayType[Array], *, to_cpu_memory: bool) -> None: +@pytest.mark.parametrize("order", argvalues=["K", "C", "F"]) # “A” behaves like “K” +def test_to_dense_extra(coo_matrix_type: ArrayType[types.COOBase | types.CupyCOOMatrix], *, order: Literal["K", "C", "F"], to_cpu_memory: bool) -> None: src_mtx = coo_matrix_type([[1, 2, 3], [4, 5, 6]], dtype=np.float32) + with WARNS_NUMBA if not find_spec("numba") else nullcontext(): - arr = to_dense(src_mtx, to_cpu_memory=to_cpu_memory) + arr = to_dense(src_mtx, order=order, to_cpu_memory=to_cpu_memory) + assert_expected_cls(src_mtx, arr, to_cpu_memory=to_cpu_memory) assert arr.shape == (2, 3) + assert_expected_order(src_mtx, arr, order=order) -def assert_expected_cls(orig: Array, converted: Array, *, to_cpu_memory: bool) -> None: +def assert_expected_cls(orig: ExtendedArray, converted: Array, *, to_cpu_memory: bool) -> None: match (to_cpu_memory, orig): case False, types.DaskArray(): assert isinstance(converted, types.DaskArray) - assert_expected_cls(orig._meta, converted._meta, to_cpu_memory=to_cpu_memory) # noqa: SLF001 + assert_expected_cls(orig.compute(), converted.compute(), to_cpu_memory=to_cpu_memory) case False, types.CupyArray() | types.CupySpMatrix(): assert isinstance(converted, types.CupyArray) case _: assert isinstance(converted, np.ndarray) + + +def assert_expected_order(orig: ExtendedArray, converted: Array, *, order: Literal["K", "C", "F"]) -> None: + match converted: + case types.CupyArray() | np.ndarray(): + orders = {order_exp: converted.flags[f"{order_exp}_CONTIGUOUS"] for order_exp in (get_orders(orig) if order == "K" else {order})} # type: ignore[index] + assert any(orders.values()), orders + case types.DaskArray(): + assert_expected_order(orig, converted.compute(), order=order) + case _: + pytest.fail(f"Unsupported array type: {type(converted)}") + + +def get_orders(orig: ExtendedArray) -> Iterable[Literal["C", "F"]]: + """Get the orders of an array. + + Numpy arrays with at most one axis of a length >1 are valid in both orders. + So are COO sparse matrices/arrays. + """ + match orig: + case np.ndarray() | types.CupyArray(): + if orig.flags.c_contiguous: + yield "C" + if orig.flags.f_contiguous: + yield "F" + case _ if isinstance(orig, types.CSBase | types.COOBase | types.CupyCSMatrix | types.CupyCOOMatrix | types.CSDataset): + if orig.format in {"csr", "coo"}: + yield "C" + if orig.format in {"csc", "coo"}: + yield "F" + case types.DaskArray(): + yield from get_orders(orig.compute()) + case types.ZarrArray() | types.H5Dataset(): + yield "C" + case _: + pytest.fail(f"Unsupported array type: {type(orig)}") diff --git a/typings/cupy/_core/core.pyi b/typings/cupy/_core/core.pyi index ccd3874..f8d459e 100644 --- a/typings/cupy/_core/core.pyi +++ b/typings/cupy/_core/core.pyi @@ -3,6 +3,8 @@ from types import EllipsisType from typing import Any, Literal, Self, overload import numpy as np +from cupy.cuda import Stream +from numpy._core.multiarray import flagsobj from numpy.typing import NDArray class ndarray: @@ -10,9 +12,12 @@ class ndarray: shape: tuple[int, ...] size: int ndim: int + flags: flagsobj # cupy-specific - def get(self) -> NDArray[Any]: ... + def get( + self, stream: Stream | None = None, order: Literal["C", "F", "A"] = "C", out: NDArray[Any] | None = None, blocking: bool = True + ) -> NDArray[Any]: ... # operators def __array__(self) -> NDArray[Any]: ...