Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 148 additions & 13 deletions src/fast_array_utils/stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@

from __future__ import annotations

from typing import TYPE_CHECKING, overload
from typing import TYPE_CHECKING, cast, get_args, overload

from .._validation import validate_axis
from ..typing import CpuArray, DiskArray, GpuArray # noqa: TC001
from ._generic_ops import DtypeOps


if TYPE_CHECKING:
Expand All @@ -21,9 +22,11 @@
from optype.numpy import ToDType

from .. import types
from ._generic_ops import Ops
from ._typing import NoDtypeOps, StatFunDtype, StatFunNoDtype


__all__ = ["is_constant", "mean", "mean_var", "sum"]
__all__ = ["is_constant", "max", "mean", "mean_var", "min", "sum"]


@overload
Expand Down Expand Up @@ -201,26 +204,161 @@ def mean_var(
return mean_var_(x, axis=axis, correction=correction) # type: ignore[no-any-return]


@overload
def _mk_generic_op(op: NoDtypeOps) -> StatFunNoDtype: ...
@overload
def _mk_generic_op(op: DtypeOps) -> StatFunDtype: ...


# TODO(flying-sheep): support CSDataset (TODO)
# https://github.com/scverse/fast-array-utils/issues/52
def _mk_generic_op(op: Ops) -> StatFunNoDtype | StatFunDtype:
def _generic_op(
x: CpuArray | GpuArray | DiskArray | types.DaskArray,
/,
*,
axis: Literal[0, 1] | None = None,
dtype: DTypeLike | None = None,
keep_cupy_as_array: bool = False,
) -> NDArray[Any] | np.number[Any] | types.CupyArray | types.DaskArray:
from ._generic_ops import generic_op

assert dtype is None or op in get_args(DtypeOps), f"`dtype` is not supported for operation '{op}'"

validate_axis(x.ndim, axis)
return generic_op(x, op, axis=axis, keep_cupy_as_array=keep_cupy_as_array, dtype=dtype)

_generic_op.__name__ = op
return cast("StatFunNoDtype | StatFunDtype", _generic_op)


_min = _mk_generic_op("min")
_max = _mk_generic_op("max")
_sum = _mk_generic_op("sum")


@overload
def sum(x: CpuArray | DiskArray, /, *, axis: None = None, dtype: DTypeLike | None = None, keep_cupy_as_array: bool = False) -> np.number[Any]: ...
def min(x: CpuArray | DiskArray, /, *, axis: None = None, keep_cupy_as_array: bool = False) -> np.number[Any]: ...
@overload
def sum(x: CpuArray | DiskArray, /, *, axis: Literal[0, 1], dtype: DTypeLike | None = None, keep_cupy_as_array: bool = False) -> NDArray[Any]: ...
def min(x: CpuArray | DiskArray, /, *, axis: Literal[0, 1], keep_cupy_as_array: bool = False) -> NDArray[Any]: ...
@overload
def min(x: GpuArray, /, *, axis: None = None, keep_cupy_as_array: Literal[False] = False) -> np.number[Any]: ...
@overload
def min(x: GpuArray, /, *, axis: None, keep_cupy_as_array: Literal[True]) -> types.CupyArray: ...
@overload
def min(x: GpuArray, /, *, axis: Literal[0, 1], keep_cupy_as_array: bool = False) -> types.CupyArray: ...
@overload
def min(x: types.DaskArray, /, *, axis: Literal[0, 1] | None = None, keep_cupy_as_array: bool = False) -> types.DaskArray: ...
def min(
x: CpuArray | GpuArray | DiskArray | types.DaskArray,
/,
*,
axis: Literal[0, 1] | None = None,
keep_cupy_as_array: bool = False,
) -> object:
"""Find the minimum along both or one axis.

Parameters
----------
x
Array to find the minimum(s) in.
axis
Axis to reduce over.

Returns
-------
If ``axis`` is :data:`None`, then the minimum element is returned as a scalar.
Otherwise, the minimum along the given axis is returned as a 1D array.

Example
-------
>>> import numpy as np
>>> x = np.array([
... [0, 1, 2],
... [1, 1, 1],
... ])
>>> min(x)
0
>>> min(x, axis=0)
array([0, 1, 1])
>>> min(x, axis=1)
array([0, 1])

See Also
--------
:func:`numpy.min`

"""
return _min(x, axis=axis, keep_cupy_as_array=keep_cupy_as_array)


@overload
def sum(x: GpuArray, /, *, axis: None = None, dtype: DTypeLike | None = None, keep_cupy_as_array: Literal[False] = False) -> np.number[Any]: ...
def max(x: CpuArray | DiskArray, /, *, axis: None = None, keep_cupy_as_array: bool = False) -> np.number[Any]: ...
@overload
def sum(x: GpuArray, /, *, axis: None, dtype: DTypeLike | None = None, keep_cupy_as_array: Literal[True]) -> types.CupyArray: ...
def max(x: CpuArray | DiskArray, /, *, axis: Literal[0, 1], keep_cupy_as_array: bool = False) -> NDArray[Any]: ...
@overload
def sum(x: GpuArray, /, *, axis: Literal[0, 1], dtype: DTypeLike | None = None, keep_cupy_as_array: bool = False) -> types.CupyArray: ...
def max(x: GpuArray, /, *, axis: None = None, keep_cupy_as_array: Literal[False] = False) -> np.number[Any]: ...
@overload
def max(x: GpuArray, /, *, axis: None, keep_cupy_as_array: Literal[True]) -> types.CupyArray: ...
@overload
def max(x: GpuArray, /, *, axis: Literal[0, 1], keep_cupy_as_array: bool = False) -> types.CupyArray: ...
@overload
def max(x: types.DaskArray, /, *, axis: Literal[0, 1] | None = None, keep_cupy_as_array: bool = False) -> types.DaskArray: ...
def max(
x: CpuArray | GpuArray | DiskArray | types.DaskArray,
/,
*,
axis: Literal[0, 1] | None = None,
keep_cupy_as_array: bool = False,
) -> object:
"""Find the maximum along both or one axis.

Parameters
----------
x
Array to find the maximum(s) in.
axis
Axis to reduce over.

Returns
-------
If ``axis`` is :data:`None`, then the maximum element is returned as a scalar.
Otherwise, the maximum along the given axis is returned as a 1D array.

@overload
def sum(x: types.DaskArray, /, *, axis: Literal[0, 1] | None = None, dtype: DTypeLike | None = None, keep_cupy_as_array: bool = False) -> types.DaskArray: ...
Example
-------
>>> import numpy as np
>>> x = np.array([
... [0, 1, 2],
... [0, 0, 0],
... ])
>>> max(x)
2
>>> max(x, axis=0)
array([0, 1, 2])
>>> max(x, axis=1)
array([2, 0])

See Also
--------
:func:`numpy.max`

"""
return _max(x, axis=axis, keep_cupy_as_array=keep_cupy_as_array)


@overload
def sum(x: CpuArray | DiskArray, /, *, axis: None = None, dtype: DTypeLike | None = None, keep_cupy_as_array: bool = False) -> np.number[Any]: ...
@overload
def sum(x: CpuArray | DiskArray, /, *, axis: Literal[0, 1], dtype: DTypeLike | None = None, keep_cupy_as_array: bool = False) -> NDArray[Any]: ...
@overload
def sum(x: GpuArray, /, *, axis: None = None, dtype: DTypeLike | None = None, keep_cupy_as_array: Literal[False] = False) -> np.number[Any]: ...
@overload
def sum(x: GpuArray, /, *, axis: None = None, dtype: DTypeLike | None = None, keep_cupy_as_array: Literal[True]) -> types.CupyArray: ...
@overload
def sum(x: GpuArray, /, *, axis: Literal[0, 1], dtype: DTypeLike | None = None, keep_cupy_as_array: bool = False) -> types.CupyArray: ...
@overload
def sum(x: types.DaskArray, /, *, axis: Literal[0, 1] | None = None, dtype: DTypeLike | None = None, keep_cupy_as_array: bool = False) -> types.DaskArray: ...
def sum(
x: CpuArray | GpuArray | DiskArray | types.DaskArray,
/,
Expand Down Expand Up @@ -262,7 +400,4 @@ def sum(
:func:`numpy.sum`

"""
from ._sum import sum_

validate_axis(x.ndim, axis)
return sum_(x, axis=axis, dtype=dtype, keep_cupy_as_array=keep_cupy_as_array)
return _sum(x, axis=axis, dtype=dtype, keep_cupy_as_array=keep_cupy_as_array)
110 changes: 110 additions & 0 deletions src/fast_array_utils/stats/_generic_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations

from functools import singledispatch
from typing import TYPE_CHECKING, cast, get_args

import numpy as np

from .. import types
from ._typing import DtypeOps
from ._utils import _dask_inner


if TYPE_CHECKING:
from typing import Any, Literal, TypeAlias

from numpy.typing import DTypeLike, NDArray

from ..typing import CpuArray, DiskArray, GpuArray
from ._typing import Ops

ComplexAxis: TypeAlias = tuple[Literal[0], Literal[1]] | tuple[Literal[0, 1]] | Literal[0, 1] | None


def _run_numpy_op(
x: CpuArray | GpuArray | DiskArray | types.DaskArray,
op: Ops,
*,
axis: Literal[0, 1] | None = None,
dtype: DTypeLike | None = None,
) -> NDArray[Any] | np.number[Any] | types.CupyArray | types.DaskArray:
kwargs = {"dtype": dtype} if op in get_args(DtypeOps) else {}
return getattr(np, op)(x, axis=axis, **kwargs) # type: ignore[no-any-return]


@singledispatch
def generic_op(
x: CpuArray | GpuArray | DiskArray | types.DaskArray,
/,
op: Ops,
*,
axis: Literal[0, 1] | None = None,
dtype: DTypeLike | None = None,
keep_cupy_as_array: bool = False,
) -> NDArray[Any] | np.number[Any] | types.CupyArray | types.DaskArray:
del keep_cupy_as_array
if TYPE_CHECKING:
# these are never passed to this fallback function, but `singledispatch` wants them
assert not isinstance(x, types.CSBase | types.DaskArray | types.CupyArray | types.CupyCSMatrix)
# np supports these, but doesn’t know it. (TODO: test cupy)
assert not isinstance(x, types.ZarrArray | types.H5Dataset)
return cast("NDArray[Any] | np.number[Any]", _run_numpy_op(x, op, axis=axis, dtype=dtype))


@generic_op.register(types.CupyArray | types.CupyCSMatrix)
def _generic_op_cupy(
x: GpuArray,
/,
op: Ops,
*,
axis: Literal[0, 1] | None = None,
dtype: DTypeLike | None = None,
keep_cupy_as_array: bool = False,
) -> types.CupyArray | np.number[Any]:
arr = cast("types.CupyArray", _run_numpy_op(x, op, axis=axis, dtype=dtype))
return cast("np.number[Any]", arr.get()[()]) if not keep_cupy_as_array and axis is None else arr.squeeze()


@generic_op.register(types.CSBase)
def _generic_op_cs(
x: types.CSBase,
/,
op: Ops,
*,
axis: Literal[0, 1] | None = None,
dtype: DTypeLike | None = None,
keep_cupy_as_array: bool = False,
) -> NDArray[Any] | np.number[Any]:
del keep_cupy_as_array
import scipy.sparse as sp

# TODO(flying-sheep): once scipy fixes this issue, instead of all this,
# just convert to sparse array, then `return x.{op}(dtype=dtype)`
# https://github.com/scipy/scipy/issues/23768

kwargs = {"dtype": dtype} if op in get_args(DtypeOps) else {}
if axis is None:
return cast("np.number[Any]", getattr(x.data, op)(**kwargs))
if TYPE_CHECKING: # scipy-stubs thinks e.g. "int64" is invalid, which isn’t true
assert isinstance(dtype, np.dtype | type | None)
# convert to array so dimensions collapse as expected
x = (sp.csr_array if x.format == "csr" else sp.csc_array)(x, **kwargs) # type: ignore[call-overload]
return cast("NDArray[Any] | np.number[Any]", getattr(x, op)(axis=axis))


@generic_op.register(types.DaskArray)
def _generic_op_dask(
x: types.DaskArray,
/,
op: Ops,
*,
axis: Literal[0, 1] | None = None,
dtype: DTypeLike | None = None,
keep_cupy_as_array: bool = False,
) -> types.DaskArray:
if op in get_args(DtypeOps) and dtype is None:
# Explicitly use numpy result dtype (e.g. `NDArray[bool].sum().dtype == int64`)
dtype = getattr(np, op)(np.zeros(1, dtype=x.dtype)).dtype

return _dask_inner(x, op, axis=axis, dtype=dtype, keep_cupy_as_array=keep_cupy_as_array)
6 changes: 3 additions & 3 deletions src/fast_array_utils/stats/_mean.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import numpy as np

from ._sum import sum_
from . import sum


if TYPE_CHECKING:
Expand All @@ -24,6 +24,6 @@ def mean_(
axis: Literal[0, 1] | None = None,
dtype: DTypeLike | None = None,
) -> NDArray[np.number[Any]] | np.number[Any] | types.DaskArray:
total = sum_(x, axis=axis, dtype=dtype)
total = sum(x, axis=axis, dtype=dtype) # type: ignore[misc,arg-type]
n = np.prod(x.shape) if axis is None else x.shape[axis]
return total / n # type: ignore[operator,return-value]
return total / n # type: ignore[no-any-return]
Loading
Loading