From 106e48efec714486aead2c3d9210c1c85e008e3e Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 23 May 2024 22:35:33 +0200 Subject: [PATCH 01/13] wip --- src/zarr/array.py | 241 ++++- src/zarr/chunk_grids.py | 17 +- src/zarr/codecs/sharding.py | 7 +- src/zarr/indexing2.py | 1094 ++++++++++++++++++++++ src/zarr/metadata.py | 1 + tests/v3/test_indexing.py | 1696 +++++++++++++++++++++++++++++++++++ tests/v3/util.py | 64 ++ 7 files changed, 3077 insertions(+), 43 deletions(-) create mode 100644 src/zarr/indexing2.py create mode 100644 tests/v3/test_indexing.py create mode 100644 tests/v3/util.py diff --git a/src/zarr/array.py b/src/zarr/array.py index 7da39c285e..9d297da442 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import math # Notes on what I've changed here: # 1. Split Array into AsyncArray and Array @@ -35,10 +36,27 @@ concurrent_map, ) from zarr.config import config -from zarr.indexing import BasicIndexer +from zarr.indexing2 import ( + BasicIndexer, + BlockIndex, + BlockIndexer, + CoordinateIndexer, + Fields, + Indexer, + MaskIndexer, + OIndex, + OrthogonalIndexer, + VIndex, + is_pure_fancy_indexing, + is_pure_orthogonal_indexing, + pop_fields, +) from zarr.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata, parse_indexing_order from zarr.store import StoreLike, StorePath, make_store_path from zarr.sync import sync +from zarr.v2.indexing import check_fields + +CoordinateSelection = Iterable[int | Iterable[int]] def parse_array_metadata(data: Any) -> ArrayMetadata: @@ -355,55 +373,62 @@ def dtype(self) -> np.dtype[Any]: def attrs(self) -> dict[str, JSON]: return self.metadata.attributes - async def getitem( - self, selection: Selection, *, factory: Factory.Create = NDBuffer.create + async def _get_selection( + self, + indexer: Indexer, + *, + factory: Factory.Create = NDBuffer.create, + fields: Fields | None = None, ) -> NDArrayLike: - indexer = BasicIndexer( - selection, - shape=self.metadata.shape, - chunk_grid=self.metadata.chunk_grid, - ) + # check fields are sensible + out_dtype = check_fields(fields, self.metadata.dtype) # setup output array out = factory( shape=indexer.shape, - dtype=self.metadata.dtype, + dtype=out_dtype, order=self.order, - fill_value=0, # TODO use fill_value + fill_value=self.metadata.fill_value, ) + if math.prod(indexer.shape) > 0: + # reading chunks and decoding them + await self.metadata.codec_pipeline.read( + [ + ( + self.store_path / self.metadata.encode_chunk_key(chunk_coords), + self.metadata.get_chunk_spec(chunk_coords, self.order), + chunk_selection, + out_selection, + ) + for chunk_coords, chunk_selection, out_selection in indexer + ], + out, + ) + return out.as_ndarray_like() - # reading chunks and decoding them - await self.metadata.codec_pipeline.read( - [ - ( - self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, self.order), - chunk_selection, - out_selection, - ) - for chunk_coords, chunk_selection, out_selection in indexer - ], - out, + async def getitem( + self, selection: Selection, *, factory: Factory.Create = NDBuffer.create + ) -> NDArrayLike: + indexer = BasicIndexer( + selection, + shape=self.metadata.shape, + chunk_grid=self.metadata.chunk_grid, ) - return out.as_ndarray_like() + return await self._get_selection(indexer, factory=factory) async def _save_metadata(self, metadata: ArrayMetadata) -> None: to_save = metadata.to_buffer_dict() awaitables = [set_or_delete(self.store_path / key, value) for key, value in to_save.items()] await gather(*awaitables) - async def setitem( + async def _set_selection( self, - selection: Selection, + indexer: Indexer, value: NDArrayLike, + *, factory: Factory.NDArrayLike = NDBuffer.from_ndarray_like, + fields: Fields | None = None, ) -> None: - indexer = BasicIndexer( - selection, - shape=self.metadata.shape, - chunk_grid=self.metadata.chunk_grid, - ) - sel_shape = indexer.shape # check value shape @@ -435,6 +460,19 @@ async def setitem( value_buffer, ) + async def setitem( + self, + selection: Selection, + value: NDArrayLike, + factory: Factory.NDArrayLike = NDBuffer.from_ndarray_like, + ) -> None: + indexer = BasicIndexer( + selection, + shape=self.metadata.shape, + chunk_grid=self.metadata.chunk_grid, + ) + return await self._set_selection(indexer, value, factory=factory) + async def resize( self, new_shape: ChunkCoords, delete_outside_chunks: bool = True ) -> AsyncArray: @@ -583,14 +621,143 @@ def order(self) -> Literal["C", "F"]: return self._async_array.order def __getitem__(self, selection: Selection) -> NDArrayLike: - return sync( - self._async_array.getitem(selection), - ) + fields, pure_selection = pop_fields(selection) + if is_pure_fancy_indexing(pure_selection, self.ndim): + result = self.vindex[selection] + elif is_pure_orthogonal_indexing(pure_selection, self.ndim): + result = self.get_orthogonal_selection(pure_selection, fields=fields) + else: + result = self.get_basic_selection(pure_selection, fields=fields) + return result def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: - sync( - self._async_array.setitem(selection, value), - ) + fields, pure_selection = pop_fields(selection) + if is_pure_fancy_indexing(pure_selection, self.ndim): + self.vindex[selection] = value + elif is_pure_orthogonal_indexing(pure_selection, self.ndim): + self.set_orthogonal_selection(pure_selection, value, fields=fields) + else: + self.set_basic_selection(pure_selection, value, fields=fields) + + def get_basic_selection( + self, selection: Selection = Ellipsis, out=None, fields: Fields | None = None + ) -> NDArrayLike: + check_fields(fields, self.dtype) + if self.shape == (): + raise NotImplementedError + else: + return sync( + self._async_array._get_selection( + BasicIndexer(selection, self.shape, self.metadata.chunk_grid) + ) + ) + + def set_basic_selection( + self, selection: Selection, value: NDArrayLike, fields: Fields | None = None + ) -> None: + check_fields(fields, self.dtype) + indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) + + sync(self._async_array._set_selection(indexer, value, fields=fields)) + + def get_orthogonal_selection( + self, selection: Selection, fields: Fields | None = None + ) -> NDArrayLike: + check_fields(fields, self.dtype) + + # setup indexer + indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) + return sync(self._async_array._get_selection(indexer=indexer)) + + def set_orthogonal_selection( + self, selection: Selection, value: NDArrayLike, fields: Fields | None = None + ) -> None: + indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) + + return sync(self._async_array._set_selection(indexer, value, fields=fields)) + + def get_mask_selection( + self, mask: npt.NDArray[Any], fields: Fields | None = None + ) -> NDArrayLike: + # check args + check_fields(fields, self.dtype) + + # setup indexer + indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + + return sync(self._async_array._get_selection(indexer=indexer, fields=fields)) + + def set_mask_selection( + self, mask: npt.NDArray[Any], value: NDArrayLike, fields: Fields | None = None + ) -> None: + # setup indexer + indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) + + sync(self._async_array._set_selection(indexer, value, fields=fields)) + + def get_coordinate_selection( + self, selection: CoordinateSelection, fields: Fields | None = None + ) -> NDArrayLike: + check_fields(fields, self.dtype) + + # setup indexer + indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) + + out = sync(self._async_array._get_selection(indexer=indexer, fields=fields)) + + # restore shape + out = out.reshape(indexer.sel_shape) + + return out + + def set_coordinate_selection( + self, selection: CoordinateSelection, value: NDArrayLike, fields: Fields | None = None + ) -> None: + # setup indexer + indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) + + # # handle value - need ndarray-like flatten value + # if not is_scalar(value, self._dtype): + # try: + # value = ensure_ndarray_like(value) + # except TypeError: + # # Handle types like `list` or `tuple` + # value = np.array(value, like=self._meta_array) + if hasattr(value, "shape") and len(value.shape) > 1: + value = value.reshape(-1) + + sync(self._async_array._set_selection(indexer, value, fields=fields)) + + def get_block_selection( + self, selection: Selection, fields: Fields | None = None + ) -> NDArrayLike: + # check args + check_fields(fields, self.dtype) + + # setup indexer + indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) + + return sync(self._async_array._get_selection(indexer=indexer, fields=fields)) + + def set_block_selection( + self, selection: Selection, value: npt.NDArray[Any], fields: Fields | None = None + ) -> None: + # setup indexer + indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) + + sync(self._async_array._set_selection(indexer, value, fields=fields)) + + @property + def vindex(self) -> Any: + return VIndex(self) + + @property + def oindex(self) -> Any: + return OIndex(self) + + @property + def blocks(self) -> Any: + return BlockIndex(self) def resize(self, new_shape: ChunkCoords) -> Array: return type(self)( diff --git a/src/zarr/chunk_grids.py b/src/zarr/chunk_grids.py index f6366b8038..4fe90e4358 100644 --- a/src/zarr/chunk_grids.py +++ b/src/zarr/chunk_grids.py @@ -1,8 +1,11 @@ from __future__ import annotations import itertools +import operator +from abc import abstractmethod from collections.abc import Iterator from dataclasses import dataclass +from functools import reduce from typing import TYPE_CHECKING from zarr.abc.metadata import Metadata @@ -31,8 +34,13 @@ def from_dict(cls, data: dict[str, JSON] | ChunkGrid) -> ChunkGrid: return RegularChunkGrid._from_dict(data) raise ValueError(f"Unknown chunk grid. Got {name_parsed}.") + @abstractmethod def all_chunk_coords(self, array_shape: ChunkCoords) -> Iterator[ChunkCoords]: - raise NotImplementedError + pass + + @abstractmethod + def get_nchunks(self, array_shape: ChunkCoords) -> int: + pass @dataclass(frozen=True) @@ -57,3 +65,10 @@ def all_chunk_coords(self, array_shape: ChunkCoords) -> Iterator[ChunkCoords]: return itertools.product( *(range(0, _ceildiv(s, c)) for s, c in zip(array_shape, self.chunk_shape, strict=False)) ) + + def get_nchunks(self, array_shape: ChunkCoords) -> int: + return reduce( + operator.mul, + (_ceildiv(s, c) for s, c in zip(array_shape, self.chunk_shape, strict=True)), + 1, + ) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index a68577be68..0dc958e11f 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -34,11 +34,8 @@ parse_shapelike, product, ) -from zarr.indexing import ( - BasicIndexer, - c_order_iter, - morton_order_iter, -) +from zarr.indexing import c_order_iter, morton_order_iter +from zarr.indexing2 import BasicIndexer from zarr.metadata import ArrayMetadata, parse_codecs if TYPE_CHECKING: diff --git a/src/zarr/indexing2.py b/src/zarr/indexing2.py new file mode 100644 index 0000000000..24a8e8715e --- /dev/null +++ b/src/zarr/indexing2.py @@ -0,0 +1,1094 @@ +from __future__ import annotations + +from enum import Enum +import itertools +import math +import numbers +import operator +from collections.abc import Iterable, Iterator +from dataclasses import dataclass +from functools import reduce +from typing import TYPE_CHECKING, Any, NamedTuple, Protocol, runtime_checkable + +import numpy as np +import numpy.typing as npt + +from zarr.common import ChunkCoords, SliceSelection +from zarr.v2.errors import ( + ArrayIndexError, + BoundsCheckError, + NegativeStepError, + VindexInvalidSelectionError, + err_too_many_indices, +) + +if TYPE_CHECKING: + from zarr.array import Array + from zarr.buffer import NDArrayLike + from zarr.chunk_grids import ChunkGrid + +Selector = int | slice | npt.NDArray[Any] +Selection = tuple[Selector, ...] + + +@runtime_checkable +class Indexer(Protocol): + shape: ChunkCoords + + def __iter__(self) -> Iterator[ChunkProjection]: + ... + + +def is_integer(x: Selector) -> bool: + """True if x is an integer (both pure Python or NumPy). + + Note that Python's bool is considered an integer too. + """ + return isinstance(x, numbers.Integral) + + +def is_integer_list(x: Selector) -> bool: + """True if x is a list of integers. + + This function assumes ie *does not check* that all elements of the list + have the same type. Mixed type lists will result in other errors that will + bubble up anyway. + """ + return isinstance(x, list) and len(x) > 0 and is_integer(x[0]) + + +def is_integer_array(x: Selector, ndim: int | None = None) -> bool: + t = not np.isscalar(x) and hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype.kind in "ui" + if ndim is not None: + t = t and len(x.shape) == ndim + return t + + +def is_bool_array(x: Selector, ndim: int | None = None) -> bool: + t = hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype == bool + if ndim is not None: + t = t and len(x.shape) == ndim + return t + + +def is_scalar(value: Selector, dtype: npt.DTypeLike) -> bool: + if np.isscalar(value): + return True + if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names): + return True + return False + + +def is_pure_fancy_indexing(selection: Selection, ndim: int) -> bool: + """Check whether a selection contains only scalars or integer array-likes. + + Parameters + ---------- + selection : tuple, slice, or scalar + A valid selection value for indexing into arrays. + + Returns + ------- + is_pure : bool + True if the selection is a pure fancy indexing expression (ie not mixed + with boolean or slices). + """ + if ndim == 1: + if is_integer_list(selection) or is_integer_array(selection): + return True + # if not, we go through the normal path below, because a 1-tuple + # of integers is also allowed. + no_slicing = ( + isinstance(selection, tuple) + and len(selection) == ndim + and not (any(isinstance(elem, slice) or elem is Ellipsis for elem in selection)) + ) + return ( + no_slicing + and all( + is_integer(elem) or is_integer_list(elem) or is_integer_array(elem) + for elem in selection + ) + and any(is_integer_list(elem) or is_integer_array(elem) for elem in selection) + ) + + +def is_pure_orthogonal_indexing(selection: Selection, ndim: int) -> bool: + if not ndim: + return False + + # Case 1: Selection is a single iterable of integers + if is_integer_list(selection) or is_integer_array(selection, ndim=1): + return True + + # Case two: selection contains either zero or one integer iterables. + # All other selection elements are slices or integers + return ( + isinstance(selection, tuple) + and len(selection) == ndim + and sum(is_integer_list(elem) or is_integer_array(elem) for elem in selection) <= 1 + and all( + is_integer_list(elem) or is_integer_array(elem) or isinstance(elem, int | slice) + for elem in selection + ) + ) + + +def get_chunk_shape(chunk_grid: ChunkGrid) -> ChunkCoords: + from zarr.chunk_grids import RegularChunkGrid + + assert isinstance( + chunk_grid, RegularChunkGrid + ), "Only regular chunk grid is supported, currently." + return chunk_grid.chunk_shape + + +def normalize_integer_selection(dim_sel: int, dim_len: int) -> int: + # normalize type to int + dim_sel = int(dim_sel) + + # handle wraparound + if dim_sel < 0: + dim_sel = dim_len + dim_sel + + # handle out of bounds + if dim_sel >= dim_len or dim_sel < 0: + raise BoundsCheckError(dim_len) + + return dim_sel + + +class ChunkDimProjection(NamedTuple): + """A mapping from chunk to output array for a single dimension. + + Parameters + ---------- + dim_chunk_ix + Index of chunk. + dim_chunk_sel + Selection of items from chunk array. + dim_out_sel + Selection of items in target (output) array. + + """ + + dim_chunk_ix: int + dim_chunk_sel: slice + dim_out_sel: slice + + +@dataclass(frozen=True) +class IntDimIndexer: + dim_sel: int + dim_len: int + dim_chunk_len: int + nitems: int = 1 + + def __init__(self, dim_sel: int, dim_len: int, dim_chunk_len: int): + object.__setattr__(self, "dim_sel", normalize_integer_selection(dim_sel, dim_len)) + object.__setattr__(self, "dim_len", dim_len) + object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + + def __iter__(self) -> Iterator[ChunkDimProjection]: + dim_chunk_ix = self.dim_sel // self.dim_chunk_len + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel - dim_offset + dim_out_sel = None + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def ceildiv(a: int, b: int) -> int: + return math.ceil(a / b) + + +@dataclass(frozen=True) +class SliceDimIndexer: + dim_len: int + dim_chunk_len: int + + start: int + stop: int + step: int + + def __init__(self, dim_sel: slice, dim_len: int, dim_chunk_len: int): + # normalize + start, stop, step = dim_sel.indices(dim_len) + if step < 1: + raise NegativeStepError + + object.__setattr__(self, "start", start) + object.__setattr__(self, "stop", stop) + object.__setattr__(self, "step", step) + + object.__setattr__(self, "dim_len", dim_len) + object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "nitems", max(0, ceildiv((stop - start), step))) + object.__setattr__(self, "nchunks", ceildiv(dim_len, dim_chunk_len)) + + def __iter__(self) -> Iterator[ChunkDimProjection]: + # figure out the range of chunks we need to visit + dim_chunk_ix_from = self.start // self.dim_chunk_len + dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) + + # iterate over chunks in range + for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): + # compute offsets for chunk within overall array + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len) + + # determine chunk length, accounting for trailing chunk + dim_chunk_len = dim_limit - dim_offset + + if self.start < dim_offset: + # selection starts before current chunk + dim_chunk_sel_start = 0 + remainder = (dim_offset - self.start) % self.step + if remainder: + dim_chunk_sel_start += self.step - remainder + # compute number of previous items, provides offset into output array + dim_out_offset = ceildiv((dim_offset - self.start), self.step) + + else: + # selection starts within current chunk + dim_chunk_sel_start = self.start - dim_offset + dim_out_offset = 0 + + if self.stop > dim_limit: + # selection ends after current chunk + dim_chunk_sel_stop = dim_chunk_len + + else: + # selection ends within current chunk + dim_chunk_sel_stop = self.stop - dim_offset + + dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) + dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) + + # If there are no elements on the selection within this chunk, then skip + if dim_chunk_nitems == 0: + continue + + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def check_selection_length(selection: Selection, shape: ChunkCoords): + if len(selection) > len(shape): + err_too_many_indices(selection, shape) + + +def replace_ellipsis(selection: Selection, shape: ChunkCoords): + selection = ensure_tuple(selection) + + # count number of ellipsis present + n_ellipsis = sum(1 for i in selection if i is Ellipsis) + + if n_ellipsis > 1: + # more than 1 is an error + raise IndexError("an index can only have a single ellipsis ('...')") + + elif n_ellipsis == 1: + # locate the ellipsis, count how many items to left and right + n_items_l = selection.index(Ellipsis) # items to left of ellipsis + n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis + n_items = len(selection) - 1 # all non-ellipsis items + + if n_items >= len(shape): + # ellipsis does nothing, just remove it + selection = tuple(i for i in selection if i != Ellipsis) + + else: + # replace ellipsis with as many slices are needed for number of dims + new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) + if n_items_r: + new_item += selection[-n_items_r:] + selection = new_item + + # fill out selection if not completely specified + if len(selection) < len(shape): + selection += (slice(None),) * (len(shape) - len(selection)) + + # check selection not too long + check_selection_length(selection, shape) + + return selection + + +def replace_lists(selection: Selection) -> Selection: + return tuple( + np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel for dim_sel in selection + ) + + +def ensure_tuple(v): + if not isinstance(v, tuple): + v = (v,) + return v + + +class ChunkProjection(NamedTuple): + """A mapping of items from chunk to output array. Can be used to extract items from the + chunk array for loading into an output array. Can also be used to extract items from a + value array for setting/updating in a chunk array. + + Parameters + ---------- + chunk_coords + Indices of chunk. + chunk_selection + Selection of items from chunk array. + out_selection + Selection of items in target (output) array. + + """ + + chunk_coords: ChunkCoords + chunk_selection: SliceSelection + out_selection: SliceSelection + + +def is_slice(s: slice) -> bool: + return isinstance(s, slice) + + +def is_contiguous_slice(s: slice) -> bool: + return is_slice(s) and (s.step is None or s.step == 1) + + +def is_positive_slice(s: slice) -> bool: + return is_slice(s) and (s.step is None or s.step >= 1) + + +def is_contiguous_selection(selection: Selection) -> bool: + selection = ensure_tuple(selection) + return all((is_integer_array(s) or is_contiguous_slice(s) or s == Ellipsis) for s in selection) + + +def is_basic_selection(selection: Selection) -> bool: + selection = ensure_tuple(selection) + return all(is_integer(s) or is_positive_slice(s) for s in selection) + + +class BasicIndexer: + dim_indexers: list[IntDimIndexer | SliceDimIndexer] + shape: ChunkCoords + drop_axes: None + + def __init__( + self, + selection: Selection, + shape: ChunkCoords, + chunk_grid: ChunkGrid, + ): + chunk_shape = get_chunk_shape(chunk_grid) + # handle ellipsis + selection = replace_ellipsis(selection, shape) + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, shape, chunk_shape, strict=True): + if is_integer(dim_sel): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_slice(dim_sel): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError( + "unsupported selection item for basic indexing; " + f"expected integer or slice, got {type(dim_sel)!r}" + ) + + dim_indexers.append(dim_indexer) + + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) + self.drop_axes = None + + def __iter__(self) -> Iterator[ChunkProjection]: + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +@dataclass(frozen=True) +class BoolArrayDimIndexer: + dim_sel: npt.NDArray[np.bool_] + dim_len: int + dim_chunk_len: int + nchunks: int + + chunk_nitems: npt.NDArray[Any] + chunk_nitems_cumsum: npt.NDArray[Any] + nitems: int + dim_chunk_ixs: int + + def __init__(self, dim_sel: npt.NDArray[np.bool_], dim_len: int, dim_chunk_len: int): + # check number of dimensions + if not is_bool_array(dim_sel, 1): + raise IndexError("Boolean arrays in an orthogonal selection must be 1-dimensional only") + + # check shape + if dim_sel.shape[0] != dim_len: + raise IndexError( + "Boolean array has the wrong length for dimension; expected {}, got {}".format( + dim_len, dim_sel.shape[0] + ) + ) + + # precompute number of selected items for each chunk + nchunks = ceildiv(dim_len, dim_chunk_len) + chunk_nitems = np.zeros(nchunks, dtype="i8") + for dim_chunk_ix in range(nchunks): + dim_offset = dim_chunk_ix * dim_chunk_len + chunk_nitems[dim_chunk_ix] = np.count_nonzero( + dim_sel[dim_offset : dim_offset + dim_chunk_len] + ) + chunk_nitems_cumsum = np.cumsum(chunk_nitems) + nitems = chunk_nitems_cumsum[-1] + dim_chunk_ixs = np.nonzero(chunk_nitems)[0] + + # store attributes + object.__setattr__(self, "dim_sel", dim_sel) + object.__setattr__(self, "dim_len", dim_len) + object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "nchunks", nchunks) + object.__setattr__(self, "chunk_nitems", chunk_nitems) + object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) + object.__setattr__(self, "nitems", nitems) + object.__setattr__(self, "dim_chunk_ixs", dim_chunk_ixs) + + def __iter__(self) -> Iterator[ChunkDimProjection]: + # iterate over chunks with at least one item + for dim_chunk_ix in self.dim_chunk_ixs: + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] + + # pad out if final chunk + if dim_chunk_sel.shape[0] < self.dim_chunk_len: + tmp = np.zeros(self.dim_chunk_len, dtype=bool) + tmp[: dim_chunk_sel.shape[0]] = dim_chunk_sel + dim_chunk_sel = tmp + + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + dim_out_sel = slice(start, stop) + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +class Order(Enum): + UNKNOWN = 0 + INCREASING = 1 + DECREASING = 2 + UNORDERED = 3 + + @staticmethod + def check(a: npt.NDArray[Any]) -> Order: + diff = np.diff(a) + diff_positive = diff >= 0 + n_diff_positive = np.count_nonzero(diff_positive) + all_increasing = n_diff_positive == len(diff_positive) + any_increasing = n_diff_positive > 0 + if all_increasing: + order = Order.INCREASING + elif any_increasing: + order = Order.UNORDERED + else: + order = Order.DECREASING + return order + + +def wraparound_indices(x, dim_len): + loc_neg = x < 0 + if np.any(loc_neg): + x[loc_neg] = x[loc_neg] + dim_len + + +def boundscheck_indices(x, dim_len): + if np.any(x < 0) or np.any(x >= dim_len): + raise BoundsCheckError(dim_len) + + +class IntArrayDimIndexer: + """Integer array selection against a single dimension.""" + + dim_len: int + dim_chunk_len: int + nchunks: int + nitems: int + order: Order + dim_sel: int + dim_out_sel: int + chunk_nitems: int + dim_chunk_ixs: npt.NDArray[np.intp] + chunk_nitems_cumsum: npt.NDArray[np.intp] + + def __init__( + self, + dim_sel: int, + dim_len: int, + dim_chunk_len: int, + wraparound=True, + boundscheck=True, + order=Order.UNKNOWN, + ): + # ensure 1d array + dim_sel = np.asanyarray(dim_sel) + if not is_integer_array(dim_sel, 1): + raise IndexError("integer arrays in an orthogonal selection must be 1-dimensional only") + + # handle wraparound + if wraparound: + wraparound_indices(dim_sel, dim_len) + + # handle out of bounds + if boundscheck: + boundscheck_indices(dim_sel, dim_len) + + # store attributes + self.dim_len = dim_len + self.dim_chunk_len = dim_chunk_len + self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) + self.nitems = len(dim_sel) + + # determine which chunk is needed for each selection item + # note: for dense integer selections, the division operation here is the + # bottleneck + dim_sel_chunk = dim_sel // dim_chunk_len + + # determine order of indices + if order == Order.UNKNOWN: + order = Order.check(dim_sel) + self.order = Order(order) + + if self.order == Order.INCREASING: + self.dim_sel = dim_sel + self.dim_out_sel = None + elif self.order == Order.DECREASING: + self.dim_sel = dim_sel[::-1] + # TODO should be possible to do this without creating an arange + self.dim_out_sel = np.arange(self.nitems - 1, -1, -1) + else: + # sort indices to group by chunk + self.dim_out_sel = np.argsort(dim_sel_chunk) + self.dim_sel = np.take(dim_sel, self.dim_out_sel) + + # precompute number of selected items for each chunk + self.chunk_nitems = np.bincount(dim_sel_chunk, minlength=self.nchunks) + + # find chunks that we need to visit + self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] + + # compute offsets into the output array + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + + def __iter__(self) -> Iterator[ChunkDimProjection]: + for dim_chunk_ix in self.dim_chunk_ixs: + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + if self.order == Order.INCREASING: + dim_out_sel = slice(start, stop) + else: + dim_out_sel = self.dim_out_sel[start:stop] + + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[start:stop] - dim_offset + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def slice_to_range(s: slice, l: int): # noqa: E741 + return range(*s.indices(l)) + + +def ix_(selection: Selection, shape: ChunkCoords): + """Convert an orthogonal selection to a numpy advanced (fancy) selection, like numpy.ix_ + but with support for slices and single ints.""" + + # normalisation + selection = replace_ellipsis(selection, shape) + + # replace slice and int as these are not supported by numpy.ix_ + selection = [ + slice_to_range(dim_sel, dim_len) + if isinstance(dim_sel, slice) + else [dim_sel] + if is_integer(dim_sel) + else dim_sel + for dim_sel, dim_len in zip(selection, shape, strict=True) + ] + + # now get numpy to convert to a coordinate selection + selection = np.ix_(*selection) + + return selection + + +def oindex(a: npt.NDArray[Any], selection: Selection): + """Implementation of orthogonal indexing with slices and ints.""" + selection = replace_ellipsis(selection, a.shape) + drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) + selection = ix_(selection, a.shape) + result = a[selection] + if drop_axes: + result = result.squeeze(axis=drop_axes) + return result + + +def oindex_set(a: npt.NDArray[Any], selection: Selection, value): + selection = replace_ellipsis(selection, a.shape) + drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) + selection = ix_(selection, a.shape) + if not np.isscalar(value) and drop_axes: + value = np.asanyarray(value) + value_selection = [slice(None)] * len(a.shape) + for i in drop_axes: + value_selection[i] = np.newaxis + value_selection = tuple(value_selection) + value = value[value_selection] + a[selection] = value + + +# noinspection PyProtectedMember +class OrthogonalIndexer: + dim_indexers: list[IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer] + shape: ChunkCoords + chunk_shape: ChunkCoords + is_advanced: bool + drop_axes: tuple[int, ...] | None + + def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGrid): + chunk_shape = get_chunk_shape(chunk_grid) + + # handle ellipsis + selection = replace_ellipsis(selection, shape) + + # normalize list to array + selection = replace_lists(selection) + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, shape, chunk_shape, strict=True): + if is_integer(dim_sel): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif isinstance(dim_sel, slice): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_integer_array(dim_sel): + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_bool_array(dim_sel): + dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError( + "unsupported selection item for orthogonal indexing; " + "expected integer, slice, integer array or Boolean " + f"array, got {type(dim_sel)!r}" + ) + + dim_indexers.append(dim_indexer) + + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) + self.chunk_shape = chunk_shape + self.is_advanced = not is_basic_selection(selection) + if self.is_advanced: + self.drop_axes = tuple( + i + for i, dim_indexer in enumerate(self.dim_indexers) + if isinstance(dim_indexer, IntDimIndexer) + ) + else: + self.drop_axes = None + + def __iter__(self) -> Iterator[ChunkProjection]: + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + # handle advanced indexing arrays orthogonally + if self.is_advanced: + # N.B., numpy doesn't support orthogonal indexing directly as yet, + # so need to work around via np.ix_. Also np.ix_ does not support a + # mixture of arrays and slices or integers, so need to convert slices + # and integers into ranges. + chunk_selection = ix_(chunk_selection, self.chunk_shape) + + # special case for non-monotonic indices + if not is_basic_selection(out_selection): + out_selection = ix_(out_selection, self.shape) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +@dataclass(frozen=True) +class OIndex: + array: Array + + def __getitem__(self, selection: Selection) -> NDArrayLike: + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.get_orthogonal_selection(selection, fields=fields) + + def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.set_orthogonal_selection(selection, value, fields=fields) + + +# noinspection PyProtectedMember +class BlockIndexer: + dim_indexers: list[SliceDimIndexer] + shape: ChunkCoords + drop_axes: None + + def __init__(self, selection, shape: ChunkCoords, chunk_grid: ChunkGrid): + chunk_shape = get_chunk_shape(chunk_grid) + + # handle ellipsis + selection = replace_ellipsis(selection, shape) + + # normalize list to array + selection = replace_lists(selection) + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_size in zip(selection, shape, chunk_shape, strict=True): + dim_numchunks = int(np.ceil(dim_len / dim_chunk_size)) + + if is_integer(dim_sel): + if dim_sel < 0: + dim_sel = dim_numchunks + dim_sel + + start = dim_sel * dim_chunk_size + stop = start + dim_chunk_size + slice_ = slice(start, stop) + + elif is_slice(dim_sel): + start = dim_sel.start if dim_sel.start is not None else 0 + stop = dim_sel.stop if dim_sel.stop is not None else dim_numchunks + + if dim_sel.step not in {1, None}: + raise IndexError( + "unsupported selection item for block indexing; " + f"expected integer or slice with step=1, got {type(dim_sel)!r}" + ) + + # Can't reuse wraparound_indices because it expects a numpy array + # We have integers here. + if start < 0: + start = dim_numchunks + start + if stop < 0: + stop = dim_numchunks + stop + + start = start * dim_chunk_size + stop = stop * dim_chunk_size + slice_ = slice(start, stop) + + else: + raise IndexError( + "unsupported selection item for block indexing; " + f"expected integer or slice, got {type(dim_sel)!r}" + ) + + dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size) + dim_indexers.append(dim_indexer) + + if start >= dim_len or start < 0: + raise BoundsCheckError(dim_len) + + self.dim_indexers = dim_indexers + self.shape = tuple(s.nitems for s in self.dim_indexers) + self.drop_axes = None + + def __iter__(self) -> Iterator[ChunkProjection]: + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +@dataclass(frozen=True) +class BlockIndex: + array: Array + + def __getitem__(self, selection: Selection) -> NDArrayLike: + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.get_block_selection(selection, fields=fields) + + def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.set_block_selection(selection, value, fields=fields) + + +# noinspection PyProtectedMember +def is_coordinate_selection(selection: Selection, shape: ChunkCoords) -> bool: + return (len(selection) == len(shape)) and all( + is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection + ) + + +# noinspection PyProtectedMember +def is_mask_selection(selection: Selection, shape: ChunkCoords) -> bool: + return len(selection) == 1 and is_bool_array(selection[0]) and selection[0].shape == shape + + +# noinspection PyProtectedMember +class CoordinateIndexer: + sel_shape: ChunkCoords + selection: Selection + sel_sort: npt.NDArray[np.intp] | None + shape: ChunkCoords + chunk_shape: ChunkCoords + drop_axes: None + + def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGrid): + chunk_shape = get_chunk_shape(chunk_grid) + + if shape == (): + cdata_shape = (1,) + else: + cdata_shape = tuple(math.ceil(s / c) for s, c in zip(shape, chunk_shape, strict=True)) + nchunks = reduce(operator.mul, cdata_shape, 1) + + # some initial normalization + selection = ensure_tuple(selection) + selection = tuple([i] if is_integer(i) else i for i in selection) + selection = replace_lists(selection) + + # validation + if not is_coordinate_selection(selection, shape): + raise IndexError( + "invalid coordinate selection; expected one integer " + "(coordinate) array per dimension of the target array, " + f"got {selection!r}" + ) + + # handle wraparound, boundscheck + for dim_sel, dim_len in zip(selection, shape, strict=True): + # handle wraparound + wraparound_indices(dim_sel, dim_len) + + # handle out of bounds + boundscheck_indices(dim_sel, dim_len) + + # compute chunk index for each point in the selection + chunks_multi_index = tuple( + dim_sel // dim_chunk_len + for (dim_sel, dim_chunk_len) in zip(selection, chunk_shape, strict=True) + ) + + # broadcast selection - this will raise error if array dimensions don't match + selection = np.broadcast_arrays(*selection) + chunks_multi_index = np.broadcast_arrays(*chunks_multi_index) + + # remember shape of selection, because we will flatten indices for processing + self.sel_shape = selection[0].shape if selection[0].shape else (1,) + + # flatten selection + selection = [dim_sel.reshape(-1) for dim_sel in selection] + chunks_multi_index = [dim_chunks.reshape(-1) for dim_chunks in chunks_multi_index] + + # ravel chunk indices + chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, dims=cdata_shape) + + # group points by chunk + if np.any(np.diff(chunks_raveled_indices) < 0): + # optimisation, only sort if needed + sel_sort = np.argsort(chunks_raveled_indices) + selection = tuple(dim_sel[sel_sort] for dim_sel in selection) + else: + sel_sort = None + + # store attributes + self.selection = selection + self.sel_sort = sel_sort + self.shape = selection[0].shape if selection[0].shape else (1,) + self.chunk_shape = chunk_shape + self.drop_axes = None + + # precompute number of selected items for each chunk + self.chunk_nitems = np.bincount(chunks_raveled_indices, minlength=nchunks) + self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + # locate the chunks we need to process + self.chunk_rixs = np.nonzero(self.chunk_nitems)[0] + + # unravel chunk indices + self.chunk_mixs = np.unravel_index(self.chunk_rixs, cdata_shape) + + def __iter__(self) -> Iterator[ChunkProjection]: + # iterate over chunks + for i, chunk_rix in enumerate(self.chunk_rixs): + chunk_coords = tuple(m[i] for m in self.chunk_mixs) + if chunk_rix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[chunk_rix - 1] + stop = self.chunk_nitems_cumsum[chunk_rix] + if self.sel_sort is None: + out_selection = slice(start, stop) + else: + out_selection = self.sel_sort[start:stop] + + chunk_offsets = tuple( + dim_chunk_ix * dim_chunk_len + for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.chunk_shape, strict=True) + ) + chunk_selection = tuple( + dim_sel[start:stop] - dim_chunk_offset + for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets, strict=True) + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +# noinspection PyProtectedMember +class MaskIndexer(CoordinateIndexer): + def __init__(self, selection, shape: ChunkCoords, chunk_grid: ChunkGrid): + # some initial normalization + selection = ensure_tuple(selection) + selection = replace_lists(selection) + + # validation + if not is_mask_selection(selection, shape): + raise IndexError( + "invalid mask selection; expected one Boolean (mask)" + f"array with the same shape as the target array, got {selection!r}" + ) + + # convert to indices + selection = np.nonzero(selection[0]) + + # delegate the rest to superclass + super().__init__(selection, shape, chunk_grid) + + +@dataclass(frozen=True) +class VIndex: + array: Array + + def __getitem__(self, selection: Selection) -> NDArrayLike: + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + if is_coordinate_selection(selection, self.array.shape): + return self.array.get_coordinate_selection(selection, fields=fields) + elif is_mask_selection(selection, self.array.shape): + return self.array.get_mask_selection(selection, fields=fields) + else: + raise VindexInvalidSelectionError(selection) + + def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + if is_coordinate_selection(selection, self.array.shape): + self.array.set_coordinate_selection(selection, value, fields=fields) + elif is_mask_selection(selection, self.array.shape): + self.array.set_mask_selection(selection, value, fields=fields) + else: + raise VindexInvalidSelectionError(selection) + + +Fields = str | list | tuple + + +def check_fields(fields: Fields, dtype: npt.DTypeLike) -> npt.DTypeLike: + # early out + if fields is None: + return dtype + # check type + if not isinstance(fields, Fields): + raise IndexError( + f"'fields' argument must be a string or list of strings; found {type(fields)!r}" + ) + if fields: + if dtype.names is None: + raise IndexError("invalid 'fields' argument, array does not have any fields") + try: + if isinstance(fields, str): + # single field selection + out_dtype = dtype[fields] + else: + # multiple field selection + out_dtype = np.dtype([(f, dtype[f]) for f in fields]) + except KeyError as e: + raise IndexError(f"invalid 'fields' argument, field not found: {e!r}") from e + else: + return out_dtype + else: + return dtype + + +def check_no_multi_fields(fields: Fields) -> Fields: + if isinstance(fields, list): + if len(fields) == 1: + return fields[0] + elif len(fields) > 1: + raise IndexError("multiple fields are not supported for this operation") + return fields + + +def pop_fields(selection: Selection) -> tuple[Fields | None, Selection]: + if isinstance(selection, str): + # single field selection + fields = selection + selection = () + elif not isinstance(selection, tuple): + # single selection item, no fields + fields = None + # leave selection as-is + else: + # multiple items, split fields from selection items + fields = [f for f in selection if isinstance(f, str)] + fields = fields[0] if len(fields) == 1 else fields + selection = tuple(s for s in selection if not isinstance(s, str)) + selection = selection[0] if len(selection) == 1 else selection + return fields, selection + + +def make_slice_selection(selection: Selection) -> list[int | slice]: + ls = [] + for dim_selection in selection: + if is_integer(dim_selection): + ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1)) + elif isinstance(dim_selection, np.ndarray): + if len(dim_selection) == 1: + ls.append(slice(int(dim_selection[0]), int(dim_selection[0]) + 1, 1)) + else: + raise ArrayIndexError + else: + ls.append(dim_selection) + return ls diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py index 58cc276c29..b2f99a9299 100644 --- a/src/zarr/metadata.py +++ b/src/zarr/metadata.py @@ -115,6 +115,7 @@ def from_dtype(cls, dtype: np.dtype[Any]) -> DataType: class ArrayMetadata(Metadata, ABC): shape: ChunkCoords chunk_grid: ChunkGrid + fill_value: Any attributes: dict[str, JSON] @property diff --git a/tests/v3/test_indexing.py b/tests/v3/test_indexing.py new file mode 100644 index 0000000000..b46393451d --- /dev/null +++ b/tests/v3/test_indexing.py @@ -0,0 +1,1696 @@ +from __future__ import annotations + +from collections.abc import Iterator +from typing import Any +from uuid import uuid4 + +import numpy as np +import numpy.typing as npt +import pytest +from numpy.testing import assert_array_equal + +import zarr +from zarr.abc.store import Store +from zarr.common import ChunkCoords +from zarr.indexing2 import ( + make_slice_selection, + normalize_integer_selection, + oindex, + oindex_set, + replace_ellipsis, +) +from zarr.store.core import StorePath +from zarr.store.memory import MemoryStore + +from .util import CountingDict + + +@pytest.fixture +def store() -> Iterator[Store]: + yield StorePath(MemoryStore()) + + +def zarr_array_from_numpy_array( + store: StorePath, a: npt.NDArray[Any], chunk_shape: ChunkCoords | None = None +) -> zarr.Array: + z = zarr.Array.create( + store=store / str(uuid4()), + shape=a.shape, + dtype=a.dtype, + chunk_shape=chunk_shape or a.shape, + chunk_key_encoding=("v2", "."), + ) + z[:] = a + return z + + +def test_normalize_integer_selection(): + assert 1 == normalize_integer_selection(1, 100) + assert 99 == normalize_integer_selection(-1, 100) + with pytest.raises(IndexError): + normalize_integer_selection(100, 100) + with pytest.raises(IndexError): + normalize_integer_selection(1000, 100) + with pytest.raises(IndexError): + normalize_integer_selection(-1000, 100) + + +def test_replace_ellipsis(): + # 1D, single item + assert (0,) == replace_ellipsis(0, (100,)) + + # 1D + assert (slice(None),) == replace_ellipsis(Ellipsis, (100,)) + assert (slice(None),) == replace_ellipsis(slice(None), (100,)) + assert (slice(None, 100),) == replace_ellipsis(slice(None, 100), (100,)) + assert (slice(0, None),) == replace_ellipsis(slice(0, None), (100,)) + assert (slice(None),) == replace_ellipsis((slice(None), Ellipsis), (100,)) + assert (slice(None),) == replace_ellipsis((Ellipsis, slice(None)), (100,)) + + # 2D, single item + assert (0, 0) == replace_ellipsis((0, 0), (100, 100)) + assert (-1, 1) == replace_ellipsis((-1, 1), (100, 100)) + + # 2D, single col/row + assert (0, slice(None)) == replace_ellipsis((0, slice(None)), (100, 100)) + assert (0, slice(None)) == replace_ellipsis((0,), (100, 100)) + assert (slice(None), 0) == replace_ellipsis((slice(None), 0), (100, 100)) + + # 2D slice + assert (slice(None), slice(None)) == replace_ellipsis(Ellipsis, (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis(slice(None), (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis((slice(None), slice(None)), (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis((Ellipsis, slice(None)), (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis((slice(None), Ellipsis), (100, 100)) + assert (slice(None), slice(None)) == replace_ellipsis( + (slice(None), Ellipsis, slice(None)), (100, 100) + ) + assert (slice(None), slice(None)) == replace_ellipsis( + (Ellipsis, slice(None), slice(None)), (100, 100) + ) + assert (slice(None), slice(None)) == replace_ellipsis( + (slice(None), slice(None), Ellipsis), (100, 100) + ) + + +@pytest.mark.xfail(reason="zero-dimension arrays are not supported in v3") +def test_get_basic_selection_0d(store: StorePath): + # setup + a = np.array(42) + z = zarr_array_from_numpy_array(store, a) + + assert_array_equal(a, z.get_basic_selection(Ellipsis)) + assert_array_equal(a, z[...]) + assert 42 == z.get_basic_selection(()) + assert 42 == z[()] + + # # test out param + # b = np.zeros_like(a) + # z.get_basic_selection(Ellipsis, out=b) + # assert_array_equal(a, b) + + # test structured array + value = (b"aaa", 1, 4.2) + a = np.array(value, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + z = zarr_array_from_numpy_array(store, a) + z[()] = value + assert_array_equal(a, z.get_basic_selection(Ellipsis)) + assert_array_equal(a, z[...]) + assert a[()] == z.get_basic_selection(()) + assert a[()] == z[()] + assert b"aaa" == z.get_basic_selection((), fields="foo") + assert b"aaa" == z["foo"] + assert a[["foo", "bar"]] == z.get_basic_selection((), fields=["foo", "bar"]) + assert a[["foo", "bar"]] == z["foo", "bar"] + # test out param + b = np.zeros_like(a) + z.get_basic_selection(Ellipsis, out=b) + assert_array_equal(a, b) + c = np.zeros_like(a[["foo", "bar"]]) + z.get_basic_selection(Ellipsis, out=c, fields=["foo", "bar"]) + assert_array_equal(a[["foo", "bar"]], c) + + +basic_selections_1d = [ + # single value + 42, + -1, + # slices + slice(0, 1050), + slice(50, 150), + slice(0, 2000), + slice(-150, -50), + slice(-2000, 2000), + slice(0, 0), # empty result + slice(-1, 0), # empty result + # total selections + slice(None), + Ellipsis, + (), + (Ellipsis, slice(None)), + # slice with step + slice(None), + slice(None, None), + slice(None, None, 1), + slice(None, None, 10), + slice(None, None, 100), + slice(None, None, 1000), + slice(None, None, 10000), + slice(0, 1050), + slice(0, 1050, 1), + slice(0, 1050, 10), + slice(0, 1050, 100), + slice(0, 1050, 1000), + slice(0, 1050, 10000), + slice(1, 31, 3), + slice(1, 31, 30), + slice(1, 31, 300), + slice(81, 121, 3), + slice(81, 121, 30), + slice(81, 121, 300), + slice(50, 150), + slice(50, 150, 1), + slice(50, 150, 10), +] + + +basic_selections_1d_bad = [ + # only positive step supported + slice(None, None, -1), + slice(None, None, -10), + slice(None, None, -100), + slice(None, None, -1000), + slice(None, None, -10000), + slice(1050, -1, -1), + slice(1050, -1, -10), + slice(1050, -1, -100), + slice(1050, -1, -1000), + slice(1050, -1, -10000), + slice(1050, 0, -1), + slice(1050, 0, -10), + slice(1050, 0, -100), + slice(1050, 0, -1000), + slice(1050, 0, -10000), + slice(150, 50, -1), + slice(150, 50, -10), + slice(31, 1, -3), + slice(121, 81, -3), + slice(-1, 0, -1), + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (0, 0), + (slice(None), slice(None)), +] + + +def _test_get_basic_selection(a, z, selection): + print(a, z, selection) + expect = a[selection] + actual = z.get_basic_selection(selection) + assert_array_equal(expect, actual) + actual = z[selection] + assert_array_equal(expect, actual) + + +# noinspection PyStatementEffect +def test_get_basic_selection_1d(store: StorePath): + # setup + a = np.arange(1050, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + for selection in basic_selections_1d: + _test_get_basic_selection(a, z, selection) + + for selection in basic_selections_1d_bad: + with pytest.raises(IndexError): + z.get_basic_selection(selection) + with pytest.raises(IndexError): + z[selection] + + with pytest.raises(IndexError): + z.get_basic_selection([1, 0]) + + +basic_selections_2d = [ + # single row + 42, + -1, + (42, slice(None)), + (-1, slice(None)), + # single col + (slice(None), 4), + (slice(None), -1), + # row slices + slice(None), + slice(0, 1000), + slice(250, 350), + slice(0, 2000), + slice(-350, -250), + slice(0, 0), # empty result + slice(-1, 0), # empty result + slice(-2000, 0), + slice(-2000, 2000), + # 2D slices + (slice(None), slice(1, 5)), + (slice(250, 350), slice(None)), + (slice(250, 350), slice(1, 5)), + (slice(250, 350), slice(-5, -1)), + (slice(250, 350), slice(-50, 50)), + (slice(250, 350, 10), slice(1, 5)), + (slice(250, 350), slice(1, 5, 2)), + (slice(250, 350, 33), slice(1, 5, 3)), + # total selections + (slice(None), slice(None)), + Ellipsis, + (), + (Ellipsis, slice(None)), + (Ellipsis, slice(None), slice(None)), +] + + +basic_selections_2d_bad = [ + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (2.3, slice(None)), + # only positive step supported + slice(None, None, -1), + (slice(None, None, -1), slice(None)), + (0, 0, 0), + (slice(None), slice(None), slice(None)), +] + + +# noinspection PyStatementEffect +def test_get_basic_selection_2d(store: StorePath): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + for selection in basic_selections_2d: + _test_get_basic_selection(a, z, selection) + + bad_selections = basic_selections_2d_bad + [ + # integer arrays + [0, 1], + (slice(None), [0, 1]), + ] + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_basic_selection(selection) + # check fallback on fancy indexing + fancy_selection = ([0, 1], [0, 1]) + np.testing.assert_array_equal(z[fancy_selection], [0, 11]) + + +def test_fancy_indexing_fallback_on_get_setitem(store: StorePath): + z = zarr_array_from_numpy_array(store, np.zeros((20, 20))) + z[[1, 2, 3], [1, 2, 3]] = 1 + np.testing.assert_array_equal( + z[:4, :4], + [ + [0, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1], + ], + ) + np.testing.assert_array_equal(z[[1, 2, 3], [1, 2, 3]], 1) + # test broadcasting + np.testing.assert_array_equal(z[1, [1, 2, 3]], [1, 0, 0]) + # test 1D fancy indexing + z2 = zarr_array_from_numpy_array(store, np.zeros(5)) + z2[[1, 2, 3]] = 1 + np.testing.assert_array_equal(z2, [0, 1, 1, 1, 0]) + + +@pytest.mark.parametrize( + "index,expected_result", + [ + # Single iterable of integers + ([0, 1], [[0, 1, 2], [3, 4, 5]]), + # List first, then slice + (([0, 1], slice(None)), [[0, 1, 2], [3, 4, 5]]), + # List first, then slice + (([0, 1], slice(1, None)), [[1, 2], [4, 5]]), + # Slice first, then list + ((slice(0, 2), [0, 2]), [[0, 2], [3, 5]]), + # Slices only + ((slice(0, 2), slice(0, 2)), [[0, 1], [3, 4]]), + # List with repeated index + (([1, 0, 1], slice(1, None)), [[4, 5], [1, 2], [4, 5]]), + # 1D indexing + (([1, 0, 1]), [[3, 4, 5], [0, 1, 2], [3, 4, 5]]), + ], +) +def test_orthogonal_indexing_fallback_on_getitem_2d(store: StorePath, index, expected_result): + """ + Tests the orthogonal indexing fallback on __getitem__ for a 2D matrix. + + In addition to checking expected behavior, all indexing + is also checked against numpy. + """ + # [0, 1, 2], + # [3, 4, 5], + # [6, 7, 8] + a = np.arange(9).reshape(3, 3) + z = zarr_array_from_numpy_array(store, a) + + np.testing.assert_array_equal(z[index], a[index], err_msg="Indexing disagrees with numpy") + np.testing.assert_array_equal(z[index], expected_result) + + +@pytest.mark.parametrize( + "index,expected_result", + [ + # Single iterable of integers + ([0, 1], [[[0, 1, 2], [3, 4, 5], [6, 7, 8]], [[9, 10, 11], [12, 13, 14], [15, 16, 17]]]), + # One slice, two integers + ((slice(0, 2), 1, 1), [4, 13]), + # One integer, two slices + ((slice(0, 2), 1, slice(0, 2)), [[3, 4], [12, 13]]), + # Two slices and a list + ((slice(0, 2), [1, 2], slice(0, 2)), [[[3, 4], [6, 7]], [[12, 13], [15, 16]]]), + ], +) +def test_orthogonal_indexing_fallback_on_getitem_3d(store: StorePath, index, expected_result): + """ + Tests the orthogonal indexing fallback on __getitem__ for a 3D matrix. + + In addition to checking expected behavior, all indexing + is also checked against numpy. + """ + # [[[ 0, 1, 2], + # [ 3, 4, 5], + # [ 6, 7, 8]], + + # [[ 9, 10, 11], + # [12, 13, 14], + # [15, 16, 17]], + + # [[18, 19, 20], + # [21, 22, 23], + # [24, 25, 26]]] + a = np.arange(27).reshape(3, 3, 3) + z = zarr_array_from_numpy_array(store, a) + + np.testing.assert_array_equal(z[index], a[index], err_msg="Indexing disagrees with numpy") + np.testing.assert_array_equal(z[index], expected_result) + + +@pytest.mark.parametrize( + "index,expected_result", + [ + # Single iterable of integers + ([0, 1], [[1, 1, 1], [1, 1, 1], [0, 0, 0]]), + # List and slice combined + (([0, 1], slice(1, 3)), [[0, 1, 1], [0, 1, 1], [0, 0, 0]]), + # Index repetition is ignored on setitem + (([0, 1, 1, 1, 1, 1, 1], slice(1, 3)), [[0, 1, 1], [0, 1, 1], [0, 0, 0]]), + # Slice with step + (([0, 2], slice(None, None, 2)), [[1, 0, 1], [0, 0, 0], [1, 0, 1]]), + ], +) +def test_orthogonal_indexing_fallback_on_setitem_2d(store: StorePath, index, expected_result): + """ + Tests the orthogonal indexing fallback on __setitem__ for a 3D matrix. + + In addition to checking expected behavior, all indexing + is also checked against numpy. + """ + # Slice + fancy index + a = np.zeros((3, 3)) + z = zarr_array_from_numpy_array(store, a) + z[index] = 1 + a[index] = 1 + np.testing.assert_array_equal(z, expected_result) + np.testing.assert_array_equal(z, a, err_msg="Indexing disagrees with numpy") + + +def test_fancy_indexing_doesnt_mix_with_implicit_slicing(store: StorePath): + z2 = zarr_array_from_numpy_array(store, np.zeros((5, 5, 5))) + with pytest.raises(IndexError): + z2[[1, 2, 3], [1, 2, 3]] = 2 + with pytest.raises(IndexError): + np.testing.assert_array_equal(z2[[1, 2, 3], [1, 2, 3]], 0) + with pytest.raises(IndexError): + z2[..., [1, 2, 3]] = 2 + with pytest.raises(IndexError): + np.testing.assert_array_equal(z2[..., [1, 2, 3]], 0) + + +@pytest.mark.xfail(reason="zero-dimension arrays are not supported in v3") +def test_set_basic_selection_0d(store: StorePath): + # setup + v = np.array(42) + a = np.zeros_like(v) + z = zarr_array_from_numpy_array(store, v) + assert_array_equal(a, z) + + # tests + z.set_basic_selection(Ellipsis, v) + assert_array_equal(v, z) + z[...] = 0 + assert_array_equal(a, z) + z[...] = v + assert_array_equal(v, z) + + # test structured array + value = (b"aaa", 1, 4.2) + v = np.array(value, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + a = np.zeros_like(v) + z = zarr_array_from_numpy_array(store, a) + + # tests + z.set_basic_selection(Ellipsis, v) + assert_array_equal(v, z) + z.set_basic_selection(Ellipsis, a) + assert_array_equal(a, z) + z[...] = v + assert_array_equal(v, z) + z[...] = a + assert_array_equal(a, z) + # with fields + z.set_basic_selection(Ellipsis, v["foo"], fields="foo") + assert v["foo"] == z["foo"] + assert a["bar"] == z["bar"] + assert a["baz"] == z["baz"] + z["bar"] = v["bar"] + assert v["foo"] == z["foo"] + assert v["bar"] == z["bar"] + assert a["baz"] == z["baz"] + # multiple field assignment not supported + with pytest.raises(IndexError): + z.set_basic_selection(Ellipsis, v[["foo", "bar"]], fields=["foo", "bar"]) + with pytest.raises(IndexError): + z[..., "foo", "bar"] = v[["foo", "bar"]] + + +def _test_get_orthogonal_selection(a, z, selection): + expect = oindex(a, selection) + actual = z.get_orthogonal_selection(selection) + assert_array_equal(expect, actual) + actual = z.oindex[selection] + assert_array_equal(expect, actual) + + +# noinspection PyStatementEffect +def test_get_orthogonal_selection_1d_bool(store: StorePath): + # setup + a = np.arange(1050, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_get_orthogonal_selection(a, z, ix) + + # test errors + with pytest.raises(IndexError): + z.oindex[np.zeros(50, dtype=bool)] # too short + with pytest.raises(IndexError): + z.oindex[np.zeros(2000, dtype=bool)] # too long + with pytest.raises(IndexError): + z.oindex[[[True, False], [False, True]]] # too many dimensions + + +# noinspection PyStatementEffect +def test_get_orthogonal_selection_1d_int(store: StorePath): + # setup + a = np.arange(1050, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + # unordered + ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + _test_get_orthogonal_selection(a, z, ix) + # increasing + ix.sort() + _test_get_orthogonal_selection(a, z, ix) + # decreasing + ix = ix[::-1] + _test_get_orthogonal_selection(a, z, ix) + + selections = basic_selections_1d + [ + # test wraparound + [0, 3, 10, -23, -12, -1], + # explicit test not sorted + [3, 105, 23, 127], + ] + for selection in selections: + _test_get_orthogonal_selection(a, z, selection) + + bad_selections = basic_selections_1d_bad + [ + [a.shape[0] + 1], # out of bounds + [-(a.shape[0] + 1)], # out of bounds + [[2, 4], [6, 8]], # too many dimensions + ] + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_orthogonal_selection(selection) + with pytest.raises(IndexError): + z.oindex[selection] + + +def _test_get_orthogonal_selection_2d(a, z, ix0, ix1): + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice + (ix0, slice(1, 5)), + (ix0, slice(1, 5, 2)), + (slice(250, 350), ix1), + (slice(250, 350, 10), ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ] + for selection in selections: + _test_get_orthogonal_selection(a, z, selection) + + +# noinspection PyStatementEffect +def test_get_orthogonal_selection_2d(store: StorePath): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + + # mixed int array / bool array + selections = ( + (ix0, np.nonzero(ix1)[0]), + (np.nonzero(ix0)[0], ix1), + ) + for selection in selections: + _test_get_orthogonal_selection(a, z, selection) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + ix0.sort() + ix1.sort() + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + _test_get_orthogonal_selection_2d(a, z, ix0, ix1) + + for selection in basic_selections_2d: + _test_get_orthogonal_selection(a, z, selection) + + for selection in basic_selections_2d_bad: + with pytest.raises(IndexError): + z.get_orthogonal_selection(selection) + with pytest.raises(IndexError): + z.oindex[selection] + + +def _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2): + selections = [ + # single value + (84, 42, 4), + (-1, -1, -1), + # index all axes with array + (ix0, ix1, ix2), + # mixed indexing with single array / slices + (ix0, slice(15, 25), slice(1, 5)), + (slice(50, 70), ix1, slice(1, 5)), + (slice(50, 70), slice(15, 25), ix2), + (ix0, slice(15, 25, 5), slice(1, 5, 2)), + (slice(50, 70, 3), ix1, slice(1, 5, 2)), + (slice(50, 70, 3), slice(15, 25, 5), ix2), + # mixed indexing with single array / ints + (ix0, 42, 4), + (84, ix1, 4), + (84, 42, ix2), + # mixed indexing with single array / slice / int + (ix0, slice(15, 25), 4), + (42, ix1, slice(1, 5)), + (slice(50, 70), 42, ix2), + # mixed indexing with two array / slice + (ix0, ix1, slice(1, 5)), + (slice(50, 70), ix1, ix2), + (ix0, slice(15, 25), ix2), + # mixed indexing with two array / integer + (ix0, ix1, 4), + (42, ix1, ix2), + (ix0, 42, ix2), + ] + for selection in selections: + _test_get_orthogonal_selection(a, z, selection) + + +def test_get_orthogonal_selection_3d(store: StorePath): + # setup + a = np.arange(100000, dtype=int).reshape(200, 50, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(60, 20, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + ix2 = np.random.binomial(1, 0.5, size=a.shape[2]).astype(bool) + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) + ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * 0.5), replace=True) + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + ix0.sort() + ix1.sort() + ix2.sort() + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + ix2 = ix2[::-1] + _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2) + + +def test_orthogonal_indexing_edge_cases(store: StorePath): + a = np.arange(6).reshape(1, 2, 3) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(1, 2, 3)) + + expect = oindex(a, (0, slice(None), [0, 1, 2])) + actual = z.oindex[0, :, [0, 1, 2]] + assert_array_equal(expect, actual) + + expect = oindex(a, (0, slice(None), [True, True, True])) + actual = z.oindex[0, :, [True, True, True]] + assert_array_equal(expect, actual) + + +def _test_set_orthogonal_selection(v, a, z, selection): + for value in 42, oindex(v, selection), oindex(v, selection).tolist(): + if isinstance(value, list) and value == []: + # skip these cases as cannot preserve all dimensions + continue + # setup expectation + a[:] = 0 + oindex_set(a, selection, value) + # long-form API + z[:] = 0 + z.set_orthogonal_selection(selection, value) + assert_array_equal(a, z[:]) + # short-form API + z[:] = 0 + z.oindex[selection] = value + assert_array_equal(a, z[:]) + + +def test_set_orthogonal_selection_1d(store: StorePath): + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + # test with different degrees of sparseness + np.random.seed(42) + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_set_orthogonal_selection(v, a, z, ix) + + # integer arrays + ix = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + _test_set_orthogonal_selection(v, a, z, ix) + ix.sort() + _test_set_orthogonal_selection(v, a, z, ix) + ix = ix[::-1] + _test_set_orthogonal_selection(v, a, z, ix) + + # basic selections + for selection in basic_selections_1d: + _test_set_orthogonal_selection(v, a, z, selection) + + +def _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1): + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice or int + (ix0, slice(1, 5)), + (slice(250, 350), ix1), + (ix0, 4), + (42, ix1), + ] + for selection in selections: + _test_set_orthogonal_selection(v, a, z, selection) + + +def test_set_orthogonal_selection_2d(store: StorePath): + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + ix0.sort() + ix1.sort() + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + ix0 = ix0[::-1] + ix1 = ix1[::-1] + _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1) + + for selection in basic_selections_2d: + _test_set_orthogonal_selection(v, a, z, selection) + + +def _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2): + selections = ( + # single value + (84, 42, 4), + (-1, -1, -1), + # index all axes with bool array + (ix0, ix1, ix2), + # mixed indexing with single bool array / slice or int + (ix0, slice(15, 25), slice(1, 5)), + (slice(50, 70), ix1, slice(1, 5)), + (slice(50, 70), slice(15, 25), ix2), + (ix0, 42, 4), + (84, ix1, 4), + (84, 42, ix2), + (ix0, slice(15, 25), 4), + (slice(50, 70), ix1, 4), + (slice(50, 70), 42, ix2), + # indexing with two arrays / slice + (ix0, ix1, slice(1, 5)), + # indexing with two arrays / integer + (ix0, ix1, 4), + ) + for selection in selections: + _test_set_orthogonal_selection(v, a, z, selection) + + +def test_set_orthogonal_selection_3d(store: StorePath): + # setup + v = np.arange(100000, dtype=int).reshape(200, 50, 10) + a = np.empty_like(v) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(60, 20, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + # boolean arrays + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + ix2 = np.random.binomial(1, 0.5, size=a.shape[2]).astype(bool) + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + # integer arrays + ix0 = np.random.choice(a.shape[0], size=int(a.shape[0] * p), replace=True) + ix1 = np.random.choice(a.shape[1], size=int(a.shape[1] * 0.5), replace=True) + ix2 = np.random.choice(a.shape[2], size=int(a.shape[2] * 0.5), replace=True) + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + # sorted increasing + ix0.sort() + ix1.sort() + ix2.sort() + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + # sorted decreasing + ix0 = ix0[::-1] + ix1 = ix1[::-1] + ix2 = ix2[::-1] + _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2) + + +def test_orthogonal_indexing_fallback_on_get_setitem(store: StorePath): + z = zarr_array_from_numpy_array(store, np.zeros((20, 20))) + z[[1, 2, 3], [1, 2, 3]] = 1 + np.testing.assert_array_equal( + z[:4, :4], + [ + [0, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1], + ], + ) + np.testing.assert_array_equal(z[[1, 2, 3], [1, 2, 3]], 1) + # test broadcasting + np.testing.assert_array_equal(z[1, [1, 2, 3]], [1, 0, 0]) + # test 1D fancy indexing + z2 = zarr_array_from_numpy_array(store, np.zeros(5)) + z2[[1, 2, 3]] = 1 + np.testing.assert_array_equal(z2, [0, 1, 1, 1, 0]) + + +def _test_get_coordinate_selection(a, z, selection): + expect = a[selection] + actual = z.get_coordinate_selection(selection) + assert_array_equal(expect, actual) + actual = z.vindex[selection] + assert_array_equal(expect, actual) + + +coordinate_selections_1d_bad = [ + # slice not supported + slice(5, 15), + slice(None), + Ellipsis, + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (0, 0), + (slice(None), slice(None)), +] + + +# noinspection PyStatementEffect +def test_get_coordinate_selection_1d(store: StorePath): + # setup + a = np.arange(1050, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix = np.random.choice(a.shape[0], size=n, replace=True) + _test_get_coordinate_selection(a, z, ix) + ix.sort() + _test_get_coordinate_selection(a, z, ix) + ix = ix[::-1] + _test_get_coordinate_selection(a, z, ix) + + selections = [ + # test single item + 42, + -1, + # test wraparound + [0, 3, 10, -23, -12, -1], + # test out of order + [3, 105, 23, 127], # not monotonically increasing + # test multi-dimensional selection + np.array([[2, 4], [6, 8]]), + ] + for selection in selections: + _test_get_coordinate_selection(a, z, selection) + + # test errors + bad_selections = coordinate_selections_1d_bad + [ + [a.shape[0] + 1], # out of bounds + [-(a.shape[0] + 1)], # out of bounds + ] + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_coordinate_selection(selection) + with pytest.raises(IndexError): + z.vindex[selection] + + +def test_get_coordinate_selection_2d(store: StorePath): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) + selections = [ + # single value + (42, 4), + (-1, -1), + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + (42, 4), + ] + for selection in selections: + _test_get_coordinate_selection(a, z, selection) + + # not monotonically increasing (first dim) + ix0 = [3, 3, 4, 2, 5] + ix1 = [1, 3, 5, 7, 9] + _test_get_coordinate_selection(a, z, (ix0, ix1)) + + # not monotonically increasing (second dim) + ix0 = [1, 1, 2, 2, 5] + ix1 = [1, 3, 2, 1, 0] + _test_get_coordinate_selection(a, z, (ix0, ix1)) + + # multi-dimensional selection + ix0 = np.array([[1, 1, 2], [2, 2, 5]]) + ix1 = np.array([[1, 3, 2], [1, 0, 0]]) + _test_get_coordinate_selection(a, z, (ix0, ix1)) + + with pytest.raises(IndexError): + selection = slice(5, 15), [1, 2, 3] + z.get_coordinate_selection(selection) + with pytest.raises(IndexError): + selection = [1, 2, 3], slice(5, 15) + z.get_coordinate_selection(selection) + with pytest.raises(IndexError): + selection = Ellipsis, [1, 2, 3] + z.get_coordinate_selection(selection) + with pytest.raises(IndexError): + selection = Ellipsis + z.get_coordinate_selection(selection) + + +def _test_set_coordinate_selection(v, a, z, selection): + for value in 42, v[selection], v[selection].tolist(): + # setup expectation + a[:] = 0 + a[selection] = value + # test long-form API + z[:] = 0 + z.set_coordinate_selection(selection, value) + assert_array_equal(a, z[:]) + # test short-form API + z[:] = 0 + z.vindex[selection] = value + assert_array_equal(a, z[:]) + + +def test_set_coordinate_selection_1d(store: StorePath): + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=v.dtype) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix = np.random.choice(a.shape[0], size=n, replace=True) + _test_set_coordinate_selection(v, a, z, ix) + + # multi-dimensional selection + ix = np.array([[2, 4], [6, 8]]) + _test_set_coordinate_selection(v, a, z, ix) + + for selection in coordinate_selections_1d_bad: + with pytest.raises(IndexError): + z.set_coordinate_selection(selection, 42) + with pytest.raises(IndexError): + z.vindex[selection] = 42 + + +def test_set_coordinate_selection_2d(store: StorePath): + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 2, 0.5, 0.1, 0.01: + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) + + selections = ( + (42, 4), + (-1, -1), + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ) + for selection in selections: + _test_set_coordinate_selection(v, a, z, selection) + + # multi-dimensional selection + ix0 = np.array([[1, 2, 3], [4, 5, 6]]) + ix1 = np.array([[1, 3, 2], [2, 0, 5]]) + _test_set_coordinate_selection(v, a, z, (ix0, ix1)) + + +def _test_get_block_selection(a, z, selection, expected_idx): + expect = a[expected_idx] + actual = z.get_block_selection(selection) + assert_array_equal(expect, actual) + actual = z.blocks[selection] + assert_array_equal(expect, actual) + + +block_selections_1d = [ + # test single item + 0, + 5, + # test wraparound + -1, + -4, + # test slice + slice(5), + slice(None, 3), + slice(5, 6), + slice(-3, -1), + slice(None), # Full slice +] + +block_selections_1d_array_projection = [ + # test single item + slice(100), + slice(500, 600), + # test wraparound + slice(1000, None), + slice(700, 800), + # test slice + slice(500), + slice(None, 300), + slice(500, 600), + slice(800, 1000), + slice(None), +] + +block_selections_1d_bad = [ + # slice not supported + slice(3, 8, 2), + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (0, 0), + (slice(None), slice(None)), + [0, 5, 3], +] + + +def test_get_block_selection_1d(store: StorePath): + # setup + a = np.arange(1050, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + for selection, expected_idx in zip( + block_selections_1d, block_selections_1d_array_projection, strict=True + ): + _test_get_block_selection(a, z, selection, expected_idx) + + bad_selections = block_selections_1d_bad + [ + z.metadata.chunk_grid.get_nchunks(z.shape) + 1, # out of bounds + -(z.metadata.chunk_grid.get_nchunks(z.shape) + 1), # out of bounds + ] + + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_block_selection(selection) + with pytest.raises(IndexError): + z.blocks[selection] + + +block_selections_2d = [ + # test single item + (0, 0), + (1, 2), + # test wraparound + (-1, -1), + (-3, -2), + # test slice + (slice(1), slice(2)), + (slice(None, 2), slice(-2, -1)), + (slice(2, 3), slice(-2, None)), + (slice(-3, -1), slice(-3, -2)), + (slice(None), slice(None)), # Full slice +] + +block_selections_2d_array_projection = [ + # test single item + (slice(300), slice(3)), + (slice(300, 600), slice(6, 9)), + # test wraparound + (slice(900, None), slice(9, None)), + (slice(300, 600), slice(6, 9)), + # test slice + (slice(300), slice(6)), + (slice(None, 600), slice(6, 9)), + (slice(600, 900), slice(6, None)), + (slice(300, 900), slice(3, 6)), + (slice(None), slice(None)), # Full slice +] + + +def test_get_block_selection_2d(store: StorePath): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + for selection, expected_idx in zip( + block_selections_2d, block_selections_2d_array_projection, strict=True + ): + _test_get_block_selection(a, z, selection, expected_idx) + + with pytest.raises(IndexError): + selection = slice(5, 15), [1, 2, 3] + z.get_block_selection(selection) + with pytest.raises(IndexError): + selection = Ellipsis, [1, 2, 3] + z.get_block_selection(selection) + with pytest.raises(IndexError): # out of bounds + selection = slice(15, 20), slice(None) + z.get_block_selection(selection) + + +def _test_set_block_selection(v: np.ndarray, a: np.ndarray, z: zarr.Array, selection, expected_idx): + for value in 42, v[expected_idx], v[expected_idx].tolist(): + # setup expectation + a[:] = 0 + a[expected_idx] = value + # test long-form API + z[:] = 0 + z.set_block_selection(selection, value) + assert_array_equal(a, z[:]) + # test short-form API + z[:] = 0 + z.blocks[selection] = value + assert_array_equal(a, z[:]) + + +def test_set_block_selection_1d(store: StorePath): + # setup + v = np.arange(1050, dtype=int) + a = np.empty(v.shape, dtype=v.dtype) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + for selection, expected_idx in zip( + block_selections_1d, block_selections_1d_array_projection, strict=True + ): + _test_set_block_selection(v, a, z, selection, expected_idx) + + for selection in block_selections_1d_bad: + with pytest.raises(IndexError): + z.set_block_selection(selection, 42) + with pytest.raises(IndexError): + z.blocks[selection] = 42 + + +def test_set_block_selection_2d(store: StorePath): + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty(v.shape, dtype=v.dtype) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + for selection, expected_idx in zip( + block_selections_2d, block_selections_2d_array_projection, strict=True + ): + _test_set_block_selection(v, a, z, selection, expected_idx) + + with pytest.raises(IndexError): + selection = slice(5, 15), [1, 2, 3] + z.set_block_selection(selection, 42) + with pytest.raises(IndexError): + selection = Ellipsis, [1, 2, 3] + z.set_block_selection(selection, 42) + with pytest.raises(IndexError): # out of bounds + selection = slice(15, 20), slice(None) + z.set_block_selection(selection, 42) + + +def _test_get_mask_selection(a, z, selection): + expect = a[selection] + actual = z.get_mask_selection(selection) + assert_array_equal(expect, actual) + actual = z.vindex[selection] + assert_array_equal(expect, actual) + + +mask_selections_1d_bad = [ + # slice not supported + slice(5, 15), + slice(None), + Ellipsis, + # bad stuff + 2.3, + "foo", + b"xxx", + None, + (0, 0), + (slice(None), slice(None)), +] + + +# noinspection PyStatementEffect +def test_get_mask_selection_1d(store: StorePath): + # setup + a = np.arange(1050, dtype=int) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_get_mask_selection(a, z, ix) + + # test errors + bad_selections = mask_selections_1d_bad + [ + np.zeros(50, dtype=bool), # too short + np.zeros(2000, dtype=bool), # too long + [[True, False], [False, True]], # too many dimensions + ] + for selection in bad_selections: + with pytest.raises(IndexError): + z.get_mask_selection(selection) + with pytest.raises(IndexError): + z.vindex[selection] + + +# noinspection PyStatementEffect +def test_get_mask_selection_2d(store: StorePath): + # setup + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.size).astype(bool).reshape(a.shape) + _test_get_mask_selection(a, z, ix) + + # test errors + with pytest.raises(IndexError): + z.vindex[np.zeros((1000, 5), dtype=bool)] # too short + with pytest.raises(IndexError): + z.vindex[np.zeros((2000, 10), dtype=bool)] # too long + with pytest.raises(IndexError): + z.vindex[[True, False]] # wrong no. dimensions + + +def _test_set_mask_selection(v, a, z, selection): + a[:] = 0 + z[:] = 0 + a[selection] = v[selection] + z.set_mask_selection(selection, v[selection]) + assert_array_equal(a, z[:]) + z[:] = 0 + z.vindex[selection] = v[selection] + assert_array_equal(a, z[:]) + + +def test_set_mask_selection_1d(store: StorePath): + # setup + v = np.arange(1050, dtype=int) + a = np.empty_like(v) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + _test_set_mask_selection(v, a, z, ix) + + for selection in mask_selections_1d_bad: + with pytest.raises(IndexError): + z.set_mask_selection(selection, 42) + with pytest.raises(IndexError): + z.vindex[selection] = 42 + + +def test_set_mask_selection_2d(store: StorePath): + # setup + v = np.arange(10000, dtype=int).reshape(1000, 10) + a = np.empty_like(v) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix = np.random.binomial(1, p, size=a.size).astype(bool).reshape(a.shape) + _test_set_mask_selection(v, a, z, ix) + + +def test_get_selection_out(store: StorePath): + # basic selections + a = np.arange(1050) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + + selections = [ + slice(50, 150), + slice(0, 1050), + slice(1, 2), + ] + for selection in selections: + expect = a[selection] + out = zarr_array_from_numpy_array(store, np.zeros(expect.shape), chunk_shape=(10,)) + z.get_basic_selection(selection, out=out) + assert_array_equal(expect, out[:]) + + with pytest.raises(TypeError): + z.get_basic_selection(Ellipsis, out=[]) + + # orthogonal selections + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) + ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / slice + (ix0, slice(1, 5)), + (slice(250, 350), ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + # mixed int array / bool array + (ix0, np.nonzero(ix1)[0]), + (np.nonzero(ix0)[0], ix1), + ] + for selection in selections: + expect = oindex(a, selection) + out = np.zeros(expect.shape, dtype=expect.dtype) + z.get_orthogonal_selection(selection, out=out) + assert_array_equal(expect, out[:]) + + # coordinate selections + a = np.arange(10000, dtype=int).reshape(1000, 10) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(300, 3)) + np.random.seed(42) + # test with different degrees of sparseness + for p in 0.5, 0.1, 0.01: + n = int(a.size * p) + ix0 = np.random.choice(a.shape[0], size=n, replace=True) + ix1 = np.random.choice(a.shape[1], size=n, replace=True) + selections = [ + # index both axes with array + (ix0, ix1), + # mixed indexing with array / int + (ix0, 4), + (42, ix1), + ] + for selection in selections: + expect = a[selection] + out = np.zeros(expect.shape, dtype=expect.dtype) + z.get_coordinate_selection(selection, out=out) + assert_array_equal(expect, out[:]) + + +def test_get_selections_with_fields(store: StorePath): + a = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] + a = np.array(a, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(2,)) + + fields_fixture = [ + "foo", + ["foo"], + ["foo", "bar"], + ["foo", "baz"], + ["bar", "baz"], + ["foo", "bar", "baz"], + ["bar", "foo"], + ["baz", "bar", "foo"], + ] + + for fields in fields_fixture: + # total selection + expect = a[fields] + actual = z.get_basic_selection(Ellipsis, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z[fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[fields[0], fields[1]] + assert_array_equal(expect, actual) + if isinstance(fields, str): + actual = z[..., fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[..., fields[0], fields[1]] + assert_array_equal(expect, actual) + + # basic selection with slice + expect = a[fields][0:2] + actual = z.get_basic_selection(slice(0, 2), fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z[0:2, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[0:2, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # basic selection with single item + expect = a[fields][1] + actual = z.get_basic_selection(1, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z[1, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z[1, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # orthogonal selection + ix = [0, 2] + expect = a[fields][ix] + actual = z.get_orthogonal_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z.oindex[ix, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z.oindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # coordinate selection + ix = [0, 2] + expect = a[fields][ix] + actual = z.get_coordinate_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z.vindex[ix, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z.vindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # mask selection + ix = [True, False, True] + expect = a[fields][ix] + actual = z.get_mask_selection(ix, fields=fields) + assert_array_equal(expect, actual) + # alternative API + if isinstance(fields, str): + actual = z.vindex[ix, fields] + assert_array_equal(expect, actual) + elif len(fields) == 2: + actual = z.vindex[ix, fields[0], fields[1]] + assert_array_equal(expect, actual) + + # missing/bad fields + with pytest.raises(IndexError): + z.get_basic_selection(Ellipsis, fields=["notafield"]) + with pytest.raises(IndexError): + z.get_basic_selection(Ellipsis, fields=slice(None)) + + +def test_set_selections_with_fields(store: StorePath): + v = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] + v = np.array(v, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) + a = np.empty_like(v) + z = zarr_array_from_numpy_array(store, v, chunk_shape=(2,)) + + fields_fixture = [ + "foo", + [], + ["foo"], + ["foo", "bar"], + ["foo", "baz"], + ["bar", "baz"], + ["foo", "bar", "baz"], + ["bar", "foo"], + ["baz", "bar", "foo"], + ] + + for fields in fields_fixture: + # currently multi-field assignment is not supported in numpy, so we won't support + # it either + if isinstance(fields, list) and len(fields) > 1: + with pytest.raises(IndexError): + z.set_basic_selection(Ellipsis, v, fields=fields) + with pytest.raises(IndexError): + z.set_orthogonal_selection([0, 2], v, fields=fields) + with pytest.raises(IndexError): + z.set_coordinate_selection([0, 2], v, fields=fields) + with pytest.raises(IndexError): + z.set_mask_selection([True, False, True], v, fields=fields) + + else: + if isinstance(fields, list) and len(fields) == 1: + # work around numpy does not support multi-field assignment even if there + # is only one field + key = fields[0] + elif isinstance(fields, list) and len(fields) == 0: + # work around numpy ambiguity about what is a field selection + key = Ellipsis + else: + key = fields + + # setup expectation + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + assert_array_equal(a, z[:]) + a[key] = v[key] + # total selection + z.set_basic_selection(Ellipsis, v[key], fields=fields) + assert_array_equal(a, z[:]) + + # basic selection with slice + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + a[key][0:2] = v[key][0:2] + z.set_basic_selection(slice(0, 2), v[key][0:2], fields=fields) + assert_array_equal(a, z[:]) + + # orthogonal selection + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + ix = [0, 2] + a[key][ix] = v[key][ix] + z.set_orthogonal_selection(ix, v[key][ix], fields=fields) + assert_array_equal(a, z[:]) + + # coordinate selection + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + ix = [0, 2] + a[key][ix] = v[key][ix] + z.set_coordinate_selection(ix, v[key][ix], fields=fields) + assert_array_equal(a, z[:]) + + # mask selection + a[:] = ("", 0, 0) + z[:] = ("", 0, 0) + ix = [True, False, True] + a[key][ix] = v[key][ix] + z.set_mask_selection(ix, v[key][ix], fields=fields) + assert_array_equal(a, z[:]) + + +def test_slice_selection_uints(): + arr = np.arange(24).reshape((4, 6)) + idx = np.uint64(3) + slice_sel = make_slice_selection((idx,)) + assert arr[tuple(slice_sel)].shape == (1, 6) + + +def test_numpy_int_indexing(store: StorePath): + a = np.arange(1050) + z = zarr_array_from_numpy_array(store, a, chunk_shape=(100,)) + assert a[42] == z[42] + assert a[np.int64(42)] == z[np.int64(42)] + + +@pytest.mark.parametrize( + "shape, chunks, ops", + [ + # 1D test cases + ((1070,), (50,), [("__getitem__", (slice(200, 400),))]), + ((1070,), (50,), [("__getitem__", (slice(200, 400, 100),))]), + ( + (1070,), + (50,), + [ + ("__getitem__", (slice(200, 400),)), + ("__setitem__", (slice(200, 400, 100),)), + ], + ), + # 2D test cases + ( + (40, 50), + (5, 8), + [ + ("__getitem__", (slice(6, 37, 13), (slice(4, 10)))), + ("__setitem__", (slice(None), (slice(None)))), + ], + ), + ], +) +def test_accessed_chunks(shape, chunks, ops): + # Test that only the required chunks are accessed during basic selection operations + # shape: array shape + # chunks: chunk size + # ops: list of tuples with (optype, tuple of slices) + # optype = "__getitem__" or "__setitem__", tuple length must match number of dims + import itertools + + # Use a counting dict as the backing store so we can track the items access + store = CountingDict() + z = zarr_array_from_numpy_array(StorePath(store), np.zeros(shape), chunk_shape=chunks) + + for ii, (optype, slices) in enumerate(ops): + # Resolve the slices into the accessed chunks for each dimension + chunks_per_dim = [] + for N, C, sl in zip(shape, chunks, slices, strict=True): + chunk_ind = np.arange(N, dtype=int)[sl] // C + chunks_per_dim.append(np.unique(chunk_ind)) + + # Combine and generate the cartesian product to determine the chunks keys that + # will be accessed + chunks_accessed = [] + for comb in itertools.product(*chunks_per_dim): + chunks_accessed.append(".".join([str(ci) for ci in comb])) + + counts_before = store.counter.copy() + + # Perform the operation + if optype == "__getitem__": + z[slices] + else: + z[slices] = ii + + # Get the change in counts + delta_counts = store.counter - counts_before + + # Check that the access counts for the operation have increased by one for all + # the chunks we expect to be included + for ci in chunks_accessed: + assert delta_counts.pop((optype, ci)) == 1 + + # If the chunk was partially written to it will also have been read once. We + # don't determine if the chunk was actually partial here, just that the + # counts are consistent that this might have happened + if optype == "__setitem__": + assert ("__getitem__", ci) not in delta_counts or delta_counts.pop( + ("__getitem__", ci) + ) == 1 + # Check that no other chunks were accessed + assert len(delta_counts) == 0 diff --git a/tests/v3/util.py b/tests/v3/util.py new file mode 100644 index 0000000000..7c47ff578e --- /dev/null +++ b/tests/v3/util.py @@ -0,0 +1,64 @@ +import collections +import os +import tempfile + +import pytest + +from zarr.store.memory import MemoryStore + + +class CountingDict(MemoryStore): + def __init__(self): + super().__init__() + self.counter = collections.Counter() + + async def get(self, key, byte_range=None): + key_suffix = "/".join(key.split("/")[1:]) + self.counter["__getitem__", key_suffix] += 1 + return await super().get(key, byte_range) + + async def set(self, key, value, byte_range=None): + key_suffix = "/".join(key.split("/")[1:]) + self.counter["__setitem__", key_suffix] += 1 + return await super().set(key, value, byte_range) + + +def skip_test_env_var(name): + """Checks for environment variables indicating whether tests requiring services should be run""" + value = os.environ.get(name, "0") + return pytest.mark.skipif(value == "0", reason="Tests not enabled via environment variable") + + +try: + import fsspec # noqa: F401 + + have_fsspec = True +except ImportError: # pragma: no cover + have_fsspec = False + + +def abs_container(): + from azure.core.exceptions import ResourceExistsError + import azure.storage.blob as asb + + URL = "http://127.0.0.1:10000" + ACCOUNT_NAME = "devstoreaccount1" + KEY = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" + CONN_STR = ( + f"DefaultEndpointsProtocol=http;AccountName={ACCOUNT_NAME};" + f"AccountKey={KEY};BlobEndpoint={URL}/{ACCOUNT_NAME};" + ) + + blob_service_client = asb.BlobServiceClient.from_connection_string(CONN_STR) + try: + container_client = blob_service_client.create_container("test") + except ResourceExistsError: + container_client = blob_service_client.get_container_client("test") + + return container_client + + +def mktemp(**kwargs): + f = tempfile.NamedTemporaryFile(**kwargs) + f.close() + return f.name From fd2c70ddb3e72d1686f2a91444a09e37a4238730 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 24 May 2024 21:38:45 +0200 Subject: [PATCH 02/13] fixes tests --- src/zarr/abc/codec.py | 2 + src/zarr/array.py | 131 ++++++++++++++++------------- src/zarr/buffer.py | 9 +- src/zarr/codecs/pipeline.py | 40 +++++++-- src/zarr/indexing2.py | 160 +++++++++++++++++++++--------------- tests/v3/test_indexing.py | 47 ++++++----- tests/v3/util.py | 2 +- 7 files changed, 241 insertions(+), 150 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 028d1757ce..9dfb939cca 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -344,6 +344,7 @@ async def read( self, batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], out: NDBuffer, + drop_axes: tuple[int, ...] | None = None, ) -> None: """Reads chunk data from the store, decodes it and writes it into an output array. Partial decoding may be utilized if the codecs and stores support it. @@ -365,6 +366,7 @@ async def write( self, batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], value: NDBuffer, + drop_axes: tuple[int, ...] | None = None, ) -> None: """Encodes chunk data and writes it to the store. Merges with existing chunk data by reading first, if necessary. diff --git a/src/zarr/array.py b/src/zarr/array.py index 9d297da442..d81d3061f4 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -47,8 +47,10 @@ OIndex, OrthogonalIndexer, VIndex, + check_no_multi_fields, is_pure_fancy_indexing, is_pure_orthogonal_indexing, + is_scalar, pop_fields, ) from zarr.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata, parse_indexing_order @@ -377,19 +379,35 @@ async def _get_selection( self, indexer: Indexer, *, - factory: Factory.Create = NDBuffer.create, + out: NDArrayLike | NDBuffer | None = None, + create_factory: Factory.Create = NDBuffer.create, + from_factory: Factory.Create = NDBuffer.from_ndarray_like, fields: Fields | None = None, ) -> NDArrayLike: # check fields are sensible - out_dtype = check_fields(fields, self.metadata.dtype) - - # setup output array - out = factory( - shape=indexer.shape, - dtype=out_dtype, - order=self.order, - fill_value=self.metadata.fill_value, - ) + out_dtype = check_fields(fields, self.dtype) + + # setup output buffer + if out is not None: + if isinstance(out, NDArrayLike): + out_buffer = NDBuffer.from_ndarray_like(out) + elif isinstance(out, NDBuffer): + out_buffer = out + else: + raise TypeError( + f"out argument needs to be either an ndarray or an NDBuffer. Got {type(out)!r}" + ) + if out_buffer.shape != indexer.shape: + raise ValueError( + f"shape of out argument doesn't match. Expected {indexer.shape}, got {out.shape}" + ) + else: + out_buffer = create_factory( + shape=indexer.shape, + dtype=out_dtype, + order=self.order, + fill_value=self.metadata.fill_value, + ) if math.prod(indexer.shape) > 0: # reading chunks and decoding them await self.metadata.codec_pipeline.read( @@ -402,9 +420,10 @@ async def _get_selection( ) for chunk_coords, chunk_selection, out_selection in indexer ], - out, + out_buffer, + drop_axes=indexer.drop_axes, ) - return out.as_ndarray_like() + return out_buffer.as_ndarray_like() async def getitem( self, selection: Selection, *, factory: Factory.Create = NDBuffer.create @@ -429,7 +448,9 @@ async def _set_selection( factory: Factory.NDArrayLike = NDBuffer.from_ndarray_like, fields: Fields | None = None, ) -> None: - sel_shape = indexer.shape + # check fields are sensible + check_fields(fields, self.dtype) + fields = check_no_multi_fields(fields) # check value shape if np.isscalar(value): @@ -437,7 +458,9 @@ async def _set_selection( else: if not hasattr(value, "shape"): value = np.asarray(value, self.metadata.dtype) - assert value.shape == sel_shape + # assert ( + # value.shape == indexer.shape + # ), f"shape of value doesn't match indexer shape. Expected {indexer.shape}, got {value.shape}" if value.dtype.name != self.metadata.dtype.name: value = value.astype(self.metadata.dtype, order="A") @@ -458,6 +481,7 @@ async def _set_selection( for chunk_coords, chunk_selection, out_selection in indexer ], value_buffer, + drop_axes=indexer.drop_axes, ) async def setitem( @@ -640,74 +664,69 @@ def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: self.set_basic_selection(pure_selection, value, fields=fields) def get_basic_selection( - self, selection: Selection = Ellipsis, out=None, fields: Fields | None = None + self, + selection: Selection = Ellipsis, + out: NDArrayLike | NDBuffer | None = None, + fields: Fields | None = None, ) -> NDArrayLike: - check_fields(fields, self.dtype) if self.shape == (): raise NotImplementedError else: return sync( self._async_array._get_selection( - BasicIndexer(selection, self.shape, self.metadata.chunk_grid) + BasicIndexer(selection, self.shape, self.metadata.chunk_grid), + out=out, + fields=fields, ) ) def set_basic_selection( self, selection: Selection, value: NDArrayLike, fields: Fields | None = None ) -> None: - check_fields(fields, self.dtype) indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) - sync(self._async_array._set_selection(indexer, value, fields=fields)) def get_orthogonal_selection( - self, selection: Selection, fields: Fields | None = None + self, + selection: Selection, + out: NDArrayLike | NDBuffer | None = None, + fields: Fields | None = None, ) -> NDArrayLike: - check_fields(fields, self.dtype) - - # setup indexer indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) - return sync(self._async_array._get_selection(indexer=indexer)) + return sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) def set_orthogonal_selection( self, selection: Selection, value: NDArrayLike, fields: Fields | None = None ) -> None: indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) - return sync(self._async_array._set_selection(indexer, value, fields=fields)) def get_mask_selection( - self, mask: npt.NDArray[Any], fields: Fields | None = None + self, + mask: npt.NDArray[Any], + out: NDArrayLike | NDBuffer | None = None, + fields: Fields | None = None, ) -> NDArrayLike: - # check args - check_fields(fields, self.dtype) - - # setup indexer indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) - - return sync(self._async_array._get_selection(indexer=indexer, fields=fields)) + return sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) def set_mask_selection( self, mask: npt.NDArray[Any], value: NDArrayLike, fields: Fields | None = None ) -> None: - # setup indexer indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) - sync(self._async_array._set_selection(indexer, value, fields=fields)) def get_coordinate_selection( - self, selection: CoordinateSelection, fields: Fields | None = None + self, + selection: CoordinateSelection, + out: NDArrayLike | NDBuffer | None = None, + fields: Fields | None = None, ) -> NDArrayLike: - check_fields(fields, self.dtype) - - # setup indexer indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) - - out = sync(self._async_array._get_selection(indexer=indexer, fields=fields)) + out = sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) # restore shape out = out.reshape(indexer.sel_shape) - return out def set_coordinate_selection( @@ -716,35 +735,33 @@ def set_coordinate_selection( # setup indexer indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) - # # handle value - need ndarray-like flatten value - # if not is_scalar(value, self._dtype): - # try: - # value = ensure_ndarray_like(value) - # except TypeError: - # # Handle types like `list` or `tuple` - # value = np.array(value, like=self._meta_array) + # handle value - need ndarray-like flatten value + if not is_scalar(value, self.dtype): + try: + from numcodecs.compat import ensure_ndarray_like + + value = ensure_ndarray_like(value) # TODO replace with agnostic + except TypeError: + # Handle types like `list` or `tuple` + value = np.array(value) # TODO replace with agnostic if hasattr(value, "shape") and len(value.shape) > 1: value = value.reshape(-1) sync(self._async_array._set_selection(indexer, value, fields=fields)) def get_block_selection( - self, selection: Selection, fields: Fields | None = None + self, + selection: Selection, + out: NDArrayLike | NDBuffer | None = None, + fields: Fields | None = None, ) -> NDArrayLike: - # check args - check_fields(fields, self.dtype) - - # setup indexer indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) - - return sync(self._async_array._get_selection(indexer=indexer, fields=fields)) + return sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) def set_block_selection( self, selection: Selection, value: npt.NDArray[Any], fields: Fields | None = None ) -> None: - # setup indexer indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) - sync(self._async_array._set_selection(indexer, value, fields=fields)) @property diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index 0f055093c1..a6dc7f8fe4 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -304,7 +304,7 @@ class NDBuffer: """ def __init__(self, array: NDArrayLike): - assert array.ndim > 0 + # assert array.ndim > 0 assert array.dtype != object self._data = array @@ -421,6 +421,10 @@ def byteorder(self) -> Endian: def reshape(self, newshape: ChunkCoords) -> Self: return self.__class__(self._data.reshape(newshape)) + def squeeze(self, axis: tuple[int, ...]) -> Self: + newshape = tuple(a for i, a in enumerate(self.shape) if i not in axis) + return self.__class__(self._data.reshape(newshape)) + def astype(self, dtype: npt.DTypeLike, order: Literal["K", "A", "C", "F"] = "K") -> Self: return self.__class__(self._data.astype(dtype=dtype, order=order)) @@ -435,6 +439,9 @@ def __setitem__(self, key: Any, value: Any) -> None: def __len__(self) -> int: return self._data.__len__() + def __repr__(self) -> str: + return f"" + def all_equal(self, other: Any) -> bool: return bool((self._data == other).all()) diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 893cbc8b4b..398bbd0583 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -6,6 +6,8 @@ from typing import TYPE_CHECKING, TypeVar from warnings import warn +from numpy import newaxis + from zarr.abc.codec import ( ArrayArrayCodec, ArrayBytesCodec, @@ -22,6 +24,7 @@ from zarr.common import JSON, concurrent_map, parse_named_configuration from zarr.config import config from zarr.indexing import is_total_slice +from zarr.indexing2 import is_scalar from zarr.metadata import ArrayMetadata if TYPE_CHECKING: @@ -293,6 +296,7 @@ async def read_batch( self, batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], out: NDBuffer, + drop_axes: tuple[int, ...] | None = None, ) -> None: if self.supports_partial_decode: chunk_array_batch = await self.decode_partial_batch( @@ -327,6 +331,8 @@ async def read_batch( ): if chunk_array is not None: tmp = chunk_array[chunk_selection] + if drop_axes: + tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp else: out[out_selection] = chunk_spec.fill_value @@ -335,6 +341,7 @@ async def write_batch( self, batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], value: NDBuffer, + drop_axes: tuple[int, ...] | None = None, ) -> None: if self.supports_partial_encode: await self.encode_partial_batch( @@ -371,12 +378,16 @@ async def _read_key(byte_setter: ByteSetter | None) -> Buffer | None: def _merge_chunk_array( existing_chunk_array: NDBuffer | None, - new_chunk_array_slice: NDBuffer, + value: NDBuffer, + out_selection: SliceSelection, chunk_spec: ArraySpec, chunk_selection: SliceSelection, ) -> NDBuffer: - if is_total_slice(chunk_selection, chunk_spec.shape): - return new_chunk_array_slice + if ( + is_total_slice(chunk_selection, chunk_spec.shape) + and value.shape == chunk_spec.shape + ): + return value if existing_chunk_array is None: chunk_array = NDBuffer.create( shape=chunk_spec.shape, @@ -386,11 +397,24 @@ def _merge_chunk_array( ) else: chunk_array = existing_chunk_array.copy() # make a writable copy - chunk_array[chunk_selection] = new_chunk_array_slice + if chunk_selection == (): + chunk_value = value + elif is_scalar(value.as_ndarray_like(), chunk_spec.dtype): + chunk_value = value + else: + chunk_value = value[out_selection] + # handle missing singleton dimensions + if drop_axes: + item = [slice(None)] * chunk_spec.ndim + for a in drop_axes: + item[a] = newaxis # TODO replace with agnostic newaxis + item = tuple(item) + chunk_value = chunk_value[item] + chunk_array[chunk_selection] = chunk_value return chunk_array chunk_array_batch = [ - _merge_chunk_array(chunk_array, value[out_selection], chunk_spec, chunk_selection) + _merge_chunk_array(chunk_array, value, out_selection, chunk_spec, chunk_selection) for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( chunk_array_batch, batch_info, strict=False ) @@ -453,10 +477,11 @@ async def read( self, batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], out: NDBuffer, + drop_axes: tuple[int, ...] | None = None, ) -> None: await concurrent_map( [ - (single_batch_info, out) + (single_batch_info, out, drop_axes) for single_batch_info in batched(batch_info, self.batch_size) ], self.read_batch, @@ -467,10 +492,11 @@ async def write( self, batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], value: NDBuffer, + drop_axes: tuple[int, ...] | None = None, ) -> None: await concurrent_map( [ - (single_batch_info, value) + (single_batch_info, value, drop_axes) for single_batch_info in batched(batch_info, self.batch_size) ], self.write_batch, diff --git a/src/zarr/indexing2.py b/src/zarr/indexing2.py index 24a8e8715e..8852b3dabf 100644 --- a/src/zarr/indexing2.py +++ b/src/zarr/indexing2.py @@ -1,12 +1,12 @@ from __future__ import annotations -from enum import Enum import itertools import math import numbers import operator -from collections.abc import Iterable, Iterator +from collections.abc import Iterator from dataclasses import dataclass +from enum import Enum from functools import reduce from typing import TYPE_CHECKING, Any, NamedTuple, Protocol, runtime_checkable @@ -27,16 +27,17 @@ from zarr.buffer import NDArrayLike from zarr.chunk_grids import ChunkGrid -Selector = int | slice | npt.NDArray[Any] +Selector = int | slice | npt.NDArray[np.bool_] Selection = tuple[Selector, ...] +Fields = str | list | tuple @runtime_checkable class Indexer(Protocol): shape: ChunkCoords + drop_axes: ChunkCoords | None - def __iter__(self) -> Iterator[ChunkProjection]: - ... + def __iter__(self) -> Iterator[ChunkProjection]: ... def is_integer(x: Selector) -> bool: @@ -74,6 +75,8 @@ def is_bool_array(x: Selector, ndim: int | None = None) -> bool: def is_scalar(value: Selector, dtype: npt.DTypeLike) -> bool: if np.isscalar(value): return True + if hasattr(value, "shape") and value.shape == (): + return True if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names): return True return False @@ -205,6 +208,8 @@ def ceildiv(a: int, b: int) -> int: class SliceDimIndexer: dim_len: int dim_chunk_len: int + nitems: int + nchunks: int start: int stop: int @@ -321,7 +326,7 @@ def replace_lists(selection: Selection) -> Selection: ) -def ensure_tuple(v): +def ensure_tuple(v: Selector | tuple[Selector]) -> tuple[Selector, ...]: if not isinstance(v, tuple): v = (v,) return v @@ -370,7 +375,8 @@ def is_basic_selection(selection: Selection) -> bool: return all(is_integer(s) or is_positive_slice(s) for s in selection) -class BasicIndexer: +@dataclass(frozen=True) +class BasicIndexer(Indexer): dim_indexers: list[IntDimIndexer | SliceDimIndexer] shape: ChunkCoords drop_axes: None @@ -402,9 +408,13 @@ def __init__( dim_indexers.append(dim_indexer) - self.dim_indexers = dim_indexers - self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) - self.drop_axes = None + object.__setattr__(self, "dim_indexers", dim_indexers) + object.__setattr__( + self, + "shape", + tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)), + ) + object.__setattr__(self, "drop_axes", None) def __iter__(self) -> Iterator[ChunkProjection]: for dim_projections in itertools.product(*self.dim_indexers): @@ -437,9 +447,7 @@ def __init__(self, dim_sel: npt.NDArray[np.bool_], dim_len: int, dim_chunk_len: # check shape if dim_sel.shape[0] != dim_len: raise IndexError( - "Boolean array has the wrong length for dimension; expected {}, got {}".format( - dim_len, dim_sel.shape[0] - ) + f"Boolean array has the wrong length for dimension; expected {dim_len}, got {dim_sel.shape[0]}" ) # precompute number of selected items for each chunk @@ -521,6 +529,7 @@ def boundscheck_indices(x, dim_len): raise BoundsCheckError(dim_len) +@dataclass(frozen=True) class IntArrayDimIndexer: """Integer array selection against a single dimension.""" @@ -549,6 +558,9 @@ def __init__( if not is_integer_array(dim_sel, 1): raise IndexError("integer arrays in an orthogonal selection must be 1-dimensional only") + nitems = len(dim_sel) + nchunks = ceildiv(dim_len, dim_chunk_len) + # handle wraparound if wraparound: wraparound_indices(dim_sel, dim_len) @@ -557,12 +569,6 @@ def __init__( if boundscheck: boundscheck_indices(dim_sel, dim_len) - # store attributes - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) - self.nitems = len(dim_sel) - # determine which chunk is needed for each selection item # note: for dense integer selections, the division operation here is the # bottleneck @@ -571,28 +577,40 @@ def __init__( # determine order of indices if order == Order.UNKNOWN: order = Order.check(dim_sel) - self.order = Order(order) + order = Order(order) - if self.order == Order.INCREASING: - self.dim_sel = dim_sel - self.dim_out_sel = None - elif self.order == Order.DECREASING: - self.dim_sel = dim_sel[::-1] + if order == Order.INCREASING: + dim_sel = dim_sel + dim_out_sel = None + elif order == Order.DECREASING: + dim_sel = dim_sel[::-1] # TODO should be possible to do this without creating an arange - self.dim_out_sel = np.arange(self.nitems - 1, -1, -1) + dim_out_sel = np.arange(nitems - 1, -1, -1) else: # sort indices to group by chunk - self.dim_out_sel = np.argsort(dim_sel_chunk) - self.dim_sel = np.take(dim_sel, self.dim_out_sel) + dim_out_sel = np.argsort(dim_sel_chunk) + dim_sel = np.take(dim_sel, dim_out_sel) # precompute number of selected items for each chunk - self.chunk_nitems = np.bincount(dim_sel_chunk, minlength=self.nchunks) + chunk_nitems = np.bincount(dim_sel_chunk, minlength=nchunks) # find chunks that we need to visit - self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] + dim_chunk_ixs = np.nonzero(chunk_nitems)[0] # compute offsets into the output array - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + chunk_nitems_cumsum = np.cumsum(chunk_nitems) + + # store attributes + object.__setattr__(self, "dim_len", dim_len) + object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "nchunks", nchunks) + object.__setattr__(self, "nitems", nitems) + object.__setattr__(self, "order", order) + object.__setattr__(self, "dim_sel", dim_sel) + object.__setattr__(self, "dim_out_sel", dim_out_sel) + object.__setattr__(self, "chunk_nitems", chunk_nitems) + object.__setattr__(self, "dim_chunk_ixs", dim_chunk_ixs) + object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) def __iter__(self) -> Iterator[ChunkDimProjection]: for dim_chunk_ix in self.dim_chunk_ixs: @@ -666,8 +684,8 @@ def oindex_set(a: npt.NDArray[Any], selection: Selection, value): a[selection] = value -# noinspection PyProtectedMember -class OrthogonalIndexer: +@dataclass(frozen=True) +class OrthogonalIndexer(Indexer): dim_indexers: list[IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer] shape: ChunkCoords chunk_shape: ChunkCoords @@ -707,18 +725,24 @@ def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGr dim_indexers.append(dim_indexer) - self.dim_indexers = dim_indexers - self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) - self.chunk_shape = chunk_shape - self.is_advanced = not is_basic_selection(selection) - if self.is_advanced: - self.drop_axes = tuple( + dim_indexers = dim_indexers + shape = tuple(s.nitems for s in dim_indexers if not isinstance(s, IntDimIndexer)) + chunk_shape = chunk_shape + is_advanced = not is_basic_selection(selection) + if is_advanced: + drop_axes = tuple( i - for i, dim_indexer in enumerate(self.dim_indexers) + for i, dim_indexer in enumerate(dim_indexers) if isinstance(dim_indexer, IntDimIndexer) ) else: - self.drop_axes = None + drop_axes = None + + object.__setattr__(self, "dim_indexers", dim_indexers) + object.__setattr__(self, "shape", shape) + object.__setattr__(self, "chunk_shape", chunk_shape) + object.__setattr__(self, "is_advanced", is_advanced) + object.__setattr__(self, "drop_axes", drop_axes) def __iter__(self) -> Iterator[ChunkProjection]: for dim_projections in itertools.product(*self.dim_indexers): @@ -760,8 +784,8 @@ def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: return self.array.set_orthogonal_selection(selection, value, fields=fields) -# noinspection PyProtectedMember -class BlockIndexer: +@dataclass(frozen=True) +class BlockIndexer(Indexer): dim_indexers: list[SliceDimIndexer] shape: ChunkCoords drop_axes: None @@ -821,9 +845,12 @@ def __init__(self, selection, shape: ChunkCoords, chunk_grid: ChunkGrid): if start >= dim_len or start < 0: raise BoundsCheckError(dim_len) - self.dim_indexers = dim_indexers - self.shape = tuple(s.nitems for s in self.dim_indexers) - self.drop_axes = None + dim_indexers = dim_indexers + shape = tuple(s.nitems for s in dim_indexers) + + object.__setattr__(self, "dim_indexers", dim_indexers) + object.__setattr__(self, "shape", shape) + object.__setattr__(self, "drop_axes", None) def __iter__(self) -> Iterator[ChunkProjection]: for dim_projections in itertools.product(*self.dim_indexers): @@ -853,23 +880,24 @@ def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: return self.array.set_block_selection(selection, value, fields=fields) -# noinspection PyProtectedMember def is_coordinate_selection(selection: Selection, shape: ChunkCoords) -> bool: return (len(selection) == len(shape)) and all( is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection ) -# noinspection PyProtectedMember def is_mask_selection(selection: Selection, shape: ChunkCoords) -> bool: return len(selection) == 1 and is_bool_array(selection[0]) and selection[0].shape == shape -# noinspection PyProtectedMember -class CoordinateIndexer: +@dataclass(frozen=True) +class CoordinateIndexer(Indexer): sel_shape: ChunkCoords selection: Selection sel_sort: npt.NDArray[np.intp] | None + chunk_nitems_cumsum: npt.NDArray[np.intp] + chunk_rixs: npt.NDArray[np.intp] + chunk_mixs: tuple[npt.NDArray[np.intp], ...] shape: ChunkCoords chunk_shape: ChunkCoords drop_axes: None @@ -915,7 +943,7 @@ def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGr chunks_multi_index = np.broadcast_arrays(*chunks_multi_index) # remember shape of selection, because we will flatten indices for processing - self.sel_shape = selection[0].shape if selection[0].shape else (1,) + sel_shape = selection[0].shape if selection[0].shape else (1,) # flatten selection selection = [dim_sel.reshape(-1) for dim_sel in selection] @@ -932,21 +960,26 @@ def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGr else: sel_sort = None - # store attributes - self.selection = selection - self.sel_sort = sel_sort - self.shape = selection[0].shape if selection[0].shape else (1,) - self.chunk_shape = chunk_shape - self.drop_axes = None + shape = selection[0].shape if selection[0].shape else (1,) # precompute number of selected items for each chunk - self.chunk_nitems = np.bincount(chunks_raveled_indices, minlength=nchunks) - self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) + chunk_nitems = np.bincount(chunks_raveled_indices, minlength=nchunks) + chunk_nitems_cumsum = np.cumsum(chunk_nitems) # locate the chunks we need to process - self.chunk_rixs = np.nonzero(self.chunk_nitems)[0] + chunk_rixs = np.nonzero(chunk_nitems)[0] # unravel chunk indices - self.chunk_mixs = np.unravel_index(self.chunk_rixs, cdata_shape) + chunk_mixs = np.unravel_index(chunk_rixs, cdata_shape) + + object.__setattr__(self, "sel_shape", sel_shape) + object.__setattr__(self, "selection", selection) + object.__setattr__(self, "sel_sort", sel_sort) + object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) + object.__setattr__(self, "chunk_rixs", chunk_rixs) + object.__setattr__(self, "chunk_mixs", chunk_mixs) + object.__setattr__(self, "chunk_shape", chunk_shape) + object.__setattr__(self, "shape", shape) + object.__setattr__(self, "drop_axes", None) def __iter__(self) -> Iterator[ChunkProjection]: # iterate over chunks @@ -974,7 +1007,7 @@ def __iter__(self) -> Iterator[ChunkProjection]: yield ChunkProjection(chunk_coords, chunk_selection, out_selection) -# noinspection PyProtectedMember +@dataclass(frozen=True) class MaskIndexer(CoordinateIndexer): def __init__(self, selection, shape: ChunkCoords, chunk_grid: ChunkGrid): # some initial normalization @@ -1022,9 +1055,6 @@ def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: raise VindexInvalidSelectionError(selection) -Fields = str | list | tuple - - def check_fields(fields: Fields, dtype: npt.DTypeLike) -> npt.DTypeLike: # early out if fields is None: diff --git a/tests/v3/test_indexing.py b/tests/v3/test_indexing.py index b46393451d..717fe733ca 100644 --- a/tests/v3/test_indexing.py +++ b/tests/v3/test_indexing.py @@ -31,7 +31,9 @@ def store() -> Iterator[Store]: def zarr_array_from_numpy_array( - store: StorePath, a: npt.NDArray[Any], chunk_shape: ChunkCoords | None = None + store: StorePath, + a: npt.NDArray[Any], + chunk_shape: ChunkCoords | None = None, ) -> zarr.Array: z = zarr.Array.create( store=store / str(uuid4()), @@ -104,10 +106,10 @@ def test_get_basic_selection_0d(store: StorePath): assert 42 == z.get_basic_selection(()) assert 42 == z[()] - # # test out param - # b = np.zeros_like(a) - # z.get_basic_selection(Ellipsis, out=b) - # assert_array_equal(a, b) + # test out param + b = np.zeros_like(a) + z.get_basic_selection(Ellipsis, out=b) + assert_array_equal(a, b) # test structured array value = (b"aaa", 1, 4.2) @@ -214,6 +216,11 @@ def _test_get_basic_selection(a, z, selection): actual = z[selection] assert_array_equal(expect, actual) + # test out param + b = np.empty(shape=expect.shape, dtype=expect.dtype) + z.get_basic_selection(selection, out=b) + assert_array_equal(expect, b) + # noinspection PyStatementEffect def test_get_basic_selection_1d(store: StorePath): @@ -326,7 +333,7 @@ def test_fancy_indexing_fallback_on_get_setitem(store: StorePath): # test 1D fancy indexing z2 = zarr_array_from_numpy_array(store, np.zeros(5)) z2[[1, 2, 3]] = 1 - np.testing.assert_array_equal(z2, [0, 1, 1, 1, 0]) + np.testing.assert_array_equal(z2[:], [0, 1, 1, 1, 0]) @pytest.mark.parametrize( @@ -428,8 +435,8 @@ def test_orthogonal_indexing_fallback_on_setitem_2d(store: StorePath, index, exp z = zarr_array_from_numpy_array(store, a) z[index] = 1 a[index] = 1 - np.testing.assert_array_equal(z, expected_result) - np.testing.assert_array_equal(z, a, err_msg="Indexing disagrees with numpy") + np.testing.assert_array_equal(z[:], expected_result) + np.testing.assert_array_equal(z[:], a, err_msg="Indexing disagrees with numpy") def test_fancy_indexing_doesnt_mix_with_implicit_slicing(store: StorePath): @@ -450,15 +457,15 @@ def test_set_basic_selection_0d(store: StorePath): v = np.array(42) a = np.zeros_like(v) z = zarr_array_from_numpy_array(store, v) - assert_array_equal(a, z) + assert_array_equal(a, z[:]) # tests z.set_basic_selection(Ellipsis, v) - assert_array_equal(v, z) + assert_array_equal(v, z[:]) z[...] = 0 - assert_array_equal(a, z) + assert_array_equal(a, z[:]) z[...] = v - assert_array_equal(v, z) + assert_array_equal(v, z[:]) # test structured array value = (b"aaa", 1, 4.2) @@ -468,13 +475,13 @@ def test_set_basic_selection_0d(store: StorePath): # tests z.set_basic_selection(Ellipsis, v) - assert_array_equal(v, z) + assert_array_equal(v, z[:]) z.set_basic_selection(Ellipsis, a) - assert_array_equal(a, z) + assert_array_equal(a, z[:]) z[...] = v - assert_array_equal(v, z) + assert_array_equal(v, z[:]) z[...] = a - assert_array_equal(a, z) + assert_array_equal(a, z[:]) # with fields z.set_basic_selection(Ellipsis, v["foo"], fields="foo") assert v["foo"] == z["foo"] @@ -862,7 +869,7 @@ def test_orthogonal_indexing_fallback_on_get_setitem(store: StorePath): # test 1D fancy indexing z2 = zarr_array_from_numpy_array(store, np.zeros(5)) z2[[1, 2, 3]] = 1 - np.testing.assert_array_equal(z2, [0, 1, 1, 1, 0]) + np.testing.assert_array_equal(z2[:], [0, 1, 1, 1, 0]) def _test_get_coordinate_selection(a, z, selection): @@ -1098,7 +1105,7 @@ def _test_get_block_selection(a, z, selection, expected_idx): slice(3, 8, 2), # bad stuff 2.3, - "foo", + # "foo", # TODO b"xxx", None, (0, 0), @@ -1360,7 +1367,7 @@ def test_get_selection_out(store: StorePath): ] for selection in selections: expect = a[selection] - out = zarr_array_from_numpy_array(store, np.zeros(expect.shape), chunk_shape=(10,)) + out = np.empty(expect.shape) z.get_basic_selection(selection, out=out) assert_array_equal(expect, out[:]) @@ -1417,6 +1424,7 @@ def test_get_selection_out(store: StorePath): assert_array_equal(expect, out[:]) +@pytest.mark.xfail(reason="fields are not supported in v3") def test_get_selections_with_fields(store: StorePath): a = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] a = np.array(a, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) @@ -1522,6 +1530,7 @@ def test_get_selections_with_fields(store: StorePath): z.get_basic_selection(Ellipsis, fields=slice(None)) +@pytest.mark.xfail(reason="fields are not supported in v3") def test_set_selections_with_fields(store: StorePath): v = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] v = np.array(v, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) diff --git a/tests/v3/util.py b/tests/v3/util.py index 7c47ff578e..59475b59a5 100644 --- a/tests/v3/util.py +++ b/tests/v3/util.py @@ -38,8 +38,8 @@ def skip_test_env_var(name): def abs_container(): - from azure.core.exceptions import ResourceExistsError import azure.storage.blob as asb + from azure.core.exceptions import ResourceExistsError URL = "http://127.0.0.1:10000" ACCOUNT_NAME = "devstoreaccount1" From b24fc77727135f93669e8c0641992a08a4161089 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 24 May 2024 21:44:13 +0200 Subject: [PATCH 03/13] rename indexing module --- src/zarr/array.py | 8 +- src/zarr/chunk_grids.py | 6 +- src/zarr/codecs/pipeline.py | 3 +- src/zarr/codecs/sharding.py | 3 +- src/zarr/indexing.py | 1082 +++++++++++++++++++++++++++++++-- src/zarr/indexing2.py | 1124 ----------------------------------- tests/v3/test_indexing.py | 2 +- 7 files changed, 1035 insertions(+), 1193 deletions(-) delete mode 100644 src/zarr/indexing2.py diff --git a/src/zarr/array.py b/src/zarr/array.py index d81d3061f4..e75696c716 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -1,7 +1,6 @@ from __future__ import annotations import json -import math # Notes on what I've changed here: # 1. Split Array into AsyncArray and Array @@ -34,9 +33,10 @@ Selection, ZarrFormat, concurrent_map, + product, ) from zarr.config import config -from zarr.indexing2 import ( +from zarr.indexing import ( BasicIndexer, BlockIndex, BlockIndexer, @@ -408,7 +408,7 @@ async def _get_selection( order=self.order, fill_value=self.metadata.fill_value, ) - if math.prod(indexer.shape) > 0: + if product(indexer.shape) > 0: # reading chunks and decoding them await self.metadata.codec_pipeline.read( [ @@ -433,7 +433,7 @@ async def getitem( shape=self.metadata.shape, chunk_grid=self.metadata.chunk_grid, ) - return await self._get_selection(indexer, factory=factory) + return await self._get_selection(indexer, create_factory=factory) async def _save_metadata(self, metadata: ArrayMetadata) -> None: to_save = metadata.to_buffer_dict() diff --git a/src/zarr/chunk_grids.py b/src/zarr/chunk_grids.py index 4fe90e4358..941f799849 100644 --- a/src/zarr/chunk_grids.py +++ b/src/zarr/chunk_grids.py @@ -16,7 +16,7 @@ parse_named_configuration, parse_shapelike, ) -from zarr.indexing import _ceildiv +from zarr.indexing import ceildiv if TYPE_CHECKING: from typing_extensions import Self @@ -63,12 +63,12 @@ def to_dict(self) -> dict[str, JSON]: def all_chunk_coords(self, array_shape: ChunkCoords) -> Iterator[ChunkCoords]: return itertools.product( - *(range(0, _ceildiv(s, c)) for s, c in zip(array_shape, self.chunk_shape, strict=False)) + *(range(0, ceildiv(s, c)) for s, c in zip(array_shape, self.chunk_shape, strict=False)) ) def get_nchunks(self, array_shape: ChunkCoords) -> int: return reduce( operator.mul, - (_ceildiv(s, c) for s, c in zip(array_shape, self.chunk_shape, strict=True)), + (ceildiv(s, c) for s, c in zip(array_shape, self.chunk_shape, strict=True)), 1, ) diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 398bbd0583..bdab8a6ac0 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -23,8 +23,7 @@ from zarr.codecs.registry import get_codec_class from zarr.common import JSON, concurrent_map, parse_named_configuration from zarr.config import config -from zarr.indexing import is_total_slice -from zarr.indexing2 import is_scalar +from zarr.indexing import is_scalar, is_total_slice from zarr.metadata import ArrayMetadata if TYPE_CHECKING: diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 0dc958e11f..8fbe4f6654 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -34,8 +34,7 @@ parse_shapelike, product, ) -from zarr.indexing import c_order_iter, morton_order_iter -from zarr.indexing2 import BasicIndexer +from zarr.indexing import BasicIndexer, c_order_iter, morton_order_iter from zarr.metadata import ArrayMetadata, parse_codecs if TYPE_CHECKING: diff --git a/src/zarr/indexing.py b/src/zarr/indexing.py index 45413bc5b2..4408cdd3ae 100644 --- a/src/zarr/indexing.py +++ b/src/zarr/indexing.py @@ -2,84 +2,242 @@ import itertools import math +import numbers +import operator from collections.abc import Iterator -from typing import TYPE_CHECKING, NamedTuple +from dataclasses import dataclass +from enum import Enum +from functools import reduce +from typing import TYPE_CHECKING, Any, NamedTuple, Protocol, runtime_checkable -from zarr.common import ChunkCoords, Selection, SliceSelection, product +import numpy as np +import numpy.typing as npt + +from zarr.common import ChunkCoords, SliceSelection, product +from zarr.v2.errors import ( + ArrayIndexError, + BoundsCheckError, + NegativeStepError, + VindexInvalidSelectionError, + err_too_many_indices, +) if TYPE_CHECKING: + from zarr.array import Array + from zarr.buffer import NDArrayLike from zarr.chunk_grids import ChunkGrid +Selector = int | slice | npt.NDArray[np.bool_] +Selection = tuple[Selector, ...] +Fields = str | list | tuple -def _ensure_tuple(v: Selection) -> SliceSelection: - if not isinstance(v, tuple): - v = (v,) - return v +@runtime_checkable +class Indexer(Protocol): + shape: ChunkCoords + drop_axes: ChunkCoords | None -def _err_too_many_indices(selection: SliceSelection, shape: ChunkCoords) -> None: - raise IndexError(f"too many indices for array; expected {len(shape)}, got {len(selection)}") + def __iter__(self) -> Iterator[ChunkProjection]: ... -def _err_negative_step() -> None: - raise IndexError("only slices with step >= 1 are supported") +def ceildiv(a: float, b: float) -> int: + return math.ceil(a / b) -def _check_selection_length(selection: SliceSelection, shape: ChunkCoords) -> None: - if len(selection) > len(shape): - _err_too_many_indices(selection, shape) +def is_integer(x: Selector) -> bool: + """True if x is an integer (both pure Python or NumPy). + Note that Python's bool is considered an integer too. + """ + return isinstance(x, numbers.Integral) -def _ensure_selection( - selection: Selection, - shape: ChunkCoords, -) -> SliceSelection: - selection = _ensure_tuple(selection) - # fill out selection if not completely specified - if len(selection) < len(shape): - selection += (slice(None),) * (len(shape) - len(selection)) +def is_integer_list(x: Selector) -> bool: + """True if x is a list of integers. - # check selection not too long - _check_selection_length(selection, shape) + This function assumes ie *does not check* that all elements of the list + have the same type. Mixed type lists will result in other errors that will + bubble up anyway. + """ + return isinstance(x, list) and len(x) > 0 and is_integer(x[0]) + + +def is_integer_array(x: Selector, ndim: int | None = None) -> bool: + t = not np.isscalar(x) and hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype.kind in "ui" + if ndim is not None: + t = t and len(x.shape) == ndim + return t + + +def is_bool_array(x: Selector, ndim: int | None = None) -> bool: + t = hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype == bool + if ndim is not None: + t = t and len(x.shape) == ndim + return t + + +def is_scalar(value: Selector, dtype: npt.DTypeLike) -> bool: + if np.isscalar(value): + return True + if hasattr(value, "shape") and value.shape == (): + return True + if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names): + return True + return False + + +def is_pure_fancy_indexing(selection: Selection, ndim: int) -> bool: + """Check whether a selection contains only scalars or integer array-likes. + + Parameters + ---------- + selection : tuple, slice, or scalar + A valid selection value for indexing into arrays. + + Returns + ------- + is_pure : bool + True if the selection is a pure fancy indexing expression (ie not mixed + with boolean or slices). + """ + if ndim == 1: + if is_integer_list(selection) or is_integer_array(selection): + return True + # if not, we go through the normal path below, because a 1-tuple + # of integers is also allowed. + no_slicing = ( + isinstance(selection, tuple) + and len(selection) == ndim + and not (any(isinstance(elem, slice) or elem is Ellipsis for elem in selection)) + ) + return ( + no_slicing + and all( + is_integer(elem) or is_integer_list(elem) or is_integer_array(elem) + for elem in selection + ) + and any(is_integer_list(elem) or is_integer_array(elem) for elem in selection) + ) + + +def is_pure_orthogonal_indexing(selection: Selection, ndim: int) -> bool: + if not ndim: + return False + + # Case 1: Selection is a single iterable of integers + if is_integer_list(selection) or is_integer_array(selection, ndim=1): + return True + + # Case two: selection contains either zero or one integer iterables. + # All other selection elements are slices or integers + return ( + isinstance(selection, tuple) + and len(selection) == ndim + and sum(is_integer_list(elem) or is_integer_array(elem) for elem in selection) <= 1 + and all( + is_integer_list(elem) or is_integer_array(elem) or isinstance(elem, int | slice) + for elem in selection + ) + ) + + +def get_chunk_shape(chunk_grid: ChunkGrid) -> ChunkCoords: + from zarr.chunk_grids import RegularChunkGrid + + assert isinstance( + chunk_grid, RegularChunkGrid + ), "Only regular chunk grid is supported, currently." + return chunk_grid.chunk_shape - return selection +def normalize_integer_selection(dim_sel: int, dim_len: int) -> int: + # normalize type to int + dim_sel = int(dim_sel) + + # handle wraparound + if dim_sel < 0: + dim_sel = dim_len + dim_sel + + # handle out of bounds + if dim_sel >= dim_len or dim_sel < 0: + raise BoundsCheckError(dim_len) + + return dim_sel + + +class ChunkDimProjection(NamedTuple): + """A mapping from chunk to output array for a single dimension. + + Parameters + ---------- + dim_chunk_ix + Index of chunk. + dim_chunk_sel + Selection of items from chunk array. + dim_out_sel + Selection of items in target (output) array. + + """ -class _ChunkDimProjection(NamedTuple): dim_chunk_ix: int dim_chunk_sel: slice - dim_out_sel: slice | None + dim_out_sel: slice + + +@dataclass(frozen=True) +class IntDimIndexer: + dim_sel: int + dim_len: int + dim_chunk_len: int + nitems: int = 1 + + def __init__(self, dim_sel: int, dim_len: int, dim_chunk_len: int): + object.__setattr__(self, "dim_sel", normalize_integer_selection(dim_sel, dim_len)) + object.__setattr__(self, "dim_len", dim_len) + object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + def __iter__(self) -> Iterator[ChunkDimProjection]: + dim_chunk_ix = self.dim_sel // self.dim_chunk_len + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel - dim_offset + dim_out_sel = None + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) -def _ceildiv(a: float, b: float) -> int: + +def ceildiv(a: int, b: int) -> int: return math.ceil(a / b) -class _SliceDimIndexer: - dim_sel: slice +@dataclass(frozen=True) +class SliceDimIndexer: dim_len: int dim_chunk_len: int nitems: int + nchunks: int start: int stop: int step: int def __init__(self, dim_sel: slice, dim_len: int, dim_chunk_len: int): - self.start, self.stop, self.step = dim_sel.indices(dim_len) - if self.step < 1: - _err_negative_step() + # normalize + start, stop, step = dim_sel.indices(dim_len) + if step < 1: + raise NegativeStepError + + object.__setattr__(self, "start", start) + object.__setattr__(self, "stop", stop) + object.__setattr__(self, "step", step) - self.dim_len = dim_len - self.dim_chunk_len = dim_chunk_len - self.nitems = max(0, _ceildiv((self.stop - self.start), self.step)) - self.nchunks = _ceildiv(self.dim_len, self.dim_chunk_len) + object.__setattr__(self, "dim_len", dim_len) + object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "nitems", max(0, ceildiv((stop - start), step))) + object.__setattr__(self, "nchunks", ceildiv(dim_len, dim_chunk_len)) - def __iter__(self) -> Iterator[_ChunkDimProjection]: + def __iter__(self) -> Iterator[ChunkDimProjection]: # figure out the range of chunks we need to visit dim_chunk_ix_from = self.start // self.dim_chunk_len - dim_chunk_ix_to = _ceildiv(self.stop, self.dim_chunk_len) + dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) # iterate over chunks in range for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): @@ -97,7 +255,7 @@ def __iter__(self) -> Iterator[_ChunkDimProjection]: if remainder: dim_chunk_sel_start += self.step - remainder # compute number of previous items, provides offset into output array - dim_out_offset = _ceildiv((dim_offset - self.start), self.step) + dim_out_offset = ceildiv((dim_offset - self.start), self.step) else: # selection starts within current chunk @@ -113,43 +271,484 @@ def __iter__(self) -> Iterator[_ChunkDimProjection]: dim_chunk_sel_stop = self.stop - dim_offset dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) - dim_chunk_nitems = _ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) + dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) + + # If there are no elements on the selection within this chunk, then skip + if dim_chunk_nitems == 0: + continue + dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - yield _ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def check_selection_length(selection: Selection, shape: ChunkCoords): + if len(selection) > len(shape): + err_too_many_indices(selection, shape) + + +def replace_ellipsis(selection: Selection, shape: ChunkCoords): + selection = ensure_tuple(selection) + + # count number of ellipsis present + n_ellipsis = sum(1 for i in selection if i is Ellipsis) + + if n_ellipsis > 1: + # more than 1 is an error + raise IndexError("an index can only have a single ellipsis ('...')") + + elif n_ellipsis == 1: + # locate the ellipsis, count how many items to left and right + n_items_l = selection.index(Ellipsis) # items to left of ellipsis + n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis + n_items = len(selection) - 1 # all non-ellipsis items + + if n_items >= len(shape): + # ellipsis does nothing, just remove it + selection = tuple(i for i in selection if i != Ellipsis) + + else: + # replace ellipsis with as many slices are needed for number of dims + new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) + if n_items_r: + new_item += selection[-n_items_r:] + selection = new_item + + # fill out selection if not completely specified + if len(selection) < len(shape): + selection += (slice(None),) * (len(shape) - len(selection)) + + # check selection not too long + check_selection_length(selection, shape) + + return selection + +def replace_lists(selection: Selection) -> Selection: + return tuple( + np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel for dim_sel in selection + ) + + +def ensure_tuple(v: Selector | tuple[Selector]) -> tuple[Selector, ...]: + if not isinstance(v, tuple): + v = (v,) + return v + + +class ChunkProjection(NamedTuple): + """A mapping of items from chunk to output array. Can be used to extract items from the + chunk array for loading into an output array. Can also be used to extract items from a + value array for setting/updating in a chunk array. + + Parameters + ---------- + chunk_coords + Indices of chunk. + chunk_selection + Selection of items from chunk array. + out_selection + Selection of items in target (output) array. + + """ -class _ChunkProjection(NamedTuple): chunk_coords: ChunkCoords chunk_selection: SliceSelection out_selection: SliceSelection -class BasicIndexer: - dim_indexers: list[_SliceDimIndexer] +def is_slice(s: slice) -> bool: + return isinstance(s, slice) + + +def is_contiguous_slice(s: slice) -> bool: + return is_slice(s) and (s.step is None or s.step == 1) + + +def is_positive_slice(s: slice) -> bool: + return is_slice(s) and (s.step is None or s.step >= 1) + + +def is_contiguous_selection(selection: Selection) -> bool: + selection = ensure_tuple(selection) + return all((is_integer_array(s) or is_contiguous_slice(s) or s == Ellipsis) for s in selection) + + +def is_basic_selection(selection: Selection) -> bool: + selection = ensure_tuple(selection) + return all(is_integer(s) or is_positive_slice(s) for s in selection) + + +@dataclass(frozen=True) +class BasicIndexer(Indexer): + dim_indexers: list[IntDimIndexer | SliceDimIndexer] shape: ChunkCoords + drop_axes: None def __init__( self, selection: Selection, - shape: tuple[int, ...], + shape: ChunkCoords, chunk_grid: ChunkGrid, ): - from zarr.chunk_grids import RegularChunkGrid + chunk_shape = get_chunk_shape(chunk_grid) + # handle ellipsis + selection = replace_ellipsis(selection, shape) - assert isinstance( - chunk_grid, RegularChunkGrid - ), "Only regular chunk grid is supported, currently." # setup per-dimension indexers - self.dim_indexers = [ - _SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) - for dim_sel, dim_len, dim_chunk_len in zip( - _ensure_selection(selection, shape), shape, chunk_grid.chunk_shape, strict=False + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, shape, chunk_shape, strict=True): + if is_integer(dim_sel): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_slice(dim_sel): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError( + "unsupported selection item for basic indexing; " + f"expected integer or slice, got {type(dim_sel)!r}" + ) + + dim_indexers.append(dim_indexer) + + object.__setattr__(self, "dim_indexers", dim_indexers) + object.__setattr__( + self, + "shape", + tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)), + ) + object.__setattr__(self, "drop_axes", None) + + def __iter__(self) -> Iterator[ChunkProjection]: + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +@dataclass(frozen=True) +class BoolArrayDimIndexer: + dim_sel: npt.NDArray[np.bool_] + dim_len: int + dim_chunk_len: int + nchunks: int + + chunk_nitems: npt.NDArray[Any] + chunk_nitems_cumsum: npt.NDArray[Any] + nitems: int + dim_chunk_ixs: int + + def __init__(self, dim_sel: npt.NDArray[np.bool_], dim_len: int, dim_chunk_len: int): + # check number of dimensions + if not is_bool_array(dim_sel, 1): + raise IndexError("Boolean arrays in an orthogonal selection must be 1-dimensional only") + + # check shape + if dim_sel.shape[0] != dim_len: + raise IndexError( + f"Boolean array has the wrong length for dimension; expected {dim_len}, got {dim_sel.shape[0]}" + ) + + # precompute number of selected items for each chunk + nchunks = ceildiv(dim_len, dim_chunk_len) + chunk_nitems = np.zeros(nchunks, dtype="i8") + for dim_chunk_ix in range(nchunks): + dim_offset = dim_chunk_ix * dim_chunk_len + chunk_nitems[dim_chunk_ix] = np.count_nonzero( + dim_sel[dim_offset : dim_offset + dim_chunk_len] ) - ] - self.shape = tuple(s.nitems for s in self.dim_indexers) + chunk_nitems_cumsum = np.cumsum(chunk_nitems) + nitems = chunk_nitems_cumsum[-1] + dim_chunk_ixs = np.nonzero(chunk_nitems)[0] + + # store attributes + object.__setattr__(self, "dim_sel", dim_sel) + object.__setattr__(self, "dim_len", dim_len) + object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "nchunks", nchunks) + object.__setattr__(self, "chunk_nitems", chunk_nitems) + object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) + object.__setattr__(self, "nitems", nitems) + object.__setattr__(self, "dim_chunk_ixs", dim_chunk_ixs) + + def __iter__(self) -> Iterator[ChunkDimProjection]: + # iterate over chunks with at least one item + for dim_chunk_ix in self.dim_chunk_ixs: + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] + + # pad out if final chunk + if dim_chunk_sel.shape[0] < self.dim_chunk_len: + tmp = np.zeros(self.dim_chunk_len, dtype=bool) + tmp[: dim_chunk_sel.shape[0]] = dim_chunk_sel + dim_chunk_sel = tmp + + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + dim_out_sel = slice(start, stop) + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +class Order(Enum): + UNKNOWN = 0 + INCREASING = 1 + DECREASING = 2 + UNORDERED = 3 + + @staticmethod + def check(a: npt.NDArray[Any]) -> Order: + diff = np.diff(a) + diff_positive = diff >= 0 + n_diff_positive = np.count_nonzero(diff_positive) + all_increasing = n_diff_positive == len(diff_positive) + any_increasing = n_diff_positive > 0 + if all_increasing: + order = Order.INCREASING + elif any_increasing: + order = Order.UNORDERED + else: + order = Order.DECREASING + return order + + +def wraparound_indices(x, dim_len): + loc_neg = x < 0 + if np.any(loc_neg): + x[loc_neg] = x[loc_neg] + dim_len + + +def boundscheck_indices(x, dim_len): + if np.any(x < 0) or np.any(x >= dim_len): + raise BoundsCheckError(dim_len) + + +@dataclass(frozen=True) +class IntArrayDimIndexer: + """Integer array selection against a single dimension.""" + + dim_len: int + dim_chunk_len: int + nchunks: int + nitems: int + order: Order + dim_sel: int + dim_out_sel: int + chunk_nitems: int + dim_chunk_ixs: npt.NDArray[np.intp] + chunk_nitems_cumsum: npt.NDArray[np.intp] + + def __init__( + self, + dim_sel: int, + dim_len: int, + dim_chunk_len: int, + wraparound=True, + boundscheck=True, + order=Order.UNKNOWN, + ): + # ensure 1d array + dim_sel = np.asanyarray(dim_sel) + if not is_integer_array(dim_sel, 1): + raise IndexError("integer arrays in an orthogonal selection must be 1-dimensional only") + + nitems = len(dim_sel) + nchunks = ceildiv(dim_len, dim_chunk_len) + + # handle wraparound + if wraparound: + wraparound_indices(dim_sel, dim_len) - def __iter__(self) -> Iterator[_ChunkProjection]: + # handle out of bounds + if boundscheck: + boundscheck_indices(dim_sel, dim_len) + + # determine which chunk is needed for each selection item + # note: for dense integer selections, the division operation here is the + # bottleneck + dim_sel_chunk = dim_sel // dim_chunk_len + + # determine order of indices + if order == Order.UNKNOWN: + order = Order.check(dim_sel) + order = Order(order) + + if order == Order.INCREASING: + dim_sel = dim_sel + dim_out_sel = None + elif order == Order.DECREASING: + dim_sel = dim_sel[::-1] + # TODO should be possible to do this without creating an arange + dim_out_sel = np.arange(nitems - 1, -1, -1) + else: + # sort indices to group by chunk + dim_out_sel = np.argsort(dim_sel_chunk) + dim_sel = np.take(dim_sel, dim_out_sel) + + # precompute number of selected items for each chunk + chunk_nitems = np.bincount(dim_sel_chunk, minlength=nchunks) + + # find chunks that we need to visit + dim_chunk_ixs = np.nonzero(chunk_nitems)[0] + + # compute offsets into the output array + chunk_nitems_cumsum = np.cumsum(chunk_nitems) + + # store attributes + object.__setattr__(self, "dim_len", dim_len) + object.__setattr__(self, "dim_chunk_len", dim_chunk_len) + object.__setattr__(self, "nchunks", nchunks) + object.__setattr__(self, "nitems", nitems) + object.__setattr__(self, "order", order) + object.__setattr__(self, "dim_sel", dim_sel) + object.__setattr__(self, "dim_out_sel", dim_out_sel) + object.__setattr__(self, "chunk_nitems", chunk_nitems) + object.__setattr__(self, "dim_chunk_ixs", dim_chunk_ixs) + object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) + + def __iter__(self) -> Iterator[ChunkDimProjection]: + for dim_chunk_ix in self.dim_chunk_ixs: + # find region in output + if dim_chunk_ix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] + stop = self.chunk_nitems_cumsum[dim_chunk_ix] + if self.order == Order.INCREASING: + dim_out_sel = slice(start, stop) + else: + dim_out_sel = self.dim_out_sel[start:stop] + + # find region in chunk + dim_offset = dim_chunk_ix * self.dim_chunk_len + dim_chunk_sel = self.dim_sel[start:stop] - dim_offset + + yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) + + +def slice_to_range(s: slice, l: int): # noqa: E741 + return range(*s.indices(l)) + + +def ix_(selection: Selection, shape: ChunkCoords): + """Convert an orthogonal selection to a numpy advanced (fancy) selection, like numpy.ix_ + but with support for slices and single ints.""" + + # normalisation + selection = replace_ellipsis(selection, shape) + + # replace slice and int as these are not supported by numpy.ix_ + selection = [ + slice_to_range(dim_sel, dim_len) + if isinstance(dim_sel, slice) + else [dim_sel] + if is_integer(dim_sel) + else dim_sel + for dim_sel, dim_len in zip(selection, shape, strict=True) + ] + + # now get numpy to convert to a coordinate selection + selection = np.ix_(*selection) + + return selection + + +def oindex(a: npt.NDArray[Any], selection: Selection): + """Implementation of orthogonal indexing with slices and ints.""" + selection = replace_ellipsis(selection, a.shape) + drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) + selection = ix_(selection, a.shape) + result = a[selection] + if drop_axes: + result = result.squeeze(axis=drop_axes) + return result + + +def oindex_set(a: npt.NDArray[Any], selection: Selection, value): + selection = replace_ellipsis(selection, a.shape) + drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) + selection = ix_(selection, a.shape) + if not np.isscalar(value) and drop_axes: + value = np.asanyarray(value) + value_selection = [slice(None)] * len(a.shape) + for i in drop_axes: + value_selection[i] = np.newaxis + value_selection = tuple(value_selection) + value = value[value_selection] + a[selection] = value + + +@dataclass(frozen=True) +class OrthogonalIndexer(Indexer): + dim_indexers: list[IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer] + shape: ChunkCoords + chunk_shape: ChunkCoords + is_advanced: bool + drop_axes: tuple[int, ...] | None + + def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGrid): + chunk_shape = get_chunk_shape(chunk_grid) + + # handle ellipsis + selection = replace_ellipsis(selection, shape) + + # normalize list to array + selection = replace_lists(selection) + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_len in zip(selection, shape, chunk_shape, strict=True): + if is_integer(dim_sel): + dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif isinstance(dim_sel, slice): + dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_integer_array(dim_sel): + dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + + elif is_bool_array(dim_sel): + dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) + + else: + raise IndexError( + "unsupported selection item for orthogonal indexing; " + "expected integer, slice, integer array or Boolean " + f"array, got {type(dim_sel)!r}" + ) + + dim_indexers.append(dim_indexer) + + dim_indexers = dim_indexers + shape = tuple(s.nitems for s in dim_indexers if not isinstance(s, IntDimIndexer)) + chunk_shape = chunk_shape + is_advanced = not is_basic_selection(selection) + if is_advanced: + drop_axes = tuple( + i + for i, dim_indexer in enumerate(dim_indexers) + if isinstance(dim_indexer, IntDimIndexer) + ) + else: + drop_axes = None + + object.__setattr__(self, "dim_indexers", dim_indexers) + object.__setattr__(self, "shape", shape) + object.__setattr__(self, "chunk_shape", chunk_shape) + object.__setattr__(self, "is_advanced", is_advanced) + object.__setattr__(self, "drop_axes", drop_axes) + + def __iter__(self) -> Iterator[ChunkProjection]: for dim_projections in itertools.product(*self.dim_indexers): chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) @@ -157,7 +756,376 @@ def __iter__(self) -> Iterator[_ChunkProjection]: p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None ) - yield _ChunkProjection(chunk_coords, chunk_selection, out_selection) + # handle advanced indexing arrays orthogonally + if self.is_advanced: + # N.B., numpy doesn't support orthogonal indexing directly as yet, + # so need to work around via np.ix_. Also np.ix_ does not support a + # mixture of arrays and slices or integers, so need to convert slices + # and integers into ranges. + chunk_selection = ix_(chunk_selection, self.chunk_shape) + + # special case for non-monotonic indices + if not is_basic_selection(out_selection): + out_selection = ix_(out_selection, self.shape) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +@dataclass(frozen=True) +class OIndex: + array: Array + + def __getitem__(self, selection: Selection) -> NDArrayLike: + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.get_orthogonal_selection(selection, fields=fields) + + def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.set_orthogonal_selection(selection, value, fields=fields) + + +@dataclass(frozen=True) +class BlockIndexer(Indexer): + dim_indexers: list[SliceDimIndexer] + shape: ChunkCoords + drop_axes: None + + def __init__(self, selection, shape: ChunkCoords, chunk_grid: ChunkGrid): + chunk_shape = get_chunk_shape(chunk_grid) + + # handle ellipsis + selection = replace_ellipsis(selection, shape) + + # normalize list to array + selection = replace_lists(selection) + + # setup per-dimension indexers + dim_indexers = [] + for dim_sel, dim_len, dim_chunk_size in zip(selection, shape, chunk_shape, strict=True): + dim_numchunks = int(np.ceil(dim_len / dim_chunk_size)) + + if is_integer(dim_sel): + if dim_sel < 0: + dim_sel = dim_numchunks + dim_sel + + start = dim_sel * dim_chunk_size + stop = start + dim_chunk_size + slice_ = slice(start, stop) + + elif is_slice(dim_sel): + start = dim_sel.start if dim_sel.start is not None else 0 + stop = dim_sel.stop if dim_sel.stop is not None else dim_numchunks + + if dim_sel.step not in {1, None}: + raise IndexError( + "unsupported selection item for block indexing; " + f"expected integer or slice with step=1, got {type(dim_sel)!r}" + ) + + # Can't reuse wraparound_indices because it expects a numpy array + # We have integers here. + if start < 0: + start = dim_numchunks + start + if stop < 0: + stop = dim_numchunks + stop + + start = start * dim_chunk_size + stop = stop * dim_chunk_size + slice_ = slice(start, stop) + + else: + raise IndexError( + "unsupported selection item for block indexing; " + f"expected integer or slice, got {type(dim_sel)!r}" + ) + + dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size) + dim_indexers.append(dim_indexer) + + if start >= dim_len or start < 0: + raise BoundsCheckError(dim_len) + + dim_indexers = dim_indexers + shape = tuple(s.nitems for s in dim_indexers) + + object.__setattr__(self, "dim_indexers", dim_indexers) + object.__setattr__(self, "shape", shape) + object.__setattr__(self, "drop_axes", None) + + def __iter__(self) -> Iterator[ChunkProjection]: + for dim_projections in itertools.product(*self.dim_indexers): + chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) + chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) + out_selection = tuple( + p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +@dataclass(frozen=True) +class BlockIndex: + array: Array + + def __getitem__(self, selection: Selection) -> NDArrayLike: + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.get_block_selection(selection, fields=fields) + + def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + return self.array.set_block_selection(selection, value, fields=fields) + + +def is_coordinate_selection(selection: Selection, shape: ChunkCoords) -> bool: + return (len(selection) == len(shape)) and all( + is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection + ) + + +def is_mask_selection(selection: Selection, shape: ChunkCoords) -> bool: + return len(selection) == 1 and is_bool_array(selection[0]) and selection[0].shape == shape + + +@dataclass(frozen=True) +class CoordinateIndexer(Indexer): + sel_shape: ChunkCoords + selection: Selection + sel_sort: npt.NDArray[np.intp] | None + chunk_nitems_cumsum: npt.NDArray[np.intp] + chunk_rixs: npt.NDArray[np.intp] + chunk_mixs: tuple[npt.NDArray[np.intp], ...] + shape: ChunkCoords + chunk_shape: ChunkCoords + drop_axes: None + + def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGrid): + chunk_shape = get_chunk_shape(chunk_grid) + + if shape == (): + cdata_shape = (1,) + else: + cdata_shape = tuple(math.ceil(s / c) for s, c in zip(shape, chunk_shape, strict=True)) + nchunks = reduce(operator.mul, cdata_shape, 1) + + # some initial normalization + selection = ensure_tuple(selection) + selection = tuple([i] if is_integer(i) else i for i in selection) + selection = replace_lists(selection) + + # validation + if not is_coordinate_selection(selection, shape): + raise IndexError( + "invalid coordinate selection; expected one integer " + "(coordinate) array per dimension of the target array, " + f"got {selection!r}" + ) + + # handle wraparound, boundscheck + for dim_sel, dim_len in zip(selection, shape, strict=True): + # handle wraparound + wraparound_indices(dim_sel, dim_len) + + # handle out of bounds + boundscheck_indices(dim_sel, dim_len) + + # compute chunk index for each point in the selection + chunks_multi_index = tuple( + dim_sel // dim_chunk_len + for (dim_sel, dim_chunk_len) in zip(selection, chunk_shape, strict=True) + ) + + # broadcast selection - this will raise error if array dimensions don't match + selection = np.broadcast_arrays(*selection) + chunks_multi_index = np.broadcast_arrays(*chunks_multi_index) + + # remember shape of selection, because we will flatten indices for processing + sel_shape = selection[0].shape if selection[0].shape else (1,) + + # flatten selection + selection = [dim_sel.reshape(-1) for dim_sel in selection] + chunks_multi_index = [dim_chunks.reshape(-1) for dim_chunks in chunks_multi_index] + + # ravel chunk indices + chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, dims=cdata_shape) + + # group points by chunk + if np.any(np.diff(chunks_raveled_indices) < 0): + # optimisation, only sort if needed + sel_sort = np.argsort(chunks_raveled_indices) + selection = tuple(dim_sel[sel_sort] for dim_sel in selection) + else: + sel_sort = None + + shape = selection[0].shape if selection[0].shape else (1,) + + # precompute number of selected items for each chunk + chunk_nitems = np.bincount(chunks_raveled_indices, minlength=nchunks) + chunk_nitems_cumsum = np.cumsum(chunk_nitems) + # locate the chunks we need to process + chunk_rixs = np.nonzero(chunk_nitems)[0] + + # unravel chunk indices + chunk_mixs = np.unravel_index(chunk_rixs, cdata_shape) + + object.__setattr__(self, "sel_shape", sel_shape) + object.__setattr__(self, "selection", selection) + object.__setattr__(self, "sel_sort", sel_sort) + object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) + object.__setattr__(self, "chunk_rixs", chunk_rixs) + object.__setattr__(self, "chunk_mixs", chunk_mixs) + object.__setattr__(self, "chunk_shape", chunk_shape) + object.__setattr__(self, "shape", shape) + object.__setattr__(self, "drop_axes", None) + + def __iter__(self) -> Iterator[ChunkProjection]: + # iterate over chunks + for i, chunk_rix in enumerate(self.chunk_rixs): + chunk_coords = tuple(m[i] for m in self.chunk_mixs) + if chunk_rix == 0: + start = 0 + else: + start = self.chunk_nitems_cumsum[chunk_rix - 1] + stop = self.chunk_nitems_cumsum[chunk_rix] + if self.sel_sort is None: + out_selection = slice(start, stop) + else: + out_selection = self.sel_sort[start:stop] + + chunk_offsets = tuple( + dim_chunk_ix * dim_chunk_len + for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.chunk_shape, strict=True) + ) + chunk_selection = tuple( + dim_sel[start:stop] - dim_chunk_offset + for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets, strict=True) + ) + + yield ChunkProjection(chunk_coords, chunk_selection, out_selection) + + +@dataclass(frozen=True) +class MaskIndexer(CoordinateIndexer): + def __init__(self, selection, shape: ChunkCoords, chunk_grid: ChunkGrid): + # some initial normalization + selection = ensure_tuple(selection) + selection = replace_lists(selection) + + # validation + if not is_mask_selection(selection, shape): + raise IndexError( + "invalid mask selection; expected one Boolean (mask)" + f"array with the same shape as the target array, got {selection!r}" + ) + + # convert to indices + selection = np.nonzero(selection[0]) + + # delegate the rest to superclass + super().__init__(selection, shape, chunk_grid) + + +@dataclass(frozen=True) +class VIndex: + array: Array + + def __getitem__(self, selection: Selection) -> NDArrayLike: + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + if is_coordinate_selection(selection, self.array.shape): + return self.array.get_coordinate_selection(selection, fields=fields) + elif is_mask_selection(selection, self.array.shape): + return self.array.get_mask_selection(selection, fields=fields) + else: + raise VindexInvalidSelectionError(selection) + + def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: + fields, selection = pop_fields(selection) + selection = ensure_tuple(selection) + selection = replace_lists(selection) + if is_coordinate_selection(selection, self.array.shape): + self.array.set_coordinate_selection(selection, value, fields=fields) + elif is_mask_selection(selection, self.array.shape): + self.array.set_mask_selection(selection, value, fields=fields) + else: + raise VindexInvalidSelectionError(selection) + + +def check_fields(fields: Fields, dtype: npt.DTypeLike) -> npt.DTypeLike: + # early out + if fields is None: + return dtype + # check type + if not isinstance(fields, Fields): + raise IndexError( + f"'fields' argument must be a string or list of strings; found {type(fields)!r}" + ) + if fields: + if dtype.names is None: + raise IndexError("invalid 'fields' argument, array does not have any fields") + try: + if isinstance(fields, str): + # single field selection + out_dtype = dtype[fields] + else: + # multiple field selection + out_dtype = np.dtype([(f, dtype[f]) for f in fields]) + except KeyError as e: + raise IndexError(f"invalid 'fields' argument, field not found: {e!r}") from e + else: + return out_dtype + else: + return dtype + + +def check_no_multi_fields(fields: Fields) -> Fields: + if isinstance(fields, list): + if len(fields) == 1: + return fields[0] + elif len(fields) > 1: + raise IndexError("multiple fields are not supported for this operation") + return fields + + +def pop_fields(selection: Selection) -> tuple[Fields | None, Selection]: + if isinstance(selection, str): + # single field selection + fields = selection + selection = () + elif not isinstance(selection, tuple): + # single selection item, no fields + fields = None + # leave selection as-is + else: + # multiple items, split fields from selection items + fields = [f for f in selection if isinstance(f, str)] + fields = fields[0] if len(fields) == 1 else fields + selection = tuple(s for s in selection if not isinstance(s, str)) + selection = selection[0] if len(selection) == 1 else selection + return fields, selection + + +def make_slice_selection(selection: Selection) -> list[int | slice]: + ls = [] + for dim_selection in selection: + if is_integer(dim_selection): + ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1)) + elif isinstance(dim_selection, np.ndarray): + if len(dim_selection) == 1: + ls.append(slice(int(dim_selection[0]), int(dim_selection[0]) + 1, 1)) + else: + raise ArrayIndexError + else: + ls.append(dim_selection) + return ls def morton_order_iter(chunk_shape: ChunkCoords) -> Iterator[ChunkCoords]: diff --git a/src/zarr/indexing2.py b/src/zarr/indexing2.py deleted file mode 100644 index 8852b3dabf..0000000000 --- a/src/zarr/indexing2.py +++ /dev/null @@ -1,1124 +0,0 @@ -from __future__ import annotations - -import itertools -import math -import numbers -import operator -from collections.abc import Iterator -from dataclasses import dataclass -from enum import Enum -from functools import reduce -from typing import TYPE_CHECKING, Any, NamedTuple, Protocol, runtime_checkable - -import numpy as np -import numpy.typing as npt - -from zarr.common import ChunkCoords, SliceSelection -from zarr.v2.errors import ( - ArrayIndexError, - BoundsCheckError, - NegativeStepError, - VindexInvalidSelectionError, - err_too_many_indices, -) - -if TYPE_CHECKING: - from zarr.array import Array - from zarr.buffer import NDArrayLike - from zarr.chunk_grids import ChunkGrid - -Selector = int | slice | npt.NDArray[np.bool_] -Selection = tuple[Selector, ...] -Fields = str | list | tuple - - -@runtime_checkable -class Indexer(Protocol): - shape: ChunkCoords - drop_axes: ChunkCoords | None - - def __iter__(self) -> Iterator[ChunkProjection]: ... - - -def is_integer(x: Selector) -> bool: - """True if x is an integer (both pure Python or NumPy). - - Note that Python's bool is considered an integer too. - """ - return isinstance(x, numbers.Integral) - - -def is_integer_list(x: Selector) -> bool: - """True if x is a list of integers. - - This function assumes ie *does not check* that all elements of the list - have the same type. Mixed type lists will result in other errors that will - bubble up anyway. - """ - return isinstance(x, list) and len(x) > 0 and is_integer(x[0]) - - -def is_integer_array(x: Selector, ndim: int | None = None) -> bool: - t = not np.isscalar(x) and hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype.kind in "ui" - if ndim is not None: - t = t and len(x.shape) == ndim - return t - - -def is_bool_array(x: Selector, ndim: int | None = None) -> bool: - t = hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype == bool - if ndim is not None: - t = t and len(x.shape) == ndim - return t - - -def is_scalar(value: Selector, dtype: npt.DTypeLike) -> bool: - if np.isscalar(value): - return True - if hasattr(value, "shape") and value.shape == (): - return True - if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names): - return True - return False - - -def is_pure_fancy_indexing(selection: Selection, ndim: int) -> bool: - """Check whether a selection contains only scalars or integer array-likes. - - Parameters - ---------- - selection : tuple, slice, or scalar - A valid selection value for indexing into arrays. - - Returns - ------- - is_pure : bool - True if the selection is a pure fancy indexing expression (ie not mixed - with boolean or slices). - """ - if ndim == 1: - if is_integer_list(selection) or is_integer_array(selection): - return True - # if not, we go through the normal path below, because a 1-tuple - # of integers is also allowed. - no_slicing = ( - isinstance(selection, tuple) - and len(selection) == ndim - and not (any(isinstance(elem, slice) or elem is Ellipsis for elem in selection)) - ) - return ( - no_slicing - and all( - is_integer(elem) or is_integer_list(elem) or is_integer_array(elem) - for elem in selection - ) - and any(is_integer_list(elem) or is_integer_array(elem) for elem in selection) - ) - - -def is_pure_orthogonal_indexing(selection: Selection, ndim: int) -> bool: - if not ndim: - return False - - # Case 1: Selection is a single iterable of integers - if is_integer_list(selection) or is_integer_array(selection, ndim=1): - return True - - # Case two: selection contains either zero or one integer iterables. - # All other selection elements are slices or integers - return ( - isinstance(selection, tuple) - and len(selection) == ndim - and sum(is_integer_list(elem) or is_integer_array(elem) for elem in selection) <= 1 - and all( - is_integer_list(elem) or is_integer_array(elem) or isinstance(elem, int | slice) - for elem in selection - ) - ) - - -def get_chunk_shape(chunk_grid: ChunkGrid) -> ChunkCoords: - from zarr.chunk_grids import RegularChunkGrid - - assert isinstance( - chunk_grid, RegularChunkGrid - ), "Only regular chunk grid is supported, currently." - return chunk_grid.chunk_shape - - -def normalize_integer_selection(dim_sel: int, dim_len: int) -> int: - # normalize type to int - dim_sel = int(dim_sel) - - # handle wraparound - if dim_sel < 0: - dim_sel = dim_len + dim_sel - - # handle out of bounds - if dim_sel >= dim_len or dim_sel < 0: - raise BoundsCheckError(dim_len) - - return dim_sel - - -class ChunkDimProjection(NamedTuple): - """A mapping from chunk to output array for a single dimension. - - Parameters - ---------- - dim_chunk_ix - Index of chunk. - dim_chunk_sel - Selection of items from chunk array. - dim_out_sel - Selection of items in target (output) array. - - """ - - dim_chunk_ix: int - dim_chunk_sel: slice - dim_out_sel: slice - - -@dataclass(frozen=True) -class IntDimIndexer: - dim_sel: int - dim_len: int - dim_chunk_len: int - nitems: int = 1 - - def __init__(self, dim_sel: int, dim_len: int, dim_chunk_len: int): - object.__setattr__(self, "dim_sel", normalize_integer_selection(dim_sel, dim_len)) - object.__setattr__(self, "dim_len", dim_len) - object.__setattr__(self, "dim_chunk_len", dim_chunk_len) - - def __iter__(self) -> Iterator[ChunkDimProjection]: - dim_chunk_ix = self.dim_sel // self.dim_chunk_len - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel - dim_offset - dim_out_sel = None - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - - -def ceildiv(a: int, b: int) -> int: - return math.ceil(a / b) - - -@dataclass(frozen=True) -class SliceDimIndexer: - dim_len: int - dim_chunk_len: int - nitems: int - nchunks: int - - start: int - stop: int - step: int - - def __init__(self, dim_sel: slice, dim_len: int, dim_chunk_len: int): - # normalize - start, stop, step = dim_sel.indices(dim_len) - if step < 1: - raise NegativeStepError - - object.__setattr__(self, "start", start) - object.__setattr__(self, "stop", stop) - object.__setattr__(self, "step", step) - - object.__setattr__(self, "dim_len", dim_len) - object.__setattr__(self, "dim_chunk_len", dim_chunk_len) - object.__setattr__(self, "nitems", max(0, ceildiv((stop - start), step))) - object.__setattr__(self, "nchunks", ceildiv(dim_len, dim_chunk_len)) - - def __iter__(self) -> Iterator[ChunkDimProjection]: - # figure out the range of chunks we need to visit - dim_chunk_ix_from = self.start // self.dim_chunk_len - dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) - - # iterate over chunks in range - for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): - # compute offsets for chunk within overall array - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len) - - # determine chunk length, accounting for trailing chunk - dim_chunk_len = dim_limit - dim_offset - - if self.start < dim_offset: - # selection starts before current chunk - dim_chunk_sel_start = 0 - remainder = (dim_offset - self.start) % self.step - if remainder: - dim_chunk_sel_start += self.step - remainder - # compute number of previous items, provides offset into output array - dim_out_offset = ceildiv((dim_offset - self.start), self.step) - - else: - # selection starts within current chunk - dim_chunk_sel_start = self.start - dim_offset - dim_out_offset = 0 - - if self.stop > dim_limit: - # selection ends after current chunk - dim_chunk_sel_stop = dim_chunk_len - - else: - # selection ends within current chunk - dim_chunk_sel_stop = self.stop - dim_offset - - dim_chunk_sel = slice(dim_chunk_sel_start, dim_chunk_sel_stop, self.step) - dim_chunk_nitems = ceildiv((dim_chunk_sel_stop - dim_chunk_sel_start), self.step) - - # If there are no elements on the selection within this chunk, then skip - if dim_chunk_nitems == 0: - continue - - dim_out_sel = slice(dim_out_offset, dim_out_offset + dim_chunk_nitems) - - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - - -def check_selection_length(selection: Selection, shape: ChunkCoords): - if len(selection) > len(shape): - err_too_many_indices(selection, shape) - - -def replace_ellipsis(selection: Selection, shape: ChunkCoords): - selection = ensure_tuple(selection) - - # count number of ellipsis present - n_ellipsis = sum(1 for i in selection if i is Ellipsis) - - if n_ellipsis > 1: - # more than 1 is an error - raise IndexError("an index can only have a single ellipsis ('...')") - - elif n_ellipsis == 1: - # locate the ellipsis, count how many items to left and right - n_items_l = selection.index(Ellipsis) # items to left of ellipsis - n_items_r = len(selection) - (n_items_l + 1) # items to right of ellipsis - n_items = len(selection) - 1 # all non-ellipsis items - - if n_items >= len(shape): - # ellipsis does nothing, just remove it - selection = tuple(i for i in selection if i != Ellipsis) - - else: - # replace ellipsis with as many slices are needed for number of dims - new_item = selection[:n_items_l] + ((slice(None),) * (len(shape) - n_items)) - if n_items_r: - new_item += selection[-n_items_r:] - selection = new_item - - # fill out selection if not completely specified - if len(selection) < len(shape): - selection += (slice(None),) * (len(shape) - len(selection)) - - # check selection not too long - check_selection_length(selection, shape) - - return selection - - -def replace_lists(selection: Selection) -> Selection: - return tuple( - np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel for dim_sel in selection - ) - - -def ensure_tuple(v: Selector | tuple[Selector]) -> tuple[Selector, ...]: - if not isinstance(v, tuple): - v = (v,) - return v - - -class ChunkProjection(NamedTuple): - """A mapping of items from chunk to output array. Can be used to extract items from the - chunk array for loading into an output array. Can also be used to extract items from a - value array for setting/updating in a chunk array. - - Parameters - ---------- - chunk_coords - Indices of chunk. - chunk_selection - Selection of items from chunk array. - out_selection - Selection of items in target (output) array. - - """ - - chunk_coords: ChunkCoords - chunk_selection: SliceSelection - out_selection: SliceSelection - - -def is_slice(s: slice) -> bool: - return isinstance(s, slice) - - -def is_contiguous_slice(s: slice) -> bool: - return is_slice(s) and (s.step is None or s.step == 1) - - -def is_positive_slice(s: slice) -> bool: - return is_slice(s) and (s.step is None or s.step >= 1) - - -def is_contiguous_selection(selection: Selection) -> bool: - selection = ensure_tuple(selection) - return all((is_integer_array(s) or is_contiguous_slice(s) or s == Ellipsis) for s in selection) - - -def is_basic_selection(selection: Selection) -> bool: - selection = ensure_tuple(selection) - return all(is_integer(s) or is_positive_slice(s) for s in selection) - - -@dataclass(frozen=True) -class BasicIndexer(Indexer): - dim_indexers: list[IntDimIndexer | SliceDimIndexer] - shape: ChunkCoords - drop_axes: None - - def __init__( - self, - selection: Selection, - shape: ChunkCoords, - chunk_grid: ChunkGrid, - ): - chunk_shape = get_chunk_shape(chunk_grid) - # handle ellipsis - selection = replace_ellipsis(selection, shape) - - # setup per-dimension indexers - dim_indexers = [] - for dim_sel, dim_len, dim_chunk_len in zip(selection, shape, chunk_shape, strict=True): - if is_integer(dim_sel): - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) - - elif is_slice(dim_sel): - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) - - else: - raise IndexError( - "unsupported selection item for basic indexing; " - f"expected integer or slice, got {type(dim_sel)!r}" - ) - - dim_indexers.append(dim_indexer) - - object.__setattr__(self, "dim_indexers", dim_indexers) - object.__setattr__( - self, - "shape", - tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)), - ) - object.__setattr__(self, "drop_axes", None) - - def __iter__(self) -> Iterator[ChunkProjection]: - for dim_projections in itertools.product(*self.dim_indexers): - chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) - chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) - out_selection = tuple( - p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None - ) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - - -@dataclass(frozen=True) -class BoolArrayDimIndexer: - dim_sel: npt.NDArray[np.bool_] - dim_len: int - dim_chunk_len: int - nchunks: int - - chunk_nitems: npt.NDArray[Any] - chunk_nitems_cumsum: npt.NDArray[Any] - nitems: int - dim_chunk_ixs: int - - def __init__(self, dim_sel: npt.NDArray[np.bool_], dim_len: int, dim_chunk_len: int): - # check number of dimensions - if not is_bool_array(dim_sel, 1): - raise IndexError("Boolean arrays in an orthogonal selection must be 1-dimensional only") - - # check shape - if dim_sel.shape[0] != dim_len: - raise IndexError( - f"Boolean array has the wrong length for dimension; expected {dim_len}, got {dim_sel.shape[0]}" - ) - - # precompute number of selected items for each chunk - nchunks = ceildiv(dim_len, dim_chunk_len) - chunk_nitems = np.zeros(nchunks, dtype="i8") - for dim_chunk_ix in range(nchunks): - dim_offset = dim_chunk_ix * dim_chunk_len - chunk_nitems[dim_chunk_ix] = np.count_nonzero( - dim_sel[dim_offset : dim_offset + dim_chunk_len] - ) - chunk_nitems_cumsum = np.cumsum(chunk_nitems) - nitems = chunk_nitems_cumsum[-1] - dim_chunk_ixs = np.nonzero(chunk_nitems)[0] - - # store attributes - object.__setattr__(self, "dim_sel", dim_sel) - object.__setattr__(self, "dim_len", dim_len) - object.__setattr__(self, "dim_chunk_len", dim_chunk_len) - object.__setattr__(self, "nchunks", nchunks) - object.__setattr__(self, "chunk_nitems", chunk_nitems) - object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) - object.__setattr__(self, "nitems", nitems) - object.__setattr__(self, "dim_chunk_ixs", dim_chunk_ixs) - - def __iter__(self) -> Iterator[ChunkDimProjection]: - # iterate over chunks with at least one item - for dim_chunk_ix in self.dim_chunk_ixs: - # find region in chunk - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] - - # pad out if final chunk - if dim_chunk_sel.shape[0] < self.dim_chunk_len: - tmp = np.zeros(self.dim_chunk_len, dtype=bool) - tmp[: dim_chunk_sel.shape[0]] = dim_chunk_sel - dim_chunk_sel = tmp - - # find region in output - if dim_chunk_ix == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] - stop = self.chunk_nitems_cumsum[dim_chunk_ix] - dim_out_sel = slice(start, stop) - - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - - -class Order(Enum): - UNKNOWN = 0 - INCREASING = 1 - DECREASING = 2 - UNORDERED = 3 - - @staticmethod - def check(a: npt.NDArray[Any]) -> Order: - diff = np.diff(a) - diff_positive = diff >= 0 - n_diff_positive = np.count_nonzero(diff_positive) - all_increasing = n_diff_positive == len(diff_positive) - any_increasing = n_diff_positive > 0 - if all_increasing: - order = Order.INCREASING - elif any_increasing: - order = Order.UNORDERED - else: - order = Order.DECREASING - return order - - -def wraparound_indices(x, dim_len): - loc_neg = x < 0 - if np.any(loc_neg): - x[loc_neg] = x[loc_neg] + dim_len - - -def boundscheck_indices(x, dim_len): - if np.any(x < 0) or np.any(x >= dim_len): - raise BoundsCheckError(dim_len) - - -@dataclass(frozen=True) -class IntArrayDimIndexer: - """Integer array selection against a single dimension.""" - - dim_len: int - dim_chunk_len: int - nchunks: int - nitems: int - order: Order - dim_sel: int - dim_out_sel: int - chunk_nitems: int - dim_chunk_ixs: npt.NDArray[np.intp] - chunk_nitems_cumsum: npt.NDArray[np.intp] - - def __init__( - self, - dim_sel: int, - dim_len: int, - dim_chunk_len: int, - wraparound=True, - boundscheck=True, - order=Order.UNKNOWN, - ): - # ensure 1d array - dim_sel = np.asanyarray(dim_sel) - if not is_integer_array(dim_sel, 1): - raise IndexError("integer arrays in an orthogonal selection must be 1-dimensional only") - - nitems = len(dim_sel) - nchunks = ceildiv(dim_len, dim_chunk_len) - - # handle wraparound - if wraparound: - wraparound_indices(dim_sel, dim_len) - - # handle out of bounds - if boundscheck: - boundscheck_indices(dim_sel, dim_len) - - # determine which chunk is needed for each selection item - # note: for dense integer selections, the division operation here is the - # bottleneck - dim_sel_chunk = dim_sel // dim_chunk_len - - # determine order of indices - if order == Order.UNKNOWN: - order = Order.check(dim_sel) - order = Order(order) - - if order == Order.INCREASING: - dim_sel = dim_sel - dim_out_sel = None - elif order == Order.DECREASING: - dim_sel = dim_sel[::-1] - # TODO should be possible to do this without creating an arange - dim_out_sel = np.arange(nitems - 1, -1, -1) - else: - # sort indices to group by chunk - dim_out_sel = np.argsort(dim_sel_chunk) - dim_sel = np.take(dim_sel, dim_out_sel) - - # precompute number of selected items for each chunk - chunk_nitems = np.bincount(dim_sel_chunk, minlength=nchunks) - - # find chunks that we need to visit - dim_chunk_ixs = np.nonzero(chunk_nitems)[0] - - # compute offsets into the output array - chunk_nitems_cumsum = np.cumsum(chunk_nitems) - - # store attributes - object.__setattr__(self, "dim_len", dim_len) - object.__setattr__(self, "dim_chunk_len", dim_chunk_len) - object.__setattr__(self, "nchunks", nchunks) - object.__setattr__(self, "nitems", nitems) - object.__setattr__(self, "order", order) - object.__setattr__(self, "dim_sel", dim_sel) - object.__setattr__(self, "dim_out_sel", dim_out_sel) - object.__setattr__(self, "chunk_nitems", chunk_nitems) - object.__setattr__(self, "dim_chunk_ixs", dim_chunk_ixs) - object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) - - def __iter__(self) -> Iterator[ChunkDimProjection]: - for dim_chunk_ix in self.dim_chunk_ixs: - # find region in output - if dim_chunk_ix == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[dim_chunk_ix - 1] - stop = self.chunk_nitems_cumsum[dim_chunk_ix] - if self.order == Order.INCREASING: - dim_out_sel = slice(start, stop) - else: - dim_out_sel = self.dim_out_sel[start:stop] - - # find region in chunk - dim_offset = dim_chunk_ix * self.dim_chunk_len - dim_chunk_sel = self.dim_sel[start:stop] - dim_offset - - yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) - - -def slice_to_range(s: slice, l: int): # noqa: E741 - return range(*s.indices(l)) - - -def ix_(selection: Selection, shape: ChunkCoords): - """Convert an orthogonal selection to a numpy advanced (fancy) selection, like numpy.ix_ - but with support for slices and single ints.""" - - # normalisation - selection = replace_ellipsis(selection, shape) - - # replace slice and int as these are not supported by numpy.ix_ - selection = [ - slice_to_range(dim_sel, dim_len) - if isinstance(dim_sel, slice) - else [dim_sel] - if is_integer(dim_sel) - else dim_sel - for dim_sel, dim_len in zip(selection, shape, strict=True) - ] - - # now get numpy to convert to a coordinate selection - selection = np.ix_(*selection) - - return selection - - -def oindex(a: npt.NDArray[Any], selection: Selection): - """Implementation of orthogonal indexing with slices and ints.""" - selection = replace_ellipsis(selection, a.shape) - drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) - selection = ix_(selection, a.shape) - result = a[selection] - if drop_axes: - result = result.squeeze(axis=drop_axes) - return result - - -def oindex_set(a: npt.NDArray[Any], selection: Selection, value): - selection = replace_ellipsis(selection, a.shape) - drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) - selection = ix_(selection, a.shape) - if not np.isscalar(value) and drop_axes: - value = np.asanyarray(value) - value_selection = [slice(None)] * len(a.shape) - for i in drop_axes: - value_selection[i] = np.newaxis - value_selection = tuple(value_selection) - value = value[value_selection] - a[selection] = value - - -@dataclass(frozen=True) -class OrthogonalIndexer(Indexer): - dim_indexers: list[IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer] - shape: ChunkCoords - chunk_shape: ChunkCoords - is_advanced: bool - drop_axes: tuple[int, ...] | None - - def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGrid): - chunk_shape = get_chunk_shape(chunk_grid) - - # handle ellipsis - selection = replace_ellipsis(selection, shape) - - # normalize list to array - selection = replace_lists(selection) - - # setup per-dimension indexers - dim_indexers = [] - for dim_sel, dim_len, dim_chunk_len in zip(selection, shape, chunk_shape, strict=True): - if is_integer(dim_sel): - dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) - - elif isinstance(dim_sel, slice): - dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) - - elif is_integer_array(dim_sel): - dim_indexer = IntArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) - - elif is_bool_array(dim_sel): - dim_indexer = BoolArrayDimIndexer(dim_sel, dim_len, dim_chunk_len) - - else: - raise IndexError( - "unsupported selection item for orthogonal indexing; " - "expected integer, slice, integer array or Boolean " - f"array, got {type(dim_sel)!r}" - ) - - dim_indexers.append(dim_indexer) - - dim_indexers = dim_indexers - shape = tuple(s.nitems for s in dim_indexers if not isinstance(s, IntDimIndexer)) - chunk_shape = chunk_shape - is_advanced = not is_basic_selection(selection) - if is_advanced: - drop_axes = tuple( - i - for i, dim_indexer in enumerate(dim_indexers) - if isinstance(dim_indexer, IntDimIndexer) - ) - else: - drop_axes = None - - object.__setattr__(self, "dim_indexers", dim_indexers) - object.__setattr__(self, "shape", shape) - object.__setattr__(self, "chunk_shape", chunk_shape) - object.__setattr__(self, "is_advanced", is_advanced) - object.__setattr__(self, "drop_axes", drop_axes) - - def __iter__(self) -> Iterator[ChunkProjection]: - for dim_projections in itertools.product(*self.dim_indexers): - chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) - chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) - out_selection = tuple( - p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None - ) - - # handle advanced indexing arrays orthogonally - if self.is_advanced: - # N.B., numpy doesn't support orthogonal indexing directly as yet, - # so need to work around via np.ix_. Also np.ix_ does not support a - # mixture of arrays and slices or integers, so need to convert slices - # and integers into ranges. - chunk_selection = ix_(chunk_selection, self.chunk_shape) - - # special case for non-monotonic indices - if not is_basic_selection(out_selection): - out_selection = ix_(out_selection, self.shape) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - - -@dataclass(frozen=True) -class OIndex: - array: Array - - def __getitem__(self, selection: Selection) -> NDArrayLike: - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - return self.array.get_orthogonal_selection(selection, fields=fields) - - def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - return self.array.set_orthogonal_selection(selection, value, fields=fields) - - -@dataclass(frozen=True) -class BlockIndexer(Indexer): - dim_indexers: list[SliceDimIndexer] - shape: ChunkCoords - drop_axes: None - - def __init__(self, selection, shape: ChunkCoords, chunk_grid: ChunkGrid): - chunk_shape = get_chunk_shape(chunk_grid) - - # handle ellipsis - selection = replace_ellipsis(selection, shape) - - # normalize list to array - selection = replace_lists(selection) - - # setup per-dimension indexers - dim_indexers = [] - for dim_sel, dim_len, dim_chunk_size in zip(selection, shape, chunk_shape, strict=True): - dim_numchunks = int(np.ceil(dim_len / dim_chunk_size)) - - if is_integer(dim_sel): - if dim_sel < 0: - dim_sel = dim_numchunks + dim_sel - - start = dim_sel * dim_chunk_size - stop = start + dim_chunk_size - slice_ = slice(start, stop) - - elif is_slice(dim_sel): - start = dim_sel.start if dim_sel.start is not None else 0 - stop = dim_sel.stop if dim_sel.stop is not None else dim_numchunks - - if dim_sel.step not in {1, None}: - raise IndexError( - "unsupported selection item for block indexing; " - f"expected integer or slice with step=1, got {type(dim_sel)!r}" - ) - - # Can't reuse wraparound_indices because it expects a numpy array - # We have integers here. - if start < 0: - start = dim_numchunks + start - if stop < 0: - stop = dim_numchunks + stop - - start = start * dim_chunk_size - stop = stop * dim_chunk_size - slice_ = slice(start, stop) - - else: - raise IndexError( - "unsupported selection item for block indexing; " - f"expected integer or slice, got {type(dim_sel)!r}" - ) - - dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size) - dim_indexers.append(dim_indexer) - - if start >= dim_len or start < 0: - raise BoundsCheckError(dim_len) - - dim_indexers = dim_indexers - shape = tuple(s.nitems for s in dim_indexers) - - object.__setattr__(self, "dim_indexers", dim_indexers) - object.__setattr__(self, "shape", shape) - object.__setattr__(self, "drop_axes", None) - - def __iter__(self) -> Iterator[ChunkProjection]: - for dim_projections in itertools.product(*self.dim_indexers): - chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) - chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) - out_selection = tuple( - p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None - ) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - - -@dataclass(frozen=True) -class BlockIndex: - array: Array - - def __getitem__(self, selection: Selection) -> NDArrayLike: - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - return self.array.get_block_selection(selection, fields=fields) - - def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - return self.array.set_block_selection(selection, value, fields=fields) - - -def is_coordinate_selection(selection: Selection, shape: ChunkCoords) -> bool: - return (len(selection) == len(shape)) and all( - is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection - ) - - -def is_mask_selection(selection: Selection, shape: ChunkCoords) -> bool: - return len(selection) == 1 and is_bool_array(selection[0]) and selection[0].shape == shape - - -@dataclass(frozen=True) -class CoordinateIndexer(Indexer): - sel_shape: ChunkCoords - selection: Selection - sel_sort: npt.NDArray[np.intp] | None - chunk_nitems_cumsum: npt.NDArray[np.intp] - chunk_rixs: npt.NDArray[np.intp] - chunk_mixs: tuple[npt.NDArray[np.intp], ...] - shape: ChunkCoords - chunk_shape: ChunkCoords - drop_axes: None - - def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGrid): - chunk_shape = get_chunk_shape(chunk_grid) - - if shape == (): - cdata_shape = (1,) - else: - cdata_shape = tuple(math.ceil(s / c) for s, c in zip(shape, chunk_shape, strict=True)) - nchunks = reduce(operator.mul, cdata_shape, 1) - - # some initial normalization - selection = ensure_tuple(selection) - selection = tuple([i] if is_integer(i) else i for i in selection) - selection = replace_lists(selection) - - # validation - if not is_coordinate_selection(selection, shape): - raise IndexError( - "invalid coordinate selection; expected one integer " - "(coordinate) array per dimension of the target array, " - f"got {selection!r}" - ) - - # handle wraparound, boundscheck - for dim_sel, dim_len in zip(selection, shape, strict=True): - # handle wraparound - wraparound_indices(dim_sel, dim_len) - - # handle out of bounds - boundscheck_indices(dim_sel, dim_len) - - # compute chunk index for each point in the selection - chunks_multi_index = tuple( - dim_sel // dim_chunk_len - for (dim_sel, dim_chunk_len) in zip(selection, chunk_shape, strict=True) - ) - - # broadcast selection - this will raise error if array dimensions don't match - selection = np.broadcast_arrays(*selection) - chunks_multi_index = np.broadcast_arrays(*chunks_multi_index) - - # remember shape of selection, because we will flatten indices for processing - sel_shape = selection[0].shape if selection[0].shape else (1,) - - # flatten selection - selection = [dim_sel.reshape(-1) for dim_sel in selection] - chunks_multi_index = [dim_chunks.reshape(-1) for dim_chunks in chunks_multi_index] - - # ravel chunk indices - chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, dims=cdata_shape) - - # group points by chunk - if np.any(np.diff(chunks_raveled_indices) < 0): - # optimisation, only sort if needed - sel_sort = np.argsort(chunks_raveled_indices) - selection = tuple(dim_sel[sel_sort] for dim_sel in selection) - else: - sel_sort = None - - shape = selection[0].shape if selection[0].shape else (1,) - - # precompute number of selected items for each chunk - chunk_nitems = np.bincount(chunks_raveled_indices, minlength=nchunks) - chunk_nitems_cumsum = np.cumsum(chunk_nitems) - # locate the chunks we need to process - chunk_rixs = np.nonzero(chunk_nitems)[0] - - # unravel chunk indices - chunk_mixs = np.unravel_index(chunk_rixs, cdata_shape) - - object.__setattr__(self, "sel_shape", sel_shape) - object.__setattr__(self, "selection", selection) - object.__setattr__(self, "sel_sort", sel_sort) - object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) - object.__setattr__(self, "chunk_rixs", chunk_rixs) - object.__setattr__(self, "chunk_mixs", chunk_mixs) - object.__setattr__(self, "chunk_shape", chunk_shape) - object.__setattr__(self, "shape", shape) - object.__setattr__(self, "drop_axes", None) - - def __iter__(self) -> Iterator[ChunkProjection]: - # iterate over chunks - for i, chunk_rix in enumerate(self.chunk_rixs): - chunk_coords = tuple(m[i] for m in self.chunk_mixs) - if chunk_rix == 0: - start = 0 - else: - start = self.chunk_nitems_cumsum[chunk_rix - 1] - stop = self.chunk_nitems_cumsum[chunk_rix] - if self.sel_sort is None: - out_selection = slice(start, stop) - else: - out_selection = self.sel_sort[start:stop] - - chunk_offsets = tuple( - dim_chunk_ix * dim_chunk_len - for dim_chunk_ix, dim_chunk_len in zip(chunk_coords, self.chunk_shape, strict=True) - ) - chunk_selection = tuple( - dim_sel[start:stop] - dim_chunk_offset - for (dim_sel, dim_chunk_offset) in zip(self.selection, chunk_offsets, strict=True) - ) - - yield ChunkProjection(chunk_coords, chunk_selection, out_selection) - - -@dataclass(frozen=True) -class MaskIndexer(CoordinateIndexer): - def __init__(self, selection, shape: ChunkCoords, chunk_grid: ChunkGrid): - # some initial normalization - selection = ensure_tuple(selection) - selection = replace_lists(selection) - - # validation - if not is_mask_selection(selection, shape): - raise IndexError( - "invalid mask selection; expected one Boolean (mask)" - f"array with the same shape as the target array, got {selection!r}" - ) - - # convert to indices - selection = np.nonzero(selection[0]) - - # delegate the rest to superclass - super().__init__(selection, shape, chunk_grid) - - -@dataclass(frozen=True) -class VIndex: - array: Array - - def __getitem__(self, selection: Selection) -> NDArrayLike: - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - if is_coordinate_selection(selection, self.array.shape): - return self.array.get_coordinate_selection(selection, fields=fields) - elif is_mask_selection(selection, self.array.shape): - return self.array.get_mask_selection(selection, fields=fields) - else: - raise VindexInvalidSelectionError(selection) - - def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - if is_coordinate_selection(selection, self.array.shape): - self.array.set_coordinate_selection(selection, value, fields=fields) - elif is_mask_selection(selection, self.array.shape): - self.array.set_mask_selection(selection, value, fields=fields) - else: - raise VindexInvalidSelectionError(selection) - - -def check_fields(fields: Fields, dtype: npt.DTypeLike) -> npt.DTypeLike: - # early out - if fields is None: - return dtype - # check type - if not isinstance(fields, Fields): - raise IndexError( - f"'fields' argument must be a string or list of strings; found {type(fields)!r}" - ) - if fields: - if dtype.names is None: - raise IndexError("invalid 'fields' argument, array does not have any fields") - try: - if isinstance(fields, str): - # single field selection - out_dtype = dtype[fields] - else: - # multiple field selection - out_dtype = np.dtype([(f, dtype[f]) for f in fields]) - except KeyError as e: - raise IndexError(f"invalid 'fields' argument, field not found: {e!r}") from e - else: - return out_dtype - else: - return dtype - - -def check_no_multi_fields(fields: Fields) -> Fields: - if isinstance(fields, list): - if len(fields) == 1: - return fields[0] - elif len(fields) > 1: - raise IndexError("multiple fields are not supported for this operation") - return fields - - -def pop_fields(selection: Selection) -> tuple[Fields | None, Selection]: - if isinstance(selection, str): - # single field selection - fields = selection - selection = () - elif not isinstance(selection, tuple): - # single selection item, no fields - fields = None - # leave selection as-is - else: - # multiple items, split fields from selection items - fields = [f for f in selection if isinstance(f, str)] - fields = fields[0] if len(fields) == 1 else fields - selection = tuple(s for s in selection if not isinstance(s, str)) - selection = selection[0] if len(selection) == 1 else selection - return fields, selection - - -def make_slice_selection(selection: Selection) -> list[int | slice]: - ls = [] - for dim_selection in selection: - if is_integer(dim_selection): - ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1)) - elif isinstance(dim_selection, np.ndarray): - if len(dim_selection) == 1: - ls.append(slice(int(dim_selection[0]), int(dim_selection[0]) + 1, 1)) - else: - raise ArrayIndexError - else: - ls.append(dim_selection) - return ls diff --git a/tests/v3/test_indexing.py b/tests/v3/test_indexing.py index 717fe733ca..467fd46533 100644 --- a/tests/v3/test_indexing.py +++ b/tests/v3/test_indexing.py @@ -12,7 +12,7 @@ import zarr from zarr.abc.store import Store from zarr.common import ChunkCoords -from zarr.indexing2 import ( +from zarr.indexing import ( make_slice_selection, normalize_integer_selection, oindex, From 4073cd3de59dca2f6d2be6e8cd3aa66d35e1d27c Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 24 May 2024 21:57:23 +0200 Subject: [PATCH 04/13] NDBuffer --- src/zarr/array.py | 27 +++++++++++---------------- src/zarr/indexing.py | 4 ---- tests/v3/test_indexing.py | 23 ++++++++++++----------- 3 files changed, 23 insertions(+), 31 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index e75696c716..9125ec86a3 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -379,9 +379,8 @@ async def _get_selection( self, indexer: Indexer, *, - out: NDArrayLike | NDBuffer | None = None, - create_factory: Factory.Create = NDBuffer.create, - from_factory: Factory.Create = NDBuffer.from_ndarray_like, + out: NDBuffer | None = None, + factory: Factory.Create = NDBuffer.create, fields: Fields | None = None, ) -> NDArrayLike: # check fields are sensible @@ -389,20 +388,16 @@ async def _get_selection( # setup output buffer if out is not None: - if isinstance(out, NDArrayLike): - out_buffer = NDBuffer.from_ndarray_like(out) - elif isinstance(out, NDBuffer): + if isinstance(out, NDBuffer): out_buffer = out else: - raise TypeError( - f"out argument needs to be either an ndarray or an NDBuffer. Got {type(out)!r}" - ) + raise TypeError(f"out argument needs to be an NDBuffer. Got {type(out)!r}") if out_buffer.shape != indexer.shape: raise ValueError( f"shape of out argument doesn't match. Expected {indexer.shape}, got {out.shape}" ) else: - out_buffer = create_factory( + out_buffer = factory( shape=indexer.shape, dtype=out_dtype, order=self.order, @@ -433,7 +428,7 @@ async def getitem( shape=self.metadata.shape, chunk_grid=self.metadata.chunk_grid, ) - return await self._get_selection(indexer, create_factory=factory) + return await self._get_selection(indexer, factory=factory) async def _save_metadata(self, metadata: ArrayMetadata) -> None: to_save = metadata.to_buffer_dict() @@ -666,7 +661,7 @@ def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: def get_basic_selection( self, selection: Selection = Ellipsis, - out: NDArrayLike | NDBuffer | None = None, + out: NDBuffer | None = None, fields: Fields | None = None, ) -> NDArrayLike: if self.shape == (): @@ -689,7 +684,7 @@ def set_basic_selection( def get_orthogonal_selection( self, selection: Selection, - out: NDArrayLike | NDBuffer | None = None, + out: NDBuffer | None = None, fields: Fields | None = None, ) -> NDArrayLike: indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) @@ -704,7 +699,7 @@ def set_orthogonal_selection( def get_mask_selection( self, mask: npt.NDArray[Any], - out: NDArrayLike | NDBuffer | None = None, + out: NDBuffer | None = None, fields: Fields | None = None, ) -> NDArrayLike: indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) @@ -719,7 +714,7 @@ def set_mask_selection( def get_coordinate_selection( self, selection: CoordinateSelection, - out: NDArrayLike | NDBuffer | None = None, + out: NDBuffer | None = None, fields: Fields | None = None, ) -> NDArrayLike: indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) @@ -752,7 +747,7 @@ def set_coordinate_selection( def get_block_selection( self, selection: Selection, - out: NDArrayLike | NDBuffer | None = None, + out: NDBuffer | None = None, fields: Fields | None = None, ) -> NDArrayLike: indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) diff --git a/src/zarr/indexing.py b/src/zarr/indexing.py index 4408cdd3ae..d51a6e6622 100644 --- a/src/zarr/indexing.py +++ b/src/zarr/indexing.py @@ -204,10 +204,6 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) -def ceildiv(a: int, b: int) -> int: - return math.ceil(a / b) - - @dataclass(frozen=True) class SliceDimIndexer: dim_len: int diff --git a/tests/v3/test_indexing.py b/tests/v3/test_indexing.py index 467fd46533..53dfd81e5c 100644 --- a/tests/v3/test_indexing.py +++ b/tests/v3/test_indexing.py @@ -11,6 +11,7 @@ import zarr from zarr.abc.store import Store +from zarr.buffer import NDBuffer from zarr.common import ChunkCoords from zarr.indexing import ( make_slice_selection, @@ -107,7 +108,7 @@ def test_get_basic_selection_0d(store: StorePath): assert 42 == z[()] # test out param - b = np.zeros_like(a) + b = NDBuffer.from_numpy_array(np.zeros_like(a)) z.get_basic_selection(Ellipsis, out=b) assert_array_equal(a, b) @@ -125,10 +126,10 @@ def test_get_basic_selection_0d(store: StorePath): assert a[["foo", "bar"]] == z.get_basic_selection((), fields=["foo", "bar"]) assert a[["foo", "bar"]] == z["foo", "bar"] # test out param - b = np.zeros_like(a) + b = NDBuffer.from_numpy_array(np.zeros_like(a)) z.get_basic_selection(Ellipsis, out=b) assert_array_equal(a, b) - c = np.zeros_like(a[["foo", "bar"]]) + c = NDBuffer.from_numpy_array(np.zeros_like(a[["foo", "bar"]])) z.get_basic_selection(Ellipsis, out=c, fields=["foo", "bar"]) assert_array_equal(a[["foo", "bar"]], c) @@ -217,9 +218,9 @@ def _test_get_basic_selection(a, z, selection): assert_array_equal(expect, actual) # test out param - b = np.empty(shape=expect.shape, dtype=expect.dtype) + b = NDBuffer.from_numpy_array(np.empty(shape=expect.shape, dtype=expect.dtype)) z.get_basic_selection(selection, out=b) - assert_array_equal(expect, b) + assert_array_equal(expect, b.as_numpy_array()) # noinspection PyStatementEffect @@ -1367,9 +1368,9 @@ def test_get_selection_out(store: StorePath): ] for selection in selections: expect = a[selection] - out = np.empty(expect.shape) + out = NDBuffer.from_numpy_array(np.empty(expect.shape)) z.get_basic_selection(selection, out=out) - assert_array_equal(expect, out[:]) + assert_array_equal(expect, out.as_numpy_array()[:]) with pytest.raises(TypeError): z.get_basic_selection(Ellipsis, out=[]) @@ -1397,9 +1398,9 @@ def test_get_selection_out(store: StorePath): ] for selection in selections: expect = oindex(a, selection) - out = np.zeros(expect.shape, dtype=expect.dtype) + out = NDBuffer.from_numpy_array(np.zeros(expect.shape, dtype=expect.dtype)) z.get_orthogonal_selection(selection, out=out) - assert_array_equal(expect, out[:]) + assert_array_equal(expect, out.as_numpy_array()[:]) # coordinate selections a = np.arange(10000, dtype=int).reshape(1000, 10) @@ -1419,9 +1420,9 @@ def test_get_selection_out(store: StorePath): ] for selection in selections: expect = a[selection] - out = np.zeros(expect.shape, dtype=expect.dtype) + out = NDBuffer.from_numpy_array(np.zeros(expect.shape, dtype=expect.dtype)) z.get_coordinate_selection(selection, out=out) - assert_array_equal(expect, out[:]) + assert_array_equal(expect, out.as_numpy_array()[:]) @pytest.mark.xfail(reason="fields are not supported in v3") From f2515d11b9edf643b8b3d4d4e6506977774d967f Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 27 May 2024 14:39:45 +0200 Subject: [PATCH 05/13] typing --- src/zarr/buffer.py | 6 ++++-- src/zarr/indexing.py | 14 +++++++------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py index a6dc7f8fe4..8d76a51f04 100644 --- a/src/zarr/buffer.py +++ b/src/zarr/buffer.py @@ -63,7 +63,9 @@ def __getitem__(self, key: slice) -> Self: ... def __setitem__(self, key: slice, value: Any) -> None: ... - def reshape(self, shape: ChunkCoords, *, order: Literal["A", "C", "F"] = ...) -> Self: ... + def reshape( + self, shape: ChunkCoords | Literal[-1], *, order: Literal["A", "C", "F"] = ... + ) -> Self: ... def view(self, dtype: npt.DTypeLike) -> Self: ... @@ -418,7 +420,7 @@ def byteorder(self) -> Endian: else: return Endian(sys.byteorder) - def reshape(self, newshape: ChunkCoords) -> Self: + def reshape(self, newshape: ChunkCoords | Literal[-1]) -> Self: return self.__class__(self._data.reshape(newshape)) def squeeze(self, axis: tuple[int, ...]) -> Self: diff --git a/src/zarr/indexing.py b/src/zarr/indexing.py index d51a6e6622..79ca3e58c1 100644 --- a/src/zarr/indexing.py +++ b/src/zarr/indexing.py @@ -8,7 +8,7 @@ from dataclasses import dataclass from enum import Enum from functools import reduce -from typing import TYPE_CHECKING, Any, NamedTuple, Protocol, runtime_checkable +from typing import TYPE_CHECKING, Any, NamedTuple, Protocol, TypeGuard, runtime_checkable import numpy as np import numpy.typing as npt @@ -44,7 +44,7 @@ def ceildiv(a: float, b: float) -> int: return math.ceil(a / b) -def is_integer(x: Selector) -> bool: +def is_integer(x: Selector) -> TypeGuard[int]: """True if x is an integer (both pure Python or NumPy). Note that Python's bool is considered an integer too. @@ -52,7 +52,7 @@ def is_integer(x: Selector) -> bool: return isinstance(x, numbers.Integral) -def is_integer_list(x: Selector) -> bool: +def is_integer_list(x: Selector) -> TypeGuard[list[int]]: """True if x is a list of integers. This function assumes ie *does not check* that all elements of the list @@ -62,17 +62,17 @@ def is_integer_list(x: Selector) -> bool: return isinstance(x, list) and len(x) > 0 and is_integer(x[0]) -def is_integer_array(x: Selector, ndim: int | None = None) -> bool: +def is_integer_array(x: Selector, ndim: int | None = None) -> TypeGuard[npt.NDArray[np.intp]]: t = not np.isscalar(x) and hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype.kind in "ui" if ndim is not None: - t = t and len(x.shape) == ndim + t = t and hasattr(x, "shape") and len(x.shape) == ndim return t -def is_bool_array(x: Selector, ndim: int | None = None) -> bool: +def is_bool_array(x: Selector, ndim: int | None = None) -> TypeGuard[npt.NDArray[np.bool_]]: t = hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype == bool if ndim is not None: - t = t and len(x.shape) == ndim + t = t and hasattr(x, "shape") and len(x.shape) == ndim return t From 6e35e7e53a5a3c42298ab944ed0f1574f2d3ca00 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 29 May 2024 17:31:45 +0100 Subject: [PATCH 06/13] progress on typing --- src/zarr/abc/codec.py | 23 +-- src/zarr/array.py | 47 +++--- src/zarr/codecs/pipeline.py | 20 +-- src/zarr/codecs/sharding.py | 16 +- src/zarr/indexing.py | 311 +++++++++++++++++++++++------------- 5 files changed, 260 insertions(+), 157 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 9dfb939cca..682f445547 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -13,7 +13,8 @@ if TYPE_CHECKING: from typing_extensions import Self - from zarr.common import ArraySpec, SliceSelection + from zarr.common import ArraySpec + from zarr.indexing import SelectorTuple from zarr.metadata import ArrayMetadata @@ -155,13 +156,13 @@ class ArrayBytesCodecPartialDecodeMixin: """Mixin for array-to-bytes codecs that implement partial decoding.""" async def _decode_partial_single( - self, byte_getter: ByteGetter, selection: SliceSelection, chunk_spec: ArraySpec + self, byte_getter: ByteGetter, selection: SelectorTuple, chunk_spec: ArraySpec ) -> NDBuffer | None: raise NotImplementedError async def decode_partial( self, - batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], + batch_info: Iterable[tuple[ByteGetter, SelectorTuple, ArraySpec]], ) -> Iterable[NDBuffer | None]: """Partially decodes a batch of chunks. This method determines parts of a chunk from the slice selection, @@ -169,7 +170,7 @@ async def decode_partial( Parameters ---------- - batch_info : Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]] + batch_info : Iterable[tuple[ByteGetter, SelectorTuple, ArraySpec]] Ordered set of information about slices of encoded chunks. The slice selection determines which parts of the chunk will be fetched. The ByteGetter is used to fetch the necessary bytes. @@ -196,14 +197,14 @@ async def _encode_partial_single( self, byte_setter: ByteSetter, chunk_array: NDBuffer, - selection: SliceSelection, + selection: SelectorTuple, chunk_spec: ArraySpec, ) -> None: raise NotImplementedError async def encode_partial( self, - batch_info: Iterable[tuple[ByteSetter, NDBuffer, SliceSelection, ArraySpec]], + batch_info: Iterable[tuple[ByteSetter, NDBuffer, SelectorTuple, ArraySpec]], ) -> None: """Partially encodes a batch of chunks. This method determines parts of a chunk from the slice selection, encodes them and @@ -213,7 +214,7 @@ async def encode_partial( Parameters ---------- - batch_info : Iterable[tuple[ByteSetter, NDBuffer, SliceSelection, ArraySpec]] + batch_info : Iterable[tuple[ByteSetter, NDBuffer, SelectorTuple, ArraySpec]] Ordered set of information about slices of to-be-encoded chunks. The slice selection determines which parts of the chunk will be encoded. The ByteSetter is used to write the necessary bytes and fetch bytes for existing chunk data. @@ -342,7 +343,7 @@ async def encode( @abstractmethod async def read( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], out: NDBuffer, drop_axes: tuple[int, ...] | None = None, ) -> None: @@ -351,7 +352,7 @@ async def read( Parameters ---------- - batch_info : Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]] + batch_info : Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]] Ordered set of information about the chunks. The first slice selection determines which parts of the chunk will be fetched. The second slice selection determines where in the output array the chunk data will be written. @@ -364,7 +365,7 @@ async def read( @abstractmethod async def write( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], value: NDBuffer, drop_axes: tuple[int, ...] | None = None, ) -> None: @@ -374,7 +375,7 @@ async def write( Parameters ---------- - batch_info : Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]] + batch_info : Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]] Ordered set of information about the chunks. The first slice selection determines which parts of the chunk will be encoded. The second slice selection determines where in the value array the chunk data is located. diff --git a/src/zarr/array.py b/src/zarr/array.py index 9125ec86a3..260fb45f6b 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -12,7 +12,7 @@ from asyncio import gather from collections.abc import Iterable from dataclasses import dataclass, replace -from typing import Any, Literal +from typing import Any, Literal, cast import numpy as np import numpy.typing as npt @@ -38,14 +38,19 @@ from zarr.config import config from zarr.indexing import ( BasicIndexer, + BasicSelection, BlockIndex, BlockIndexer, + BlockSelection, CoordinateIndexer, + CoordinateSelection, Fields, Indexer, MaskIndexer, + MaskSelection, OIndex, OrthogonalIndexer, + OrthogonalSelection, VIndex, check_no_multi_fields, is_pure_fancy_indexing, @@ -58,8 +63,6 @@ from zarr.sync import sync from zarr.v2.indexing import check_fields -CoordinateSelection = Iterable[int | Iterable[int]] - def parse_array_metadata(data: Any) -> ArrayMetadata: if isinstance(data, ArrayMetadata): @@ -642,25 +645,29 @@ def order(self) -> Literal["C", "F"]: def __getitem__(self, selection: Selection) -> NDArrayLike: fields, pure_selection = pop_fields(selection) if is_pure_fancy_indexing(pure_selection, self.ndim): - result = self.vindex[selection] + result = self.vindex[cast(CoordinateSelection | MaskSelection, selection)] elif is_pure_orthogonal_indexing(pure_selection, self.ndim): - result = self.get_orthogonal_selection(pure_selection, fields=fields) + result = self.get_orthogonal_selection( + cast(OrthogonalSelection, pure_selection), fields=fields + ) else: - result = self.get_basic_selection(pure_selection, fields=fields) + result = self.get_basic_selection(cast(BasicSelection, pure_selection), fields=fields) return result def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: fields, pure_selection = pop_fields(selection) if is_pure_fancy_indexing(pure_selection, self.ndim): - self.vindex[selection] = value + self.vindex[cast(CoordinateSelection | MaskSelection, selection)] = value elif is_pure_orthogonal_indexing(pure_selection, self.ndim): - self.set_orthogonal_selection(pure_selection, value, fields=fields) + self.set_orthogonal_selection( + cast(OrthogonalSelection, pure_selection), value, fields=fields + ) else: - self.set_basic_selection(pure_selection, value, fields=fields) + self.set_basic_selection(cast(BasicSelection, pure_selection), value, fields=fields) def get_basic_selection( self, - selection: Selection = Ellipsis, + selection: BasicSelection = Ellipsis, out: NDBuffer | None = None, fields: Fields | None = None, ) -> NDArrayLike: @@ -676,14 +683,14 @@ def get_basic_selection( ) def set_basic_selection( - self, selection: Selection, value: NDArrayLike, fields: Fields | None = None + self, selection: BasicSelection, value: NDArrayLike, fields: Fields | None = None ) -> None: indexer = BasicIndexer(selection, self.shape, self.metadata.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields)) def get_orthogonal_selection( self, - selection: Selection, + selection: OrthogonalSelection, out: NDBuffer | None = None, fields: Fields | None = None, ) -> NDArrayLike: @@ -691,22 +698,19 @@ def get_orthogonal_selection( return sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) def set_orthogonal_selection( - self, selection: Selection, value: NDArrayLike, fields: Fields | None = None + self, selection: OrthogonalSelection, value: NDArrayLike, fields: Fields | None = None ) -> None: indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) return sync(self._async_array._set_selection(indexer, value, fields=fields)) def get_mask_selection( - self, - mask: npt.NDArray[Any], - out: NDBuffer | None = None, - fields: Fields | None = None, + self, mask: MaskSelection, out: NDBuffer | None = None, fields: Fields | None = None ) -> NDArrayLike: indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) return sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) def set_mask_selection( - self, mask: npt.NDArray[Any], value: NDArrayLike, fields: Fields | None = None + self, mask: MaskSelection, value: NDArrayLike, fields: Fields | None = None ) -> None: indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields)) @@ -746,7 +750,7 @@ def set_coordinate_selection( def get_block_selection( self, - selection: Selection, + selection: BlockSelection, out: NDBuffer | None = None, fields: Fields | None = None, ) -> NDArrayLike: @@ -754,7 +758,10 @@ def get_block_selection( return sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) def set_block_selection( - self, selection: Selection, value: npt.NDArray[Any], fields: Fields | None = None + self, + selection: BlockSelection, + value: NDArrayLike, + fields: Fields | None = None, ) -> None: indexer = BlockIndexer(selection, self.shape, self.metadata.chunk_grid) sync(self._async_array._set_selection(indexer, value, fields=fields)) diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index bdab8a6ac0..33ef4ad8d0 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -23,13 +23,13 @@ from zarr.codecs.registry import get_codec_class from zarr.common import JSON, concurrent_map, parse_named_configuration from zarr.config import config -from zarr.indexing import is_scalar, is_total_slice +from zarr.indexing import SelectorTuple, is_scalar, is_total_slice from zarr.metadata import ArrayMetadata if TYPE_CHECKING: from typing_extensions import Self - from zarr.common import ArraySpec, SliceSelection + from zarr.common import ArraySpec T = TypeVar("T") U = TypeVar("U") @@ -250,7 +250,7 @@ async def decode_batch( async def decode_partial_batch( self, - batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], + batch_info: Iterable[tuple[ByteGetter, SelectorTuple, ArraySpec]], ) -> Iterable[NDBuffer | None]: assert self.supports_partial_decode assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) @@ -285,7 +285,7 @@ async def encode_batch( async def encode_partial_batch( self, - batch_info: Iterable[tuple[ByteSetter, NDBuffer, SliceSelection, ArraySpec]], + batch_info: Iterable[tuple[ByteSetter, NDBuffer, SelectorTuple, ArraySpec]], ) -> None: assert self.supports_partial_encode assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) @@ -293,7 +293,7 @@ async def encode_partial_batch( async def read_batch( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], out: NDBuffer, drop_axes: tuple[int, ...] | None = None, ) -> None: @@ -338,7 +338,7 @@ async def read_batch( async def write_batch( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], value: NDBuffer, drop_axes: tuple[int, ...] | None = None, ) -> None: @@ -378,9 +378,9 @@ async def _read_key(byte_setter: ByteSetter | None) -> Buffer | None: def _merge_chunk_array( existing_chunk_array: NDBuffer | None, value: NDBuffer, - out_selection: SliceSelection, + out_selection: SelectorTuple, chunk_spec: ArraySpec, - chunk_selection: SliceSelection, + chunk_selection: SelectorTuple, ) -> NDBuffer: if ( is_total_slice(chunk_selection, chunk_spec.shape) @@ -474,7 +474,7 @@ async def encode( async def read( self, - batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], out: NDBuffer, drop_axes: tuple[int, ...] | None = None, ) -> None: @@ -489,7 +489,7 @@ async def read( async def write( self, - batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], value: NDBuffer, drop_axes: tuple[int, ...] | None = None, ) -> None: diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 8fbe4f6654..0a50e3a021 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -34,7 +34,7 @@ parse_shapelike, product, ) -from zarr.indexing import BasicIndexer, c_order_iter, morton_order_iter +from zarr.indexing import BasicIndexer, SelectorTuple, c_order_iter, get_indexer, morton_order_iter from zarr.metadata import ArrayMetadata, parse_codecs if TYPE_CHECKING: @@ -42,7 +42,7 @@ from typing_extensions import Self - from zarr.common import JSON, SliceSelection + from zarr.common import JSON MAX_UINT_64 = 2**64 - 1 ShardMapping = Mapping[ChunkCoords, Buffer] @@ -420,7 +420,7 @@ async def _decode_single( async def _decode_partial_single( self, byte_getter: ByteGetter, - selection: SliceSelection, + selection: SelectorTuple, shard_spec: ArraySpec, ) -> NDBuffer | None: shard_shape = shard_spec.shape @@ -428,7 +428,7 @@ async def _decode_partial_single( chunks_per_shard = self._get_chunks_per_shard(shard_spec) chunk_spec = self._get_chunk_spec(shard_spec) - indexer = BasicIndexer( + indexer = get_indexer( selection, shape=shard_shape, chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), @@ -517,7 +517,7 @@ async def _encode_partial_single( self, byte_setter: ByteSetter, shard_array: NDBuffer, - selection: SliceSelection, + selection: SelectorTuple, shard_spec: ArraySpec, ) -> None: shard_shape = shard_spec.shape @@ -532,10 +532,8 @@ async def _encode_partial_single( ) indexer = list( - BasicIndexer( - selection, - shape=shard_shape, - chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), + get_indexer( + selection, shape=shard_shape, chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape) ) ) diff --git a/src/zarr/indexing.py b/src/zarr/indexing.py index 79ca3e58c1..a188b0902b 100644 --- a/src/zarr/indexing.py +++ b/src/zarr/indexing.py @@ -8,28 +8,81 @@ from dataclasses import dataclass from enum import Enum from functools import reduce -from typing import TYPE_CHECKING, Any, NamedTuple, Protocol, TypeGuard, runtime_checkable +from types import EllipsisType +from typing import ( + TYPE_CHECKING, + Any, + NamedTuple, + Protocol, + TypeGuard, + TypeVar, + cast, + runtime_checkable, +) import numpy as np import numpy.typing as npt -from zarr.common import ChunkCoords, SliceSelection, product -from zarr.v2.errors import ( - ArrayIndexError, - BoundsCheckError, - NegativeStepError, - VindexInvalidSelectionError, - err_too_many_indices, -) +from zarr.common import ChunkCoords, product if TYPE_CHECKING: from zarr.array import Array from zarr.buffer import NDArrayLike from zarr.chunk_grids import ChunkGrid -Selector = int | slice | npt.NDArray[np.bool_] -Selection = tuple[Selector, ...] -Fields = str | list | tuple +BasicSelector = int | slice | EllipsisType +BasicSelectorTuple = tuple[BasicSelector, ...] +BasicSelection = BasicSelector | BasicSelectorTuple +BasicSelectionNormalized = tuple[int | slice, ...] +CoordinateSelection = npt.NDArray[np.intp] +BlockSelector = int | slice +BlockSelection = BlockSelector | tuple[BlockSelector, ...] +BlockSelectionNormalized = tuple[BlockSelector, ...] +MaskSelection = npt.NDArray[np.bool_] +OrthogonalSelector = int | slice | npt.NDArray[np.intp | np.bool_] +OrthogonalSelection = OrthogonalSelector | tuple[OrthogonalSelector, ...] +OrthogonalSelectionNormalized = tuple[OrthogonalSelector, ...] + +Selection = ( + BasicSelection | CoordinateSelection | BlockSelection | MaskSelection | OrthogonalSelection +) +SelectionNormalized = ( + BasicSelectionNormalized + | CoordinateSelection + | BlockSelectionNormalized + | MaskSelection + | OrthogonalSelectionNormalized +) +Selector = int | slice | npt.NDArray[np.intp | np.bool_] +SelectorTuple = tuple[Selector, ...] +Fields = str | list[str] | tuple[str, ...] + + +class ArrayIndexError(IndexError): + pass + + +class BoundsCheckError(IndexError): + _msg = "" + + def __init__(self, dim_len: int): + self._msg = f"index out of bounds for dimension with length {dim_len}" + + +class NegativeStepError(IndexError): + _msg = "only slices with step >= 1 are supported" + + +class VindexInvalidSelectionError(IndexError): + _msg = ( + "unsupported selection type for vectorized indexing; only " + "coordinate selection (tuple of integer arrays) and mask selection " + "(single Boolean array) are supported; got {0!r}" + ) + + +def err_too_many_indices(selection: Any, shape: ChunkCoords) -> None: + raise IndexError(f"too many indices for array; expected {len(shape)}, got {len(selection)}") @runtime_checkable @@ -44,7 +97,7 @@ def ceildiv(a: float, b: float) -> int: return math.ceil(a / b) -def is_integer(x: Selector) -> TypeGuard[int]: +def is_integer(x: Any) -> TypeGuard[int]: """True if x is an integer (both pure Python or NumPy). Note that Python's bool is considered an integer too. @@ -52,7 +105,7 @@ def is_integer(x: Selector) -> TypeGuard[int]: return isinstance(x, numbers.Integral) -def is_integer_list(x: Selector) -> TypeGuard[list[int]]: +def is_integer_list(x: Any) -> TypeGuard[list[int]]: """True if x is a list of integers. This function assumes ie *does not check* that all elements of the list @@ -62,21 +115,21 @@ def is_integer_list(x: Selector) -> TypeGuard[list[int]]: return isinstance(x, list) and len(x) > 0 and is_integer(x[0]) -def is_integer_array(x: Selector, ndim: int | None = None) -> TypeGuard[npt.NDArray[np.intp]]: +def is_integer_array(x: Any, ndim: int | None = None) -> TypeGuard[npt.NDArray[np.intp]]: t = not np.isscalar(x) and hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype.kind in "ui" if ndim is not None: t = t and hasattr(x, "shape") and len(x.shape) == ndim return t -def is_bool_array(x: Selector, ndim: int | None = None) -> TypeGuard[npt.NDArray[np.bool_]]: +def is_bool_array(x: Any, ndim: int | None = None) -> TypeGuard[npt.NDArray[np.bool_]]: t = hasattr(x, "shape") and hasattr(x, "dtype") and x.dtype == bool if ndim is not None: t = t and hasattr(x, "shape") and len(x.shape) == ndim return t -def is_scalar(value: Selector, dtype: npt.DTypeLike) -> bool: +def is_scalar(value: Any, dtype: np.dtype[Any]) -> bool: if np.isscalar(value): return True if hasattr(value, "shape") and value.shape == (): @@ -86,7 +139,7 @@ def is_scalar(value: Selector, dtype: npt.DTypeLike) -> bool: return False -def is_pure_fancy_indexing(selection: Selection, ndim: int) -> bool: +def is_pure_fancy_indexing(selection: Any, ndim: int) -> bool: """Check whether a selection contains only scalars or integer array-likes. Parameters @@ -120,7 +173,7 @@ def is_pure_fancy_indexing(selection: Selection, ndim: int) -> bool: ) -def is_pure_orthogonal_indexing(selection: Selection, ndim: int) -> bool: +def is_pure_orthogonal_indexing(selection: Selection, ndim: int) -> TypeGuard[OrthogonalSelection]: if not ndim: return False @@ -180,8 +233,8 @@ class ChunkDimProjection(NamedTuple): """ dim_chunk_ix: int - dim_chunk_sel: slice - dim_out_sel: slice + dim_chunk_sel: Selector + dim_out_sel: Selector | None @dataclass(frozen=True) @@ -278,12 +331,12 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) -def check_selection_length(selection: Selection, shape: ChunkCoords): +def check_selection_length(selection: SelectionNormalized, shape: ChunkCoords) -> None: if len(selection) > len(shape): err_too_many_indices(selection, shape) -def replace_ellipsis(selection: Selection, shape: ChunkCoords): +def replace_ellipsis(selection: Any, shape: ChunkCoords) -> SelectionNormalized: selection = ensure_tuple(selection) # count number of ellipsis present @@ -320,13 +373,16 @@ def replace_ellipsis(selection: Selection, shape: ChunkCoords): return selection -def replace_lists(selection: Selection) -> Selection: +def replace_lists(selection: SelectionNormalized) -> SelectionNormalized: return tuple( np.asarray(dim_sel) if isinstance(dim_sel, list) else dim_sel for dim_sel in selection ) -def ensure_tuple(v: Selector | tuple[Selector]) -> tuple[Selector, ...]: +T = TypeVar("T") + + +def ensure_tuple(v: Any) -> SelectionNormalized: if not isinstance(v, tuple): v = (v,) return v @@ -349,28 +405,28 @@ class ChunkProjection(NamedTuple): """ chunk_coords: ChunkCoords - chunk_selection: SliceSelection - out_selection: SliceSelection + chunk_selection: tuple[Selector, ...] | npt.NDArray[np.intp] + out_selection: tuple[Selector, ...] | npt.NDArray[np.intp] -def is_slice(s: slice) -> bool: +def is_slice(s: Any) -> TypeGuard[slice]: return isinstance(s, slice) -def is_contiguous_slice(s: slice) -> bool: +def is_contiguous_slice(s: Any) -> TypeGuard[slice]: return is_slice(s) and (s.step is None or s.step == 1) -def is_positive_slice(s: slice) -> bool: +def is_positive_slice(s: Any) -> TypeGuard[slice]: return is_slice(s) and (s.step is None or s.step >= 1) -def is_contiguous_selection(selection: Selection) -> bool: +def is_contiguous_selection(selection: Any) -> TypeGuard[slice]: selection = ensure_tuple(selection) return all((is_integer_array(s) or is_contiguous_slice(s) or s == Ellipsis) for s in selection) -def is_basic_selection(selection: Selection) -> bool: +def is_basic_selection(selection: Any) -> TypeGuard[BasicSelection]: selection = ensure_tuple(selection) return all(is_integer(s) or is_positive_slice(s) for s in selection) @@ -383,17 +439,20 @@ class BasicIndexer(Indexer): def __init__( self, - selection: Selection, + selection: BasicSelection, shape: ChunkCoords, chunk_grid: ChunkGrid, ): chunk_shape = get_chunk_shape(chunk_grid) # handle ellipsis - selection = replace_ellipsis(selection, shape) + selection_normalized = replace_ellipsis(selection, shape) # setup per-dimension indexers - dim_indexers = [] - for dim_sel, dim_len, dim_chunk_len in zip(selection, shape, chunk_shape, strict=True): + dim_indexers: list[IntDimIndexer | SliceDimIndexer] = [] + for dim_sel, dim_len, dim_chunk_len in zip( + selection_normalized, shape, chunk_shape, strict=True + ): + dim_indexer: IntDimIndexer | SliceDimIndexer if is_integer(dim_sel): dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) @@ -437,7 +496,7 @@ class BoolArrayDimIndexer: chunk_nitems: npt.NDArray[Any] chunk_nitems_cumsum: npt.NDArray[Any] nitems: int - dim_chunk_ixs: int + dim_chunk_ixs: npt.NDArray[np.intp] def __init__(self, dim_sel: npt.NDArray[np.bool_], dim_len: int, dim_chunk_len: int): # check number of dimensions @@ -518,13 +577,13 @@ def check(a: npt.NDArray[Any]) -> Order: return order -def wraparound_indices(x, dim_len): +def wraparound_indices(x: npt.NDArray[Any], dim_len: int) -> None: loc_neg = x < 0 if np.any(loc_neg): x[loc_neg] = x[loc_neg] + dim_len -def boundscheck_indices(x, dim_len): +def boundscheck_indices(x: npt.NDArray[Any], dim_len: int) -> None: if np.any(x < 0) or np.any(x >= dim_len): raise BoundsCheckError(dim_len) @@ -538,20 +597,20 @@ class IntArrayDimIndexer: nchunks: int nitems: int order: Order - dim_sel: int - dim_out_sel: int + dim_sel: npt.NDArray[np.intp] + dim_out_sel: npt.NDArray[np.intp] chunk_nitems: int dim_chunk_ixs: npt.NDArray[np.intp] chunk_nitems_cumsum: npt.NDArray[np.intp] def __init__( self, - dim_sel: int, + dim_sel: npt.NDArray[np.intp], dim_len: int, dim_chunk_len: int, - wraparound=True, - boundscheck=True, - order=Order.UNKNOWN, + wraparound: bool = True, + boundscheck: bool = True, + order: Order = Order.UNKNOWN, ): # ensure 1d array dim_sel = np.asanyarray(dim_sel) @@ -614,6 +673,7 @@ def __init__( def __iter__(self) -> Iterator[ChunkDimProjection]: for dim_chunk_ix in self.dim_chunk_ixs: + dim_out_sel: slice | npt.NDArray[np.intp] # find region in output if dim_chunk_ix == 0: start = 0 @@ -632,11 +692,11 @@ def __iter__(self) -> Iterator[ChunkDimProjection]: yield ChunkDimProjection(dim_chunk_ix, dim_chunk_sel, dim_out_sel) -def slice_to_range(s: slice, l: int): # noqa: E741 - return range(*s.indices(l)) +def slice_to_range(s: slice, length: int) -> range: + return range(*s.indices(length)) -def ix_(selection: Selection, shape: ChunkCoords): +def ix_(selection: Any, shape: ChunkCoords) -> npt.NDArray[np.intp]: """Convert an orthogonal selection to a numpy advanced (fancy) selection, like numpy.ix_ but with support for slices and single ints.""" @@ -659,7 +719,7 @@ def ix_(selection: Selection, shape: ChunkCoords): return selection -def oindex(a: npt.NDArray[Any], selection: Selection): +def oindex(a: npt.NDArray[Any], selection: Selection) -> npt.NDArray[Any]: """Implementation of orthogonal indexing with slices and ints.""" selection = replace_ellipsis(selection, a.shape) drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) @@ -670,17 +730,16 @@ def oindex(a: npt.NDArray[Any], selection: Selection): return result -def oindex_set(a: npt.NDArray[Any], selection: Selection, value): +def oindex_set(a: npt.NDArray[Any], selection: Selection, value: Any) -> None: selection = replace_ellipsis(selection, a.shape) drop_axes = tuple(i for i, s in enumerate(selection) if is_integer(s)) selection = ix_(selection, a.shape) if not np.isscalar(value) and drop_axes: value = np.asanyarray(value) - value_selection = [slice(None)] * len(a.shape) + value_selection: list[Selector | None] = [slice(None)] * len(a.shape) for i in drop_axes: value_selection[i] = np.newaxis - value_selection = tuple(value_selection) - value = value[value_selection] + value = value[tuple(value_selection)] a[selection] = value @@ -702,8 +761,11 @@ def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGr selection = replace_lists(selection) # setup per-dimension indexers - dim_indexers = [] + dim_indexers: list[ + IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer + ] = [] for dim_sel, dim_len, dim_chunk_len in zip(selection, shape, chunk_shape, strict=True): + dim_indexer: IntDimIndexer | SliceDimIndexer | IntArrayDimIndexer | BoolArrayDimIndexer if is_integer(dim_sel): dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) @@ -747,8 +809,10 @@ def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGr def __iter__(self) -> Iterator[ChunkProjection]: for dim_projections in itertools.product(*self.dim_indexers): chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) - chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) - out_selection = tuple( + chunk_selection: tuple[Selector, ...] | npt.NDArray[Any] = tuple( + p.dim_chunk_sel for p in dim_projections + ) + out_selection: tuple[Selector, ...] | npt.NDArray[Any] = tuple( p.dim_out_sel for p in dim_projections if p.dim_out_sel is not None ) @@ -771,17 +835,21 @@ def __iter__(self) -> Iterator[ChunkProjection]: class OIndex: array: Array - def __getitem__(self, selection: Selection) -> NDArrayLike: - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - return self.array.get_orthogonal_selection(selection, fields=fields) + def __getitem__(self, selection: OrthogonalSelection) -> NDArrayLike: + fields, new_selection = pop_fields(selection) + new_selection = ensure_tuple(new_selection) + new_selection = replace_lists(new_selection) + return self.array.get_orthogonal_selection( + cast(OrthogonalSelection, new_selection), fields=fields + ) - def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - return self.array.set_orthogonal_selection(selection, value, fields=fields) + def __setitem__(self, selection: OrthogonalSelection, value: NDArrayLike) -> None: + fields, new_selection = pop_fields(selection) + new_selection = ensure_tuple(new_selection) + new_selection = replace_lists(new_selection) + return self.array.set_orthogonal_selection( + cast(OrthogonalSelection, new_selection), value, fields=fields + ) @dataclass(frozen=True) @@ -790,18 +858,20 @@ class BlockIndexer(Indexer): shape: ChunkCoords drop_axes: None - def __init__(self, selection, shape: ChunkCoords, chunk_grid: ChunkGrid): + def __init__(self, selection: BlockSelection, shape: ChunkCoords, chunk_grid: ChunkGrid): chunk_shape = get_chunk_shape(chunk_grid) # handle ellipsis - selection = replace_ellipsis(selection, shape) + selection_normalized = replace_ellipsis(selection, shape) # normalize list to array - selection = replace_lists(selection) + selection_normalized = replace_lists(selection_normalized) # setup per-dimension indexers dim_indexers = [] - for dim_sel, dim_len, dim_chunk_size in zip(selection, shape, chunk_shape, strict=True): + for dim_sel, dim_len, dim_chunk_size in zip( + selection_normalized, shape, chunk_shape, strict=True + ): dim_numchunks = int(np.ceil(dim_len / dim_chunk_size)) if is_integer(dim_sel): @@ -867,26 +937,30 @@ def __iter__(self) -> Iterator[ChunkProjection]: class BlockIndex: array: Array - def __getitem__(self, selection: Selection) -> NDArrayLike: - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - return self.array.get_block_selection(selection, fields=fields) - - def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - return self.array.set_block_selection(selection, value, fields=fields) + def __getitem__(self, selection: BlockSelection) -> NDArrayLike: + fields, new_selection = pop_fields(selection) + new_selection = ensure_tuple(new_selection) + new_selection = replace_lists(new_selection) + return self.array.get_block_selection(cast(BlockSelection, new_selection), fields=fields) + + def __setitem__(self, selection: BlockSelection, value: NDArrayLike) -> None: + fields, new_selection = pop_fields(selection) + new_selection = ensure_tuple(new_selection) + new_selection = replace_lists(new_selection) + return self.array.set_block_selection( + cast(BlockSelection, new_selection), value, fields=fields + ) -def is_coordinate_selection(selection: Selection, shape: ChunkCoords) -> bool: +def is_coordinate_selection( + selection: Selection, shape: ChunkCoords +) -> TypeGuard[CoordinateSelection]: return (len(selection) == len(shape)) and all( is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection ) -def is_mask_selection(selection: Selection, shape: ChunkCoords) -> bool: +def is_mask_selection(selection: Selection, shape: ChunkCoords) -> TypeGuard[MaskSelection]: return len(selection) == 1 and is_bool_array(selection[0]) and selection[0].shape == shape @@ -902,7 +976,7 @@ class CoordinateIndexer(Indexer): chunk_shape: ChunkCoords drop_axes: None - def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGrid): + def __init__(self, selection: CoordinateSelection, shape: ChunkCoords, chunk_grid: ChunkGrid): chunk_shape = get_chunk_shape(chunk_grid) if shape == (): @@ -1009,7 +1083,7 @@ def __iter__(self) -> Iterator[ChunkProjection]: @dataclass(frozen=True) class MaskIndexer(CoordinateIndexer): - def __init__(self, selection, shape: ChunkCoords, chunk_grid: ChunkGrid): + def __init__(self, selection: MaskSelection, shape: ChunkCoords, chunk_grid: ChunkGrid): # some initial normalization selection = ensure_tuple(selection) selection = replace_lists(selection) @@ -1032,30 +1106,36 @@ def __init__(self, selection, shape: ChunkCoords, chunk_grid: ChunkGrid): class VIndex: array: Array - def __getitem__(self, selection: Selection) -> NDArrayLike: - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - if is_coordinate_selection(selection, self.array.shape): - return self.array.get_coordinate_selection(selection, fields=fields) - elif is_mask_selection(selection, self.array.shape): - return self.array.get_mask_selection(selection, fields=fields) + def __getitem__(self, selection: CoordinateSelection | MaskSelection) -> NDArrayLike: + fields, new_selection = pop_fields(selection) + new_selection = ensure_tuple(new_selection) + new_selection = replace_lists(new_selection) + if is_coordinate_selection(new_selection, self.array.shape): + return self.array.get_coordinate_selection( + cast(CoordinateSelection, new_selection), fields=fields + ) + elif is_mask_selection(new_selection, self.array.shape): + return self.array.get_mask_selection(cast(MaskSelection, new_selection), fields=fields) else: - raise VindexInvalidSelectionError(selection) - - def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: - fields, selection = pop_fields(selection) - selection = ensure_tuple(selection) - selection = replace_lists(selection) - if is_coordinate_selection(selection, self.array.shape): - self.array.set_coordinate_selection(selection, value, fields=fields) - elif is_mask_selection(selection, self.array.shape): - self.array.set_mask_selection(selection, value, fields=fields) + raise VindexInvalidSelectionError(new_selection) + + def __setitem__( + self, selection: CoordinateSelection | MaskSelection, value: NDArrayLike + ) -> None: + fields, new_selection = pop_fields(selection) + new_selection = ensure_tuple(new_selection) + new_selection = replace_lists(new_selection) + if is_coordinate_selection(new_selection, self.array.shape): + self.array.set_coordinate_selection( + cast(CoordinateSelection, new_selection), value, fields=fields + ) + elif is_mask_selection(new_selection, self.array.shape): + self.array.set_mask_selection(cast(MaskSelection, new_selection), value, fields=fields) else: - raise VindexInvalidSelectionError(selection) + raise VindexInvalidSelectionError(new_selection) -def check_fields(fields: Fields, dtype: npt.DTypeLike) -> npt.DTypeLike: +def check_fields(fields: Fields, dtype: np.dtype[Any]) -> np.dtype[Any]: # early out if fields is None: return dtype @@ -1109,8 +1189,8 @@ def pop_fields(selection: Selection) -> tuple[Fields | None, Selection]: return fields, selection -def make_slice_selection(selection: Selection) -> list[int | slice]: - ls = [] +def make_slice_selection(selection: Any) -> list[int | slice]: + ls: list[int | slice] = [] for dim_selection in selection: if is_integer(dim_selection): ls.append(slice(int(dim_selection), int(dim_selection) + 1, 1)) @@ -1173,3 +1253,20 @@ def is_total_slice(item: Selection, shape: ChunkCoords) -> bool: ) else: raise TypeError(f"expected slice or tuple of slices, found {item!r}") + + +def get_indexer(selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGrid) -> Indexer: + fields, pure_selection = pop_fields(selection) + if is_pure_fancy_indexing(pure_selection, len(shape)): + new_selection = ensure_tuple(selection) + new_selection = replace_lists(new_selection) + if is_coordinate_selection(new_selection, shape): + return CoordinateIndexer(cast(CoordinateSelection, selection), shape, chunk_grid) + elif is_mask_selection(new_selection, shape): + return MaskIndexer(cast(MaskSelection, selection), shape, chunk_grid) + else: + raise VindexInvalidSelectionError(new_selection) + elif is_pure_orthogonal_indexing(pure_selection, len(shape)): + return OrthogonalIndexer(cast(OrthogonalSelection, selection), shape, chunk_grid) + else: + return BasicIndexer(cast(BasicSelection, selection), shape, chunk_grid) From e25a2954fd019e2fdb4d3862ea55c6457820bc48 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Sat, 1 Jun 2024 13:43:55 +0200 Subject: [PATCH 07/13] narrow the type of drop_axes to just tuple[int, ...] (#1938) --- src/zarr/abc/codec.py | 4 ++-- src/zarr/codecs/pipeline.py | 24 +++++++++++++----------- src/zarr/indexing.py | 18 +++++++++--------- src/zarr/v2/indexing.py | 8 ++++---- 4 files changed, 28 insertions(+), 26 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 682f445547..0836d878ae 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -345,7 +345,7 @@ async def read( self, batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], out: NDBuffer, - drop_axes: tuple[int, ...] | None = None, + drop_axes: tuple[int, ...] = (), ) -> None: """Reads chunk data from the store, decodes it and writes it into an output array. Partial decoding may be utilized if the codecs and stores support it. @@ -367,7 +367,7 @@ async def write( self, batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], value: NDBuffer, - drop_axes: tuple[int, ...] | None = None, + drop_axes: tuple[int, ...] = (), ) -> None: """Encodes chunk data and writes it to the store. Merges with existing chunk data by reading first, if necessary. diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 33ef4ad8d0..65702ee232 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -295,7 +295,7 @@ async def read_batch( self, batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], out: NDBuffer, - drop_axes: tuple[int, ...] | None = None, + drop_axes: tuple[int, ...] = (), ) -> None: if self.supports_partial_decode: chunk_array_batch = await self.decode_partial_batch( @@ -340,7 +340,7 @@ async def write_batch( self, batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], value: NDBuffer, - drop_axes: tuple[int, ...] | None = None, + drop_axes: tuple[int, ...] = (), ) -> None: if self.supports_partial_encode: await self.encode_partial_batch( @@ -381,6 +381,7 @@ def _merge_chunk_array( out_selection: SelectorTuple, chunk_spec: ArraySpec, chunk_selection: SelectorTuple, + drop_axes: tuple[int, ...], ) -> NDBuffer: if ( is_total_slice(chunk_selection, chunk_spec.shape) @@ -403,17 +404,18 @@ def _merge_chunk_array( else: chunk_value = value[out_selection] # handle missing singleton dimensions - if drop_axes: - item = [slice(None)] * chunk_spec.ndim - for a in drop_axes: - item[a] = newaxis # TODO replace with agnostic newaxis - item = tuple(item) - chunk_value = chunk_value[item] + item = tuple( + newaxis if idx in drop_axes else slice(None) + for idx in range(chunk_spec.ndim) + ) + chunk_value = chunk_value[item] chunk_array[chunk_selection] = chunk_value return chunk_array chunk_array_batch = [ - _merge_chunk_array(chunk_array, value, out_selection, chunk_spec, chunk_selection) + _merge_chunk_array( + chunk_array, value, out_selection, chunk_spec, chunk_selection, drop_axes + ) for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( chunk_array_batch, batch_info, strict=False ) @@ -476,7 +478,7 @@ async def read( self, batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple]], out: NDBuffer, - drop_axes: tuple[int, ...] | None = None, + drop_axes: tuple[int, ...] = (), ) -> None: await concurrent_map( [ @@ -491,7 +493,7 @@ async def write( self, batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], value: NDBuffer, - drop_axes: tuple[int, ...] | None = None, + drop_axes: tuple[int, ...] = (), ) -> None: await concurrent_map( [ diff --git a/src/zarr/indexing.py b/src/zarr/indexing.py index a188b0902b..9d8b6813d6 100644 --- a/src/zarr/indexing.py +++ b/src/zarr/indexing.py @@ -88,7 +88,7 @@ def err_too_many_indices(selection: Any, shape: ChunkCoords) -> None: @runtime_checkable class Indexer(Protocol): shape: ChunkCoords - drop_axes: ChunkCoords | None + drop_axes: ChunkCoords def __iter__(self) -> Iterator[ChunkProjection]: ... @@ -435,7 +435,7 @@ def is_basic_selection(selection: Any) -> TypeGuard[BasicSelection]: class BasicIndexer(Indexer): dim_indexers: list[IntDimIndexer | SliceDimIndexer] shape: ChunkCoords - drop_axes: None + drop_axes: ChunkCoords def __init__( self, @@ -473,7 +473,7 @@ def __init__( "shape", tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)), ) - object.__setattr__(self, "drop_axes", None) + object.__setattr__(self, "drop_axes", ()) def __iter__(self) -> Iterator[ChunkProjection]: for dim_projections in itertools.product(*self.dim_indexers): @@ -749,7 +749,7 @@ class OrthogonalIndexer(Indexer): shape: ChunkCoords chunk_shape: ChunkCoords is_advanced: bool - drop_axes: tuple[int, ...] | None + drop_axes: tuple[int, ...] def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGrid): chunk_shape = get_chunk_shape(chunk_grid) @@ -798,7 +798,7 @@ def __init__(self, selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGr if isinstance(dim_indexer, IntDimIndexer) ) else: - drop_axes = None + drop_axes = () object.__setattr__(self, "dim_indexers", dim_indexers) object.__setattr__(self, "shape", shape) @@ -856,7 +856,7 @@ def __setitem__(self, selection: OrthogonalSelection, value: NDArrayLike) -> Non class BlockIndexer(Indexer): dim_indexers: list[SliceDimIndexer] shape: ChunkCoords - drop_axes: None + drop_axes: ChunkCoords def __init__(self, selection: BlockSelection, shape: ChunkCoords, chunk_grid: ChunkGrid): chunk_shape = get_chunk_shape(chunk_grid) @@ -920,7 +920,7 @@ def __init__(self, selection: BlockSelection, shape: ChunkCoords, chunk_grid: Ch object.__setattr__(self, "dim_indexers", dim_indexers) object.__setattr__(self, "shape", shape) - object.__setattr__(self, "drop_axes", None) + object.__setattr__(self, "drop_axes", ()) def __iter__(self) -> Iterator[ChunkProjection]: for dim_projections in itertools.product(*self.dim_indexers): @@ -974,7 +974,7 @@ class CoordinateIndexer(Indexer): chunk_mixs: tuple[npt.NDArray[np.intp], ...] shape: ChunkCoords chunk_shape: ChunkCoords - drop_axes: None + drop_axes: ChunkCoords def __init__(self, selection: CoordinateSelection, shape: ChunkCoords, chunk_grid: ChunkGrid): chunk_shape = get_chunk_shape(chunk_grid) @@ -1053,7 +1053,7 @@ def __init__(self, selection: CoordinateSelection, shape: ChunkCoords, chunk_gri object.__setattr__(self, "chunk_mixs", chunk_mixs) object.__setattr__(self, "chunk_shape", chunk_shape) object.__setattr__(self, "shape", shape) - object.__setattr__(self, "drop_axes", None) + object.__setattr__(self, "drop_axes", ()) def __iter__(self) -> Iterator[ChunkProjection]: # iterate over chunks diff --git a/src/zarr/v2/indexing.py b/src/zarr/v2/indexing.py index 0e266ad908..242e9ae849 100644 --- a/src/zarr/v2/indexing.py +++ b/src/zarr/v2/indexing.py @@ -346,7 +346,7 @@ def __init__(self, selection, array): self.dim_indexers = dim_indexers self.shape = tuple(s.nitems for s in self.dim_indexers if not isinstance(s, IntDimIndexer)) - self.drop_axes = None + self.drop_axes = () def __iter__(self): for dim_projections in itertools.product(*self.dim_indexers): @@ -625,7 +625,7 @@ def __init__(self, selection, array): if isinstance(dim_indexer, IntDimIndexer) ) else: - self.drop_axes = None + self.drop_axes = () def __iter__(self): for dim_projections in itertools.product(*self.dim_indexers): @@ -724,7 +724,7 @@ def __init__(self, selection, array): self.dim_indexers = dim_indexers self.shape = tuple(s.nitems for s in self.dim_indexers) - self.drop_axes = None + self.drop_axes = () def __iter__(self): for dim_projections in itertools.product(*self.dim_indexers): @@ -823,7 +823,7 @@ def __init__(self, selection, array): self.selection = selection self.sel_sort = sel_sort self.shape = selection[0].shape if selection[0].shape else (1,) - self.drop_axes = None + self.drop_axes = () self.array = array # precompute number of selected items for each chunk From 7d05acecb589f8d582cf3c5173af214927af0d13 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sat, 1 Jun 2024 20:26:38 +0200 Subject: [PATCH 08/13] merge --- src/zarr/array.py | 14 +++++--------- src/zarr/codecs/pipeline.py | 6 +++--- src/zarr/indexing.py | 16 ++++++---------- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index 260fb45f6b..ad30839e27 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -647,9 +647,7 @@ def __getitem__(self, selection: Selection) -> NDArrayLike: if is_pure_fancy_indexing(pure_selection, self.ndim): result = self.vindex[cast(CoordinateSelection | MaskSelection, selection)] elif is_pure_orthogonal_indexing(pure_selection, self.ndim): - result = self.get_orthogonal_selection( - cast(OrthogonalSelection, pure_selection), fields=fields - ) + result = self.get_orthogonal_selection(pure_selection, fields=fields) else: result = self.get_basic_selection(cast(BasicSelection, pure_selection), fields=fields) return result @@ -659,9 +657,7 @@ def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: if is_pure_fancy_indexing(pure_selection, self.ndim): self.vindex[cast(CoordinateSelection | MaskSelection, selection)] = value elif is_pure_orthogonal_indexing(pure_selection, self.ndim): - self.set_orthogonal_selection( - cast(OrthogonalSelection, pure_selection), value, fields=fields - ) + self.set_orthogonal_selection(pure_selection, value, fields=fields) else: self.set_basic_selection(cast(BasicSelection, pure_selection), value, fields=fields) @@ -722,11 +718,11 @@ def get_coordinate_selection( fields: Fields | None = None, ) -> NDArrayLike: indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) - out = sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) + out_array = sync(self._async_array._get_selection(indexer=indexer, out=out, fields=fields)) # restore shape - out = out.reshape(indexer.sel_shape) - return out + out_array = out_array.reshape(indexer.sel_shape) + return out_array def set_coordinate_selection( self, selection: CoordinateSelection, value: NDArrayLike, fields: Fields | None = None diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 65702ee232..a45116d19c 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -6,8 +6,6 @@ from typing import TYPE_CHECKING, TypeVar from warnings import warn -from numpy import newaxis - from zarr.abc.codec import ( ArrayArrayCodec, ArrayBytesCodec, @@ -405,7 +403,9 @@ def _merge_chunk_array( chunk_value = value[out_selection] # handle missing singleton dimensions item = tuple( - newaxis if idx in drop_axes else slice(None) + None # equivalent to np.newaxis + if idx in drop_axes + else slice(None) for idx in range(chunk_spec.ndim) ) chunk_value = chunk_value[item] diff --git a/src/zarr/indexing.py b/src/zarr/indexing.py index 9d8b6813d6..a5d5f73a2d 100644 --- a/src/zarr/indexing.py +++ b/src/zarr/indexing.py @@ -54,7 +54,7 @@ | OrthogonalSelectionNormalized ) Selector = int | slice | npt.NDArray[np.intp | np.bool_] -SelectorTuple = tuple[Selector, ...] +SelectorTuple = tuple[Selector, ...] | npt.NDArray[np.intp] Fields = str | list[str] | tuple[str, ...] @@ -1111,11 +1111,9 @@ def __getitem__(self, selection: CoordinateSelection | MaskSelection) -> NDArray new_selection = ensure_tuple(new_selection) new_selection = replace_lists(new_selection) if is_coordinate_selection(new_selection, self.array.shape): - return self.array.get_coordinate_selection( - cast(CoordinateSelection, new_selection), fields=fields - ) + return self.array.get_coordinate_selection(new_selection, fields=fields) elif is_mask_selection(new_selection, self.array.shape): - return self.array.get_mask_selection(cast(MaskSelection, new_selection), fields=fields) + return self.array.get_mask_selection(new_selection, fields=fields) else: raise VindexInvalidSelectionError(new_selection) @@ -1126,11 +1124,9 @@ def __setitem__( new_selection = ensure_tuple(new_selection) new_selection = replace_lists(new_selection) if is_coordinate_selection(new_selection, self.array.shape): - self.array.set_coordinate_selection( - cast(CoordinateSelection, new_selection), value, fields=fields - ) + self.array.set_coordinate_selection(new_selection, value, fields=fields) elif is_mask_selection(new_selection, self.array.shape): - self.array.set_mask_selection(cast(MaskSelection, new_selection), value, fields=fields) + self.array.set_mask_selection(new_selection, value, fields=fields) else: raise VindexInvalidSelectionError(new_selection) @@ -1162,7 +1158,7 @@ def check_fields(fields: Fields, dtype: np.dtype[Any]) -> np.dtype[Any]: return dtype -def check_no_multi_fields(fields: Fields) -> Fields: +def check_no_multi_fields(fields: Fields | None) -> Fields | None: if isinstance(fields, list): if len(fields) == 1: return fields[0] From 5255dab69c1fe8b230d7cce7569d93ff34c5e762 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sat, 1 Jun 2024 20:41:15 +0200 Subject: [PATCH 09/13] fix --- src/zarr/codecs/pipeline.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index a45116d19c..5d3139df3f 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -328,7 +328,7 @@ async def read_batch( ): if chunk_array is not None: tmp = chunk_array[chunk_selection] - if drop_axes: + if drop_axes != (): tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp else: @@ -402,13 +402,14 @@ def _merge_chunk_array( else: chunk_value = value[out_selection] # handle missing singleton dimensions - item = tuple( - None # equivalent to np.newaxis - if idx in drop_axes - else slice(None) - for idx in range(chunk_spec.ndim) - ) - chunk_value = chunk_value[item] + if drop_axes != (): + item = tuple( + None # equivalent to np.newaxis + if idx in drop_axes + else slice(None) + for idx in range(chunk_spec.ndim) + ) + chunk_value = chunk_value[item] chunk_array[chunk_selection] = chunk_value return chunk_array From e2a4aff1fc73abd7f206396e8aa25ff4351a1053 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sat, 1 Jun 2024 20:59:20 +0200 Subject: [PATCH 10/13] pr feedback --- src/zarr/codecs/pipeline.py | 80 ++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index f736fc0741..ada4ae23f9 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -333,6 +333,44 @@ async def read_batch( else: out[out_selection] = chunk_spec.fill_value + def _merge_chunk_array( + self, + existing_chunk_array: NDBuffer | None, + value: NDBuffer, + out_selection: SelectorTuple, + chunk_spec: ArraySpec, + chunk_selection: SelectorTuple, + drop_axes: tuple[int, ...], + ) -> NDBuffer: + if is_total_slice(chunk_selection, chunk_spec.shape) and value.shape == chunk_spec.shape: + return value + if existing_chunk_array is None: + chunk_array = NDBuffer.create( + shape=chunk_spec.shape, + dtype=chunk_spec.dtype, + order=chunk_spec.order, + fill_value=chunk_spec.fill_value, + ) + else: + chunk_array = existing_chunk_array.copy() # make a writable copy + if chunk_selection == (): + chunk_value = value + elif is_scalar(value.as_ndarray_like(), chunk_spec.dtype): + chunk_value = value + else: + chunk_value = value[out_selection] + # handle missing singleton dimensions + if drop_axes != (): + item = tuple( + None # equivalent to np.newaxis + if idx in drop_axes + else slice(None) + for idx in range(chunk_spec.ndim) + ) + chunk_value = chunk_value[item] + chunk_array[chunk_selection] = chunk_value + return chunk_array + async def write_batch( self, batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple]], @@ -372,48 +410,8 @@ async def _read_key(byte_setter: ByteSetter | None) -> Buffer | None: ], ) - def _merge_chunk_array( - existing_chunk_array: NDBuffer | None, - value: NDBuffer, - out_selection: SelectorTuple, - chunk_spec: ArraySpec, - chunk_selection: SelectorTuple, - drop_axes: tuple[int, ...], - ) -> NDBuffer: - if ( - is_total_slice(chunk_selection, chunk_spec.shape) - and value.shape == chunk_spec.shape - ): - return value - if existing_chunk_array is None: - chunk_array = NDBuffer.create( - shape=chunk_spec.shape, - dtype=chunk_spec.dtype, - order=chunk_spec.order, - fill_value=chunk_spec.fill_value, - ) - else: - chunk_array = existing_chunk_array.copy() # make a writable copy - if chunk_selection == (): - chunk_value = value - elif is_scalar(value.as_ndarray_like(), chunk_spec.dtype): - chunk_value = value - else: - chunk_value = value[out_selection] - # handle missing singleton dimensions - if drop_axes != (): - item = tuple( - None # equivalent to np.newaxis - if idx in drop_axes - else slice(None) - for idx in range(chunk_spec.ndim) - ) - chunk_value = chunk_value[item] - chunk_array[chunk_selection] = chunk_value - return chunk_array - chunk_array_batch = [ - _merge_chunk_array( + self._merge_chunk_array( chunk_array, value, out_selection, chunk_spec, chunk_selection, drop_axes ) for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( From f124017524b861c7fa875f6f7a2b6cc67c6afdcf Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sat, 1 Jun 2024 22:22:21 +0200 Subject: [PATCH 11/13] fixes all types --- src/zarr/array.py | 13 +++-- src/zarr/indexing.py | 121 +++++++++++++++++++++++++------------------ 2 files changed, 78 insertions(+), 56 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index 32cb129f79..59f31e5d0b 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -645,12 +645,11 @@ def order(self) -> Literal["C", "F"]: def __getitem__(self, selection: Selection) -> NDArrayLike: fields, pure_selection = pop_fields(selection) if is_pure_fancy_indexing(pure_selection, self.ndim): - result = self.vindex[cast(CoordinateSelection | MaskSelection, selection)] + return self.vindex[cast(CoordinateSelection | MaskSelection, selection)] elif is_pure_orthogonal_indexing(pure_selection, self.ndim): - result = self.get_orthogonal_selection(pure_selection, fields=fields) + return self.get_orthogonal_selection(pure_selection, fields=fields) else: - result = self.get_basic_selection(cast(BasicSelection, pure_selection), fields=fields) - return result + return self.get_basic_selection(cast(BasicSelection, pure_selection), fields=fields) def __setitem__(self, selection: Selection, value: NDArrayLike) -> None: fields, pure_selection = pop_fields(selection) @@ -763,15 +762,15 @@ def set_block_selection( sync(self._async_array._set_selection(indexer, value, fields=fields)) @property - def vindex(self) -> Any: + def vindex(self) -> VIndex: return VIndex(self) @property - def oindex(self) -> Any: + def oindex(self) -> OIndex: return OIndex(self) @property - def blocks(self) -> Any: + def blocks(self) -> BlockIndex: return BlockIndex(self) def resize(self, new_shape: ChunkCoords) -> Array: diff --git a/src/zarr/indexing.py b/src/zarr/indexing.py index be4a911f50..98130fe0cd 100644 --- a/src/zarr/indexing.py +++ b/src/zarr/indexing.py @@ -4,7 +4,7 @@ import math import numbers import operator -from collections.abc import Iterator +from collections.abc import Iterator, Sequence from dataclasses import dataclass from enum import Enum from functools import reduce @@ -34,12 +34,14 @@ BasicSelectorTuple = tuple[BasicSelector, ...] BasicSelection = BasicSelector | BasicSelectorTuple BasicSelectionNormalized = tuple[int | slice, ...] -CoordinateSelection = npt.NDArray[np.intp] +CoordinateSelector = list[int] | npt.NDArray[np.intp] +CoordinateSelection = CoordinateSelector | tuple[CoordinateSelector, ...] +CoordinateSelectionNormalized = tuple[npt.NDArray[np.intp], ...] BlockSelector = int | slice BlockSelection = BlockSelector | tuple[BlockSelector, ...] BlockSelectionNormalized = tuple[BlockSelector, ...] MaskSelection = npt.NDArray[np.bool_] -OrthogonalSelector = int | slice | npt.NDArray[np.intp | np.bool_] +OrthogonalSelector = int | slice | npt.NDArray[np.intp] | npt.NDArray[np.bool_] OrthogonalSelection = OrthogonalSelector | tuple[OrthogonalSelector, ...] OrthogonalSelectionNormalized = tuple[OrthogonalSelector, ...] @@ -48,13 +50,14 @@ ) SelectionNormalized = ( BasicSelectionNormalized - | CoordinateSelection + | CoordinateSelectionNormalized | BlockSelectionNormalized | MaskSelection | OrthogonalSelectionNormalized ) -Selector = int | slice | npt.NDArray[np.intp | np.bool_] -SelectorTuple = tuple[Selector, ...] | npt.NDArray[np.intp] +Selector = int | slice | npt.NDArray[np.intp] | npt.NDArray[np.bool_] +SelectionWithFields = Selection | str | Sequence[str] +SelectorTuple = tuple[Selector, ...] | npt.NDArray[np.intp] | slice Fields = str | list[str] | tuple[str, ...] @@ -370,7 +373,7 @@ def replace_ellipsis(selection: Any, shape: ChunkCoords) -> SelectionNormalized: # check selection not too long check_selection_length(selection, shape) - return selection + return cast(SelectionNormalized, selection) def replace_lists(selection: SelectionNormalized) -> SelectionNormalized: @@ -385,7 +388,7 @@ def replace_lists(selection: SelectionNormalized) -> SelectionNormalized: def ensure_tuple(v: Any) -> SelectionNormalized: if not isinstance(v, tuple): v = (v,) - return v + return cast(SelectionNormalized, v) class ChunkProjection(NamedTuple): @@ -406,7 +409,7 @@ class ChunkProjection(NamedTuple): chunk_coords: ChunkCoords chunk_selection: tuple[Selector, ...] | npt.NDArray[np.intp] - out_selection: tuple[Selector, ...] | npt.NDArray[np.intp] + out_selection: tuple[Selector, ...] | npt.NDArray[np.intp] | slice def is_slice(s: Any) -> TypeGuard[slice]: @@ -716,7 +719,7 @@ def ix_(selection: Any, shape: ChunkCoords) -> npt.NDArray[np.intp]: # now get numpy to convert to a coordinate selection selection = np.ix_(*selection) - return selection + return cast(npt.NDArray[np.intp], selection) def oindex(a: npt.NDArray[Any], selection: Selection) -> npt.NDArray[Any]: @@ -953,21 +956,28 @@ def __setitem__(self, selection: BlockSelection, value: NDArrayLike) -> None: def is_coordinate_selection( - selection: Selection, shape: ChunkCoords -) -> TypeGuard[CoordinateSelection]: - return (len(selection) == len(shape)) and all( - is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection + selection: SelectionNormalized, shape: ChunkCoords +) -> TypeGuard[CoordinateSelectionNormalized]: + return ( + isinstance(selection, tuple) + and len(selection) == len(shape) + and all(is_integer(dim_sel) or is_integer_array(dim_sel) for dim_sel in selection) ) def is_mask_selection(selection: Selection, shape: ChunkCoords) -> TypeGuard[MaskSelection]: - return len(selection) == 1 and is_bool_array(selection[0]) and selection[0].shape == shape + return ( + isinstance(selection, tuple) + and len(selection) == 1 + and is_bool_array(selection[0]) + and selection[0].shape == shape + ) @dataclass(frozen=True) class CoordinateIndexer(Indexer): sel_shape: ChunkCoords - selection: Selection + selection: CoordinateSelectionNormalized sel_sort: npt.NDArray[np.intp] | None chunk_nitems_cumsum: npt.NDArray[np.intp] chunk_rixs: npt.NDArray[np.intp] @@ -979,6 +989,7 @@ class CoordinateIndexer(Indexer): def __init__(self, selection: CoordinateSelection, shape: ChunkCoords, chunk_grid: ChunkGrid): chunk_shape = get_chunk_shape(chunk_grid) + cdata_shape: ChunkCoords if shape == (): cdata_shape = (1,) else: @@ -986,12 +997,16 @@ def __init__(self, selection: CoordinateSelection, shape: ChunkCoords, chunk_gri nchunks = reduce(operator.mul, cdata_shape, 1) # some initial normalization - selection = ensure_tuple(selection) - selection = tuple([i] if is_integer(i) else i for i in selection) - selection = replace_lists(selection) + selection_normalized = cast(CoordinateSelectionNormalized, ensure_tuple(selection)) + selection_normalized = tuple( + np.asarray([i]) if is_integer(i) else i for i in selection_normalized + ) + selection_normalized = cast( + CoordinateSelectionNormalized, replace_lists(selection_normalized) + ) # validation - if not is_coordinate_selection(selection, shape): + if not is_coordinate_selection(selection_normalized, shape): raise IndexError( "invalid coordinate selection; expected one integer " "(coordinate) array per dimension of the target array, " @@ -999,7 +1014,7 @@ def __init__(self, selection: CoordinateSelection, shape: ChunkCoords, chunk_gri ) # handle wraparound, boundscheck - for dim_sel, dim_len in zip(selection, shape, strict=True): + for dim_sel, dim_len in zip(selection_normalized, shape, strict=True): # handle wraparound wraparound_indices(dim_sel, dim_len) @@ -1009,32 +1024,36 @@ def __init__(self, selection: CoordinateSelection, shape: ChunkCoords, chunk_gri # compute chunk index for each point in the selection chunks_multi_index = tuple( dim_sel // dim_chunk_len - for (dim_sel, dim_chunk_len) in zip(selection, chunk_shape, strict=True) + for (dim_sel, dim_chunk_len) in zip(selection_normalized, chunk_shape, strict=True) ) # broadcast selection - this will raise error if array dimensions don't match - selection = np.broadcast_arrays(*selection) - chunks_multi_index = np.broadcast_arrays(*chunks_multi_index) + selection_broadcast = tuple(np.broadcast_arrays(*selection_normalized)) + chunks_multi_index_broadcast = np.broadcast_arrays(*chunks_multi_index) # remember shape of selection, because we will flatten indices for processing - sel_shape = selection[0].shape if selection[0].shape else (1,) + sel_shape = selection_broadcast[0].shape if selection_broadcast[0].shape else (1,) # flatten selection - selection = [dim_sel.reshape(-1) for dim_sel in selection] - chunks_multi_index = [dim_chunks.reshape(-1) for dim_chunks in chunks_multi_index] + selection_broadcast = tuple(dim_sel.reshape(-1) for dim_sel in selection_broadcast) + chunks_multi_index_broadcast = [ + dim_chunks.reshape(-1) for dim_chunks in chunks_multi_index_broadcast + ] # ravel chunk indices - chunks_raveled_indices = np.ravel_multi_index(chunks_multi_index, dims=cdata_shape) + chunks_raveled_indices = np.ravel_multi_index( + chunks_multi_index_broadcast, dims=cdata_shape + ) # group points by chunk if np.any(np.diff(chunks_raveled_indices) < 0): # optimisation, only sort if needed sel_sort = np.argsort(chunks_raveled_indices) - selection = tuple(dim_sel[sel_sort] for dim_sel in selection) + selection_broadcast = tuple(dim_sel[sel_sort] for dim_sel in selection_broadcast) else: sel_sort = None - shape = selection[0].shape if selection[0].shape else (1,) + shape = selection_broadcast[0].shape if selection_broadcast[0].shape else (1,) # precompute number of selected items for each chunk chunk_nitems = np.bincount(chunks_raveled_indices, minlength=nchunks) @@ -1046,7 +1065,7 @@ def __init__(self, selection: CoordinateSelection, shape: ChunkCoords, chunk_gri chunk_mixs = np.unravel_index(chunk_rixs, cdata_shape) object.__setattr__(self, "sel_shape", sel_shape) - object.__setattr__(self, "selection", selection) + object.__setattr__(self, "selection", selection_broadcast) object.__setattr__(self, "sel_sort", sel_sort) object.__setattr__(self, "chunk_nitems_cumsum", chunk_nitems_cumsum) object.__setattr__(self, "chunk_rixs", chunk_rixs) @@ -1064,6 +1083,7 @@ def __iter__(self) -> Iterator[ChunkProjection]: else: start = self.chunk_nitems_cumsum[chunk_rix - 1] stop = self.chunk_nitems_cumsum[chunk_rix] + out_selection: slice | npt.NDArray[np.intp] if self.sel_sort is None: out_selection = slice(start, stop) else: @@ -1085,21 +1105,21 @@ def __iter__(self) -> Iterator[ChunkProjection]: class MaskIndexer(CoordinateIndexer): def __init__(self, selection: MaskSelection, shape: ChunkCoords, chunk_grid: ChunkGrid): # some initial normalization - selection = ensure_tuple(selection) - selection = replace_lists(selection) + selection_normalized = cast(tuple[MaskSelection], ensure_tuple(selection)) + selection_normalized = cast(tuple[MaskSelection], replace_lists(selection_normalized)) # validation - if not is_mask_selection(selection, shape): + if not is_mask_selection(selection_normalized, shape): raise IndexError( "invalid mask selection; expected one Boolean (mask)" - f"array with the same shape as the target array, got {selection!r}" + f"array with the same shape as the target array, got {selection_normalized!r}" ) # convert to indices - selection = np.nonzero(selection[0]) + selection_indices = np.nonzero(selection_normalized[0]) # delegate the rest to superclass - super().__init__(selection, shape, chunk_grid) + super().__init__(selection_indices, shape, chunk_grid) @dataclass(frozen=True) @@ -1131,12 +1151,12 @@ def __setitem__( raise VindexInvalidSelectionError(new_selection) -def check_fields(fields: Fields, dtype: np.dtype[Any]) -> np.dtype[Any]: +def check_fields(fields: Fields | None, dtype: np.dtype[Any]) -> np.dtype[Any]: # early out if fields is None: return dtype # check type - if not isinstance(fields, Fields): + if not isinstance(fields, str | list | tuple): raise IndexError( f"'fields' argument must be a string or list of strings; found {type(fields)!r}" ) @@ -1167,22 +1187,23 @@ def check_no_multi_fields(fields: Fields | None) -> Fields | None: return fields -def pop_fields(selection: Selection) -> tuple[Fields | None, Selection]: +def pop_fields(selection: SelectionWithFields) -> tuple[Fields | None, Selection]: if isinstance(selection, str): # single field selection - fields = selection - selection = () + return selection, () elif not isinstance(selection, tuple): # single selection item, no fields - fields = None # leave selection as-is + return None, cast(Selection, selection) else: # multiple items, split fields from selection items - fields = [f for f in selection if isinstance(f, str)] + fields: Fields = [f for f in selection if isinstance(f, str)] fields = fields[0] if len(fields) == 1 else fields - selection = tuple(s for s in selection if not isinstance(s, str)) - selection = selection[0] if len(selection) == 1 else selection - return fields, selection + selection_tuple = tuple(s for s in selection if not isinstance(s, str)) + selection = cast( + Selection, selection_tuple[0] if len(selection_tuple) == 1 else selection_tuple + ) + return fields, selection def make_slice_selection(selection: Any) -> list[int | slice]: @@ -1249,8 +1270,10 @@ def is_total_slice(item: Selection, shape: ChunkCoords) -> bool: raise TypeError(f"expected slice or tuple of slices, found {item!r}") -def get_indexer(selection: Selection, shape: ChunkCoords, chunk_grid: ChunkGrid) -> Indexer: - fields, pure_selection = pop_fields(selection) +def get_indexer( + selection: SelectionWithFields, shape: ChunkCoords, chunk_grid: ChunkGrid +) -> Indexer: + _, pure_selection = pop_fields(selection) if is_pure_fancy_indexing(pure_selection, len(shape)): new_selection = ensure_tuple(selection) new_selection = replace_lists(new_selection) From 332d5afb73c0a9e43afbd6e49d901a4f5a99d440 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sat, 1 Jun 2024 22:26:17 +0200 Subject: [PATCH 12/13] fix import --- src/zarr/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index 59f31e5d0b..48ac27c469 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -52,6 +52,7 @@ OrthogonalIndexer, OrthogonalSelection, VIndex, + check_fields, check_no_multi_fields, is_pure_fancy_indexing, is_pure_orthogonal_indexing, @@ -61,7 +62,6 @@ from zarr.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata from zarr.store import StoreLike, StorePath, make_store_path from zarr.sync import sync -from zarr.v2.indexing import check_fields def parse_array_metadata(data: Any) -> ArrayMetadata: From 1b55d960c0a1668d0c3cb3fd101faef16216a11c Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 2 Jun 2024 08:25:48 +0200 Subject: [PATCH 13/13] remove util.py --- tests/v3/test_indexing.py | 19 ++++++++++-- tests/v3/util.py | 64 --------------------------------------- 2 files changed, 17 insertions(+), 66 deletions(-) delete mode 100644 tests/v3/util.py diff --git a/tests/v3/test_indexing.py b/tests/v3/test_indexing.py index 1143eda478..9ce485945b 100644 --- a/tests/v3/test_indexing.py +++ b/tests/v3/test_indexing.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections import Counter from collections.abc import Iterator from typing import Any from uuid import uuid4 @@ -23,8 +24,6 @@ from zarr.store.core import StorePath from zarr.store.memory import MemoryStore -from .util import CountingDict - @pytest.fixture def store() -> Iterator[Store]: @@ -47,6 +46,22 @@ def zarr_array_from_numpy_array( return z +class CountingDict(MemoryStore): + def __init__(self): + super().__init__(mode="w") + self.counter = Counter() + + async def get(self, key, byte_range=None): + key_suffix = "/".join(key.split("/")[1:]) + self.counter["__getitem__", key_suffix] += 1 + return await super().get(key, byte_range) + + async def set(self, key, value, byte_range=None): + key_suffix = "/".join(key.split("/")[1:]) + self.counter["__setitem__", key_suffix] += 1 + return await super().set(key, value, byte_range) + + def test_normalize_integer_selection(): assert 1 == normalize_integer_selection(1, 100) assert 99 == normalize_integer_selection(-1, 100) diff --git a/tests/v3/util.py b/tests/v3/util.py deleted file mode 100644 index d1838804f3..0000000000 --- a/tests/v3/util.py +++ /dev/null @@ -1,64 +0,0 @@ -import collections -import os -import tempfile - -import pytest - -from zarr.store.memory import MemoryStore - - -class CountingDict(MemoryStore): - def __init__(self): - super().__init__(mode="w") - self.counter = collections.Counter() - - async def get(self, key, byte_range=None): - key_suffix = "/".join(key.split("/")[1:]) - self.counter["__getitem__", key_suffix] += 1 - return await super().get(key, byte_range) - - async def set(self, key, value, byte_range=None): - key_suffix = "/".join(key.split("/")[1:]) - self.counter["__setitem__", key_suffix] += 1 - return await super().set(key, value, byte_range) - - -def skip_test_env_var(name): - """Checks for environment variables indicating whether tests requiring services should be run""" - value = os.environ.get(name, "0") - return pytest.mark.skipif(value == "0", reason="Tests not enabled via environment variable") - - -try: - import fsspec # noqa: F401 - - have_fsspec = True -except ImportError: # pragma: no cover - have_fsspec = False - - -def abs_container(): - import azure.storage.blob as asb - from azure.core.exceptions import ResourceExistsError - - URL = "http://127.0.0.1:10000" - ACCOUNT_NAME = "devstoreaccount1" - KEY = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" - CONN_STR = ( - f"DefaultEndpointsProtocol=http;AccountName={ACCOUNT_NAME};" - f"AccountKey={KEY};BlobEndpoint={URL}/{ACCOUNT_NAME};" - ) - - blob_service_client = asb.BlobServiceClient.from_connection_string(CONN_STR) - try: - container_client = blob_service_client.create_container("test") - except ResourceExistsError: - container_client = blob_service_client.get_container_client("test") - - return container_client - - -def mktemp(**kwargs): - f = tempfile.NamedTemporaryFile(**kwargs) - f.close() - return f.name