From 9405fda3c086679b02cd990f99cc7a359ebbb2cf Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 20 Feb 2024 11:25:39 +0100 Subject: [PATCH 01/21] merge --- src/zarr/abc/codec.py | 136 ++++++++++++- src/zarr/array.py | 166 ++++------------ src/zarr/codecs/batched_pipeline.py | 283 ++++++++++++++++++++++++++++ src/zarr/codecs/pipeline.py | 49 ++--- src/zarr/metadata.py | 11 +- 5 files changed, 483 insertions(+), 162 deletions(-) create mode 100644 src/zarr/codecs/batched_pipeline.py diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 1abc21b30b..366d388495 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -1,12 +1,12 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Awaitable, Callable, Iterable, Optional, Tuple, TypeVar import numpy as np from zarr.abc.metadata import Metadata -from zarr.common import ArraySpec +from zarr.common import ArraySpec, concurrent_map from zarr.store import StorePath @@ -16,6 +16,22 @@ from zarr.metadata import ArrayMetadata from zarr.config import RuntimeConfiguration +T = TypeVar("T") +U = TypeVar("U") + + +def noop_for_none( + func: Callable[[Optional[T], ArraySpec, RuntimeConfiguration], Awaitable[U]], +) -> Callable[[T, ArraySpec, RuntimeConfiguration], Awaitable[U]]: + async def wrap( + chunk: Optional[T], chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration + ) -> U: + if chunk is None: + return None + return await func(chunk, chunk_spec, runtime_configuration) + + return wrap + class Codec(Metadata): is_fixed_size: bool @@ -44,6 +60,20 @@ async def decode( ) -> np.ndarray: pass + async def decode_batch( + self, + chunk_arrays_and_specs: Iterable[Tuple[np.ndarray, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[np.ndarray]: + return await concurrent_map( + [ + (chunk_array, chunk_spec, runtime_configuration) + for chunk_array, chunk_spec in chunk_arrays_and_specs + ], + noop_for_none(self.decode), + runtime_configuration.concurrency, + ) + @abstractmethod async def encode( self, @@ -53,17 +83,45 @@ async def encode( ) -> Optional[np.ndarray]: pass + async def encode_batch( + self, + chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[np.ndarray]]: + return await concurrent_map( + [ + (chunk_array, chunk_spec, runtime_configuration) + for chunk_array, chunk_spec in chunk_arrays_and_specs + ], + noop_for_none(self.encode), + runtime_configuration.concurrency, + ) + class ArrayBytesCodec(Codec): @abstractmethod async def decode( self, - chunk_array: BytesLike, + chunk_bytes: BytesLike, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, ) -> np.ndarray: pass + async def decode_batch( + self, + chunk_bytes_and_specs: Iterable[Tuple[BytesLike, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[np.ndarray]: + return await concurrent_map( + [ + (chunk_bytes, chunk_spec, runtime_configuration) + for chunk_bytes, chunk_spec in chunk_bytes_and_specs + ], + noop_for_none(self.decode), + runtime_configuration.concurrency, + ) + @abstractmethod async def encode( self, @@ -73,6 +131,20 @@ async def encode( ) -> Optional[BytesLike]: pass + async def encode_batch( + self, + chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[BytesLike]]: + return await concurrent_map( + [ + (chunk_array, chunk_spec, runtime_configuration) + for chunk_array, chunk_spec in chunk_arrays_and_specs + ], + noop_for_none(self.encode), + runtime_configuration.concurrency, + ) + class ArrayBytesCodecPartialDecodeMixin: @abstractmethod @@ -85,6 +157,20 @@ async def decode_partial( ) -> Optional[np.ndarray]: pass + async def decode_partial_batched( + self, + batch_info: Iterable[Tuple[StorePath, SliceSelection, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[np.ndarray]]: + return await concurrent_map( + [ + (store_path, selection, chunk_spec, runtime_configuration) + for store_path, selection, chunk_spec in batch_info + ], + self.decode_partial, + runtime_configuration.concurrency, + ) + class ArrayBytesCodecPartialEncodeMixin: @abstractmethod @@ -98,17 +184,45 @@ async def encode_partial( ) -> None: pass + async def encode_partial_batched( + self, + batch_info: Iterable[Tuple[StorePath, np.ndarray, SliceSelection, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> None: + await concurrent_map( + [ + (store_path, chunk_array, selection, chunk_spec, runtime_configuration) + for store_path, chunk_array, selection, chunk_spec in batch_info + ], + self.encode_partial, + runtime_configuration.concurrency, + ) + class BytesBytesCodec(Codec): @abstractmethod async def decode( self, - chunk_array: BytesLike, + chunk_bytes: BytesLike, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, ) -> BytesLike: pass + async def decode_batch( + self, + chunk_bytes_and_specs: Iterable[Tuple[BytesLike, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[BytesLike]: + return await concurrent_map( + [ + (chunk_bytes, chunk_spec, runtime_configuration) + for chunk_bytes, chunk_spec in chunk_bytes_and_specs + ], + noop_for_none(self.decode), + runtime_configuration.concurrency, + ) + @abstractmethod async def encode( self, @@ -117,3 +231,17 @@ async def encode( runtime_configuration: RuntimeConfiguration, ) -> Optional[BytesLike]: pass + + async def encode_batch( + self, + chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[BytesLike]]: + return await concurrent_map( + [ + (chunk_bytes, chunk_spec, runtime_configuration) + for chunk_bytes, chunk_spec in chunk_bytes_and_specs + ], + noop_for_none(self.encode), + runtime_configuration.concurrency, + ) diff --git a/src/zarr/array.py b/src/zarr/array.py index c1263230c0..ea50e3713f 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -1,3 +1,5 @@ +from __future__ import annotations + # Notes on what I've changed here: # 1. Split Array into AsyncArray and Array # 3. Added .size and .attrs methods @@ -8,7 +10,6 @@ # 1. Was splitting the array into two classes really necessary? # 2. Do we really need runtime_configuration? Specifically, the asyncio_loop seems problematic -from __future__ import annotations from dataclasses import dataclass, replace @@ -23,15 +24,13 @@ from zarr.codecs import BytesCodec from zarr.common import ( ZARR_JSON, - ArraySpec, ChunkCoords, Selection, - SliceSelection, concurrent_map, ) from zarr.config import RuntimeConfiguration -from zarr.indexing import BasicIndexer, all_chunk_coords, is_total_slice +from zarr.indexing import BasicIndexer, all_chunk_coords from zarr.chunk_grids import RegularChunkGrid from zarr.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding from zarr.metadata import ArrayMetadata @@ -208,13 +207,19 @@ async def getitem(self, selection: Selection): ) # reading chunks and decoding them - await concurrent_map( + await self.codecs.read_batched( [ - (chunk_coords, chunk_selection, out_selection, out) + ( + self.store_path + / self.metadata.chunk_key_encoding.encode_chunk_key(chunk_coords), + self.metadata.get_chunk_spec(chunk_coords), + chunk_selection, + out_selection, + ) for chunk_coords, chunk_selection, out_selection in indexer ], - self._read_chunk, - self.runtime_configuration.concurrency, + out, + self.runtime_configuration, ) if out.shape: @@ -225,37 +230,6 @@ async def getitem(self, selection: Selection): async def _save_metadata(self) -> None: await (self.store_path / ZARR_JSON).set(self.metadata.to_bytes()) - async def _read_chunk( - self, - chunk_coords: ChunkCoords, - chunk_selection: SliceSelection, - out_selection: SliceSelection, - out: np.ndarray, - ): - chunk_spec = self.metadata.get_chunk_spec(chunk_coords) - chunk_key_encoding = self.metadata.chunk_key_encoding - chunk_key = chunk_key_encoding.encode_chunk_key(chunk_coords) - store_path = self.store_path / chunk_key - - if self.codecs.supports_partial_decode: - chunk_array = await self.codecs.decode_partial( - store_path, chunk_selection, chunk_spec, self.runtime_configuration - ) - if chunk_array is not None: - out[out_selection] = chunk_array - else: - out[out_selection] = self.metadata.fill_value - else: - chunk_bytes = await store_path.get() - if chunk_bytes is not None: - chunk_array = await self.codecs.decode( - chunk_bytes, chunk_spec, self.runtime_configuration - ) - tmp = chunk_array[chunk_selection] - out[out_selection] = tmp - else: - out[out_selection] = self.metadata.fill_value - async def setitem(self, selection: Selection, value: np.ndarray) -> None: assert isinstance(self.metadata.chunk_grid, RegularChunkGrid) chunk_shape = self.metadata.chunk_grid.chunk_shape @@ -279,97 +253,25 @@ async def setitem(self, selection: Selection, value: np.ndarray) -> None: value = value.astype(self.metadata.dtype, order="A") # merging with existing data and encoding chunks - await concurrent_map( + await self.codecs.write_batched( [ ( - value, - chunk_shape, - chunk_coords, + self.store_path + / self.metadata.chunk_key_encoding.encode_chunk_key(chunk_coords), + self.metadata.get_chunk_spec(chunk_coords), chunk_selection, out_selection, ) for chunk_coords, chunk_selection, out_selection in indexer ], - self._write_chunk, - self.runtime_configuration.concurrency, + value, + self.runtime_configuration, ) - async def _write_chunk( - self, - value: np.ndarray, - chunk_shape: ChunkCoords, - chunk_coords: ChunkCoords, - chunk_selection: SliceSelection, - out_selection: SliceSelection, - ): - chunk_spec = self.metadata.get_chunk_spec(chunk_coords) - chunk_key_encoding = self.metadata.chunk_key_encoding - chunk_key = chunk_key_encoding.encode_chunk_key(chunk_coords) - store_path = self.store_path / chunk_key - - if is_total_slice(chunk_selection, chunk_shape): - # write entire chunks - if np.isscalar(value): - chunk_array = np.empty( - chunk_shape, - dtype=self.metadata.dtype, - ) - chunk_array.fill(value) - else: - chunk_array = value[out_selection] - await self._write_chunk_to_store(store_path, chunk_array, chunk_spec) - - elif self.codecs.supports_partial_encode: - # print("encode_partial", chunk_coords, chunk_selection, repr(self)) - await self.codecs.encode_partial( - store_path, - value[out_selection], - chunk_selection, - chunk_spec, - self.runtime_configuration, - ) - else: - # writing partial chunks - # read chunk first - chunk_bytes = await store_path.get() - - # merge new value - if chunk_bytes is None: - chunk_array = np.empty( - chunk_shape, - dtype=self.metadata.dtype, - ) - chunk_array.fill(self.metadata.fill_value) - else: - chunk_array = ( - await self.codecs.decode(chunk_bytes, chunk_spec, self.runtime_configuration) - ).copy() # make a writable copy - chunk_array[chunk_selection] = value[out_selection] - - await self._write_chunk_to_store(store_path, chunk_array, chunk_spec) - - async def _write_chunk_to_store( - self, store_path: StorePath, chunk_array: np.ndarray, chunk_spec: ArraySpec - ): - if np.all(chunk_array == self.metadata.fill_value): - # chunks that only contain fill_value will be removed - await store_path.delete() - else: - chunk_bytes = await self.codecs.encode( - chunk_array, chunk_spec, self.runtime_configuration - ) - if chunk_bytes is None: - await store_path.delete() - else: - await store_path.set(chunk_bytes) - - async def resize(self, new_shape: ChunkCoords) -> AsyncArray: - if len(new_shape) != len(self.metadata.shape): - raise ValueError( - "The new shape must have the same number of dimensions " - + f"(={len(self.metadata.shape)})." - ) - + async def resize( + self, new_shape: ChunkCoords, delete_outside_chunks: bool = True + ) -> AsyncArray: + assert len(new_shape) == len(self.metadata.shape) new_metadata = replace(self.metadata, shape=new_shape) # Remove all chunks outside of the new shape @@ -379,17 +281,19 @@ async def resize(self, new_shape: ChunkCoords) -> AsyncArray: old_chunk_coords = set(all_chunk_coords(self.metadata.shape, chunk_shape)) new_chunk_coords = set(all_chunk_coords(new_shape, chunk_shape)) - async def _delete_key(key: str) -> None: - await (self.store_path / key).delete() + if delete_outside_chunks: - await concurrent_map( - [ - (chunk_key_encoding.encode_chunk_key(chunk_coords),) - for chunk_coords in old_chunk_coords.difference(new_chunk_coords) - ], - _delete_key, - self.runtime_configuration.concurrency, - ) + async def _delete_key(key: str) -> None: + await (self.store_path / key).delete() + + await concurrent_map( + [ + (chunk_key_encoding.encode_chunk_key(chunk_coords),) + for chunk_coords in old_chunk_coords.difference(new_chunk_coords) + ], + _delete_key, + self.runtime_configuration.concurrency, + ) # Write new metadata await (self.store_path / ZARR_JSON).set(new_metadata.to_bytes()) diff --git a/src/zarr/codecs/batched_pipeline.py b/src/zarr/codecs/batched_pipeline.py new file mode 100644 index 0000000000..27974d0647 --- /dev/null +++ b/src/zarr/codecs/batched_pipeline.py @@ -0,0 +1,283 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Iterable, TypeVar +import numpy as np +from dataclasses import dataclass + +from zarr.abc.codec import ( + Codec, + ArrayArrayCodec, + ArrayBytesCodec, + ArrayBytesCodecPartialDecodeMixin, + ArrayBytesCodecPartialEncodeMixin, + BytesBytesCodec, +) +from zarr.codecs.pipeline import CodecPipeline +from zarr.common import concurrent_map +from zarr.indexing import is_total_slice + +if TYPE_CHECKING: + from typing import List, Optional, Tuple + from zarr.store import StorePath + from zarr.metadata import RuntimeConfiguration + from zarr.common import ArraySpec, BytesLike, SliceSelection + +T = TypeVar("T") +U = TypeVar("U") + + +def unzip2(iterable: Iterable[Tuple[T, U]]) -> Tuple[List[T], List[U]]: + out0: List[T] = [] + out1: List[U] = [] + for item0, item1 in iterable: + out0.append(item0) + out1.append(item1) + return (out0, out1) + + +def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[ArraySpec]: + return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] + + +@dataclass(frozen=True) +class BatchedCodecPipeline(CodecPipeline): + def _codecs_with_resolved_metadata_batched( + self, chunk_specs: Iterable[ArraySpec] + ) -> Tuple[ + List[Tuple[ArrayArrayCodec, List[ArraySpec]]], + Tuple[ArrayBytesCodec, List[ArraySpec]], + List[Tuple[BytesBytesCodec, List[ArraySpec]]], + ]: + aa_codecs_with_spec: List[Tuple[ArrayArrayCodec, List[ArraySpec]]] = [] + for aa_codec in self.array_array_codecs: + aa_codecs_with_spec.append((aa_codec, chunk_specs)) + chunk_specs = [aa_codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] + + ab_codec_with_spec = (self.array_bytes_codec, chunk_specs) + chunk_specs = [ + self.array_bytes_codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs + ] + + bb_codecs_with_spec: List[Tuple[BytesBytesCodec, List[ArraySpec]]] = [] + for bb_codec in self.bytes_bytes_codecs: + bb_codecs_with_spec.append((bb_codec, chunk_specs)) + chunk_specs = [bb_codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] + + return (aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec) + + async def read_batched( + self, + batch_info: Iterable[Tuple[StorePath, ArraySpec, SliceSelection, SliceSelection]], + out: np.ndarray, + runtime_configuration: RuntimeConfiguration, + ) -> None: + if self.supports_partial_decode: + chunk_array_batch = await self.decode_partial_batched( + [ + (store_path, chunk_selection, chunk_spec) + for store_path, chunk_spec, chunk_selection, _ in batch_info + ], + runtime_configuration, + ) + for chunk_array, (_, chunk_spec, _, out_selection) in zip( + chunk_array_batch, batch_info + ): + if chunk_array is not None: + out[out_selection] = chunk_array + else: + out[out_selection] = chunk_spec.fill_value + else: + chunk_bytes_batch = await concurrent_map( + [(store_path,) for store_path, _, _, _ in batch_info], + lambda store_path: store_path.get(), + runtime_configuration.concurrency, + ) + chunk_array_batch = await self.decode_batched( + [ + (chunk_bytes, chunk_spec) + for chunk_bytes, (_, chunk_spec, _, _) in zip(chunk_bytes_batch, batch_info) + ], + runtime_configuration, + ) + for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( + chunk_array_batch, batch_info + ): + if chunk_array is not None: + tmp = chunk_array[chunk_selection] + out[out_selection] = tmp + else: + out[out_selection] = chunk_spec.fill_value + + async def decode_batched( + self, + chunk_bytes_and_specs: Iterable[Tuple[BytesLike, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[np.ndarray]]: + chunk_bytes_batch, chunk_specs = unzip2(chunk_bytes_and_specs) + + ( + aa_codecs_with_spec, + ab_codec_with_spec, + bb_codecs_with_spec, + ) = self._codecs_with_resolved_metadata_batched(chunk_specs) + + for bb_codec, chunk_spec_batch in bb_codecs_with_spec[::-1]: + chunk_bytes_batch = await bb_codec.decode_batch( + zip(chunk_bytes_batch, chunk_spec_batch), runtime_configuration + ) + + ab_codec, chunk_spec_batch = ab_codec_with_spec + chunk_array_batch = await ab_codec.decode_batch( + zip(chunk_bytes_batch, chunk_spec_batch), runtime_configuration + ) + + for aa_codec, chunk_spec_batch in aa_codecs_with_spec[::-1]: + chunk_array_batch = await aa_codec.decode_batch( + zip(chunk_array_batch, chunk_spec_batch), runtime_configuration + ) + + return chunk_array_batch + + async def decode_partial_batched( + self, + batch_info: Iterable[Tuple[StorePath, SliceSelection, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[np.ndarray]]: + assert self.supports_partial_decode + assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) + return await self.array_bytes_codec.decode_partial_batched( + batch_info, runtime_configuration + ) + + async def encode_batched( + self, + chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[BytesLike]]: + chunk_array_batch, chunk_specs = unzip2(chunk_arrays_and_specs) + + for aa_codec in self.array_array_codecs: + chunk_array_batch = await aa_codec.encode_batch( + zip(chunk_array_batch, chunk_specs), runtime_configuration + ) + chunk_specs = resolve_batched(aa_codec, chunk_specs) + + chunk_bytes_batch = await self.array_bytes_codec.encode_batch( + zip(chunk_array_batch, chunk_specs), runtime_configuration + ) + chunk_specs = resolve_batched(self.array_bytes_codec, chunk_specs) + + for bb_codec in self.bytes_bytes_codecs: + chunk_bytes_batch = await bb_codec.encode_batch( + zip(chunk_bytes_batch, chunk_specs), runtime_configuration + ) + chunk_specs = resolve_batched(bb_codec, chunk_specs) + + return chunk_bytes_batch + + async def encode_partial_batched( + self, + batch_info: Iterable[Tuple[StorePath, np.ndarray, SliceSelection, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> None: + assert self.supports_partial_encode + assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) + await self.array_bytes_codec.encode_partial_batched(batch_info, runtime_configuration) + + def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: + for codec in self: + byte_length = codec.compute_encoded_size(byte_length, array_spec) + array_spec = codec.resolve_metadata(array_spec) + return byte_length + + async def write_batched( + self, + batch_info: Iterable[Tuple[StorePath, ArraySpec, SliceSelection, SliceSelection]], + value: np.ndarray, + runtime_configuration: RuntimeConfiguration, + ) -> None: + if self.supports_partial_encode: + await self.encode_partial_batched( + [ + (store_path, value[out_selection], chunk_selection, chunk_spec) + for store_path, chunk_spec, chunk_selection, out_selection in batch_info + ], + runtime_configuration, + ) + + else: + # Read existing bytes if not total slice + async def _read_key(store_path: Optional[StorePath]) -> Optional[BytesLike]: + if store_path is None: + return None + return await store_path.get() + + chunk_bytes_batch = await concurrent_map( + [ + (None if is_total_slice(chunk_selection, chunk_spec.shape) else store_path,) + for store_path, chunk_spec, chunk_selection, _ in batch_info + ], + _read_key, + runtime_configuration.concurrency, + ) + chunk_array_batch = await self.decode_batched( + [ + (chunk_bytes, chunk_spec) + for chunk_bytes, (_, chunk_spec, _, _) in zip(chunk_bytes_batch, batch_info) + ], + runtime_configuration, + ) + + def _merge_chunk_array( + existing_chunk_array: Optional[np.ndarray], + new_chunk_array_slice: np.ndarray, + chunk_spec: ArraySpec, + chunk_selection: SliceSelection, + ) -> np.ndarray: + if is_total_slice(chunk_selection, chunk_spec.shape): + return new_chunk_array_slice + if existing_chunk_array is None: + chunk_array = np.empty( + chunk_spec.shape, + dtype=chunk_spec.dtype, + ) + chunk_array.fill(chunk_spec.fill_value) + else: + chunk_array = existing_chunk_array.copy() # make a writable copy + chunk_array[chunk_selection] = new_chunk_array_slice + return chunk_array + + chunk_array_batch = [ + _merge_chunk_array(chunk_array, value[out_selection], chunk_spec, chunk_selection) + for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( + chunk_array_batch, batch_info + ) + ] + + chunk_array_batch = [ + None if np.all(chunk_array == chunk_spec.fill_value) else chunk_array + for chunk_array, (_, chunk_spec, _, _) in zip(chunk_array_batch, batch_info) + ] + + chunk_bytes_batch = await self.encode_batched( + [ + (chunk_array, chunk_spec) + for chunk_array, (_, chunk_spec, _, _) in zip(chunk_array_batch, batch_info) + ], + runtime_configuration, + ) + + async def _write_key(store_path: StorePath, chunk_bytes: Optional[BytesLike]) -> None: + if chunk_bytes is None: + await store_path.delete() + else: + await store_path.set(chunk_bytes) + + await concurrent_map( + [ + (store_path, chunk_bytes) + for chunk_bytes, (store_path, _, _, _) in zip(chunk_bytes_batch, batch_info) + ], + _write_key, + runtime_configuration.concurrency, + ) diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 4908ee8057..d4246c847a 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -19,6 +19,7 @@ if TYPE_CHECKING: from typing import Iterator, List, Optional, Tuple, Union + from typing_extensions import Self from zarr.store import StorePath from zarr.metadata import ArrayMetadata from zarr.config import RuntimeConfiguration @@ -43,16 +44,16 @@ def from_dict(cls, data: Iterable[Union[JSON, Codec]]) -> CodecPipeline: else: name_parsed, _ = parse_named_configuration(c, require_configuration=False) out.append(get_codec_class(name_parsed).from_dict(c)) # type: ignore[arg-type] - return CodecPipeline.from_list(out) + return cls.from_list(out) def to_dict(self) -> JSON: return [c.to_dict() for c in self] - def evolve(self, array_spec: ArraySpec) -> CodecPipeline: - return CodecPipeline.from_list([c.evolve(array_spec) for c in self]) + def evolve(self, array_spec: ArraySpec) -> Self: + return type(self).from_list([c.evolve(array_spec) for c in self]) @classmethod - def from_list(cls, codecs: List[Codec]) -> CodecPipeline: + def from_list(cls, codecs: List[Codec]) -> Self: from zarr.codecs.sharding import ShardingCodec if not any(isinstance(codec, ArrayBytesCodec) for codec in codecs): @@ -90,7 +91,7 @@ def from_list(cls, codecs: List[Codec]) -> CodecPipeline: + "writes, which may lead to inefficient performance." ) - return CodecPipeline( + return cls( array_array_codecs=tuple( codec for codec in codecs if isinstance(codec, ArrayArrayCodec) ), @@ -126,7 +127,7 @@ def validate(self, array_metadata: ArrayMetadata) -> None: codec.validate(array_metadata) def _codecs_with_resolved_metadata( - self, array_spec: ArraySpec + self, chunk_spec: ArraySpec ) -> Tuple[ List[Tuple[ArrayArrayCodec, ArraySpec]], Tuple[ArrayBytesCodec, ArraySpec], @@ -134,39 +135,39 @@ def _codecs_with_resolved_metadata( ]: aa_codecs_with_spec: List[Tuple[ArrayArrayCodec, ArraySpec]] = [] for aa_codec in self.array_array_codecs: - aa_codecs_with_spec.append((aa_codec, array_spec)) - array_spec = aa_codec.resolve_metadata(array_spec) + aa_codecs_with_spec.append((aa_codec, chunk_spec)) + chunk_spec = aa_codec.resolve_metadata(chunk_spec) - ab_codec_with_spec = (self.array_bytes_codec, array_spec) - array_spec = self.array_bytes_codec.resolve_metadata(array_spec) + ab_codec_with_spec = (self.array_bytes_codec, chunk_spec) + chunk_spec = self.array_bytes_codec.resolve_metadata(chunk_spec) bb_codecs_with_spec: List[Tuple[BytesBytesCodec, ArraySpec]] = [] for bb_codec in self.bytes_bytes_codecs: - bb_codecs_with_spec.append((bb_codec, array_spec)) - array_spec = bb_codec.resolve_metadata(array_spec) + bb_codecs_with_spec.append((bb_codec, chunk_spec)) + chunk_spec = bb_codec.resolve_metadata(chunk_spec) return (aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec) async def decode( self, chunk_bytes: BytesLike, - array_spec: ArraySpec, + chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, ) -> np.ndarray: ( aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec, - ) = self._codecs_with_resolved_metadata(array_spec) + ) = self._codecs_with_resolved_metadata(chunk_spec) - for bb_codec, array_spec in bb_codecs_with_spec[::-1]: - chunk_bytes = await bb_codec.decode(chunk_bytes, array_spec, runtime_configuration) + for bb_codec, chunk_spec in bb_codecs_with_spec[::-1]: + chunk_bytes = await bb_codec.decode(chunk_bytes, chunk_spec, runtime_configuration) - ab_codec, array_spec = ab_codec_with_spec - chunk_array = await ab_codec.decode(chunk_bytes, array_spec, runtime_configuration) + ab_codec, chunk_spec = ab_codec_with_spec + chunk_array = await ab_codec.decode(chunk_bytes, chunk_spec, runtime_configuration) - for aa_codec, array_spec in aa_codecs_with_spec[::-1]: - chunk_array = await aa_codec.decode(chunk_array, array_spec, runtime_configuration) + for aa_codec, chunk_spec in aa_codecs_with_spec[::-1]: + chunk_array = await aa_codec.decode(chunk_array, chunk_spec, runtime_configuration) return chunk_array @@ -186,18 +187,18 @@ async def decode_partial( async def encode( self, chunk_array: np.ndarray, - array_spec: ArraySpec, + chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, ) -> Optional[BytesLike]: ( aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec, - ) = self._codecs_with_resolved_metadata(array_spec) + ) = self._codecs_with_resolved_metadata(chunk_spec) - for aa_codec, array_spec in aa_codecs_with_spec: + for aa_codec, chunk_spec in aa_codecs_with_spec: chunk_array_maybe = await aa_codec.encode( - chunk_array, array_spec, runtime_configuration + chunk_array, chunk_spec, runtime_configuration ) if chunk_array_maybe is None: return None diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py index 8eba9a0b5a..5a65be7f62 100644 --- a/src/zarr/metadata.py +++ b/src/zarr/metadata.py @@ -12,7 +12,7 @@ if TYPE_CHECKING: from typing import Literal, Union, List, Optional, Tuple - from zarr.codecs.pipeline import CodecPipeline + from zarr.codecs.batched_pipeline import BatchedCodecPipeline from zarr.abc.codec import Codec @@ -116,7 +116,7 @@ class ArrayMetadata(Metadata): chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any - codecs: CodecPipeline + codecs: BatchedCodecPipeline attributes: Dict[str, Any] = field(default_factory=dict) dimension_names: Optional[Tuple[str, ...]] = None zarr_format: Literal[3] = field(default=3, init=False) @@ -369,9 +369,14 @@ def parse_v2_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: return data +<<<<<<< HEAD:src/zarr/metadata.py def parse_codecs(data: Iterable[Union[Codec, JSON]]) -> CodecPipeline: from zarr.codecs.pipeline import CodecPipeline +======= +def parse_codecs(data: Iterable[Union[Codec, JSON]]) -> BatchedCodecPipeline: + from zarr.v3.codecs.batched_pipeline import BatchedCodecPipeline +>>>>>>> 450bcc64 (merge):src/zarr/v3/metadata.py if not isinstance(data, Iterable): raise TypeError(f"Expected iterable, got {type(data)}") - return CodecPipeline.from_dict(data) + return BatchedCodecPipeline.from_dict(data) From 019ecc8c451f27ca54c021a2a397edf1856d45f9 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 19 Apr 2024 17:27:19 +0200 Subject: [PATCH 02/21] refactors CodecPipelines --- src/zarr/abc/codec.py | 70 +++- src/zarr/array.py | 4 +- src/zarr/codecs/__init__.py | 1 + src/zarr/codecs/pipeline/__init__.py | 5 + .../batched.py} | 156 ++++---- .../codecs/{pipeline.py => pipeline/core.py} | 140 +++---- src/zarr/codecs/pipeline/interleaved.py | 329 +++++++++++++++++ src/zarr/codecs/sharding.py | 343 +++++++++--------- src/zarr/metadata.py | 11 +- src/zarr/v3/sync.py | 131 ------- 10 files changed, 688 insertions(+), 502 deletions(-) create mode 100644 src/zarr/codecs/pipeline/__init__.py rename src/zarr/codecs/{batched_pipeline.py => pipeline/batched.py} (77%) rename src/zarr/codecs/{pipeline.py => pipeline/core.py} (59%) create mode 100644 src/zarr/codecs/pipeline/interleaved.py delete mode 100644 src/zarr/v3/sync.py diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 366d388495..b221444534 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -1,13 +1,22 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Awaitable, Callable, Iterable, Optional, Tuple, TypeVar +from typing import ( + TYPE_CHECKING, + Awaitable, + Callable, + Iterable, + Optional, + Protocol, + Tuple, + TypeVar, + runtime_checkable, +) import numpy as np from zarr.abc.metadata import Metadata from zarr.common import ArraySpec, concurrent_map -from zarr.store import StorePath if TYPE_CHECKING: @@ -21,11 +30,16 @@ def noop_for_none( +<<<<<<< HEAD:src/zarr/abc/codec.py func: Callable[[Optional[T], ArraySpec, RuntimeConfiguration], Awaitable[U]], ) -> Callable[[T, ArraySpec, RuntimeConfiguration], Awaitable[U]]: +======= + func: Callable[[T, ArraySpec, RuntimeConfiguration], Awaitable[Optional[U]]], +) -> Callable[[Optional[T], ArraySpec, RuntimeConfiguration], Awaitable[Optional[U]]]: +>>>>>>> 51d3c921 (refactors CodecPipelines):src/zarr/v3/abc/codec.py async def wrap( chunk: Optional[T], chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration - ) -> U: + ) -> Optional[U]: if chunk is None: return None return await func(chunk, chunk_spec, runtime_configuration) @@ -33,6 +47,28 @@ async def wrap( return wrap +@runtime_checkable +class ByteGetter(Protocol): + async def get( + self, byte_range: Optional[Tuple[int, Optional[int]]] = None + ) -> Optional[BytesLike]: + ... + + +@runtime_checkable +class ByteSetter(Protocol): + async def get( + self, byte_range: Optional[Tuple[int, Optional[int]]] = None + ) -> Optional[BytesLike]: + ... + + async def set(self, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None) -> None: + ... + + async def delete(self) -> None: + ... + + class Codec(Metadata): is_fixed_size: bool @@ -62,9 +98,9 @@ async def decode( async def decode_batch( self, - chunk_arrays_and_specs: Iterable[Tuple[np.ndarray, ArraySpec]], + chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> Iterable[np.ndarray]: + ) -> Iterable[Optional[np.ndarray]]: return await concurrent_map( [ (chunk_array, chunk_spec, runtime_configuration) @@ -110,9 +146,9 @@ async def decode( async def decode_batch( self, - chunk_bytes_and_specs: Iterable[Tuple[BytesLike, ArraySpec]], + chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> Iterable[np.ndarray]: + ) -> Iterable[Optional[np.ndarray]]: return await concurrent_map( [ (chunk_bytes, chunk_spec, runtime_configuration) @@ -150,7 +186,7 @@ class ArrayBytesCodecPartialDecodeMixin: @abstractmethod async def decode_partial( self, - store_path: StorePath, + byte_getter: ByteGetter, selection: SliceSelection, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, @@ -159,13 +195,13 @@ async def decode_partial( async def decode_partial_batched( self, - batch_info: Iterable[Tuple[StorePath, SliceSelection, ArraySpec]], + batch_info: Iterable[Tuple[ByteGetter, SliceSelection, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> Iterable[Optional[np.ndarray]]: return await concurrent_map( [ - (store_path, selection, chunk_spec, runtime_configuration) - for store_path, selection, chunk_spec in batch_info + (byte_getter, selection, chunk_spec, runtime_configuration) + for byte_getter, selection, chunk_spec in batch_info ], self.decode_partial, runtime_configuration.concurrency, @@ -176,7 +212,7 @@ class ArrayBytesCodecPartialEncodeMixin: @abstractmethod async def encode_partial( self, - store_path: StorePath, + byte_setter: ByteSetter, chunk_array: np.ndarray, selection: SliceSelection, chunk_spec: ArraySpec, @@ -186,13 +222,13 @@ async def encode_partial( async def encode_partial_batched( self, - batch_info: Iterable[Tuple[StorePath, np.ndarray, SliceSelection, ArraySpec]], + batch_info: Iterable[Tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> None: await concurrent_map( [ - (store_path, chunk_array, selection, chunk_spec, runtime_configuration) - for store_path, chunk_array, selection, chunk_spec in batch_info + (byte_setter, chunk_array, selection, chunk_spec, runtime_configuration) + for byte_setter, chunk_array, selection, chunk_spec in batch_info ], self.encode_partial, runtime_configuration.concurrency, @@ -211,9 +247,9 @@ async def decode( async def decode_batch( self, - chunk_bytes_and_specs: Iterable[Tuple[BytesLike, ArraySpec]], + chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> Iterable[BytesLike]: + ) -> Iterable[Optional[BytesLike]]: return await concurrent_map( [ (chunk_bytes, chunk_spec, runtime_configuration) diff --git a/src/zarr/array.py b/src/zarr/array.py index ea50e3713f..b57c8c7677 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -207,7 +207,7 @@ async def getitem(self, selection: Selection): ) # reading chunks and decoding them - await self.codecs.read_batched( + await self.codecs.read_batch( [ ( self.store_path @@ -253,7 +253,7 @@ async def setitem(self, selection: Selection, value: np.ndarray) -> None: value = value.astype(self.metadata.dtype, order="A") # merging with existing data and encoding chunks - await self.codecs.write_batched( + await self.codecs.write_batch( [ ( self.store_path diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index 8fa0c9f7b0..3068adde42 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -7,3 +7,4 @@ from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation # noqa: F401 from zarr.codecs.transpose import TransposeCodec # noqa: F401 from zarr.codecs.zstd import ZstdCodec # noqa: F401 +from zarr.codecs.pipeline import CodecPipeline, BatchedCodecPipeline, InterleavedCodecPipeline # noqa: F401 diff --git a/src/zarr/codecs/pipeline/__init__.py b/src/zarr/codecs/pipeline/__init__.py new file mode 100644 index 0000000000..4b1e955994 --- /dev/null +++ b/src/zarr/codecs/pipeline/__init__.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from zarr.codecs.pipeline.core import CodecPipeline # noqa: F401 +from zarr.codecs.pipeline.batched import BatchedCodecPipeline # noqa: F401 +from zarr.codecs.pipeline.interleaved import InterleavedCodecPipeline # noqa: F401 diff --git a/src/zarr/codecs/batched_pipeline.py b/src/zarr/codecs/pipeline/batched.py similarity index 77% rename from src/zarr/codecs/batched_pipeline.py rename to src/zarr/codecs/pipeline/batched.py index 27974d0647..5c72785a70 100644 --- a/src/zarr/codecs/batched_pipeline.py +++ b/src/zarr/codecs/pipeline/batched.py @@ -1,10 +1,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Iterable, TypeVar +from typing import TYPE_CHECKING, TypeVar, Iterable import numpy as np from dataclasses import dataclass from zarr.abc.codec import ( + ByteGetter, + ByteSetter, Codec, ArrayArrayCodec, ArrayBytesCodec, @@ -12,13 +14,12 @@ ArrayBytesCodecPartialEncodeMixin, BytesBytesCodec, ) -from zarr.codecs.pipeline import CodecPipeline +from zarr.codecs.pipeline.core import CodecPipeline from zarr.common import concurrent_map from zarr.indexing import is_total_slice if TYPE_CHECKING: from typing import List, Optional, Tuple - from zarr.store import StorePath from zarr.metadata import RuntimeConfiguration from zarr.common import ArraySpec, BytesLike, SliceSelection @@ -26,7 +27,7 @@ U = TypeVar("U") -def unzip2(iterable: Iterable[Tuple[T, U]]) -> Tuple[List[T], List[U]]: +def _unzip2(iterable: Iterable[Tuple[T, U]]) -> Tuple[List[T], List[U]]: out0: List[T] = [] out1: List[U] = [] for item0, item1 in iterable: @@ -49,6 +50,7 @@ def _codecs_with_resolved_metadata_batched( List[Tuple[BytesBytesCodec, List[ArraySpec]]], ]: aa_codecs_with_spec: List[Tuple[ArrayArrayCodec, List[ArraySpec]]] = [] + chunk_specs = list(chunk_specs) for aa_codec in self.array_array_codecs: aa_codecs_with_spec.append((aa_codec, chunk_specs)) chunk_specs = [aa_codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] @@ -65,55 +67,13 @@ def _codecs_with_resolved_metadata_batched( return (aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec) - async def read_batched( + async def decode( self, - batch_info: Iterable[Tuple[StorePath, ArraySpec, SliceSelection, SliceSelection]], - out: np.ndarray, - runtime_configuration: RuntimeConfiguration, - ) -> None: - if self.supports_partial_decode: - chunk_array_batch = await self.decode_partial_batched( - [ - (store_path, chunk_selection, chunk_spec) - for store_path, chunk_spec, chunk_selection, _ in batch_info - ], - runtime_configuration, - ) - for chunk_array, (_, chunk_spec, _, out_selection) in zip( - chunk_array_batch, batch_info - ): - if chunk_array is not None: - out[out_selection] = chunk_array - else: - out[out_selection] = chunk_spec.fill_value - else: - chunk_bytes_batch = await concurrent_map( - [(store_path,) for store_path, _, _, _ in batch_info], - lambda store_path: store_path.get(), - runtime_configuration.concurrency, - ) - chunk_array_batch = await self.decode_batched( - [ - (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, _, _) in zip(chunk_bytes_batch, batch_info) - ], - runtime_configuration, - ) - for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( - chunk_array_batch, batch_info - ): - if chunk_array is not None: - tmp = chunk_array[chunk_selection] - out[out_selection] = tmp - else: - out[out_selection] = chunk_spec.fill_value - - async def decode_batched( - self, - chunk_bytes_and_specs: Iterable[Tuple[BytesLike, ArraySpec]], + chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> Iterable[Optional[np.ndarray]]: - chunk_bytes_batch, chunk_specs = unzip2(chunk_bytes_and_specs) + chunk_bytes_batch: Iterable[Optional[BytesLike]] + chunk_bytes_batch, chunk_specs = _unzip2(chunk_bytes_and_specs) ( aa_codecs_with_spec, @@ -138,9 +98,9 @@ async def decode_batched( return chunk_array_batch - async def decode_partial_batched( + async def decode_partial( self, - batch_info: Iterable[Tuple[StorePath, SliceSelection, ArraySpec]], + batch_info: Iterable[Tuple[ByteGetter, SliceSelection, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> Iterable[Optional[np.ndarray]]: assert self.supports_partial_decode @@ -149,12 +109,14 @@ async def decode_partial_batched( batch_info, runtime_configuration ) - async def encode_batched( + async def encode( self, chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> Iterable[Optional[BytesLike]]: - chunk_array_batch, chunk_specs = unzip2(chunk_arrays_and_specs) + chunk_array_batch: Iterable[Optional[np.ndarray]] + chunk_specs: Iterable[ArraySpec] + chunk_array_batch, chunk_specs = _unzip2(chunk_arrays_and_specs) for aa_codec in self.array_array_codecs: chunk_array_batch = await aa_codec.encode_batch( @@ -175,52 +137,90 @@ async def encode_batched( return chunk_bytes_batch - async def encode_partial_batched( + async def encode_partial( self, - batch_info: Iterable[Tuple[StorePath, np.ndarray, SliceSelection, ArraySpec]], + batch_info: Iterable[Tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> None: assert self.supports_partial_encode assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) await self.array_bytes_codec.encode_partial_batched(batch_info, runtime_configuration) - def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: - for codec in self: - byte_length = codec.compute_encoded_size(byte_length, array_spec) - array_spec = codec.resolve_metadata(array_spec) - return byte_length + async def read_batch( + self, + batch_info: Iterable[Tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + out: np.ndarray, + runtime_configuration: RuntimeConfiguration, + ) -> None: + if self.supports_partial_decode: + chunk_array_batch = await self.decode_partial( + [ + (byte_getter, chunk_selection, chunk_spec) + for byte_getter, chunk_spec, chunk_selection, _ in batch_info + ], + runtime_configuration, + ) + for chunk_array, (_, chunk_spec, _, out_selection) in zip( + chunk_array_batch, batch_info + ): + if chunk_array is not None: + out[out_selection] = chunk_array + else: + out[out_selection] = chunk_spec.fill_value + else: + chunk_bytes_batch = await concurrent_map( + [(byte_getter,) for byte_getter, _, _, _ in batch_info], + lambda byte_getter: byte_getter.get(), + runtime_configuration.concurrency, + ) + chunk_array_batch = await self.decode( + [ + (chunk_bytes, chunk_spec) + for chunk_bytes, (_, chunk_spec, _, _) in zip(chunk_bytes_batch, batch_info) + ], + runtime_configuration, + ) + for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( + chunk_array_batch, batch_info + ): + if chunk_array is not None: + tmp = chunk_array[chunk_selection] + out[out_selection] = tmp + else: + out[out_selection] = chunk_spec.fill_value - async def write_batched( + async def write_batch( self, - batch_info: Iterable[Tuple[StorePath, ArraySpec, SliceSelection, SliceSelection]], + batch_info: Iterable[Tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], value: np.ndarray, runtime_configuration: RuntimeConfiguration, ) -> None: if self.supports_partial_encode: - await self.encode_partial_batched( + await self.encode_partial( [ - (store_path, value[out_selection], chunk_selection, chunk_spec) - for store_path, chunk_spec, chunk_selection, out_selection in batch_info + (byte_setter, value[out_selection], chunk_selection, chunk_spec) + for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info ], runtime_configuration, ) else: # Read existing bytes if not total slice - async def _read_key(store_path: Optional[StorePath]) -> Optional[BytesLike]: - if store_path is None: + async def _read_key(byte_setter: Optional[ByteSetter]) -> Optional[BytesLike]: + if byte_setter is None: return None - return await store_path.get() + return await byte_setter.get() + chunk_bytes_batch: Iterable[Optional[BytesLike]] chunk_bytes_batch = await concurrent_map( [ - (None if is_total_slice(chunk_selection, chunk_spec.shape) else store_path,) - for store_path, chunk_spec, chunk_selection, _ in batch_info + (None if is_total_slice(chunk_selection, chunk_spec.shape) else byte_setter,) + for byte_setter, chunk_spec, chunk_selection, _ in batch_info ], _read_key, runtime_configuration.concurrency, ) - chunk_array_batch = await self.decode_batched( + chunk_array_batch = await self.decode( [ (chunk_bytes, chunk_spec) for chunk_bytes, (_, chunk_spec, _, _) in zip(chunk_bytes_batch, batch_info) @@ -259,7 +259,7 @@ def _merge_chunk_array( for chunk_array, (_, chunk_spec, _, _) in zip(chunk_array_batch, batch_info) ] - chunk_bytes_batch = await self.encode_batched( + chunk_bytes_batch = await self.encode( [ (chunk_array, chunk_spec) for chunk_array, (_, chunk_spec, _, _) in zip(chunk_array_batch, batch_info) @@ -267,16 +267,16 @@ def _merge_chunk_array( runtime_configuration, ) - async def _write_key(store_path: StorePath, chunk_bytes: Optional[BytesLike]) -> None: + async def _write_key(byte_setter: ByteSetter, chunk_bytes: Optional[BytesLike]) -> None: if chunk_bytes is None: - await store_path.delete() + await byte_setter.delete() else: - await store_path.set(chunk_bytes) + await byte_setter.set(chunk_bytes) await concurrent_map( [ - (store_path, chunk_bytes) - for chunk_bytes, (store_path, _, _, _) in zip(chunk_bytes_batch, batch_info) + (byte_setter, chunk_bytes) + for chunk_bytes, (byte_setter, _, _, _) in zip(chunk_bytes_batch, batch_info) ], _write_key, runtime_configuration.concurrency, diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline/core.py similarity index 59% rename from src/zarr/codecs/pipeline.py rename to src/zarr/codecs/pipeline/core.py index d4246c847a..564cf3769f 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline/core.py @@ -1,5 +1,6 @@ from __future__ import annotations +from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Iterable import numpy as np from dataclasses import dataclass @@ -10,6 +11,8 @@ ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin, + ByteGetter, + ByteSetter, BytesBytesCodec, Codec, ) @@ -20,20 +23,19 @@ if TYPE_CHECKING: from typing import Iterator, List, Optional, Tuple, Union from typing_extensions import Self - from zarr.store import StorePath from zarr.metadata import ArrayMetadata from zarr.config import RuntimeConfiguration from zarr.common import JSON, ArraySpec, BytesLike, SliceSelection @dataclass(frozen=True) -class CodecPipeline(Metadata): +class CodecPipeline(Metadata, ABC): array_array_codecs: Tuple[ArrayArrayCodec, ...] array_bytes_codec: ArrayBytesCodec bytes_bytes_codecs: Tuple[BytesBytesCodec, ...] @classmethod - def from_dict(cls, data: Iterable[Union[JSON, Codec]]) -> CodecPipeline: + def from_dict(cls, data: Iterable[Union[JSON, Codec]]) -> Self: out: List[Codec] = [] if not isinstance(data, Iterable): raise TypeError(f"Expected iterable, got {type(data)}") @@ -126,116 +128,58 @@ def validate(self, array_metadata: ArrayMetadata) -> None: for codec in self: codec.validate(array_metadata) - def _codecs_with_resolved_metadata( - self, chunk_spec: ArraySpec - ) -> Tuple[ - List[Tuple[ArrayArrayCodec, ArraySpec]], - Tuple[ArrayBytesCodec, ArraySpec], - List[Tuple[BytesBytesCodec, ArraySpec]], - ]: - aa_codecs_with_spec: List[Tuple[ArrayArrayCodec, ArraySpec]] = [] - for aa_codec in self.array_array_codecs: - aa_codecs_with_spec.append((aa_codec, chunk_spec)) - chunk_spec = aa_codec.resolve_metadata(chunk_spec) - - ab_codec_with_spec = (self.array_bytes_codec, chunk_spec) - chunk_spec = self.array_bytes_codec.resolve_metadata(chunk_spec) - - bb_codecs_with_spec: List[Tuple[BytesBytesCodec, ArraySpec]] = [] - for bb_codec in self.bytes_bytes_codecs: - bb_codecs_with_spec.append((bb_codec, chunk_spec)) - chunk_spec = bb_codec.resolve_metadata(chunk_spec) - - return (aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec) + def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: + for codec in self: + byte_length = codec.compute_encoded_size(byte_length, array_spec) + array_spec = codec.resolve_metadata(array_spec) + return byte_length + @abstractmethod async def decode( self, - chunk_bytes: BytesLike, - chunk_spec: ArraySpec, + chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> np.ndarray: - ( - aa_codecs_with_spec, - ab_codec_with_spec, - bb_codecs_with_spec, - ) = self._codecs_with_resolved_metadata(chunk_spec) - - for bb_codec, chunk_spec in bb_codecs_with_spec[::-1]: - chunk_bytes = await bb_codec.decode(chunk_bytes, chunk_spec, runtime_configuration) - - ab_codec, chunk_spec = ab_codec_with_spec - chunk_array = await ab_codec.decode(chunk_bytes, chunk_spec, runtime_configuration) - - for aa_codec, chunk_spec in aa_codecs_with_spec[::-1]: - chunk_array = await aa_codec.decode(chunk_array, chunk_spec, runtime_configuration) - - return chunk_array + ) -> Iterable[Optional[np.ndarray]]: + pass + @abstractmethod async def decode_partial( self, - store_path: StorePath, - selection: SliceSelection, - chunk_spec: ArraySpec, + batch_info: Iterable[Tuple[ByteGetter, SliceSelection, ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> Optional[np.ndarray]: - assert self.supports_partial_decode - assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) - return await self.array_bytes_codec.decode_partial( - store_path, selection, chunk_spec, runtime_configuration - ) + ) -> Iterable[Optional[np.ndarray]]: + pass + @abstractmethod async def encode( self, - chunk_array: np.ndarray, - chunk_spec: ArraySpec, + chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: - ( - aa_codecs_with_spec, - ab_codec_with_spec, - bb_codecs_with_spec, - ) = self._codecs_with_resolved_metadata(chunk_spec) - - for aa_codec, chunk_spec in aa_codecs_with_spec: - chunk_array_maybe = await aa_codec.encode( - chunk_array, chunk_spec, runtime_configuration - ) - if chunk_array_maybe is None: - return None - chunk_array = chunk_array_maybe - - ab_codec, array_spec = ab_codec_with_spec - chunk_bytes_maybe = await ab_codec.encode(chunk_array, array_spec, runtime_configuration) - if chunk_bytes_maybe is None: - return None - chunk_bytes = chunk_bytes_maybe - - for bb_codec, array_spec in bb_codecs_with_spec: - chunk_bytes_maybe = await bb_codec.encode( - chunk_bytes, array_spec, runtime_configuration - ) - if chunk_bytes_maybe is None: - return None - chunk_bytes = chunk_bytes_maybe - - return chunk_bytes + ) -> Iterable[Optional[BytesLike]]: + pass + @abstractmethod async def encode_partial( self, - store_path: StorePath, - chunk_array: np.ndarray, - selection: SliceSelection, - chunk_spec: ArraySpec, + batch_info: Iterable[Tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> None: - assert self.supports_partial_encode - assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) - await self.array_bytes_codec.encode_partial( - store_path, chunk_array, selection, chunk_spec, runtime_configuration - ) + pass - def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: - for codec in self: - byte_length = codec.compute_encoded_size(byte_length, array_spec) - array_spec = codec.resolve_metadata(array_spec) - return byte_length + @abstractmethod + async def read_batch( + self, + batch_info: Iterable[Tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + out: np.ndarray, + runtime_configuration: RuntimeConfiguration, + ) -> None: + pass + + @abstractmethod + async def write_batch( + self, + batch_info: Iterable[Tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], + value: np.ndarray, + runtime_configuration: RuntimeConfiguration, + ) -> None: + pass diff --git a/src/zarr/codecs/pipeline/interleaved.py b/src/zarr/codecs/pipeline/interleaved.py new file mode 100644 index 0000000000..10e1de9e92 --- /dev/null +++ b/src/zarr/codecs/pipeline/interleaved.py @@ -0,0 +1,329 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +import numpy as np +from dataclasses import dataclass + +from zarr.abc.codec import ( + ByteGetter, + ByteSetter, + ArrayArrayCodec, + ArrayBytesCodec, + ArrayBytesCodecPartialDecodeMixin, + ArrayBytesCodecPartialEncodeMixin, + BytesBytesCodec, +) +from zarr.codecs.pipeline.core import CodecPipeline +from zarr.common import concurrent_map +from zarr.indexing import is_total_slice + +if TYPE_CHECKING: + from typing import List, Optional, Tuple, Iterable + from zarr.config import RuntimeConfiguration + from zarr.common import ArraySpec, BytesLike, SliceSelection + + +@dataclass(frozen=True) +class InterleavedCodecPipeline(CodecPipeline): + def _codecs_with_resolved_metadata( + self, chunk_spec: ArraySpec + ) -> Tuple[ + List[Tuple[ArrayArrayCodec, ArraySpec]], + Tuple[ArrayBytesCodec, ArraySpec], + List[Tuple[BytesBytesCodec, ArraySpec]], + ]: + aa_codecs_with_spec: List[Tuple[ArrayArrayCodec, ArraySpec]] = [] + for aa_codec in self.array_array_codecs: + aa_codecs_with_spec.append((aa_codec, chunk_spec)) + chunk_spec = aa_codec.resolve_metadata(chunk_spec) + + ab_codec_with_spec = (self.array_bytes_codec, chunk_spec) + chunk_spec = self.array_bytes_codec.resolve_metadata(chunk_spec) + + bb_codecs_with_spec: List[Tuple[BytesBytesCodec, ArraySpec]] = [] + for bb_codec in self.bytes_bytes_codecs: + bb_codecs_with_spec.append((bb_codec, chunk_spec)) + chunk_spec = bb_codec.resolve_metadata(chunk_spec) + + return (aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec) + + async def decode( + self, + chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[np.ndarray]]: + return await concurrent_map( + [ + (chunk_bytes, chunk_spec, runtime_configuration) + for chunk_bytes, chunk_spec in chunk_bytes_and_specs + ], + self.decode_single, + runtime_configuration.concurrency, + ) + + async def decode_single( + self, + chunk_bytes: Optional[BytesLike], + chunk_spec: ArraySpec, + runtime_configuration: RuntimeConfiguration, + ) -> Optional[np.ndarray]: + if chunk_bytes is None: + return None + + ( + aa_codecs_with_spec, + ab_codec_with_spec, + bb_codecs_with_spec, + ) = self._codecs_with_resolved_metadata(chunk_spec) + + for bb_codec, chunk_spec in bb_codecs_with_spec[::-1]: + chunk_bytes = await bb_codec.decode(chunk_bytes, chunk_spec, runtime_configuration) + + ab_codec, chunk_spec = ab_codec_with_spec + chunk_array = await ab_codec.decode(chunk_bytes, chunk_spec, runtime_configuration) + + for aa_codec, chunk_spec in aa_codecs_with_spec[::-1]: + chunk_array = await aa_codec.decode(chunk_array, chunk_spec, runtime_configuration) + + return chunk_array + + async def decode_partial( + self, + batch_info: Iterable[Tuple[ByteGetter, SliceSelection, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[np.ndarray]]: + return await concurrent_map( + [ + (byte_getter, selection, chunk_spec, runtime_configuration) + for byte_getter, selection, chunk_spec in batch_info + ], + self.decode_partial_single, + runtime_configuration.concurrency, + ) + + async def decode_partial_single( + self, + byte_getter: ByteGetter, + selection: SliceSelection, + chunk_spec: ArraySpec, + runtime_configuration: RuntimeConfiguration, + ) -> Optional[np.ndarray]: + assert self.supports_partial_decode + assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) + return await self.array_bytes_codec.decode_partial( + byte_getter, selection, chunk_spec, runtime_configuration + ) + + async def encode( + self, + chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[BytesLike]]: + return await concurrent_map( + [ + (chunk_array, chunk_spec, runtime_configuration) + for chunk_array, chunk_spec in chunk_arrays_and_specs + ], + self.encode_single, + runtime_configuration.concurrency, + ) + + async def encode_single( + self, + chunk_array: Optional[np.ndarray], + chunk_spec: ArraySpec, + runtime_configuration: RuntimeConfiguration, + ) -> Optional[BytesLike]: + if chunk_array is None: + return None + + ( + aa_codecs_with_spec, + ab_codec_with_spec, + bb_codecs_with_spec, + ) = self._codecs_with_resolved_metadata(chunk_spec) + + for aa_codec, chunk_spec in aa_codecs_with_spec: + chunk_array_maybe = await aa_codec.encode( + chunk_array, chunk_spec, runtime_configuration + ) + if chunk_array_maybe is None: + return None + chunk_array = chunk_array_maybe + + ab_codec, array_spec = ab_codec_with_spec + chunk_bytes_maybe = await ab_codec.encode(chunk_array, array_spec, runtime_configuration) + if chunk_bytes_maybe is None: + return None + chunk_bytes = chunk_bytes_maybe + + for bb_codec, array_spec in bb_codecs_with_spec: + chunk_bytes_maybe = await bb_codec.encode( + chunk_bytes, array_spec, runtime_configuration + ) + if chunk_bytes_maybe is None: + return None + chunk_bytes = chunk_bytes_maybe + + return chunk_bytes + + async def encode_partial( + self, + batch_info: Iterable[Tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> None: + await concurrent_map( + [ + (byte_setter, chunk_array, selection, chunk_spec, runtime_configuration) + for byte_setter, chunk_array, selection, chunk_spec in batch_info + ], + self.encode_partial_single, + runtime_configuration.concurrency, + ) + + async def encode_partial_single( + self, + byte_setter: ByteSetter, + chunk_array: np.ndarray, + selection: SliceSelection, + chunk_spec: ArraySpec, + runtime_configuration: RuntimeConfiguration, + ) -> None: + assert self.supports_partial_encode + assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) + await self.array_bytes_codec.encode_partial( + byte_setter, chunk_array, selection, chunk_spec, runtime_configuration + ) + + async def read_single( + self, + byte_getter: ByteGetter, + chunk_spec: ArraySpec, + chunk_selection: SliceSelection, + out_selection: SliceSelection, + out: np.ndarray, + runtime_configuration: RuntimeConfiguration, + ) -> None: + if self.supports_partial_decode: + chunk_array = await self.decode_partial_single( + byte_getter, chunk_selection, chunk_spec, runtime_configuration + ) + if chunk_array is not None: + out[out_selection] = chunk_array + else: + out[out_selection] = chunk_spec.fill_value + else: + chunk_bytes = await byte_getter.get() + chunk_array = await self.decode_single(chunk_bytes, chunk_spec, runtime_configuration) + if chunk_array is not None: + tmp = chunk_array[chunk_selection] + out[out_selection] = tmp + else: + out[out_selection] = chunk_spec.fill_value + + async def read_batch( + self, + batch_info: Iterable[Tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + out: np.ndarray, + runtime_configuration: RuntimeConfiguration, + ) -> None: + await concurrent_map( + [ + ( + byte_getter, + chunk_spec, + chunk_selection, + out_selection, + out, + runtime_configuration, + ) + for byte_getter, chunk_spec, chunk_selection, out_selection in batch_info + ], + self.read_single, + runtime_configuration.concurrency, + ) + + async def write_single( + self, + byte_setter: ByteSetter, + chunk_spec: ArraySpec, + chunk_selection: SliceSelection, + out_selection: SliceSelection, + value: np.ndarray, + runtime_configuration: RuntimeConfiguration, + ) -> None: + async def _write_chunk_to_store(chunk_array: np.ndarray): + if np.all(chunk_array == chunk_spec.fill_value): + # chunks that only contain fill_value will be removed + await byte_setter.delete() + else: + chunk_bytes = await self.encode_single( + chunk_array, chunk_spec, runtime_configuration + ) + if chunk_bytes is None: + await byte_setter.delete() + else: + await byte_setter.set(chunk_bytes) + + if is_total_slice(chunk_selection, chunk_spec.shape): + # write entire chunks + if np.isscalar(value): + chunk_array = np.empty( + chunk_spec.shape, + dtype=chunk_spec.dtype, + ) + chunk_array.fill(value) + else: + chunk_array = value[out_selection] + await _write_chunk_to_store(chunk_array) + + elif self.supports_partial_encode: + await self.encode_partial_single( + byte_setter, + value[out_selection], + chunk_selection, + chunk_spec, + runtime_configuration, + ) + else: + # writing partial chunks + # read chunk first + chunk_bytes = await byte_setter.get() + + # merge new value + chunk_array_maybe = await self.decode_single( + chunk_bytes, chunk_spec, runtime_configuration + ) + if chunk_array_maybe is None: + chunk_array = np.empty( + chunk_spec.shape, + dtype=chunk_spec.dtype, + ) + chunk_array.fill(chunk_spec.fill_value) + else: + chunk_array = chunk_array_maybe.copy() # make a writable copy + chunk_array[chunk_selection] = value[out_selection] + + await _write_chunk_to_store(chunk_array) + + async def write_batch( + self, + batch_info: Iterable[Tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], + value: np.ndarray, + runtime_configuration: RuntimeConfiguration, + ) -> None: + await concurrent_map( + [ + ( + byte_setter, + chunk_spec, + chunk_selection, + out_selection, + value, + runtime_configuration, + ) + for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info + ], + self.write_single, + runtime_configuration.concurrency, + ) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 948e46f132..adbef995f0 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -1,12 +1,14 @@ from __future__ import annotations from enum import Enum -from typing import TYPE_CHECKING, Iterable, Mapping, NamedTuple, Union -from dataclasses import dataclass, replace +from typing import TYPE_CHECKING, Iterable, Mapping, MutableMapping, NamedTuple, Tuple, Union +from dataclasses import dataclass, field, replace from functools import lru_cache import numpy as np from zarr.abc.codec import ( + ByteGetter, + ByteSetter, Codec, ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, @@ -14,12 +16,11 @@ ) from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec -from zarr.codecs.pipeline import CodecPipeline +from zarr.codecs.pipeline import CodecPipeline, InterleavedCodecPipeline from zarr.codecs.registry import register_codec from zarr.common import ( ArraySpec, ChunkCoordsLike, - concurrent_map, parse_enum, parse_named_configuration, parse_shapelike, @@ -29,7 +30,6 @@ from zarr.indexing import ( BasicIndexer, c_order_iter, - is_total_slice, morton_order_iter, ) from zarr.metadata import ( @@ -39,10 +39,9 @@ ) if TYPE_CHECKING: - from typing import Awaitable, Callable, Dict, Iterator, List, Optional, Set, Tuple + from typing import Awaitable, Callable, Dict, Iterator, Optional, Set from typing_extensions import Self - from zarr.store import StorePath from zarr.common import ( JSON, ChunkCoords, @@ -63,6 +62,30 @@ def parse_index_location(data: JSON) -> ShardingCodecIndexLocation: return parse_enum(data, ShardingCodecIndexLocation) +@dataclass(frozen=True) +class _ShardingByteGetter(ByteGetter): + shard_dict: Mapping[ChunkCoords, BytesLike] + chunk_coords: ChunkCoords + + async def get( + self, byte_range: Optional[Tuple[int, Optional[int]]] = None + ) -> Optional[BytesLike]: + assert byte_range is None, "byte_range is not supported within shards" + return self.shard_dict.get(self.chunk_coords) + + +@dataclass(frozen=True) +class _ShardingByteSetter(_ShardingByteGetter, ByteSetter): + shard_dict: MutableMapping[ChunkCoords, BytesLike] + + async def set(self, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None) -> None: + assert byte_range is None, "byte_range is not supported within shards" + self.shard_dict[self.chunk_coords] = value + + async def delete(self) -> None: + del self.shard_dict[self.chunk_coords] + + class _ShardIndex(NamedTuple): # dtype uint64, shape (chunks_per_shard_0, chunks_per_shard_1, ..., 2) offsets_and_lengths: np.ndarray @@ -80,6 +103,9 @@ def _localize_chunk(self, chunk_coords: ChunkCoords) -> ChunkCoords: def is_all_empty(self) -> bool: return bool(np.array_equiv(self.offsets_and_lengths, MAX_UINT_64)) + def get_full_chunk_map(self) -> np.ndarray: + return self.offsets_and_lengths[..., 0] != MAX_UINT_64 + def get_chunk_slice(self, chunk_coords: ChunkCoords) -> Optional[Tuple[int, int]]: localized_chunk = self._localize_chunk(chunk_coords) chunk_start, chunk_len = self.offsets_and_lengths[localized_chunk] @@ -126,14 +152,14 @@ def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardIndex: return cls(offsets_and_lengths) -class _ShardProxy(Mapping): - index: _ShardIndex +class _ShardReader(Mapping): buf: BytesLike + index: _ShardIndex @classmethod async def from_bytes( cls, buf: BytesLike, codec: ShardingCodec, chunks_per_shard: ChunkCoords - ) -> _ShardProxy: + ) -> _ShardReader: shard_index_size = codec._shard_index_size(chunks_per_shard) obj = cls() obj.buf = memoryview(buf) @@ -146,7 +172,7 @@ async def from_bytes( return obj @classmethod - def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardProxy: + def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardReader: index = _ShardIndex.create_empty(chunks_per_shard) obj = cls() obj.buf = memoryview(b"") @@ -165,8 +191,11 @@ def __len__(self) -> int: def __iter__(self) -> Iterator[ChunkCoords]: return c_order_iter(self.index.offsets_and_lengths.shape[:-1]) + def is_empty(self) -> bool: + return self.index.is_all_empty() -class _ShardBuilder(_ShardProxy): + +class _ShardBuilder(_ShardReader, MutableMapping): buf: bytearray index: _ShardIndex @@ -184,7 +213,7 @@ def merge_with_morton_order( for shard_dict in shard_dicts: maybe_value = shard_dict.get(chunk_coords, None) if maybe_value is not None: - obj.append(chunk_coords, maybe_value) + obj[chunk_coords] = maybe_value break return obj @@ -195,12 +224,15 @@ def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardBuilder: obj.index = _ShardIndex.create_empty(chunks_per_shard) return obj - def append(self, chunk_coords: ChunkCoords, value: BytesLike): + def __setitem__(self, chunk_coords: ChunkCoords, value: BytesLike) -> None: chunk_start = len(self.buf) chunk_length = len(value) self.buf.extend(value) self.index.set_chunk_slice(chunk_coords, slice(chunk_start, chunk_start + chunk_length)) + def __delitem__(self, chunk_coords: ChunkCoords) -> None: + raise NotImplementedError + async def finalize( self, index_location: ShardingCodecIndexLocation, @@ -218,6 +250,53 @@ async def finalize( return out_buf +@dataclass(frozen=True) +class _MergingShardBuilder(MutableMapping): + old_dict: _ShardReader + new_dict: _ShardBuilder + tombstones: Set[ChunkCoords] = field(default_factory=set) + + def __getitem__(self, chunk_coords: ChunkCoords) -> Optional[BytesLike]: + chunk_bytes_maybe = self.new_dict.get(chunk_coords) + if chunk_bytes_maybe is not None: + return chunk_bytes_maybe + return self.old_dict.get(chunk_coords) + + def __setitem__(self, chunk_coords: ChunkCoords, value: BytesLike) -> None: + self.new_dict[chunk_coords] = value + + def __delitem__(self, chunk_coords: ChunkCoords) -> None: + self.tombstones.add(chunk_coords) + + def __len__(self) -> int: + return self.old_dict.__len__() + + def __iter__(self) -> Iterator[ChunkCoords]: + return self.old_dict.__iter__() + + def is_empty(self) -> bool: + full_chunk_coords_map = self.old_dict.index.get_full_chunk_map() + full_chunk_coords_map = np.logical_or( + full_chunk_coords_map, self.new_dict.index.get_full_chunk_map() + ) + for tombstone in self.tombstones: + full_chunk_coords_map[tombstone] = False + return bool(np.array_equiv(full_chunk_coords_map, False)) + + async def finalize( + self, + index_location: ShardingCodecIndexLocation, + index_encoder: Callable[[_ShardIndex], Awaitable[BytesLike]], + ) -> BytesLike: + shard_builder = _ShardBuilder.merge_with_morton_order( + self.new_dict.index.chunks_per_shard, + self.tombstones, + self.new_dict, + self.old_dict, + ) + return await shard_builder.finalize(index_location, index_encoder) + + @dataclass(frozen=True) class ShardingCodec( ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin @@ -237,12 +316,14 @@ def __init__( ) -> None: chunk_shape_parsed = parse_shapelike(chunk_shape) codecs_parsed = ( - parse_codecs(codecs) if codecs is not None else CodecPipeline.from_list([BytesCodec()]) + parse_codecs(codecs) + if codecs is not None + else InterleavedCodecPipeline.from_list([BytesCodec()]) ) index_codecs_parsed = ( parse_codecs(index_codecs) if index_codecs is not None - else CodecPipeline.from_list([BytesCodec(), Crc32cCodec()]) + else InterleavedCodecPipeline.from_list([BytesCodec(), Crc32cCodec()]) ) index_location_parsed = ( parse_index_location(index_location) @@ -304,10 +385,10 @@ async def decode( shard_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, ) -> np.ndarray: - # print("decode") shard_shape = shard_spec.shape chunk_shape = self.chunk_shape chunks_per_shard = self._get_chunks_per_shard(shard_spec) + chunk_spec = self._get_chunk_spec(shard_spec) indexer = BasicIndexer( tuple(slice(0, s) for s in shard_shape), @@ -321,35 +402,32 @@ async def decode( dtype=shard_spec.dtype, order=runtime_configuration.order, ) - shard_dict = await _ShardProxy.from_bytes(shard_bytes, self, chunks_per_shard) + shard_dict = await _ShardReader.from_bytes(shard_bytes, self, chunks_per_shard) if shard_dict.index.is_all_empty(): out.fill(shard_spec.fill_value) return out # decoding chunks and writing them into the output buffer - await concurrent_map( + await self.codecs.read_batch( [ ( - shard_dict, - chunk_coords, + _ShardingByteGetter(shard_dict, chunk_coords), + chunk_spec, chunk_selection, out_selection, - shard_spec, - runtime_configuration, - out, ) for chunk_coords, chunk_selection, out_selection in indexer ], - self._read_chunk, - runtime_configuration.concurrency, + out, + runtime_configuration, ) return out async def decode_partial( self, - store_path: StorePath, + byte_getter: ByteGetter, selection: SliceSelection, shard_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, @@ -357,6 +435,7 @@ async def decode_partial( shard_shape = shard_spec.shape chunk_shape = self.chunk_shape chunks_per_shard = self._get_chunks_per_shard(shard_spec) + chunk_spec = self._get_chunk_spec(shard_spec) indexer = BasicIndexer( selection, @@ -378,62 +457,40 @@ async def decode_partial( shard_dict: Mapping[ChunkCoords, BytesLike] = {} if self._is_total_shard(all_chunk_coords, chunks_per_shard): # read entire shard - shard_dict_maybe = await self._load_full_shard_maybe(store_path, chunks_per_shard) + shard_dict_maybe = await self._load_full_shard_maybe(byte_getter, chunks_per_shard) if shard_dict_maybe is None: return None shard_dict = shard_dict_maybe else: # read some chunks within the shard - shard_index = await self._load_shard_index_maybe(store_path, chunks_per_shard) + shard_index = await self._load_shard_index_maybe(byte_getter, chunks_per_shard) if shard_index is None: return None shard_dict = {} for chunk_coords in all_chunk_coords: chunk_byte_slice = shard_index.get_chunk_slice(chunk_coords) if chunk_byte_slice: - chunk_bytes = await store_path.get(chunk_byte_slice) + chunk_bytes = await byte_getter.get(chunk_byte_slice) if chunk_bytes: shard_dict[chunk_coords] = chunk_bytes # decoding chunks and writing them into the output buffer - await concurrent_map( + await self.codecs.read_batch( [ ( - shard_dict, - chunk_coords, + _ShardingByteGetter(shard_dict, chunk_coords), + chunk_spec, chunk_selection, out_selection, - shard_spec, - runtime_configuration, - out, ) - for chunk_coords, chunk_selection, out_selection in indexed_chunks + for chunk_coords, chunk_selection, out_selection in indexer ], - self._read_chunk, - runtime_configuration.concurrency, + out, + runtime_configuration, ) return out - async def _read_chunk( - self, - shard_dict: Mapping[ChunkCoords, Optional[BytesLike]], - chunk_coords: ChunkCoords, - chunk_selection: SliceSelection, - out_selection: SliceSelection, - shard_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - out: np.ndarray, - ): - chunk_spec = self._get_chunk_spec(shard_spec) - chunk_bytes = shard_dict.get(chunk_coords, None) - if chunk_bytes is not None: - chunk_array = await self.codecs.decode(chunk_bytes, chunk_spec, runtime_configuration) - tmp = chunk_array[chunk_selection] - out[out_selection] = tmp - else: - out[out_selection] = chunk_spec.fill_value - async def encode( self, shard_array: np.ndarray, @@ -443,6 +500,7 @@ async def encode( shard_shape = shard_spec.shape chunk_shape = self.chunk_shape chunks_per_shard = self._get_chunks_per_shard(shard_spec) + chunk_spec = self._get_chunk_spec(shard_spec) indexer = list( BasicIndexer( @@ -452,68 +510,42 @@ async def encode( ) ) - async def _write_chunk( - shard_array: np.ndarray, - chunk_coords: ChunkCoords, - chunk_selection: SliceSelection, - out_selection: SliceSelection, - ) -> Tuple[ChunkCoords, Optional[BytesLike]]: - if is_total_slice(chunk_selection, chunk_shape): - chunk_array = shard_array[out_selection] - else: - # handling writing partial chunks - chunk_array = np.empty( - chunk_shape, - dtype=shard_spec.dtype, - ) - chunk_array.fill(shard_spec.fill_value) - chunk_array[chunk_selection] = shard_array[out_selection] - if not np.array_equiv(chunk_array, shard_spec.fill_value): - chunk_spec = self._get_chunk_spec(shard_spec) - return ( - chunk_coords, - await self.codecs.encode(chunk_array, chunk_spec, runtime_configuration), - ) - return (chunk_coords, None) + shard_builder = _ShardBuilder.create_empty(chunks_per_shard) - # assembling and encoding chunks within the shard - encoded_chunks: List[Tuple[ChunkCoords, Optional[BytesLike]]] = await concurrent_map( + await self.codecs.write_batch( [ - (shard_array, chunk_coords, chunk_selection, out_selection) + ( + _ShardingByteSetter(shard_builder, chunk_coords), + chunk_spec, + chunk_selection, + out_selection, + ) for chunk_coords, chunk_selection, out_selection in indexer ], - _write_chunk, - runtime_configuration.concurrency, + shard_array, + runtime_configuration, ) - if len(encoded_chunks) == 0: - return None - - shard_builder = _ShardBuilder.create_empty(chunks_per_shard) - for chunk_coords, chunk_bytes in encoded_chunks: - if chunk_bytes is not None: - shard_builder.append(chunk_coords, chunk_bytes) return await shard_builder.finalize(self.index_location, self._encode_shard_index) async def encode_partial( self, - store_path: StorePath, + byte_setter: ByteSetter, shard_array: np.ndarray, selection: SliceSelection, shard_spec: ArraySpec, runtime_configuration: RuntimeConfiguration, ) -> None: - # print("encode_partial") shard_shape = shard_spec.shape chunk_shape = self.chunk_shape chunks_per_shard = self._get_chunks_per_shard(shard_spec) chunk_spec = self._get_chunk_spec(shard_spec) - old_shard_dict = ( - await self._load_full_shard_maybe(store_path, chunks_per_shard) - ) or _ShardProxy.create_empty(chunks_per_shard) - new_shard_builder = _ShardBuilder.create_empty(chunks_per_shard) - tombstones: Set[ChunkCoords] = set() + shard_dict = _MergingShardBuilder( + await self._load_full_shard_maybe(byte_setter, chunks_per_shard) + or _ShardReader.create_empty(chunks_per_shard), + _ShardBuilder.create_empty(chunks_per_shard), + ) indexer = list( BasicIndexer( @@ -523,71 +555,25 @@ async def encode_partial( ) ) - async def _write_chunk( - chunk_coords: ChunkCoords, - chunk_selection: SliceSelection, - out_selection: SliceSelection, - ) -> Tuple[ChunkCoords, Optional[BytesLike]]: - chunk_array = None - if is_total_slice(chunk_selection, self.chunk_shape): - chunk_array = shard_array[out_selection] - else: - # handling writing partial chunks - # read chunk first - chunk_bytes = old_shard_dict.get(chunk_coords, None) - - # merge new value - if chunk_bytes is None: - chunk_array = np.empty( - self.chunk_shape, - dtype=shard_spec.dtype, - ) - chunk_array.fill(shard_spec.fill_value) - else: - chunk_array = ( - await self.codecs.decode(chunk_bytes, chunk_spec, runtime_configuration) - ).copy() # make a writable copy - chunk_array[chunk_selection] = shard_array[out_selection] - - if not np.array_equiv(chunk_array, shard_spec.fill_value): - return ( - chunk_coords, - await self.codecs.encode(chunk_array, chunk_spec, runtime_configuration), - ) - else: - return (chunk_coords, None) - - encoded_chunks: List[Tuple[ChunkCoords, Optional[BytesLike]]] = await concurrent_map( + await self.codecs.write_batch( [ ( - chunk_coords, + _ShardingByteSetter(shard_dict, chunk_coords), + chunk_spec, chunk_selection, out_selection, ) for chunk_coords, chunk_selection, out_selection in indexer ], - _write_chunk, - runtime_configuration.concurrency, - ) - - for chunk_coords, chunk_bytes in encoded_chunks: - if chunk_bytes is not None: - new_shard_builder.append(chunk_coords, chunk_bytes) - else: - tombstones.add(chunk_coords) - - shard_builder = _ShardBuilder.merge_with_morton_order( - chunks_per_shard, - tombstones, - new_shard_builder, - old_shard_dict, + shard_array, + runtime_configuration, ) - if shard_builder.index.is_all_empty(): - await store_path.delete() + if shard_dict.is_empty(): + await byte_setter.delete() else: - await store_path.set( - await shard_builder.finalize( + await byte_setter.set( + await shard_dict.finalize( self.index_location, self._encode_shard_index, ) @@ -603,19 +589,30 @@ def _is_total_shard( async def _decode_shard_index( self, index_bytes: BytesLike, chunks_per_shard: ChunkCoords ) -> _ShardIndex: - return _ShardIndex( - await self.index_codecs.decode( - index_bytes, - self._get_index_chunk_spec(chunks_per_shard), - make_runtime_configuration("C"), + index_array = next( + iter( + await self.index_codecs.decode( + [(index_bytes, self._get_index_chunk_spec(chunks_per_shard))], + make_runtime_configuration("C"), + ) ) ) + assert index_array is not None + return _ShardIndex(index_array) async def _encode_shard_index(self, index: _ShardIndex) -> BytesLike: - index_bytes = await self.index_codecs.encode( - index.offsets_and_lengths, - self._get_index_chunk_spec(index.chunks_per_shard), - make_runtime_configuration("C"), + index_bytes = next( + iter( + await self.index_codecs.encode( + [ + ( + index.offsets_and_lengths, + self._get_index_chunk_spec(index.chunks_per_shard), + ) + ], + make_runtime_configuration("C"), + ) + ) ) assert index_bytes is not None return index_bytes @@ -652,31 +649,31 @@ def _get_chunks_per_shard(self, shard_spec: ArraySpec) -> ChunkCoords: ) async def _load_shard_index_maybe( - self, store_path: StorePath, chunks_per_shard: ChunkCoords + self, byte_getter: ByteGetter, chunks_per_shard: ChunkCoords ) -> Optional[_ShardIndex]: shard_index_size = self._shard_index_size(chunks_per_shard) if self.index_location == ShardingCodecIndexLocation.start: - index_bytes = await store_path.get((0, shard_index_size)) + index_bytes = await byte_getter.get((0, shard_index_size)) else: - index_bytes = await store_path.get((-shard_index_size, None)) + index_bytes = await byte_getter.get((-shard_index_size, None)) if index_bytes is not None: return await self._decode_shard_index(index_bytes, chunks_per_shard) return None async def _load_shard_index( - self, store_path: StorePath, chunks_per_shard: ChunkCoords + self, byte_getter: ByteGetter, chunks_per_shard: ChunkCoords ) -> _ShardIndex: return ( - await self._load_shard_index_maybe(store_path, chunks_per_shard) + await self._load_shard_index_maybe(byte_getter, chunks_per_shard) ) or _ShardIndex.create_empty(chunks_per_shard) async def _load_full_shard_maybe( - self, store_path: StorePath, chunks_per_shard: ChunkCoords - ) -> Optional[_ShardProxy]: - shard_bytes = await store_path.get() + self, byte_getter: ByteGetter, chunks_per_shard: ChunkCoords + ) -> Optional[_ShardReader]: + shard_bytes = await byte_getter.get() return ( - await _ShardProxy.from_bytes(shard_bytes, self, chunks_per_shard) + await _ShardReader.from_bytes(shard_bytes, self, chunks_per_shard) if shard_bytes else None ) diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py index 5a65be7f62..9dc49c0301 100644 --- a/src/zarr/metadata.py +++ b/src/zarr/metadata.py @@ -11,8 +11,8 @@ if TYPE_CHECKING: - from typing import Literal, Union, List, Optional, Tuple - from zarr.codecs.batched_pipeline import BatchedCodecPipeline + from typing import Any, Literal, Union, List, Optional, Tuple + from zarr.codecs import CodecPipeline from zarr.abc.codec import Codec @@ -116,7 +116,7 @@ class ArrayMetadata(Metadata): chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any - codecs: BatchedCodecPipeline + codecs: CodecPipeline attributes: Dict[str, Any] = field(default_factory=dict) dimension_names: Optional[Tuple[str, ...]] = None zarr_format: Literal[3] = field(default=3, init=False) @@ -369,6 +369,7 @@ def parse_v2_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: return data +<<<<<<< HEAD:src/zarr/metadata.py <<<<<<< HEAD:src/zarr/metadata.py def parse_codecs(data: Iterable[Union[Codec, JSON]]) -> CodecPipeline: from zarr.codecs.pipeline import CodecPipeline @@ -376,6 +377,10 @@ def parse_codecs(data: Iterable[Union[Codec, JSON]]) -> CodecPipeline: def parse_codecs(data: Iterable[Union[Codec, JSON]]) -> BatchedCodecPipeline: from zarr.v3.codecs.batched_pipeline import BatchedCodecPipeline >>>>>>> 450bcc64 (merge):src/zarr/v3/metadata.py +======= +def parse_codecs(data: Iterable[Union[Codec, JSON]]) -> CodecPipeline: + from zarr.v3.codecs.pipeline.batched import BatchedCodecPipeline +>>>>>>> 51d3c921 (refactors CodecPipelines):src/zarr/v3/metadata.py if not isinstance(data, Iterable): raise TypeError(f"Expected iterable, got {type(data)}") diff --git a/src/zarr/v3/sync.py b/src/zarr/v3/sync.py deleted file mode 100644 index 2838f68172..0000000000 --- a/src/zarr/v3/sync.py +++ /dev/null @@ -1,131 +0,0 @@ -from __future__ import annotations -from typing import TYPE_CHECKING, TypeVar - -if TYPE_CHECKING: - from typing import Any, AsyncIterator, Coroutine - -import asyncio -from concurrent.futures import wait -import threading - -from typing_extensions import ParamSpec - -from zarr.v3.config import SyncConfiguration - -P = ParamSpec("P") -T = TypeVar("T") - -# From https://github.com/fsspec/filesystem_spec/blob/master/fsspec/asyn.py - -iothread: list[threading.Thread | None] = [None] # dedicated IO thread -loop: list[asyncio.AbstractEventLoop | None] = [ - None -] # global event loop for any non-async instance -_lock: threading.Lock | None = None # global lock placeholder -get_running_loop = asyncio.get_running_loop - - -class SyncError(Exception): - pass - - -def _get_lock() -> threading.Lock: - """Allocate or return a threading lock. - - The lock is allocated on first use to allow setting one lock per forked process. - """ - global _lock - if not _lock: - _lock = threading.Lock() - return _lock - - -async def _runner(coro: Coroutine[Any, Any, T]) -> T | BaseException: - """ - Await a coroutine and return the result of running it. If awaiting the coroutine raises an - exception, the exception will be returned. - """ - try: - return await coro - except Exception as ex: - return ex - - -def sync( - coro: Coroutine[Any, Any, T], - loop: asyncio.AbstractEventLoop | None = None, - timeout: float | None = None, -) -> T: - """ - Make loop run coroutine until it returns. Runs in other thread - - Examples - -------- - >>> sync(async_function(), existing_loop) - """ - if loop is None: - # NB: if the loop is not running *yet*, it is OK to submit work - # and we will wait for it - loop = _get_loop() - if not isinstance(loop, asyncio.AbstractEventLoop): - raise TypeError(f"loop cannot be of type {type(loop)}") - if loop.is_closed(): - raise RuntimeError("Loop is not running") - try: - loop0 = asyncio.events.get_running_loop() - if loop0 is loop: - raise SyncError("Calling sync() from within a running loop") - except RuntimeError: - pass - - future = asyncio.run_coroutine_threadsafe(_runner(coro), loop) - - finished, unfinished = wait([future], return_when=asyncio.ALL_COMPLETED, timeout=timeout) - if len(unfinished) > 0: - raise asyncio.TimeoutError(f"Coroutine {coro} failed to finish in within {timeout}s") - assert len(finished) == 1 - return_result = list(finished)[0].result() - - if isinstance(return_result, BaseException): - raise return_result - else: - return return_result - - -def _get_loop() -> asyncio.AbstractEventLoop: - """Create or return the default fsspec IO loop - - The loop will be running on a separate thread. - """ - if loop[0] is None: - with _get_lock(): - # repeat the check just in case the loop got filled between the - # previous two calls from another thread - if loop[0] is None: - new_loop = asyncio.new_event_loop() - loop[0] = new_loop - th = threading.Thread(target=new_loop.run_forever, name="zarrIO") - th.daemon = True - th.start() - iothread[0] = th - assert loop[0] is not None - return loop[0] - - -class SyncMixin: - _sync_configuration: SyncConfiguration - - def _sync(self, coroutine: Coroutine[Any, Any, T]) -> T: - # TODO: refactor this to to take *args and **kwargs and pass those to the method - # this should allow us to better type the sync wrapper - return sync( - coroutine, - loop=self._sync_configuration.asyncio_loop, - timeout=self._sync_configuration.timeout, - ) - - def _sync_iter(self, async_iterator: AsyncIterator[T]) -> list[T]: - async def iter_to_list() -> list[T]: - return [item async for item in async_iterator] - - return self._sync(iter_to_list()) From bd2160db4518d6605b8c48da2ad58682bf3e006c Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 30 Apr 2024 17:30:29 +0200 Subject: [PATCH 03/21] fixes --- src/zarr/abc/codec.py | 5 ----- src/zarr/metadata.py | 12 +----------- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index b221444534..6fed21ec7a 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -30,13 +30,8 @@ def noop_for_none( -<<<<<<< HEAD:src/zarr/abc/codec.py - func: Callable[[Optional[T], ArraySpec, RuntimeConfiguration], Awaitable[U]], -) -> Callable[[T, ArraySpec, RuntimeConfiguration], Awaitable[U]]: -======= func: Callable[[T, ArraySpec, RuntimeConfiguration], Awaitable[Optional[U]]], ) -> Callable[[Optional[T], ArraySpec, RuntimeConfiguration], Awaitable[Optional[U]]]: ->>>>>>> 51d3c921 (refactors CodecPipelines):src/zarr/v3/abc/codec.py async def wrap( chunk: Optional[T], chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration ) -> Optional[U]: diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py index 9dc49c0301..0f3d5957e3 100644 --- a/src/zarr/metadata.py +++ b/src/zarr/metadata.py @@ -369,18 +369,8 @@ def parse_v2_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: return data -<<<<<<< HEAD:src/zarr/metadata.py -<<<<<<< HEAD:src/zarr/metadata.py def parse_codecs(data: Iterable[Union[Codec, JSON]]) -> CodecPipeline: - from zarr.codecs.pipeline import CodecPipeline -======= -def parse_codecs(data: Iterable[Union[Codec, JSON]]) -> BatchedCodecPipeline: - from zarr.v3.codecs.batched_pipeline import BatchedCodecPipeline ->>>>>>> 450bcc64 (merge):src/zarr/v3/metadata.py -======= -def parse_codecs(data: Iterable[Union[Codec, JSON]]) -> CodecPipeline: - from zarr.v3.codecs.pipeline.batched import BatchedCodecPipeline ->>>>>>> 51d3c921 (refactors CodecPipelines):src/zarr/v3/metadata.py + from zarr.codecs.pipeline.batched import BatchedCodecPipeline if not isinstance(data, Iterable): raise TypeError(f"Expected iterable, got {type(data)}") From 4887c29463830b100b967bfeab58f9e086e363e6 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 30 Apr 2024 18:14:04 +0200 Subject: [PATCH 04/21] adds HybridCodecPipeline --- pyproject.toml | 2 +- src/zarr/codecs/__init__.py | 7 +- src/zarr/codecs/pipeline/__init__.py | 1 + src/zarr/codecs/pipeline/core.py | 26 ++++-- src/zarr/codecs/pipeline/hybrid.py | 135 +++++++++++++++++++++++++++ src/zarr/metadata.py | 4 +- 6 files changed, 162 insertions(+), 13 deletions(-) create mode 100644 src/zarr/codecs/pipeline/hybrid.py diff --git a/pyproject.toml b/pyproject.toml index 3dcda98980..8d7dd41a68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -185,7 +185,7 @@ check_untyped_defs = false module = [ "zarr.abc.codec", "zarr.codecs.bytes", - "zarr.codecs.pipeline", + "zarr.codecs.pipeline.*", "zarr.codecs.sharding", "zarr.codecs.transpose", "zarr.array_v2", diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index 3068adde42..770fd9fb5a 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -7,4 +7,9 @@ from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation # noqa: F401 from zarr.codecs.transpose import TransposeCodec # noqa: F401 from zarr.codecs.zstd import ZstdCodec # noqa: F401 -from zarr.codecs.pipeline import CodecPipeline, BatchedCodecPipeline, InterleavedCodecPipeline # noqa: F401 +from zarr.codecs.pipeline import ( + CodecPipeline, + BatchedCodecPipeline, + InterleavedCodecPipeline, + HybridCodecPipeline, +) # noqa: F401 diff --git a/src/zarr/codecs/pipeline/__init__.py b/src/zarr/codecs/pipeline/__init__.py index 4b1e955994..2a83e36805 100644 --- a/src/zarr/codecs/pipeline/__init__.py +++ b/src/zarr/codecs/pipeline/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations from zarr.codecs.pipeline.core import CodecPipeline # noqa: F401 +from zarr.codecs.pipeline.hybrid import HybridCodecPipeline # noqa: F401 from zarr.codecs.pipeline.batched import BatchedCodecPipeline # noqa: F401 from zarr.codecs.pipeline.interleaved import InterleavedCodecPipeline # noqa: F401 diff --git a/src/zarr/codecs/pipeline/core.py b/src/zarr/codecs/pipeline/core.py index 564cf3769f..caaebcbf97 100644 --- a/src/zarr/codecs/pipeline/core.py +++ b/src/zarr/codecs/pipeline/core.py @@ -54,8 +54,10 @@ def to_dict(self) -> JSON: def evolve(self, array_spec: ArraySpec) -> Self: return type(self).from_list([c.evolve(array_spec) for c in self]) - @classmethod - def from_list(cls, codecs: List[Codec]) -> Self: + @staticmethod + def codecs_from_list( + codecs: List[Codec], + ) -> Tuple[Tuple[ArrayArrayCodec, ...], ArrayBytesCodec, Tuple[BytesBytesCodec, ...]]: from zarr.codecs.sharding import ShardingCodec if not any(isinstance(codec, ArrayBytesCodec) for codec in codecs): @@ -93,14 +95,20 @@ def from_list(cls, codecs: List[Codec]) -> Self: + "writes, which may lead to inefficient performance." ) + return ( + tuple(codec for codec in codecs if isinstance(codec, ArrayArrayCodec)), + [codec for codec in codecs if isinstance(codec, ArrayBytesCodec)][0], + tuple(codec for codec in codecs if isinstance(codec, BytesBytesCodec)), + ) + + @classmethod + def from_list(cls, codecs: List[Codec]) -> Self: + array_array_codecs, array_bytes_codec, bytes_bytes_codecs = cls.codecs_from_list(codecs) + return cls( - array_array_codecs=tuple( - codec for codec in codecs if isinstance(codec, ArrayArrayCodec) - ), - array_bytes_codec=[codec for codec in codecs if isinstance(codec, ArrayBytesCodec)][0], - bytes_bytes_codecs=tuple( - codec for codec in codecs if isinstance(codec, BytesBytesCodec) - ), + array_array_codecs=array_array_codecs, + array_bytes_codec=array_bytes_codec, + bytes_bytes_codecs=bytes_bytes_codecs, ) @property diff --git a/src/zarr/codecs/pipeline/hybrid.py b/src/zarr/codecs/pipeline/hybrid.py new file mode 100644 index 0000000000..5d798469b3 --- /dev/null +++ b/src/zarr/codecs/pipeline/hybrid.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +from itertools import islice +from typing import TYPE_CHECKING +import numpy as np +from dataclasses import dataclass + +from zarr.abc.codec import ( + ByteGetter, + ByteSetter, + Codec, +) +from zarr.codecs.pipeline.batched import BatchedCodecPipeline +from zarr.codecs.pipeline.core import CodecPipeline +from zarr.common import concurrent_map + +if TYPE_CHECKING: + from typing import List, Optional, Tuple, Iterable + from typing_extensions import Self + from zarr.config import RuntimeConfiguration + from zarr.common import ArraySpec, BytesLike, SliceSelection + +DEFAULT_BATCH_SIZE = 1000 + + +def batched(iterable, n): + if n < 1: + raise ValueError("n must be at least one") + it = iter(iterable) + while batch := tuple(islice(it, n)): + yield batch + + +@dataclass(frozen=True) +class HybridCodecPipeline(CodecPipeline): + batch_size: int # TODO: There needs to be a way of specifying this from the user code + batched_codec_pipeline: BatchedCodecPipeline + + @classmethod + def from_list(cls, codecs: List[Codec], *, batch_size: Optional[int] = None) -> Self: + array_array_codecs, array_bytes_codec, bytes_bytes_codecs = cls.codecs_from_list(codecs) + + return cls( + array_array_codecs=array_array_codecs, + array_bytes_codec=array_bytes_codec, + bytes_bytes_codecs=bytes_bytes_codecs, + batch_size=batch_size or DEFAULT_BATCH_SIZE, + batched_codec_pipeline=BatchedCodecPipeline( + array_array_codecs=array_array_codecs, + array_bytes_codec=array_bytes_codec, + bytes_bytes_codecs=bytes_bytes_codecs, + ), + ) + + async def decode( + self, + chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[np.ndarray]]: + output: list[Optional[np.ndarray]] = [] + for batch_info in batched(chunk_bytes_and_specs, self.batch_size): + output.extend( + await self.batched_codec_pipeline.decode(batch_info, runtime_configuration) + ) + return output + + async def decode_partial( + self, + batch_info: Iterable[Tuple[ByteGetter, SliceSelection, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[np.ndarray]]: + output: list[Optional[np.ndarray]] = [] + for single_batch_info in batched(batch_info, self.batch_size): + output.extend( + await self.batched_codec_pipeline.decode_partial( + single_batch_info, runtime_configuration + ) + ) + return output + + async def encode( + self, + chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[Optional[BytesLike]]: + output: list[Optional[BytesLike]] = [] + for single_batch_info in batched(chunk_arrays_and_specs, self.batch_size): + output.extend( + await self.batched_codec_pipeline.encode(single_batch_info, runtime_configuration) + ) + return output + + async def encode_partial( + self, + batch_info: Iterable[Tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> None: + for single_batch_info in batched(batch_info, self.batch_size): + await self.batched_codec_pipeline.encode_partial( + single_batch_info, runtime_configuration + ) + + async def read_batch( + self, + batch_info: Iterable[Tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + out: np.ndarray, + runtime_configuration: RuntimeConfiguration, + ) -> None: + await concurrent_map( + [ + (single_batch_info, out, runtime_configuration) + for single_batch_info in batched(batch_info, self.batch_size) + ], + self.batched_codec_pipeline.read_batch, + runtime_configuration.concurrency, + ) + + async def write_batch( + self, + batch_info: Iterable[Tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], + value: np.ndarray, + runtime_configuration: RuntimeConfiguration, + ) -> None: + await concurrent_map( + [ + ( + single_batch_info, + value, + runtime_configuration, + ) + for single_batch_info in batched(batch_info, self.batch_size) + ], + self.batched_codec_pipeline.write_batch, + runtime_configuration.concurrency, + ) diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py index 0f3d5957e3..300b3637ed 100644 --- a/src/zarr/metadata.py +++ b/src/zarr/metadata.py @@ -370,8 +370,8 @@ def parse_v2_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: def parse_codecs(data: Iterable[Union[Codec, JSON]]) -> CodecPipeline: - from zarr.codecs.pipeline.batched import BatchedCodecPipeline + from zarr.codecs.pipeline.hybrid import HybridCodecPipeline if not isinstance(data, Iterable): raise TypeError(f"Expected iterable, got {type(data)}") - return BatchedCodecPipeline.from_dict(data) + return HybridCodecPipeline.from_dict(data) From c3e3504ee85fe3482148ede696aeecee19aad398 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 30 Apr 2024 18:16:44 +0200 Subject: [PATCH 05/21] fixes --- src/zarr/abc/codec.py | 12 ++++-------- src/zarr/codecs/__init__.py | 10 +++++----- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 6fed21ec7a..8f2ce9bcae 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -46,22 +46,18 @@ async def wrap( class ByteGetter(Protocol): async def get( self, byte_range: Optional[Tuple[int, Optional[int]]] = None - ) -> Optional[BytesLike]: - ... + ) -> Optional[BytesLike]: ... @runtime_checkable class ByteSetter(Protocol): async def get( self, byte_range: Optional[Tuple[int, Optional[int]]] = None - ) -> Optional[BytesLike]: - ... + ) -> Optional[BytesLike]: ... - async def set(self, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None) -> None: - ... + async def set(self, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None) -> None: ... - async def delete(self) -> None: - ... + async def delete(self) -> None: ... class Codec(Metadata): diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index 770fd9fb5a..60c52d5929 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -8,8 +8,8 @@ from zarr.codecs.transpose import TransposeCodec # noqa: F401 from zarr.codecs.zstd import ZstdCodec # noqa: F401 from zarr.codecs.pipeline import ( - CodecPipeline, - BatchedCodecPipeline, - InterleavedCodecPipeline, - HybridCodecPipeline, -) # noqa: F401 + CodecPipeline, # noqa: F401 + BatchedCodecPipeline, # noqa: F401 + InterleavedCodecPipeline, # noqa: F401 + HybridCodecPipeline, # noqa: F401 +) From a578d95db12b6769f23e11aafb563e7cdf8ef8fb Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 8 May 2024 20:34:47 +0200 Subject: [PATCH 06/21] typing --- pyproject.toml | 4 ---- src/zarr/codecs/sharding.py | 33 ++++++++++++++++----------------- src/zarr/common.py | 21 ++++++++++++++++++--- 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8d7dd41a68..1d6ce8eb2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -184,10 +184,6 @@ check_untyped_defs = false [[tool.mypy.overrides]] module = [ "zarr.abc.codec", - "zarr.codecs.bytes", - "zarr.codecs.pipeline.*", - "zarr.codecs.sharding", - "zarr.codecs.transpose", "zarr.array_v2", "zarr.array", "zarr.sync", diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index adbef995f0..8da250b9af 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -20,6 +20,8 @@ from zarr.codecs.registry import register_codec from zarr.common import ( ArraySpec, + ChunkCoords, + BytesLike, ChunkCoordsLike, parse_enum, parse_named_configuration, @@ -42,15 +44,12 @@ from typing import Awaitable, Callable, Dict, Iterator, Optional, Set from typing_extensions import Self - from zarr.common import ( - JSON, - ChunkCoords, - BytesLike, - SliceSelection, - ) + from zarr.common import JSON, SliceSelection from zarr.config import RuntimeConfiguration MAX_UINT_64 = 2**64 - 1 +ShardMapping = Mapping[ChunkCoords, BytesLike] +ShardMutableMapping = MutableMapping[ChunkCoords, BytesLike] class ShardingCodecIndexLocation(Enum): @@ -64,7 +63,7 @@ def parse_index_location(data: JSON) -> ShardingCodecIndexLocation: @dataclass(frozen=True) class _ShardingByteGetter(ByteGetter): - shard_dict: Mapping[ChunkCoords, BytesLike] + shard_dict: ShardMapping chunk_coords: ChunkCoords async def get( @@ -76,7 +75,7 @@ async def get( @dataclass(frozen=True) class _ShardingByteSetter(_ShardingByteGetter, ByteSetter): - shard_dict: MutableMapping[ChunkCoords, BytesLike] + shard_dict: ShardMutableMapping async def set(self, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None) -> None: assert byte_range is None, "byte_range is not supported within shards" @@ -152,7 +151,7 @@ def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardIndex: return cls(offsets_and_lengths) -class _ShardReader(Mapping): +class _ShardReader(ShardMapping): buf: BytesLike index: _ShardIndex @@ -179,11 +178,11 @@ def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardReader: obj.index = index return obj - def __getitem__(self, chunk_coords: ChunkCoords) -> Optional[BytesLike]: + def __getitem__(self, chunk_coords: ChunkCoords) -> BytesLike: chunk_byte_slice = self.index.get_chunk_slice(chunk_coords) if chunk_byte_slice: return self.buf[chunk_byte_slice[0] : chunk_byte_slice[1]] - return None + raise KeyError def __len__(self) -> int: return int(self.index.offsets_and_lengths.size / 2) @@ -195,7 +194,7 @@ def is_empty(self) -> bool: return self.index.is_all_empty() -class _ShardBuilder(_ShardReader, MutableMapping): +class _ShardBuilder(_ShardReader, ShardMutableMapping): buf: bytearray index: _ShardIndex @@ -204,7 +203,7 @@ def merge_with_morton_order( cls, chunks_per_shard: ChunkCoords, tombstones: Set[ChunkCoords], - *shard_dicts: Mapping[ChunkCoords, BytesLike], + *shard_dicts: ShardMapping, ) -> _ShardBuilder: obj = cls.create_empty(chunks_per_shard) for chunk_coords in morton_order_iter(chunks_per_shard): @@ -251,16 +250,16 @@ async def finalize( @dataclass(frozen=True) -class _MergingShardBuilder(MutableMapping): +class _MergingShardBuilder(ShardMutableMapping): old_dict: _ShardReader new_dict: _ShardBuilder tombstones: Set[ChunkCoords] = field(default_factory=set) - def __getitem__(self, chunk_coords: ChunkCoords) -> Optional[BytesLike]: + def __getitem__(self, chunk_coords: ChunkCoords) -> BytesLike: chunk_bytes_maybe = self.new_dict.get(chunk_coords) if chunk_bytes_maybe is not None: return chunk_bytes_maybe - return self.old_dict.get(chunk_coords) + return self.old_dict[chunk_coords] def __setitem__(self, chunk_coords: ChunkCoords, value: BytesLike) -> None: self.new_dict[chunk_coords] = value @@ -454,7 +453,7 @@ async def decode_partial( all_chunk_coords = set(chunk_coords for chunk_coords, _, _ in indexed_chunks) # reading bytes of all requested chunks - shard_dict: Mapping[ChunkCoords, BytesLike] = {} + shard_dict: ShardMapping = {} if self._is_total_shard(all_chunk_coords, chunks_per_shard): # read entire shard shard_dict_maybe = await self._load_full_shard_maybe(byte_getter, chunks_per_shard) diff --git a/src/zarr/common.py b/src/zarr/common.py index 6940ec3fe3..0dda632f3b 100644 --- a/src/zarr/common.py +++ b/src/zarr/common.py @@ -1,5 +1,16 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Union, Tuple, Iterable, Dict, List, TypeVar, overload, Any +from typing import ( + TYPE_CHECKING, + ParamSpec, + Union, + Tuple, + Iterable, + Dict, + List, + TypeVar, + overload, + Any, +) import asyncio import contextvars from dataclasses import dataclass @@ -7,7 +18,7 @@ import functools if TYPE_CHECKING: - from typing import Any, Awaitable, Callable, Iterator, Optional, Type + from typing import Awaitable, Callable, Iterator, Optional, Type import numpy as np @@ -48,7 +59,11 @@ async def run(item): return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items]) -async def to_thread(func, /, *args, **kwargs): +P = ParamSpec("P") +U = TypeVar("U") + + +async def to_thread(func: Callable[P, U], /, *args: P.args, **kwargs: P.kwargs) -> U: loop = asyncio.get_running_loop() ctx = contextvars.copy_context() func_call = functools.partial(ctx.run, func, *args, **kwargs) From e3cad7c4c40ab68e6d3f2cebe104ecf916aebd5d Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 8 May 2024 20:43:29 +0200 Subject: [PATCH 07/21] typing --- src/zarr/codecs/pipeline/hybrid.py | 6 ++++-- src/zarr/codecs/pipeline/interleaved.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/zarr/codecs/pipeline/hybrid.py b/src/zarr/codecs/pipeline/hybrid.py index 5d798469b3..0b8bae3a45 100644 --- a/src/zarr/codecs/pipeline/hybrid.py +++ b/src/zarr/codecs/pipeline/hybrid.py @@ -1,7 +1,7 @@ from __future__ import annotations from itertools import islice -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, TypeVar import numpy as np from dataclasses import dataclass @@ -22,8 +22,10 @@ DEFAULT_BATCH_SIZE = 1000 +T = TypeVar("T") -def batched(iterable, n): + +def batched(iterable: Iterable[T], n: int) -> Iterable[Tuple[T, ...]]: if n < 1: raise ValueError("n must be at least one") it = iter(iterable) diff --git a/src/zarr/codecs/pipeline/interleaved.py b/src/zarr/codecs/pipeline/interleaved.py index 10e1de9e92..1f82efb44a 100644 --- a/src/zarr/codecs/pipeline/interleaved.py +++ b/src/zarr/codecs/pipeline/interleaved.py @@ -252,7 +252,7 @@ async def write_single( value: np.ndarray, runtime_configuration: RuntimeConfiguration, ) -> None: - async def _write_chunk_to_store(chunk_array: np.ndarray): + async def _write_chunk_to_store(chunk_array: np.ndarray) -> None: if np.all(chunk_array == chunk_spec.fill_value): # chunks that only contain fill_value will be removed await byte_setter.delete() From 027ebb5d1b64e91a8ce6504270687fa967c7e4c7 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 8 May 2024 21:21:03 +0200 Subject: [PATCH 08/21] consistent naming --- src/zarr/abc/codec.py | 4 ++-- src/zarr/codecs/pipeline/batched.py | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 8f2ce9bcae..77d72d55fc 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -184,7 +184,7 @@ async def decode_partial( ) -> Optional[np.ndarray]: pass - async def decode_partial_batched( + async def decode_partial_batch( self, batch_info: Iterable[Tuple[ByteGetter, SliceSelection, ArraySpec]], runtime_configuration: RuntimeConfiguration, @@ -211,7 +211,7 @@ async def encode_partial( ) -> None: pass - async def encode_partial_batched( + async def encode_partial_batch( self, batch_info: Iterable[Tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], runtime_configuration: RuntimeConfiguration, diff --git a/src/zarr/codecs/pipeline/batched.py b/src/zarr/codecs/pipeline/batched.py index 5c72785a70..bc6f841456 100644 --- a/src/zarr/codecs/pipeline/batched.py +++ b/src/zarr/codecs/pipeline/batched.py @@ -105,9 +105,7 @@ async def decode_partial( ) -> Iterable[Optional[np.ndarray]]: assert self.supports_partial_decode assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) - return await self.array_bytes_codec.decode_partial_batched( - batch_info, runtime_configuration - ) + return await self.array_bytes_codec.decode_partial_batch(batch_info, runtime_configuration) async def encode( self, @@ -144,7 +142,7 @@ async def encode_partial( ) -> None: assert self.supports_partial_encode assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) - await self.array_bytes_codec.encode_partial_batched(batch_info, runtime_configuration) + await self.array_bytes_codec.encode_partial_batch(batch_info, runtime_configuration) async def read_batch( self, From 2bb00ae6f7bf1413c464e159d34c6f6c8cac5438 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 10 May 2024 11:37:18 +0200 Subject: [PATCH 09/21] Apply suggestions from code review Co-authored-by: Davis Bennett --- src/zarr/abc/codec.py | 48 ++++++++++++++--------------- src/zarr/codecs/pipeline/batched.py | 6 ++-- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 77d72d55fc..4dcd9302b5 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -30,11 +30,11 @@ def noop_for_none( - func: Callable[[T, ArraySpec, RuntimeConfiguration], Awaitable[Optional[U]]], -) -> Callable[[Optional[T], ArraySpec, RuntimeConfiguration], Awaitable[Optional[U]]]: + func: Callable[[T, ArraySpec, RuntimeConfiguration], Awaitable[U | None]], +) -> Callable[[T | None, ArraySpec, RuntimeConfiguration], Awaitable[U | None]]: async def wrap( - chunk: Optional[T], chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration - ) -> Optional[U]: + chunk: T | None, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration + ) -> U | None: if chunk is None: return None return await func(chunk, chunk_spec, runtime_configuration) @@ -45,17 +45,17 @@ async def wrap( @runtime_checkable class ByteGetter(Protocol): async def get( - self, byte_range: Optional[Tuple[int, Optional[int]]] = None - ) -> Optional[BytesLike]: ... + self, byte_range: tuple[int, int | None] | None = None + ) -> BytesLike | None: ... @runtime_checkable class ByteSetter(Protocol): async def get( - self, byte_range: Optional[Tuple[int, Optional[int]]] = None - ) -> Optional[BytesLike]: ... + self, byte_range: tuple[int, int | None] | None = None + ) -> BytesLike | None: ... - async def set(self, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None) -> None: ... + async def set(self, value: BytesLike, byte_range: tuple[int, int] | None = None) -> None: ... async def delete(self) -> None: ... @@ -89,9 +89,9 @@ async def decode( async def decode_batch( self, - chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], + chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> Iterable[Optional[np.ndarray]]: + ) -> Iterable[np.ndarray | None]: return await concurrent_map( [ (chunk_array, chunk_spec, runtime_configuration) @@ -112,9 +112,9 @@ async def encode( async def encode_batch( self, - chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], + chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> Iterable[Optional[np.ndarray]]: + ) -> Iterable[np.ndarray | None]: return await concurrent_map( [ (chunk_array, chunk_spec, runtime_configuration) @@ -137,9 +137,9 @@ async def decode( async def decode_batch( self, - chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], + chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> Iterable[Optional[np.ndarray]]: + ) -> Iterable[np.ndarray | None]: return await concurrent_map( [ (chunk_bytes, chunk_spec, runtime_configuration) @@ -160,9 +160,9 @@ async def encode( async def encode_batch( self, - chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], + chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> Iterable[Optional[BytesLike]]: + ) -> Iterable[BytesLike | None]: return await concurrent_map( [ (chunk_array, chunk_spec, runtime_configuration) @@ -186,9 +186,9 @@ async def decode_partial( async def decode_partial_batch( self, - batch_info: Iterable[Tuple[ByteGetter, SliceSelection, ArraySpec]], + batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> Iterable[Optional[np.ndarray]]: + ) -> Iterable[np.ndarray | None]: return await concurrent_map( [ (byte_getter, selection, chunk_spec, runtime_configuration) @@ -213,7 +213,7 @@ async def encode_partial( async def encode_partial_batch( self, - batch_info: Iterable[Tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], + batch_info: Iterable[tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> None: await concurrent_map( @@ -238,9 +238,9 @@ async def decode( async def decode_batch( self, - chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], + chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> Iterable[Optional[BytesLike]]: + ) -> Iterable[BytesLike | None]: return await concurrent_map( [ (chunk_bytes, chunk_spec, runtime_configuration) @@ -261,9 +261,9 @@ async def encode( async def encode_batch( self, - chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], + chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], runtime_configuration: RuntimeConfiguration, - ) -> Iterable[Optional[BytesLike]]: + ) -> Iterable[BytesLike | None]: return await concurrent_map( [ (chunk_bytes, chunk_spec, runtime_configuration) diff --git a/src/zarr/codecs/pipeline/batched.py b/src/zarr/codecs/pipeline/batched.py index bc6f841456..3284691462 100644 --- a/src/zarr/codecs/pipeline/batched.py +++ b/src/zarr/codecs/pipeline/batched.py @@ -27,9 +27,9 @@ U = TypeVar("U") -def _unzip2(iterable: Iterable[Tuple[T, U]]) -> Tuple[List[T], List[U]]: - out0: List[T] = [] - out1: List[U] = [] +def _unzip2(iterable: Iterable[tuple[T, U]]) -> tuple[list[T], list[U]]: + out0: list[T] = [] + out1: list[U] = [] for item0, item1 in iterable: out0.append(item0) out1.append(item1) From 56877ee725b0641e5148c78932a9272d2c52c678 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 10 May 2024 13:48:00 +0200 Subject: [PATCH 10/21] encode/decode are batched by default --- src/zarr/abc/codec.py | 179 +------------ src/zarr/codecs/__init__.py | 7 +- src/zarr/codecs/blosc.py | 8 +- src/zarr/codecs/bytes.py | 8 +- src/zarr/codecs/crc32c_.py | 8 +- src/zarr/codecs/gzip.py | 8 +- src/zarr/codecs/mixins.py | 217 ++++++++++++++++ src/zarr/codecs/pipeline/__init__.py | 1 - src/zarr/codecs/pipeline/batched.py | 16 +- src/zarr/codecs/pipeline/interleaved.py | 329 ------------------------ src/zarr/codecs/sharding.py | 32 +-- src/zarr/codecs/transpose.py | 11 +- src/zarr/codecs/zstd.py | 8 +- 13 files changed, 279 insertions(+), 553 deletions(-) create mode 100644 src/zarr/codecs/mixins.py delete mode 100644 src/zarr/codecs/pipeline/interleaved.py diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 4dcd9302b5..132c4d3fb6 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -1,59 +1,27 @@ from __future__ import annotations from abc import abstractmethod -from typing import ( - TYPE_CHECKING, - Awaitable, - Callable, - Iterable, - Optional, - Protocol, - Tuple, - TypeVar, - runtime_checkable, -) +from typing import TYPE_CHECKING, Iterable, Protocol, runtime_checkable import numpy as np from zarr.abc.metadata import Metadata -from zarr.common import ArraySpec, concurrent_map - if TYPE_CHECKING: from typing_extensions import Self - from zarr.common import BytesLike, SliceSelection + from zarr.common import ArraySpec, BytesLike, SliceSelection from zarr.metadata import ArrayMetadata from zarr.config import RuntimeConfiguration -T = TypeVar("T") -U = TypeVar("U") - - -def noop_for_none( - func: Callable[[T, ArraySpec, RuntimeConfiguration], Awaitable[U | None]], -) -> Callable[[T | None, ArraySpec, RuntimeConfiguration], Awaitable[U | None]]: - async def wrap( - chunk: T | None, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration - ) -> U | None: - if chunk is None: - return None - return await func(chunk, chunk_spec, runtime_configuration) - - return wrap - @runtime_checkable class ByteGetter(Protocol): - async def get( - self, byte_range: tuple[int, int | None] | None = None - ) -> BytesLike | None: ... + async def get(self, byte_range: tuple[int, int | None] | None = None) -> BytesLike | None: ... @runtime_checkable class ByteSetter(Protocol): - async def get( - self, byte_range: tuple[int, int | None] | None = None - ) -> BytesLike | None: ... + async def get(self, byte_range: tuple[int, int | None] | None = None) -> BytesLike | None: ... async def set(self, value: BytesLike, byte_range: tuple[int, int] | None = None) -> None: ... @@ -80,195 +48,72 @@ def validate(self, array_metadata: ArrayMetadata) -> None: class ArrayArrayCodec(Codec): @abstractmethod async def decode( - self, - chunk_array: np.ndarray, - chunk_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - ) -> np.ndarray: - pass - - async def decode_batch( self, chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> Iterable[np.ndarray | None]: - return await concurrent_map( - [ - (chunk_array, chunk_spec, runtime_configuration) - for chunk_array, chunk_spec in chunk_arrays_and_specs - ], - noop_for_none(self.decode), - runtime_configuration.concurrency, - ) + pass @abstractmethod async def encode( - self, - chunk_array: np.ndarray, - chunk_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - ) -> Optional[np.ndarray]: - pass - - async def encode_batch( self, chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> Iterable[np.ndarray | None]: - return await concurrent_map( - [ - (chunk_array, chunk_spec, runtime_configuration) - for chunk_array, chunk_spec in chunk_arrays_and_specs - ], - noop_for_none(self.encode), - runtime_configuration.concurrency, - ) + pass class ArrayBytesCodec(Codec): @abstractmethod async def decode( - self, - chunk_bytes: BytesLike, - chunk_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - ) -> np.ndarray: - pass - - async def decode_batch( self, chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> Iterable[np.ndarray | None]: - return await concurrent_map( - [ - (chunk_bytes, chunk_spec, runtime_configuration) - for chunk_bytes, chunk_spec in chunk_bytes_and_specs - ], - noop_for_none(self.decode), - runtime_configuration.concurrency, - ) + pass @abstractmethod async def encode( - self, - chunk_array: np.ndarray, - chunk_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: - pass - - async def encode_batch( self, chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> Iterable[BytesLike | None]: - return await concurrent_map( - [ - (chunk_array, chunk_spec, runtime_configuration) - for chunk_array, chunk_spec in chunk_arrays_and_specs - ], - noop_for_none(self.encode), - runtime_configuration.concurrency, - ) + pass class ArrayBytesCodecPartialDecodeMixin: @abstractmethod async def decode_partial( - self, - byte_getter: ByteGetter, - selection: SliceSelection, - chunk_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - ) -> Optional[np.ndarray]: - pass - - async def decode_partial_batch( self, batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> Iterable[np.ndarray | None]: - return await concurrent_map( - [ - (byte_getter, selection, chunk_spec, runtime_configuration) - for byte_getter, selection, chunk_spec in batch_info - ], - self.decode_partial, - runtime_configuration.concurrency, - ) + pass class ArrayBytesCodecPartialEncodeMixin: @abstractmethod async def encode_partial( - self, - byte_setter: ByteSetter, - chunk_array: np.ndarray, - selection: SliceSelection, - chunk_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - ) -> None: - pass - - async def encode_partial_batch( self, batch_info: Iterable[tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> None: - await concurrent_map( - [ - (byte_setter, chunk_array, selection, chunk_spec, runtime_configuration) - for byte_setter, chunk_array, selection, chunk_spec in batch_info - ], - self.encode_partial, - runtime_configuration.concurrency, - ) + pass class BytesBytesCodec(Codec): @abstractmethod async def decode( - self, - chunk_bytes: BytesLike, - chunk_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - ) -> BytesLike: - pass - - async def decode_batch( self, chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> Iterable[BytesLike | None]: - return await concurrent_map( - [ - (chunk_bytes, chunk_spec, runtime_configuration) - for chunk_bytes, chunk_spec in chunk_bytes_and_specs - ], - noop_for_none(self.decode), - runtime_configuration.concurrency, - ) + pass @abstractmethod async def encode( - self, - chunk_array: BytesLike, - chunk_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: - pass - - async def encode_batch( self, chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], runtime_configuration: RuntimeConfiguration, ) -> Iterable[BytesLike | None]: - return await concurrent_map( - [ - (chunk_bytes, chunk_spec, runtime_configuration) - for chunk_bytes, chunk_spec in chunk_bytes_and_specs - ], - noop_for_none(self.encode), - runtime_configuration.concurrency, - ) + pass diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index 60c52d5929..f3a3e84bc7 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -7,9 +7,4 @@ from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation # noqa: F401 from zarr.codecs.transpose import TransposeCodec # noqa: F401 from zarr.codecs.zstd import ZstdCodec # noqa: F401 -from zarr.codecs.pipeline import ( - CodecPipeline, # noqa: F401 - BatchedCodecPipeline, # noqa: F401 - InterleavedCodecPipeline, # noqa: F401 - HybridCodecPipeline, # noqa: F401 -) +from zarr.codecs.pipeline import CodecPipeline, BatchedCodecPipeline, HybridCodecPipeline # noqa: F401 diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 374375e6c2..e9cbd7a857 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -9,7 +9,7 @@ import numpy as np from numcodecs.blosc import Blosc -from zarr.abc.codec import BytesBytesCodec +from zarr.codecs.mixins import BytesBytesCodecBatchMixin from zarr.codecs.registry import register_codec from zarr.common import parse_enum, parse_named_configuration, to_thread @@ -75,7 +75,7 @@ def parse_blocksize(data: JSON) -> int: @dataclass(frozen=True) -class BloscCodec(BytesBytesCodec): +class BloscCodec(BytesBytesCodecBatchMixin): is_fixed_size = False typesize: int @@ -159,7 +159,7 @@ def _blosc_codec(self) -> Blosc: } return Blosc.from_config(config_dict) - async def decode( + async def decode_single( self, chunk_bytes: bytes, _chunk_spec: ArraySpec, @@ -167,7 +167,7 @@ async def decode( ) -> BytesLike: return await to_thread(self._blosc_codec.decode, chunk_bytes) - async def encode( + async def encode_single( self, chunk_bytes: bytes, chunk_spec: ArraySpec, diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index aa24c3167e..6b94ae2a3e 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -7,7 +7,7 @@ import numpy as np -from zarr.abc.codec import ArrayBytesCodec +from zarr.codecs.mixins import ArrayBytesCodecBatchMixin from zarr.codecs.registry import register_codec from zarr.common import parse_enum, parse_named_configuration @@ -26,7 +26,7 @@ class Endian(Enum): @dataclass(frozen=True) -class BytesCodec(ArrayBytesCodec): +class BytesCodec(ArrayBytesCodecBatchMixin): is_fixed_size = True endian: Optional[Endian] @@ -68,7 +68,7 @@ def _get_byteorder(self, array: np.ndarray) -> Endian: else: return default_system_endian - async def decode( + async def decode_single( self, chunk_bytes: BytesLike, chunk_spec: ArraySpec, @@ -91,7 +91,7 @@ async def decode( ) return chunk_array - async def encode( + async def encode_single( self, chunk_array: np.ndarray, _chunk_spec: ArraySpec, diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index 04d5b88d70..cb274eddcf 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -7,7 +7,7 @@ from crc32c import crc32c -from zarr.abc.codec import BytesBytesCodec +from zarr.codecs.mixins import BytesBytesCodecBatchMixin from zarr.codecs.registry import register_codec from zarr.common import parse_named_configuration @@ -19,7 +19,7 @@ @dataclass(frozen=True) -class Crc32cCodec(BytesBytesCodec): +class Crc32cCodec(BytesBytesCodecBatchMixin): is_fixed_size = True @classmethod @@ -30,7 +30,7 @@ def from_dict(cls, data: Dict[str, JSON]) -> Self: def to_dict(self) -> Dict[str, JSON]: return {"name": "crc32c"} - async def decode( + async def decode_single( self, chunk_bytes: bytes, _chunk_spec: ArraySpec, @@ -48,7 +48,7 @@ async def decode( ) return inner_bytes - async def encode( + async def encode_single( self, chunk_bytes: bytes, _chunk_spec: ArraySpec, diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index f75f5b743e..567ac88102 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING from numcodecs.gzip import GZip -from zarr.abc.codec import BytesBytesCodec +from zarr.codecs.mixins import BytesBytesCodecBatchMixin from zarr.codecs.registry import register_codec from zarr.common import parse_named_configuration, to_thread @@ -26,7 +26,7 @@ def parse_gzip_level(data: JSON) -> int: @dataclass(frozen=True) -class GzipCodec(BytesBytesCodec): +class GzipCodec(BytesBytesCodecBatchMixin): is_fixed_size = False level: int = 5 @@ -44,7 +44,7 @@ def from_dict(cls, data: Dict[str, JSON]) -> Self: def to_dict(self) -> Dict[str, JSON]: return {"name": "gzip", "configuration": {"level": self.level}} - async def decode( + async def decode_single( self, chunk_bytes: bytes, _chunk_spec: ArraySpec, @@ -52,7 +52,7 @@ async def decode( ) -> BytesLike: return await to_thread(GZip(self.level).decode, chunk_bytes) - async def encode( + async def encode_single( self, chunk_bytes: bytes, _chunk_spec: ArraySpec, diff --git a/src/zarr/codecs/mixins.py b/src/zarr/codecs/mixins.py new file mode 100644 index 0000000000..501f2c9fa5 --- /dev/null +++ b/src/zarr/codecs/mixins.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +from abc import abstractmethod +from typing import Awaitable, Callable, Iterable, TypeVar + +import numpy as np + +from zarr.abc.codec import ( + ArrayArrayCodec, + ArrayBytesCodec, + ArrayBytesCodecPartialDecodeMixin, + ArrayBytesCodecPartialEncodeMixin, + ByteGetter, + ByteSetter, + BytesBytesCodec, +) +from zarr.common import ArraySpec, BytesLike, SliceSelection, concurrent_map +from zarr.config import RuntimeConfiguration + + +CodecInput = TypeVar("CodecInput", bound=np.ndarray | BytesLike) +CodecOutput = TypeVar("CodecOutput", bound=np.ndarray | BytesLike) + + +async def batching_helper( + func: Callable[[CodecInput, ArraySpec, RuntimeConfiguration], Awaitable[CodecOutput | None]], + batch_info: Iterable[tuple[CodecInput | None, ArraySpec]], + runtime_configuration: RuntimeConfiguration, +) -> list[CodecOutput | None]: + return await concurrent_map( + [ + (chunk_array, chunk_spec, runtime_configuration) + for chunk_array, chunk_spec in batch_info + ], + noop_for_none(func), + runtime_configuration.concurrency, + ) + + +def noop_for_none( + func: Callable[[CodecInput, ArraySpec, RuntimeConfiguration], Awaitable[CodecOutput | None]], +) -> Callable[[CodecInput | None, ArraySpec, RuntimeConfiguration], Awaitable[CodecOutput | None]]: + async def wrap( + chunk: CodecInput | None, chunk_spec: ArraySpec, runtime_configuration: RuntimeConfiguration + ) -> CodecOutput | None: + if chunk is None: + return None + return await func(chunk, chunk_spec, runtime_configuration) + + return wrap + + +class ArrayArrayCodecBatchMixin(ArrayArrayCodec): + @abstractmethod + async def decode_single( + self, + chunk_array: np.ndarray, + chunk_spec: ArraySpec, + runtime_configuration: RuntimeConfiguration, + ) -> np.ndarray: + pass + + async def decode( + self, + chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[np.ndarray | None]: + return await batching_helper( + self.decode_single, chunk_arrays_and_specs, runtime_configuration + ) + + @abstractmethod + async def encode_single( + self, + chunk_array: np.ndarray, + chunk_spec: ArraySpec, + runtime_configuration: RuntimeConfiguration, + ) -> np.ndarray | None: + pass + + async def encode( + self, + chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[np.ndarray | None]: + return await batching_helper( + self.encode_single, chunk_arrays_and_specs, runtime_configuration + ) + + +class ArrayBytesCodecBatchMixin(ArrayBytesCodec): + @abstractmethod + async def decode_single( + self, + chunk_bytes: BytesLike, + chunk_spec: ArraySpec, + runtime_configuration: RuntimeConfiguration, + ) -> np.ndarray: + pass + + async def decode( + self, + chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[np.ndarray | None]: + return await batching_helper( + self.decode_single, chunk_bytes_and_specs, runtime_configuration + ) + + @abstractmethod + async def encode_single( + self, + chunk_array: np.ndarray, + chunk_spec: ArraySpec, + runtime_configuration: RuntimeConfiguration, + ) -> BytesLike | None: + pass + + async def encode( + self, + chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[BytesLike | None]: + return await batching_helper( + self.encode_single, chunk_arrays_and_specs, runtime_configuration + ) + + +class ArrayBytesCodecPartialDecodeBatchMixin(ArrayBytesCodecPartialDecodeMixin): + @abstractmethod + async def decode_partial_single( + self, + byte_getter: ByteGetter, + selection: SliceSelection, + chunk_spec: ArraySpec, + runtime_configuration: RuntimeConfiguration, + ) -> np.ndarray | None: + pass + + async def decode_partial( + self, + batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[np.ndarray | None]: + return await concurrent_map( + [ + (byte_getter, selection, chunk_spec, runtime_configuration) + for byte_getter, selection, chunk_spec in batch_info + ], + self.decode_partial_single, + runtime_configuration.concurrency, + ) + + +class ArrayBytesCodecPartialEncodeBatchMixin(ArrayBytesCodecPartialEncodeMixin): + @abstractmethod + async def encode_partial_single( + self, + byte_setter: ByteSetter, + chunk_array: np.ndarray, + selection: SliceSelection, + chunk_spec: ArraySpec, + runtime_configuration: RuntimeConfiguration, + ) -> None: + pass + + async def encode_partial( + self, + batch_info: Iterable[tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> None: + await concurrent_map( + [ + (byte_setter, chunk_array, selection, chunk_spec, runtime_configuration) + for byte_setter, chunk_array, selection, chunk_spec in batch_info + ], + self.encode_partial_single, + runtime_configuration.concurrency, + ) + + +class BytesBytesCodecBatchMixin(BytesBytesCodec): + @abstractmethod + async def decode_single( + self, + chunk_bytes: BytesLike, + chunk_spec: ArraySpec, + runtime_configuration: RuntimeConfiguration, + ) -> BytesLike: + pass + + async def decode( + self, + chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[BytesLike | None]: + return await batching_helper( + self.decode_single, chunk_bytes_and_specs, runtime_configuration + ) + + @abstractmethod + async def encode_single( + self, + chunk_array: BytesLike, + chunk_spec: ArraySpec, + runtime_configuration: RuntimeConfiguration, + ) -> BytesLike | None: + pass + + async def encode( + self, + chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], + runtime_configuration: RuntimeConfiguration, + ) -> Iterable[BytesLike | None]: + return await batching_helper( + self.encode_single, chunk_bytes_and_specs, runtime_configuration + ) diff --git a/src/zarr/codecs/pipeline/__init__.py b/src/zarr/codecs/pipeline/__init__.py index 2a83e36805..82439ba48b 100644 --- a/src/zarr/codecs/pipeline/__init__.py +++ b/src/zarr/codecs/pipeline/__init__.py @@ -3,4 +3,3 @@ from zarr.codecs.pipeline.core import CodecPipeline # noqa: F401 from zarr.codecs.pipeline.hybrid import HybridCodecPipeline # noqa: F401 from zarr.codecs.pipeline.batched import BatchedCodecPipeline # noqa: F401 -from zarr.codecs.pipeline.interleaved import InterleavedCodecPipeline # noqa: F401 diff --git a/src/zarr/codecs/pipeline/batched.py b/src/zarr/codecs/pipeline/batched.py index 3284691462..72b493a365 100644 --- a/src/zarr/codecs/pipeline/batched.py +++ b/src/zarr/codecs/pipeline/batched.py @@ -82,17 +82,17 @@ async def decode( ) = self._codecs_with_resolved_metadata_batched(chunk_specs) for bb_codec, chunk_spec_batch in bb_codecs_with_spec[::-1]: - chunk_bytes_batch = await bb_codec.decode_batch( + chunk_bytes_batch = await bb_codec.decode( zip(chunk_bytes_batch, chunk_spec_batch), runtime_configuration ) ab_codec, chunk_spec_batch = ab_codec_with_spec - chunk_array_batch = await ab_codec.decode_batch( + chunk_array_batch = await ab_codec.decode( zip(chunk_bytes_batch, chunk_spec_batch), runtime_configuration ) for aa_codec, chunk_spec_batch in aa_codecs_with_spec[::-1]: - chunk_array_batch = await aa_codec.decode_batch( + chunk_array_batch = await aa_codec.decode( zip(chunk_array_batch, chunk_spec_batch), runtime_configuration ) @@ -105,7 +105,7 @@ async def decode_partial( ) -> Iterable[Optional[np.ndarray]]: assert self.supports_partial_decode assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) - return await self.array_bytes_codec.decode_partial_batch(batch_info, runtime_configuration) + return await self.array_bytes_codec.decode_partial(batch_info, runtime_configuration) async def encode( self, @@ -117,18 +117,18 @@ async def encode( chunk_array_batch, chunk_specs = _unzip2(chunk_arrays_and_specs) for aa_codec in self.array_array_codecs: - chunk_array_batch = await aa_codec.encode_batch( + chunk_array_batch = await aa_codec.encode( zip(chunk_array_batch, chunk_specs), runtime_configuration ) chunk_specs = resolve_batched(aa_codec, chunk_specs) - chunk_bytes_batch = await self.array_bytes_codec.encode_batch( + chunk_bytes_batch = await self.array_bytes_codec.encode( zip(chunk_array_batch, chunk_specs), runtime_configuration ) chunk_specs = resolve_batched(self.array_bytes_codec, chunk_specs) for bb_codec in self.bytes_bytes_codecs: - chunk_bytes_batch = await bb_codec.encode_batch( + chunk_bytes_batch = await bb_codec.encode( zip(chunk_bytes_batch, chunk_specs), runtime_configuration ) chunk_specs = resolve_batched(bb_codec, chunk_specs) @@ -142,7 +142,7 @@ async def encode_partial( ) -> None: assert self.supports_partial_encode assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) - await self.array_bytes_codec.encode_partial_batch(batch_info, runtime_configuration) + await self.array_bytes_codec.encode_partial(batch_info, runtime_configuration) async def read_batch( self, diff --git a/src/zarr/codecs/pipeline/interleaved.py b/src/zarr/codecs/pipeline/interleaved.py deleted file mode 100644 index 1f82efb44a..0000000000 --- a/src/zarr/codecs/pipeline/interleaved.py +++ /dev/null @@ -1,329 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING -import numpy as np -from dataclasses import dataclass - -from zarr.abc.codec import ( - ByteGetter, - ByteSetter, - ArrayArrayCodec, - ArrayBytesCodec, - ArrayBytesCodecPartialDecodeMixin, - ArrayBytesCodecPartialEncodeMixin, - BytesBytesCodec, -) -from zarr.codecs.pipeline.core import CodecPipeline -from zarr.common import concurrent_map -from zarr.indexing import is_total_slice - -if TYPE_CHECKING: - from typing import List, Optional, Tuple, Iterable - from zarr.config import RuntimeConfiguration - from zarr.common import ArraySpec, BytesLike, SliceSelection - - -@dataclass(frozen=True) -class InterleavedCodecPipeline(CodecPipeline): - def _codecs_with_resolved_metadata( - self, chunk_spec: ArraySpec - ) -> Tuple[ - List[Tuple[ArrayArrayCodec, ArraySpec]], - Tuple[ArrayBytesCodec, ArraySpec], - List[Tuple[BytesBytesCodec, ArraySpec]], - ]: - aa_codecs_with_spec: List[Tuple[ArrayArrayCodec, ArraySpec]] = [] - for aa_codec in self.array_array_codecs: - aa_codecs_with_spec.append((aa_codec, chunk_spec)) - chunk_spec = aa_codec.resolve_metadata(chunk_spec) - - ab_codec_with_spec = (self.array_bytes_codec, chunk_spec) - chunk_spec = self.array_bytes_codec.resolve_metadata(chunk_spec) - - bb_codecs_with_spec: List[Tuple[BytesBytesCodec, ArraySpec]] = [] - for bb_codec in self.bytes_bytes_codecs: - bb_codecs_with_spec.append((bb_codec, chunk_spec)) - chunk_spec = bb_codec.resolve_metadata(chunk_spec) - - return (aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec) - - async def decode( - self, - chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], - runtime_configuration: RuntimeConfiguration, - ) -> Iterable[Optional[np.ndarray]]: - return await concurrent_map( - [ - (chunk_bytes, chunk_spec, runtime_configuration) - for chunk_bytes, chunk_spec in chunk_bytes_and_specs - ], - self.decode_single, - runtime_configuration.concurrency, - ) - - async def decode_single( - self, - chunk_bytes: Optional[BytesLike], - chunk_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - ) -> Optional[np.ndarray]: - if chunk_bytes is None: - return None - - ( - aa_codecs_with_spec, - ab_codec_with_spec, - bb_codecs_with_spec, - ) = self._codecs_with_resolved_metadata(chunk_spec) - - for bb_codec, chunk_spec in bb_codecs_with_spec[::-1]: - chunk_bytes = await bb_codec.decode(chunk_bytes, chunk_spec, runtime_configuration) - - ab_codec, chunk_spec = ab_codec_with_spec - chunk_array = await ab_codec.decode(chunk_bytes, chunk_spec, runtime_configuration) - - for aa_codec, chunk_spec in aa_codecs_with_spec[::-1]: - chunk_array = await aa_codec.decode(chunk_array, chunk_spec, runtime_configuration) - - return chunk_array - - async def decode_partial( - self, - batch_info: Iterable[Tuple[ByteGetter, SliceSelection, ArraySpec]], - runtime_configuration: RuntimeConfiguration, - ) -> Iterable[Optional[np.ndarray]]: - return await concurrent_map( - [ - (byte_getter, selection, chunk_spec, runtime_configuration) - for byte_getter, selection, chunk_spec in batch_info - ], - self.decode_partial_single, - runtime_configuration.concurrency, - ) - - async def decode_partial_single( - self, - byte_getter: ByteGetter, - selection: SliceSelection, - chunk_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - ) -> Optional[np.ndarray]: - assert self.supports_partial_decode - assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) - return await self.array_bytes_codec.decode_partial( - byte_getter, selection, chunk_spec, runtime_configuration - ) - - async def encode( - self, - chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], - runtime_configuration: RuntimeConfiguration, - ) -> Iterable[Optional[BytesLike]]: - return await concurrent_map( - [ - (chunk_array, chunk_spec, runtime_configuration) - for chunk_array, chunk_spec in chunk_arrays_and_specs - ], - self.encode_single, - runtime_configuration.concurrency, - ) - - async def encode_single( - self, - chunk_array: Optional[np.ndarray], - chunk_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - ) -> Optional[BytesLike]: - if chunk_array is None: - return None - - ( - aa_codecs_with_spec, - ab_codec_with_spec, - bb_codecs_with_spec, - ) = self._codecs_with_resolved_metadata(chunk_spec) - - for aa_codec, chunk_spec in aa_codecs_with_spec: - chunk_array_maybe = await aa_codec.encode( - chunk_array, chunk_spec, runtime_configuration - ) - if chunk_array_maybe is None: - return None - chunk_array = chunk_array_maybe - - ab_codec, array_spec = ab_codec_with_spec - chunk_bytes_maybe = await ab_codec.encode(chunk_array, array_spec, runtime_configuration) - if chunk_bytes_maybe is None: - return None - chunk_bytes = chunk_bytes_maybe - - for bb_codec, array_spec in bb_codecs_with_spec: - chunk_bytes_maybe = await bb_codec.encode( - chunk_bytes, array_spec, runtime_configuration - ) - if chunk_bytes_maybe is None: - return None - chunk_bytes = chunk_bytes_maybe - - return chunk_bytes - - async def encode_partial( - self, - batch_info: Iterable[Tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], - runtime_configuration: RuntimeConfiguration, - ) -> None: - await concurrent_map( - [ - (byte_setter, chunk_array, selection, chunk_spec, runtime_configuration) - for byte_setter, chunk_array, selection, chunk_spec in batch_info - ], - self.encode_partial_single, - runtime_configuration.concurrency, - ) - - async def encode_partial_single( - self, - byte_setter: ByteSetter, - chunk_array: np.ndarray, - selection: SliceSelection, - chunk_spec: ArraySpec, - runtime_configuration: RuntimeConfiguration, - ) -> None: - assert self.supports_partial_encode - assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) - await self.array_bytes_codec.encode_partial( - byte_setter, chunk_array, selection, chunk_spec, runtime_configuration - ) - - async def read_single( - self, - byte_getter: ByteGetter, - chunk_spec: ArraySpec, - chunk_selection: SliceSelection, - out_selection: SliceSelection, - out: np.ndarray, - runtime_configuration: RuntimeConfiguration, - ) -> None: - if self.supports_partial_decode: - chunk_array = await self.decode_partial_single( - byte_getter, chunk_selection, chunk_spec, runtime_configuration - ) - if chunk_array is not None: - out[out_selection] = chunk_array - else: - out[out_selection] = chunk_spec.fill_value - else: - chunk_bytes = await byte_getter.get() - chunk_array = await self.decode_single(chunk_bytes, chunk_spec, runtime_configuration) - if chunk_array is not None: - tmp = chunk_array[chunk_selection] - out[out_selection] = tmp - else: - out[out_selection] = chunk_spec.fill_value - - async def read_batch( - self, - batch_info: Iterable[Tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], - out: np.ndarray, - runtime_configuration: RuntimeConfiguration, - ) -> None: - await concurrent_map( - [ - ( - byte_getter, - chunk_spec, - chunk_selection, - out_selection, - out, - runtime_configuration, - ) - for byte_getter, chunk_spec, chunk_selection, out_selection in batch_info - ], - self.read_single, - runtime_configuration.concurrency, - ) - - async def write_single( - self, - byte_setter: ByteSetter, - chunk_spec: ArraySpec, - chunk_selection: SliceSelection, - out_selection: SliceSelection, - value: np.ndarray, - runtime_configuration: RuntimeConfiguration, - ) -> None: - async def _write_chunk_to_store(chunk_array: np.ndarray) -> None: - if np.all(chunk_array == chunk_spec.fill_value): - # chunks that only contain fill_value will be removed - await byte_setter.delete() - else: - chunk_bytes = await self.encode_single( - chunk_array, chunk_spec, runtime_configuration - ) - if chunk_bytes is None: - await byte_setter.delete() - else: - await byte_setter.set(chunk_bytes) - - if is_total_slice(chunk_selection, chunk_spec.shape): - # write entire chunks - if np.isscalar(value): - chunk_array = np.empty( - chunk_spec.shape, - dtype=chunk_spec.dtype, - ) - chunk_array.fill(value) - else: - chunk_array = value[out_selection] - await _write_chunk_to_store(chunk_array) - - elif self.supports_partial_encode: - await self.encode_partial_single( - byte_setter, - value[out_selection], - chunk_selection, - chunk_spec, - runtime_configuration, - ) - else: - # writing partial chunks - # read chunk first - chunk_bytes = await byte_setter.get() - - # merge new value - chunk_array_maybe = await self.decode_single( - chunk_bytes, chunk_spec, runtime_configuration - ) - if chunk_array_maybe is None: - chunk_array = np.empty( - chunk_spec.shape, - dtype=chunk_spec.dtype, - ) - chunk_array.fill(chunk_spec.fill_value) - else: - chunk_array = chunk_array_maybe.copy() # make a writable copy - chunk_array[chunk_selection] = value[out_selection] - - await _write_chunk_to_store(chunk_array) - - async def write_batch( - self, - batch_info: Iterable[Tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], - value: np.ndarray, - runtime_configuration: RuntimeConfiguration, - ) -> None: - await concurrent_map( - [ - ( - byte_setter, - chunk_spec, - chunk_selection, - out_selection, - value, - runtime_configuration, - ) - for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info - ], - self.write_single, - runtime_configuration.concurrency, - ) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 8da250b9af..e514ba7263 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -6,17 +6,15 @@ import numpy as np -from zarr.abc.codec import ( - ByteGetter, - ByteSetter, - Codec, - ArrayBytesCodec, - ArrayBytesCodecPartialDecodeMixin, - ArrayBytesCodecPartialEncodeMixin, -) +from zarr.abc.codec import ByteGetter, ByteSetter, Codec from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec -from zarr.codecs.pipeline import CodecPipeline, InterleavedCodecPipeline +from zarr.codecs.mixins import ( + ArrayBytesCodecBatchMixin, + ArrayBytesCodecPartialDecodeBatchMixin, + ArrayBytesCodecPartialEncodeBatchMixin, +) +from zarr.codecs.pipeline import CodecPipeline, HybridCodecPipeline from zarr.codecs.registry import register_codec from zarr.common import ( ArraySpec, @@ -298,7 +296,9 @@ async def finalize( @dataclass(frozen=True) class ShardingCodec( - ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin + ArrayBytesCodecBatchMixin, + ArrayBytesCodecPartialDecodeBatchMixin, + ArrayBytesCodecPartialEncodeBatchMixin, ): chunk_shape: ChunkCoords codecs: CodecPipeline @@ -317,12 +317,12 @@ def __init__( codecs_parsed = ( parse_codecs(codecs) if codecs is not None - else InterleavedCodecPipeline.from_list([BytesCodec()]) + else HybridCodecPipeline.from_list([BytesCodec()]) ) index_codecs_parsed = ( parse_codecs(index_codecs) if index_codecs is not None - else InterleavedCodecPipeline.from_list([BytesCodec(), Crc32cCodec()]) + else HybridCodecPipeline.from_list([BytesCodec(), Crc32cCodec()]) ) index_location_parsed = ( parse_index_location(index_location) @@ -378,7 +378,7 @@ def validate(self, array_metadata: ArrayMetadata) -> None: + "shard's inner `chunk_shape`." ) - async def decode( + async def decode_single( self, shard_bytes: BytesLike, shard_spec: ArraySpec, @@ -424,7 +424,7 @@ async def decode( return out - async def decode_partial( + async def decode_partial_single( self, byte_getter: ByteGetter, selection: SliceSelection, @@ -490,7 +490,7 @@ async def decode_partial( return out - async def encode( + async def encode_single( self, shard_array: np.ndarray, shard_spec: ArraySpec, @@ -527,7 +527,7 @@ async def encode( return await shard_builder.finalize(self.index_location, self._encode_shard_index) - async def encode_partial( + async def encode_partial_single( self, byte_setter: ByteSetter, shard_array: np.ndarray, diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index c63327f6fc..b4c8402d80 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -3,7 +3,9 @@ from dataclasses import dataclass, replace +from zarr.codecs.mixins import ArrayArrayCodecBatchMixin from zarr.common import JSON, ArraySpec, ChunkCoordsLike, parse_named_configuration +from zarr.codecs.registry import register_codec if TYPE_CHECKING: from zarr.config import RuntimeConfiguration @@ -12,9 +14,6 @@ import numpy as np -from zarr.abc.codec import ArrayArrayCodec -from zarr.codecs.registry import register_codec - def parse_transpose_order(data: Union[JSON, Iterable[int]]) -> Tuple[int, ...]: if not isinstance(data, Iterable): @@ -25,7 +24,7 @@ def parse_transpose_order(data: Union[JSON, Iterable[int]]) -> Tuple[int, ...]: @dataclass(frozen=True) -class TransposeCodec(ArrayArrayCodec): +class TransposeCodec(ArrayArrayCodecBatchMixin): is_fixed_size = True order: Tuple[int, ...] @@ -73,7 +72,7 @@ def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: fill_value=chunk_spec.fill_value, ) - async def decode( + async def decode_single( self, chunk_array: np.ndarray, chunk_spec: ArraySpec, @@ -85,7 +84,7 @@ async def decode( chunk_array = chunk_array.transpose(inverse_order) return chunk_array - async def encode( + async def encode_single( self, chunk_array: np.ndarray, chunk_spec: ArraySpec, diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index 41db850ab6..8ace10c541 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -5,7 +5,7 @@ from zstandard import ZstdCompressor, ZstdDecompressor -from zarr.abc.codec import BytesBytesCodec +from zarr.codecs.mixins import BytesBytesCodecBatchMixin from zarr.codecs.registry import register_codec from zarr.common import parse_named_configuration, to_thread @@ -31,7 +31,7 @@ def parse_checksum(data: JSON) -> bool: @dataclass(frozen=True) -class ZstdCodec(BytesBytesCodec): +class ZstdCodec(BytesBytesCodecBatchMixin): is_fixed_size = True level: int = 0 @@ -60,7 +60,7 @@ def _decompress(self, data: bytes) -> bytes: ctx = ZstdDecompressor() return ctx.decompress(data) - async def decode( + async def decode_single( self, chunk_bytes: bytes, _chunk_spec: ArraySpec, @@ -68,7 +68,7 @@ async def decode( ) -> BytesLike: return await to_thread(self._decompress, chunk_bytes) - async def encode( + async def encode_single( self, chunk_bytes: bytes, _chunk_spec: ArraySpec, From 99a1f93b2a9bab071e9238fae9edd5688a130c81 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 13 May 2024 22:15:03 +0200 Subject: [PATCH 11/21] use zarr.config for batch_size --- src/zarr/codecs/pipeline/hybrid.py | 6 ++---- src/zarr/config.py | 8 +++++++- tests/v3/test_config.py | 6 +++++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/zarr/codecs/pipeline/hybrid.py b/src/zarr/codecs/pipeline/hybrid.py index f433b14773..84911ca21c 100644 --- a/src/zarr/codecs/pipeline/hybrid.py +++ b/src/zarr/codecs/pipeline/hybrid.py @@ -20,8 +20,6 @@ from typing_extensions import Self from zarr.common import ArraySpec, BytesLike, SliceSelection -DEFAULT_BATCH_SIZE = 1000 - T = TypeVar("T") @@ -35,7 +33,7 @@ def batched(iterable: Iterable[T], n: int) -> Iterable[Tuple[T, ...]]: @dataclass(frozen=True) class HybridCodecPipeline(CodecPipeline): - batch_size: int # TODO: There needs to be a way of specifying this from the user code + batch_size: int batched_codec_pipeline: BatchedCodecPipeline @classmethod @@ -46,7 +44,7 @@ def from_list(cls, codecs: List[Codec], *, batch_size: Optional[int] = None) -> array_array_codecs=array_array_codecs, array_bytes_codec=array_bytes_codec, bytes_bytes_codecs=bytes_bytes_codecs, - batch_size=batch_size or DEFAULT_BATCH_SIZE, + batch_size=batch_size or config.get("codec_pipeline.batch_size"), batched_codec_pipeline=BatchedCodecPipeline( array_array_codecs=array_array_codecs, array_bytes_codec=array_bytes_codec, diff --git a/src/zarr/config.py b/src/zarr/config.py index e546cb1c23..b0afe71e87 100644 --- a/src/zarr/config.py +++ b/src/zarr/config.py @@ -6,7 +6,13 @@ config = Config( "zarr", - defaults=[{"array": {"order": "C"}, "async": {"concurrency": None, "timeout": None}}], + defaults=[ + { + "array": {"order": "C"}, + "async": {"concurrency": None, "timeout": None}, + "codec_pipeline": {"batch_size": 1000}, + } + ], ) diff --git a/tests/v3/test_config.py b/tests/v3/test_config.py index 43acdec5fa..8c5a7d6f59 100644 --- a/tests/v3/test_config.py +++ b/tests/v3/test_config.py @@ -4,7 +4,11 @@ def test_config_defaults_set(): # regression test for available defaults assert config.defaults == [ - {"array": {"order": "C"}, "async": {"concurrency": None, "timeout": None}} + { + "array": {"order": "C"}, + "async": {"concurrency": None, "timeout": None}, + "codec_pipeline": {"batch_size": 1000}, + } ] assert config.get("array.order") == "C" From 530e88b6cd63d4d91c44302f0bc0c6a37749bee4 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Tue, 14 May 2024 21:38:19 +0200 Subject: [PATCH 12/21] don't use global lru_cache --- src/zarr/codecs/sharding.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 305079c6d2..d44d6f997d 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -333,6 +333,11 @@ def __init__( object.__setattr__(self, "index_codecs", index_codecs_parsed) object.__setattr__(self, "index_location", index_location_parsed) + # Use instance-local lru_cache to avoid memory leaks + object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec)) + object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec)) + object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard)) + @classmethod def from_dict(cls, data: Dict[str, JSON]) -> Self: _, configuration_parsed = parse_named_configuration(data, "sharding_indexed") @@ -609,7 +614,6 @@ def _shard_index_size(self, chunks_per_shard: ChunkCoords) -> int: 16 * product(chunks_per_shard), self._get_index_chunk_spec(chunks_per_shard) ) - @lru_cache def _get_index_chunk_spec(self, chunks_per_shard: ChunkCoords) -> ArraySpec: return ArraySpec( shape=chunks_per_shard + (2,), @@ -618,7 +622,6 @@ def _get_index_chunk_spec(self, chunks_per_shard: ChunkCoords) -> ArraySpec: order="C", # Note: this is hard-coded for simplicity -- it is not surfaced into user code ) - @lru_cache def _get_chunk_spec(self, shard_spec: ArraySpec) -> ArraySpec: return ArraySpec( shape=self.chunk_shape, @@ -627,7 +630,6 @@ def _get_chunk_spec(self, shard_spec: ArraySpec) -> ArraySpec: order=shard_spec.order, ) - @lru_cache def _get_chunks_per_shard(self, shard_spec: ArraySpec) -> ChunkCoords: return tuple( s // c From 9eda5927f3c82e3a66f5002f8d44ebcefcf3e2df Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 15 May 2024 09:37:04 +0200 Subject: [PATCH 13/21] removes HybridCodecPipeline --- src/zarr/abc/codec.py | 73 ++++ src/zarr/array.py | 7 +- src/zarr/codecs/__init__.py | 2 +- src/zarr/codecs/batched_codec_pipeline.py | 446 ++++++++++++++++++++++ src/zarr/codecs/mixins.py | 59 +-- src/zarr/codecs/pipeline/__init__.py | 5 - src/zarr/codecs/pipeline/batched.py | 258 ------------- src/zarr/codecs/pipeline/core.py | 186 --------- src/zarr/codecs/pipeline/hybrid.py | 115 ------ src/zarr/codecs/sharding.py | 16 +- src/zarr/metadata.py | 9 +- 11 files changed, 550 insertions(+), 626 deletions(-) create mode 100644 src/zarr/codecs/batched_codec_pipeline.py delete mode 100644 src/zarr/codecs/pipeline/__init__.py delete mode 100644 src/zarr/codecs/pipeline/batched.py delete mode 100644 src/zarr/codecs/pipeline/core.py delete mode 100644 src/zarr/codecs/pipeline/hybrid.py diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 4dba928912..e3310c351b 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -108,3 +108,76 @@ async def encode( chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], ) -> Iterable[BytesLike | None]: pass + + +class CodecPipeline(Metadata): + @abstractmethod + def evolve(self, array_spec: ArraySpec) -> Self: + pass + + @classmethod + @abstractmethod + def from_list(cls, codecs: list[Codec]) -> Self: + pass + + @property + @abstractmethod + def supports_partial_decode(self) -> bool: + pass + + @property + @abstractmethod + def supports_partial_encode(self) -> bool: + pass + + @abstractmethod + def validate(self, array_metadata: ArrayMetadata) -> None: + pass + + @abstractmethod + def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: + pass + + @abstractmethod + async def decode( + self, + chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], + ) -> Iterable[np.ndarray | None]: + pass + + @abstractmethod + async def decode_partial( + self, + batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], + ) -> Iterable[np.ndarray | None]: + pass + + @abstractmethod + async def encode( + self, + chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], + ) -> Iterable[BytesLike | None]: + pass + + @abstractmethod + async def encode_partial( + self, + batch_info: Iterable[tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], + ) -> None: + pass + + @abstractmethod + async def read( + self, + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + out: np.ndarray, + ) -> None: + pass + + @abstractmethod + async def write( + self, + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], + value: np.ndarray, + ) -> None: + pass diff --git a/src/zarr/array.py b/src/zarr/array.py index 999803ffb3..bc876d4cd9 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -17,12 +17,11 @@ import numpy as np import numpy.typing as npt -from zarr.abc.codec import Codec +from zarr.abc.codec import Codec, CodecPipeline # from zarr.array_v2 import ArrayV2 from zarr.codecs import BytesCodec -from zarr.codecs.pipeline import CodecPipeline from zarr.common import ( ZARR_JSON, ChunkCoords, @@ -199,7 +198,7 @@ async def getitem(self, selection: Selection) -> npt.NDArray[Any]: ) # reading chunks and decoding them - await self.codecs.read_batch( + await self.codecs.read( [ ( self.store_path @@ -244,7 +243,7 @@ async def setitem(self, selection: Selection, value: np.ndarray) -> None: value = value.astype(self.metadata.dtype, order="A") # merging with existing data and encoding chunks - await self.codecs.write_batch( + await self.codecs.write( [ ( self.store_path diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index f3a3e84bc7..dbeef59928 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -7,4 +7,4 @@ from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation # noqa: F401 from zarr.codecs.transpose import TransposeCodec # noqa: F401 from zarr.codecs.zstd import ZstdCodec # noqa: F401 -from zarr.codecs.pipeline import CodecPipeline, BatchedCodecPipeline, HybridCodecPipeline # noqa: F401 +from zarr.codecs.batched_codec_pipeline import BatchedCodecPipeline # noqa: F401 diff --git a/src/zarr/codecs/batched_codec_pipeline.py b/src/zarr/codecs/batched_codec_pipeline.py new file mode 100644 index 0000000000..ced0e329c1 --- /dev/null +++ b/src/zarr/codecs/batched_codec_pipeline.py @@ -0,0 +1,446 @@ +from __future__ import annotations + +from itertools import islice +from typing import TYPE_CHECKING, Iterator, TypeVar, Iterable +from warnings import warn +import numpy as np +from dataclasses import dataclass + +from zarr.config import config +from zarr.abc.codec import ( + ByteGetter, + ByteSetter, + Codec, + CodecPipeline, + ArrayArrayCodec, + ArrayBytesCodec, + ArrayBytesCodecPartialDecodeMixin, + ArrayBytesCodecPartialEncodeMixin, + BytesBytesCodec, +) +from zarr.codecs.registry import get_codec_class +from zarr.common import JSON, concurrent_map, parse_named_configuration +from zarr.indexing import is_total_slice +from zarr.metadata import ArrayMetadata + +if TYPE_CHECKING: + from typing_extensions import Self + from zarr.common import ArraySpec, BytesLike, SliceSelection + +T = TypeVar("T") +U = TypeVar("U") + + +def _unzip2(iterable: Iterable[tuple[T, U]]) -> tuple[list[T], list[U]]: + out0: list[T] = [] + out1: list[U] = [] + for item0, item1 in iterable: + out0.append(item0) + out1.append(item1) + return (out0, out1) + + +def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]: + if n < 1: + raise ValueError("n must be at least one") + it = iter(iterable) + while batch := tuple(islice(it, n)): + yield batch + + +def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[ArraySpec]: + return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] + + +@dataclass(frozen=True) +class BatchedCodecPipeline(CodecPipeline): + array_array_codecs: tuple[ArrayArrayCodec, ...] + array_bytes_codec: ArrayBytesCodec + bytes_bytes_codecs: tuple[BytesBytesCodec, ...] + batch_size: int + + @classmethod + def from_dict(cls, data: Iterable[JSON | Codec], *, batch_size: int | None = None) -> Self: + out: list[Codec] = [] + if not isinstance(data, Iterable): + raise TypeError(f"Expected iterable, got {type(data)}") + + for c in data: + if isinstance(c, Codec): + out.append(c) + else: + name_parsed, _ = parse_named_configuration(c, require_configuration=False) + out.append(get_codec_class(name_parsed).from_dict(c)) # type: ignore[arg-type] + return cls.from_list(out, batch_size=batch_size) + + def to_dict(self) -> JSON: + return [c.to_dict() for c in self] + + def evolve(self, array_spec: ArraySpec) -> Self: + return type(self).from_list([c.evolve(array_spec) for c in self]) + + @staticmethod + def codecs_from_list( + codecs: list[Codec], + ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: + from zarr.codecs.sharding import ShardingCodec + + if not any(isinstance(codec, ArrayBytesCodec) for codec in codecs): + raise ValueError("Exactly one array-to-bytes codec is required.") + + prev_codec: Codec | None = None + for codec in codecs: + if prev_codec is not None: + if isinstance(codec, ArrayBytesCodec) and isinstance(prev_codec, ArrayBytesCodec): + raise ValueError( + f"ArrayBytesCodec '{type(codec)}' cannot follow after " + + f"ArrayBytesCodec '{type(prev_codec)}' because exactly " + + "1 ArrayBytesCodec is allowed." + ) + if isinstance(codec, ArrayBytesCodec) and isinstance(prev_codec, BytesBytesCodec): + raise ValueError( + f"ArrayBytesCodec '{type(codec)}' cannot follow after " + + f"BytesBytesCodec '{type(prev_codec)}'." + ) + if isinstance(codec, ArrayArrayCodec) and isinstance(prev_codec, ArrayBytesCodec): + raise ValueError( + f"ArrayArrayCodec '{type(codec)}' cannot follow after " + + f"ArrayBytesCodec '{type(prev_codec)}'." + ) + if isinstance(codec, ArrayArrayCodec) and isinstance(prev_codec, BytesBytesCodec): + raise ValueError( + f"ArrayArrayCodec '{type(codec)}' cannot follow after " + + f"BytesBytesCodec '{type(prev_codec)}'." + ) + prev_codec = codec + + if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1: + warn( + "Combining a `sharding_indexed` codec disables partial reads and " + + "writes, which may lead to inefficient performance." + ) + + return ( + tuple(codec for codec in codecs if isinstance(codec, ArrayArrayCodec)), + next(codec for codec in codecs if isinstance(codec, ArrayBytesCodec)), + tuple(codec for codec in codecs if isinstance(codec, BytesBytesCodec)), + ) + + @classmethod + def from_list(cls, codecs: list[Codec], *, batch_size: int | None = None) -> Self: + array_array_codecs, array_bytes_codec, bytes_bytes_codecs = cls.codecs_from_list(codecs) + + return cls( + array_array_codecs=array_array_codecs, + array_bytes_codec=array_bytes_codec, + bytes_bytes_codecs=bytes_bytes_codecs, + batch_size=batch_size or config.get("codec_pipeline.batch_size"), + ) + + @property + def supports_partial_decode(self) -> bool: + return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( + self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin + ) + + @property + def supports_partial_encode(self) -> bool: + return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( + self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin + ) + + def __iter__(self) -> Iterator[Codec]: + for aa_codec in self.array_array_codecs: + yield aa_codec + + yield self.array_bytes_codec + + for bb_codec in self.bytes_bytes_codecs: + yield bb_codec + + def validate(self, array_metadata: ArrayMetadata) -> None: + for codec in self: + codec.validate(array_metadata) + + def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: + for codec in self: + byte_length = codec.compute_encoded_size(byte_length, array_spec) + array_spec = codec.resolve_metadata(array_spec) + return byte_length + + def _codecs_with_resolved_metadata_batched( + self, chunk_specs: Iterable[ArraySpec] + ) -> tuple[ + list[tuple[ArrayArrayCodec, list[ArraySpec]]], + tuple[ArrayBytesCodec, list[ArraySpec]], + list[tuple[BytesBytesCodec, list[ArraySpec]]], + ]: + aa_codecs_with_spec: list[tuple[ArrayArrayCodec, list[ArraySpec]]] = [] + chunk_specs = list(chunk_specs) + for aa_codec in self.array_array_codecs: + aa_codecs_with_spec.append((aa_codec, chunk_specs)) + chunk_specs = [aa_codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] + + ab_codec_with_spec = (self.array_bytes_codec, chunk_specs) + chunk_specs = [ + self.array_bytes_codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs + ] + + bb_codecs_with_spec: list[tuple[BytesBytesCodec, list[ArraySpec]]] = [] + for bb_codec in self.bytes_bytes_codecs: + bb_codecs_with_spec.append((bb_codec, chunk_specs)) + chunk_specs = [bb_codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] + + return (aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec) + + async def decode_batch( + self, + chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], + ) -> Iterable[np.ndarray | None]: + chunk_bytes_batch: Iterable[BytesLike | None] + chunk_bytes_batch, chunk_specs = _unzip2(chunk_bytes_and_specs) + + ( + aa_codecs_with_spec, + ab_codec_with_spec, + bb_codecs_with_spec, + ) = self._codecs_with_resolved_metadata_batched(chunk_specs) + + for bb_codec, chunk_spec_batch in bb_codecs_with_spec[::-1]: + chunk_bytes_batch = await bb_codec.decode(zip(chunk_bytes_batch, chunk_spec_batch)) + + ab_codec, chunk_spec_batch = ab_codec_with_spec + chunk_array_batch = await ab_codec.decode(zip(chunk_bytes_batch, chunk_spec_batch)) + + for aa_codec, chunk_spec_batch in aa_codecs_with_spec[::-1]: + chunk_array_batch = await aa_codec.decode(zip(chunk_array_batch, chunk_spec_batch)) + + return chunk_array_batch + + async def decode_partial_batch( + self, + batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], + ) -> Iterable[np.ndarray | None]: + assert self.supports_partial_decode + assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) + return await self.array_bytes_codec.decode_partial(batch_info) + + async def encode_batch( + self, + chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], + ) -> Iterable[BytesLike | None]: + chunk_array_batch: Iterable[np.ndarray | None] + chunk_specs: Iterable[ArraySpec] + chunk_array_batch, chunk_specs = _unzip2(chunk_arrays_and_specs) + + for aa_codec in self.array_array_codecs: + chunk_array_batch = await aa_codec.encode(zip(chunk_array_batch, chunk_specs)) + chunk_specs = resolve_batched(aa_codec, chunk_specs) + + chunk_bytes_batch = await self.array_bytes_codec.encode(zip(chunk_array_batch, chunk_specs)) + chunk_specs = resolve_batched(self.array_bytes_codec, chunk_specs) + + for bb_codec in self.bytes_bytes_codecs: + chunk_bytes_batch = await bb_codec.encode(zip(chunk_bytes_batch, chunk_specs)) + chunk_specs = resolve_batched(bb_codec, chunk_specs) + + return chunk_bytes_batch + + async def encode_partial_batch( + self, + batch_info: Iterable[tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], + ) -> None: + assert self.supports_partial_encode + assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) + await self.array_bytes_codec.encode_partial(batch_info) + + async def read_batch( + self, + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + out: np.ndarray, + ) -> None: + if self.supports_partial_decode: + chunk_array_batch = await self.decode_partial_batch( + [ + (byte_getter, chunk_selection, chunk_spec) + for byte_getter, chunk_spec, chunk_selection, _ in batch_info + ] + ) + for chunk_array, (_, chunk_spec, _, out_selection) in zip( + chunk_array_batch, batch_info + ): + if chunk_array is not None: + out[out_selection] = chunk_array + else: + out[out_selection] = chunk_spec.fill_value + else: + chunk_bytes_batch = await concurrent_map( + [(byte_getter,) for byte_getter, _, _, _ in batch_info], + lambda byte_getter: byte_getter.get(), + config.get("async.concurrency"), + ) + chunk_array_batch = await self.decode_batch( + [ + (chunk_bytes, chunk_spec) + for chunk_bytes, (_, chunk_spec, _, _) in zip(chunk_bytes_batch, batch_info) + ], + ) + for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( + chunk_array_batch, batch_info + ): + if chunk_array is not None: + tmp = chunk_array[chunk_selection] + out[out_selection] = tmp + else: + out[out_selection] = chunk_spec.fill_value + + async def write_batch( + self, + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], + value: np.ndarray, + ) -> None: + if self.supports_partial_encode: + await self.encode_partial_batch( + [ + (byte_setter, value[out_selection], chunk_selection, chunk_spec) + for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info + ], + ) + + else: + # Read existing bytes if not total slice + async def _read_key(byte_setter: ByteSetter | None) -> BytesLike | None: + if byte_setter is None: + return None + return await byte_setter.get() + + chunk_bytes_batch: Iterable[BytesLike | None] + chunk_bytes_batch = await concurrent_map( + [ + (None if is_total_slice(chunk_selection, chunk_spec.shape) else byte_setter,) + for byte_setter, chunk_spec, chunk_selection, _ in batch_info + ], + _read_key, + config.get("async.concurrency"), + ) + chunk_array_batch = await self.decode_batch( + [ + (chunk_bytes, chunk_spec) + for chunk_bytes, (_, chunk_spec, _, _) in zip(chunk_bytes_batch, batch_info) + ], + ) + + def _merge_chunk_array( + existing_chunk_array: np.ndarray | None, + new_chunk_array_slice: np.ndarray, + chunk_spec: ArraySpec, + chunk_selection: SliceSelection, + ) -> np.ndarray: + if is_total_slice(chunk_selection, chunk_spec.shape): + return new_chunk_array_slice + if existing_chunk_array is None: + chunk_array = np.empty( + chunk_spec.shape, + dtype=chunk_spec.dtype, + ) + chunk_array.fill(chunk_spec.fill_value) + else: + chunk_array = existing_chunk_array.copy() # make a writable copy + chunk_array[chunk_selection] = new_chunk_array_slice + return chunk_array + + chunk_array_batch = [ + _merge_chunk_array(chunk_array, value[out_selection], chunk_spec, chunk_selection) + for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( + chunk_array_batch, batch_info + ) + ] + + chunk_array_batch = [ + None if np.all(chunk_array == chunk_spec.fill_value) else chunk_array + for chunk_array, (_, chunk_spec, _, _) in zip(chunk_array_batch, batch_info) + ] + + chunk_bytes_batch = await self.encode_batch( + [ + (chunk_array, chunk_spec) + for chunk_array, (_, chunk_spec, _, _) in zip(chunk_array_batch, batch_info) + ], + ) + + async def _write_key(byte_setter: ByteSetter, chunk_bytes: BytesLike | None) -> None: + if chunk_bytes is None: + await byte_setter.delete() + else: + await byte_setter.set(chunk_bytes) + + await concurrent_map( + [ + (byte_setter, chunk_bytes) + for chunk_bytes, (byte_setter, _, _, _) in zip(chunk_bytes_batch, batch_info) + ], + _write_key, + config.get("async.concurrency"), + ) + + async def decode( + self, + chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], + ) -> Iterable[np.ndarray | None]: + output: list[np.ndarray | None] = [] + for batch_info in batched(chunk_bytes_and_specs, self.batch_size): + output.extend(await self.decode_batch(batch_info)) + return output + + async def decode_partial( + self, + batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], + ) -> Iterable[np.ndarray | None]: + output: list[np.ndarray | None] = [] + for single_batch_info in batched(batch_info, self.batch_size): + output.extend(await self.decode_partial_batch(single_batch_info)) + return output + + async def encode( + self, + chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], + ) -> Iterable[BytesLike | None]: + output: list[BytesLike | None] = [] + for single_batch_info in batched(chunk_arrays_and_specs, self.batch_size): + output.extend(await self.encode_batch(single_batch_info)) + return output + + async def encode_partial( + self, + batch_info: Iterable[tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], + ) -> None: + for single_batch_info in batched(batch_info, self.batch_size): + await self.encode_partial_batch(single_batch_info) + + async def read( + self, + batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], + out: np.ndarray, + ) -> None: + await concurrent_map( + [ + (single_batch_info, out) + for single_batch_info in batched(batch_info, self.batch_size) + ], + self.read_batch, + config.get("async.concurrency"), + ) + + async def write( + self, + batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], + value: np.ndarray, + ) -> None: + await concurrent_map( + [ + (single_batch_info, value) + for single_batch_info in batched(batch_info, self.batch_size) + ], + self.write_batch, + config.get("async.concurrency"), + ) diff --git a/src/zarr/codecs/mixins.py b/src/zarr/codecs/mixins.py index b530558068..9538d5ddf3 100644 --- a/src/zarr/codecs/mixins.py +++ b/src/zarr/codecs/mixins.py @@ -46,60 +46,44 @@ async def wrap(chunk: CodecInput | None, chunk_spec: ArraySpec) -> CodecOutput | class ArrayArrayCodecBatchMixin(ArrayArrayCodec): @abstractmethod - async def decode_single( - self, - chunk_array: np.ndarray, - chunk_spec: ArraySpec, - ) -> np.ndarray: + async def decode_single(self, chunk_array: np.ndarray, chunk_spec: ArraySpec) -> np.ndarray: pass async def decode( - self, - chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], + self, chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]] ) -> Iterable[np.ndarray | None]: return await batching_helper(self.decode_single, chunk_arrays_and_specs) @abstractmethod async def encode_single( - self, - chunk_array: np.ndarray, - chunk_spec: ArraySpec, + self, chunk_array: np.ndarray, chunk_spec: ArraySpec ) -> np.ndarray | None: pass async def encode( - self, - chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], + self, chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]] ) -> Iterable[np.ndarray | None]: return await batching_helper(self.encode_single, chunk_arrays_and_specs) class ArrayBytesCodecBatchMixin(ArrayBytesCodec): @abstractmethod - async def decode_single( - self, - chunk_bytes: BytesLike, - chunk_spec: ArraySpec, - ) -> np.ndarray: + async def decode_single(self, chunk_bytes: BytesLike, chunk_spec: ArraySpec) -> np.ndarray: pass async def decode( - self, - chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], + self, chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]] ) -> Iterable[np.ndarray | None]: return await batching_helper(self.decode_single, chunk_bytes_and_specs) @abstractmethod async def encode_single( - self, - chunk_array: np.ndarray, - chunk_spec: ArraySpec, + self, chunk_array: np.ndarray, chunk_spec: ArraySpec ) -> BytesLike | None: pass async def encode( - self, - chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], + self, chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]] ) -> Iterable[BytesLike | None]: return await batching_helper(self.encode_single, chunk_arrays_and_specs) @@ -107,16 +91,12 @@ async def encode( class ArrayBytesCodecPartialDecodeBatchMixin(ArrayBytesCodecPartialDecodeMixin): @abstractmethod async def decode_partial_single( - self, - byte_getter: ByteGetter, - selection: SliceSelection, - chunk_spec: ArraySpec, + self, byte_getter: ByteGetter, selection: SliceSelection, chunk_spec: ArraySpec ) -> np.ndarray | None: pass async def decode_partial( - self, - batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], + self, batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]] ) -> Iterable[np.ndarray | None]: return await concurrent_map( [ @@ -140,8 +120,7 @@ async def encode_partial_single( pass async def encode_partial( - self, - batch_info: Iterable[tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], + self, batch_info: Iterable[tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]] ) -> None: await concurrent_map( [ @@ -155,29 +134,21 @@ async def encode_partial( class BytesBytesCodecBatchMixin(BytesBytesCodec): @abstractmethod - async def decode_single( - self, - chunk_bytes: BytesLike, - chunk_spec: ArraySpec, - ) -> BytesLike: + async def decode_single(self, chunk_bytes: BytesLike, chunk_spec: ArraySpec) -> BytesLike: pass async def decode( - self, - chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], + self, chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]] ) -> Iterable[BytesLike | None]: return await batching_helper(self.decode_single, chunk_bytes_and_specs) @abstractmethod async def encode_single( - self, - chunk_array: BytesLike, - chunk_spec: ArraySpec, + self, chunk_array: BytesLike, chunk_spec: ArraySpec ) -> BytesLike | None: pass async def encode( - self, - chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], + self, chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]] ) -> Iterable[BytesLike | None]: return await batching_helper(self.encode_single, chunk_bytes_and_specs) diff --git a/src/zarr/codecs/pipeline/__init__.py b/src/zarr/codecs/pipeline/__init__.py deleted file mode 100644 index 82439ba48b..0000000000 --- a/src/zarr/codecs/pipeline/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from __future__ import annotations - -from zarr.codecs.pipeline.core import CodecPipeline # noqa: F401 -from zarr.codecs.pipeline.hybrid import HybridCodecPipeline # noqa: F401 -from zarr.codecs.pipeline.batched import BatchedCodecPipeline # noqa: F401 diff --git a/src/zarr/codecs/pipeline/batched.py b/src/zarr/codecs/pipeline/batched.py deleted file mode 100644 index b6fd36b5c4..0000000000 --- a/src/zarr/codecs/pipeline/batched.py +++ /dev/null @@ -1,258 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, TypeVar, Iterable -import numpy as np -from dataclasses import dataclass - -from zarr.config import config -from zarr.abc.codec import ( - ByteGetter, - ByteSetter, - Codec, - ArrayArrayCodec, - ArrayBytesCodec, - ArrayBytesCodecPartialDecodeMixin, - ArrayBytesCodecPartialEncodeMixin, - BytesBytesCodec, -) -from zarr.codecs.pipeline.core import CodecPipeline -from zarr.common import concurrent_map -from zarr.indexing import is_total_slice - -if TYPE_CHECKING: - from typing import List, Optional, Tuple - from zarr.common import ArraySpec, BytesLike, SliceSelection - -T = TypeVar("T") -U = TypeVar("U") - - -def _unzip2(iterable: Iterable[tuple[T, U]]) -> tuple[list[T], list[U]]: - out0: list[T] = [] - out1: list[U] = [] - for item0, item1 in iterable: - out0.append(item0) - out1.append(item1) - return (out0, out1) - - -def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[ArraySpec]: - return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] - - -@dataclass(frozen=True) -class BatchedCodecPipeline(CodecPipeline): - def _codecs_with_resolved_metadata_batched( - self, chunk_specs: Iterable[ArraySpec] - ) -> Tuple[ - List[Tuple[ArrayArrayCodec, List[ArraySpec]]], - Tuple[ArrayBytesCodec, List[ArraySpec]], - List[Tuple[BytesBytesCodec, List[ArraySpec]]], - ]: - aa_codecs_with_spec: List[Tuple[ArrayArrayCodec, List[ArraySpec]]] = [] - chunk_specs = list(chunk_specs) - for aa_codec in self.array_array_codecs: - aa_codecs_with_spec.append((aa_codec, chunk_specs)) - chunk_specs = [aa_codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] - - ab_codec_with_spec = (self.array_bytes_codec, chunk_specs) - chunk_specs = [ - self.array_bytes_codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs - ] - - bb_codecs_with_spec: List[Tuple[BytesBytesCodec, List[ArraySpec]]] = [] - for bb_codec in self.bytes_bytes_codecs: - bb_codecs_with_spec.append((bb_codec, chunk_specs)) - chunk_specs = [bb_codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs] - - return (aa_codecs_with_spec, ab_codec_with_spec, bb_codecs_with_spec) - - async def decode( - self, - chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], - ) -> Iterable[Optional[np.ndarray]]: - chunk_bytes_batch: Iterable[Optional[BytesLike]] - chunk_bytes_batch, chunk_specs = _unzip2(chunk_bytes_and_specs) - - ( - aa_codecs_with_spec, - ab_codec_with_spec, - bb_codecs_with_spec, - ) = self._codecs_with_resolved_metadata_batched(chunk_specs) - - for bb_codec, chunk_spec_batch in bb_codecs_with_spec[::-1]: - chunk_bytes_batch = await bb_codec.decode(zip(chunk_bytes_batch, chunk_spec_batch)) - - ab_codec, chunk_spec_batch = ab_codec_with_spec - chunk_array_batch = await ab_codec.decode(zip(chunk_bytes_batch, chunk_spec_batch)) - - for aa_codec, chunk_spec_batch in aa_codecs_with_spec[::-1]: - chunk_array_batch = await aa_codec.decode(zip(chunk_array_batch, chunk_spec_batch)) - - return chunk_array_batch - - async def decode_partial( - self, - batch_info: Iterable[Tuple[ByteGetter, SliceSelection, ArraySpec]], - ) -> Iterable[Optional[np.ndarray]]: - assert self.supports_partial_decode - assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin) - return await self.array_bytes_codec.decode_partial(batch_info) - - async def encode( - self, - chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], - ) -> Iterable[Optional[BytesLike]]: - chunk_array_batch: Iterable[Optional[np.ndarray]] - chunk_specs: Iterable[ArraySpec] - chunk_array_batch, chunk_specs = _unzip2(chunk_arrays_and_specs) - - for aa_codec in self.array_array_codecs: - chunk_array_batch = await aa_codec.encode(zip(chunk_array_batch, chunk_specs)) - chunk_specs = resolve_batched(aa_codec, chunk_specs) - - chunk_bytes_batch = await self.array_bytes_codec.encode(zip(chunk_array_batch, chunk_specs)) - chunk_specs = resolve_batched(self.array_bytes_codec, chunk_specs) - - for bb_codec in self.bytes_bytes_codecs: - chunk_bytes_batch = await bb_codec.encode(zip(chunk_bytes_batch, chunk_specs)) - chunk_specs = resolve_batched(bb_codec, chunk_specs) - - return chunk_bytes_batch - - async def encode_partial( - self, - batch_info: Iterable[Tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], - ) -> None: - assert self.supports_partial_encode - assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin) - await self.array_bytes_codec.encode_partial(batch_info) - - async def read_batch( - self, - batch_info: Iterable[Tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], - out: np.ndarray, - ) -> None: - if self.supports_partial_decode: - chunk_array_batch = await self.decode_partial( - [ - (byte_getter, chunk_selection, chunk_spec) - for byte_getter, chunk_spec, chunk_selection, _ in batch_info - ] - ) - for chunk_array, (_, chunk_spec, _, out_selection) in zip( - chunk_array_batch, batch_info - ): - if chunk_array is not None: - out[out_selection] = chunk_array - else: - out[out_selection] = chunk_spec.fill_value - else: - chunk_bytes_batch = await concurrent_map( - [(byte_getter,) for byte_getter, _, _, _ in batch_info], - lambda byte_getter: byte_getter.get(), - config.get("async.concurrency"), - ) - chunk_array_batch = await self.decode( - [ - (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, _, _) in zip(chunk_bytes_batch, batch_info) - ], - ) - for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( - chunk_array_batch, batch_info - ): - if chunk_array is not None: - tmp = chunk_array[chunk_selection] - out[out_selection] = tmp - else: - out[out_selection] = chunk_spec.fill_value - - async def write_batch( - self, - batch_info: Iterable[Tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], - value: np.ndarray, - ) -> None: - if self.supports_partial_encode: - await self.encode_partial( - [ - (byte_setter, value[out_selection], chunk_selection, chunk_spec) - for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info - ], - ) - - else: - # Read existing bytes if not total slice - async def _read_key(byte_setter: Optional[ByteSetter]) -> Optional[BytesLike]: - if byte_setter is None: - return None - return await byte_setter.get() - - chunk_bytes_batch: Iterable[Optional[BytesLike]] - chunk_bytes_batch = await concurrent_map( - [ - (None if is_total_slice(chunk_selection, chunk_spec.shape) else byte_setter,) - for byte_setter, chunk_spec, chunk_selection, _ in batch_info - ], - _read_key, - config.get("async.concurrency"), - ) - chunk_array_batch = await self.decode( - [ - (chunk_bytes, chunk_spec) - for chunk_bytes, (_, chunk_spec, _, _) in zip(chunk_bytes_batch, batch_info) - ], - ) - - def _merge_chunk_array( - existing_chunk_array: Optional[np.ndarray], - new_chunk_array_slice: np.ndarray, - chunk_spec: ArraySpec, - chunk_selection: SliceSelection, - ) -> np.ndarray: - if is_total_slice(chunk_selection, chunk_spec.shape): - return new_chunk_array_slice - if existing_chunk_array is None: - chunk_array = np.empty( - chunk_spec.shape, - dtype=chunk_spec.dtype, - ) - chunk_array.fill(chunk_spec.fill_value) - else: - chunk_array = existing_chunk_array.copy() # make a writable copy - chunk_array[chunk_selection] = new_chunk_array_slice - return chunk_array - - chunk_array_batch = [ - _merge_chunk_array(chunk_array, value[out_selection], chunk_spec, chunk_selection) - for chunk_array, (_, chunk_spec, chunk_selection, out_selection) in zip( - chunk_array_batch, batch_info - ) - ] - - chunk_array_batch = [ - None if np.all(chunk_array == chunk_spec.fill_value) else chunk_array - for chunk_array, (_, chunk_spec, _, _) in zip(chunk_array_batch, batch_info) - ] - - chunk_bytes_batch = await self.encode( - [ - (chunk_array, chunk_spec) - for chunk_array, (_, chunk_spec, _, _) in zip(chunk_array_batch, batch_info) - ], - ) - - async def _write_key(byte_setter: ByteSetter, chunk_bytes: Optional[BytesLike]) -> None: - if chunk_bytes is None: - await byte_setter.delete() - else: - await byte_setter.set(chunk_bytes) - - await concurrent_map( - [ - (byte_setter, chunk_bytes) - for chunk_bytes, (byte_setter, _, _, _) in zip(chunk_bytes_batch, batch_info) - ], - _write_key, - config.get("async.concurrency"), - ) diff --git a/src/zarr/codecs/pipeline/core.py b/src/zarr/codecs/pipeline/core.py deleted file mode 100644 index d6574e9436..0000000000 --- a/src/zarr/codecs/pipeline/core.py +++ /dev/null @@ -1,186 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Iterable -import numpy as np -from dataclasses import dataclass -from warnings import warn - -from zarr.abc.codec import ( - ArrayArrayCodec, - ArrayBytesCodec, - ArrayBytesCodecPartialDecodeMixin, - ArrayBytesCodecPartialEncodeMixin, - ByteGetter, - ByteSetter, - BytesBytesCodec, - Codec, -) -from zarr.abc.metadata import Metadata -from zarr.codecs.registry import get_codec_class -from zarr.common import parse_named_configuration - -if TYPE_CHECKING: - from typing import Iterator, List, Optional, Tuple, Union - from typing_extensions import Self - from zarr.metadata import ArrayMetadata - from zarr.common import JSON, ArraySpec, BytesLike, SliceSelection - - -@dataclass(frozen=True) -class CodecPipeline(Metadata, ABC): - array_array_codecs: Tuple[ArrayArrayCodec, ...] - array_bytes_codec: ArrayBytesCodec - bytes_bytes_codecs: Tuple[BytesBytesCodec, ...] - - @classmethod - def from_dict(cls, data: Iterable[Union[JSON, Codec]]) -> Self: - out: List[Codec] = [] - if not isinstance(data, Iterable): - raise TypeError(f"Expected iterable, got {type(data)}") - - for c in data: - if isinstance(c, Codec): - out.append(c) - else: - name_parsed, _ = parse_named_configuration(c, require_configuration=False) - out.append(get_codec_class(name_parsed).from_dict(c)) # type: ignore[arg-type] - return cls.from_list(out) - - def to_dict(self) -> JSON: - return [c.to_dict() for c in self] - - def evolve(self, array_spec: ArraySpec) -> Self: - return type(self).from_list([c.evolve(array_spec) for c in self]) - - @staticmethod - def codecs_from_list( - codecs: List[Codec], - ) -> Tuple[Tuple[ArrayArrayCodec, ...], ArrayBytesCodec, Tuple[BytesBytesCodec, ...]]: - from zarr.codecs.sharding import ShardingCodec - - if not any(isinstance(codec, ArrayBytesCodec) for codec in codecs): - raise ValueError("Exactly one array-to-bytes codec is required.") - - prev_codec: Optional[Codec] = None - for codec in codecs: - if prev_codec is not None: - if isinstance(codec, ArrayBytesCodec) and isinstance(prev_codec, ArrayBytesCodec): - raise ValueError( - f"ArrayBytesCodec '{type(codec)}' cannot follow after " - + f"ArrayBytesCodec '{type(prev_codec)}' because exactly " - + "1 ArrayBytesCodec is allowed." - ) - if isinstance(codec, ArrayBytesCodec) and isinstance(prev_codec, BytesBytesCodec): - raise ValueError( - f"ArrayBytesCodec '{type(codec)}' cannot follow after " - + f"BytesBytesCodec '{type(prev_codec)}'." - ) - if isinstance(codec, ArrayArrayCodec) and isinstance(prev_codec, ArrayBytesCodec): - raise ValueError( - f"ArrayArrayCodec '{type(codec)}' cannot follow after " - + f"ArrayBytesCodec '{type(prev_codec)}'." - ) - if isinstance(codec, ArrayArrayCodec) and isinstance(prev_codec, BytesBytesCodec): - raise ValueError( - f"ArrayArrayCodec '{type(codec)}' cannot follow after " - + f"BytesBytesCodec '{type(prev_codec)}'." - ) - prev_codec = codec - - if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1: - warn( - "Combining a `sharding_indexed` codec disables partial reads and " - + "writes, which may lead to inefficient performance." - ) - - return ( - tuple(codec for codec in codecs if isinstance(codec, ArrayArrayCodec)), - next(codec for codec in codecs if isinstance(codec, ArrayBytesCodec)), - tuple(codec for codec in codecs if isinstance(codec, BytesBytesCodec)), - ) - - @classmethod - def from_list(cls, codecs: List[Codec]) -> Self: - array_array_codecs, array_bytes_codec, bytes_bytes_codecs = cls.codecs_from_list(codecs) - - return cls( - array_array_codecs=array_array_codecs, - array_bytes_codec=array_bytes_codec, - bytes_bytes_codecs=bytes_bytes_codecs, - ) - - @property - def supports_partial_decode(self) -> bool: - return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( - self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin - ) - - @property - def supports_partial_encode(self) -> bool: - return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( - self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin - ) - - def __iter__(self) -> Iterator[Codec]: - for aa_codec in self.array_array_codecs: - yield aa_codec - - yield self.array_bytes_codec - - for bb_codec in self.bytes_bytes_codecs: - yield bb_codec - - def validate(self, array_metadata: ArrayMetadata) -> None: - for codec in self: - codec.validate(array_metadata) - - def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: - for codec in self: - byte_length = codec.compute_encoded_size(byte_length, array_spec) - array_spec = codec.resolve_metadata(array_spec) - return byte_length - - @abstractmethod - async def decode( - self, - chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], - ) -> Iterable[Optional[np.ndarray]]: - pass - - @abstractmethod - async def decode_partial( - self, - batch_info: Iterable[Tuple[ByteGetter, SliceSelection, ArraySpec]], - ) -> Iterable[Optional[np.ndarray]]: - pass - - @abstractmethod - async def encode( - self, - chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], - ) -> Iterable[Optional[BytesLike]]: - pass - - @abstractmethod - async def encode_partial( - self, - batch_info: Iterable[Tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], - ) -> None: - pass - - @abstractmethod - async def read_batch( - self, - batch_info: Iterable[Tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], - out: np.ndarray, - ) -> None: - pass - - @abstractmethod - async def write_batch( - self, - batch_info: Iterable[Tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], - value: np.ndarray, - ) -> None: - pass diff --git a/src/zarr/codecs/pipeline/hybrid.py b/src/zarr/codecs/pipeline/hybrid.py deleted file mode 100644 index 84911ca21c..0000000000 --- a/src/zarr/codecs/pipeline/hybrid.py +++ /dev/null @@ -1,115 +0,0 @@ -from __future__ import annotations - -from itertools import islice -from typing import TYPE_CHECKING, TypeVar -import numpy as np -from dataclasses import dataclass - -from zarr.config import config -from zarr.abc.codec import ( - ByteGetter, - ByteSetter, - Codec, -) -from zarr.codecs.pipeline.batched import BatchedCodecPipeline -from zarr.codecs.pipeline.core import CodecPipeline -from zarr.common import concurrent_map - -if TYPE_CHECKING: - from typing import List, Optional, Tuple, Iterable - from typing_extensions import Self - from zarr.common import ArraySpec, BytesLike, SliceSelection - -T = TypeVar("T") - - -def batched(iterable: Iterable[T], n: int) -> Iterable[Tuple[T, ...]]: - if n < 1: - raise ValueError("n must be at least one") - it = iter(iterable) - while batch := tuple(islice(it, n)): - yield batch - - -@dataclass(frozen=True) -class HybridCodecPipeline(CodecPipeline): - batch_size: int - batched_codec_pipeline: BatchedCodecPipeline - - @classmethod - def from_list(cls, codecs: List[Codec], *, batch_size: Optional[int] = None) -> Self: - array_array_codecs, array_bytes_codec, bytes_bytes_codecs = cls.codecs_from_list(codecs) - - return cls( - array_array_codecs=array_array_codecs, - array_bytes_codec=array_bytes_codec, - bytes_bytes_codecs=bytes_bytes_codecs, - batch_size=batch_size or config.get("codec_pipeline.batch_size"), - batched_codec_pipeline=BatchedCodecPipeline( - array_array_codecs=array_array_codecs, - array_bytes_codec=array_bytes_codec, - bytes_bytes_codecs=bytes_bytes_codecs, - ), - ) - - async def decode( - self, - chunk_bytes_and_specs: Iterable[Tuple[Optional[BytesLike], ArraySpec]], - ) -> Iterable[Optional[np.ndarray]]: - output: list[Optional[np.ndarray]] = [] - for batch_info in batched(chunk_bytes_and_specs, self.batch_size): - output.extend(await self.batched_codec_pipeline.decode(batch_info)) - return output - - async def decode_partial( - self, - batch_info: Iterable[Tuple[ByteGetter, SliceSelection, ArraySpec]], - ) -> Iterable[Optional[np.ndarray]]: - output: list[Optional[np.ndarray]] = [] - for single_batch_info in batched(batch_info, self.batch_size): - output.extend(await self.batched_codec_pipeline.decode_partial(single_batch_info)) - return output - - async def encode( - self, - chunk_arrays_and_specs: Iterable[Tuple[Optional[np.ndarray], ArraySpec]], - ) -> Iterable[Optional[BytesLike]]: - output: list[Optional[BytesLike]] = [] - for single_batch_info in batched(chunk_arrays_and_specs, self.batch_size): - output.extend(await self.batched_codec_pipeline.encode(single_batch_info)) - return output - - async def encode_partial( - self, - batch_info: Iterable[Tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], - ) -> None: - for single_batch_info in batched(batch_info, self.batch_size): - await self.batched_codec_pipeline.encode_partial(single_batch_info) - - async def read_batch( - self, - batch_info: Iterable[Tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], - out: np.ndarray, - ) -> None: - await concurrent_map( - [ - (single_batch_info, out) - for single_batch_info in batched(batch_info, self.batch_size) - ], - self.batched_codec_pipeline.read_batch, - config.get("async.concurrency"), - ) - - async def write_batch( - self, - batch_info: Iterable[Tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], - value: np.ndarray, - ) -> None: - await concurrent_map( - [ - (single_batch_info, value) - for single_batch_info in batched(batch_info, self.batch_size) - ], - self.batched_codec_pipeline.write_batch, - config.get("async.concurrency"), - ) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index d44d6f997d..a46a77e95e 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -6,7 +6,7 @@ import numpy as np -from zarr.abc.codec import ByteGetter, ByteSetter, Codec +from zarr.abc.codec import ByteGetter, ByteSetter, Codec, CodecPipeline from zarr.codecs.bytes import BytesCodec from zarr.codecs.crc32c_ import Crc32cCodec from zarr.codecs.mixins import ( @@ -14,7 +14,7 @@ ArrayBytesCodecPartialDecodeBatchMixin, ArrayBytesCodecPartialEncodeBatchMixin, ) -from zarr.codecs.pipeline import CodecPipeline, HybridCodecPipeline +from zarr.codecs.batched_codec_pipeline import BatchedCodecPipeline from zarr.codecs.registry import register_codec from zarr.common import ( ArraySpec, @@ -315,12 +315,12 @@ def __init__( codecs_parsed = ( parse_codecs(codecs) if codecs is not None - else HybridCodecPipeline.from_list([BytesCodec()]) + else BatchedCodecPipeline.from_list([BytesCodec()]) ) index_codecs_parsed = ( parse_codecs(index_codecs) if index_codecs is not None - else HybridCodecPipeline.from_list([BytesCodec(), Crc32cCodec()]) + else BatchedCodecPipeline.from_list([BytesCodec(), Crc32cCodec()]) ) index_location_parsed = ( parse_index_location(index_location) @@ -410,7 +410,7 @@ async def decode_single( return out # decoding chunks and writing them into the output buffer - await self.codecs.read_batch( + await self.codecs.read( [ ( _ShardingByteGetter(shard_dict, chunk_coords), @@ -474,7 +474,7 @@ async def decode_partial_single( shard_dict[chunk_coords] = chunk_bytes # decoding chunks and writing them into the output buffer - await self.codecs.read_batch( + await self.codecs.read( [ ( _ShardingByteGetter(shard_dict, chunk_coords), @@ -509,7 +509,7 @@ async def encode_single( shard_builder = _ShardBuilder.create_empty(chunks_per_shard) - await self.codecs.write_batch( + await self.codecs.write( [ ( _ShardingByteSetter(shard_builder, chunk_coords), @@ -550,7 +550,7 @@ async def encode_partial_single( ) ) - await self.codecs.write_batch( + await self.codecs.write( [ ( _ShardingByteSetter(shard_dict, chunk_coords), diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py index 58d6d38277..356afe7800 100644 --- a/src/zarr/metadata.py +++ b/src/zarr/metadata.py @@ -11,11 +11,10 @@ if TYPE_CHECKING: - from typing import Any, Literal, Union, List, Optional, Tuple - from zarr.codecs import CodecPipeline + from typing import Literal, Union, List, Optional, Tuple -from zarr.abc.codec import Codec +from zarr.abc.codec import Codec, CodecPipeline from zarr.abc.metadata import Metadata from zarr.common import ( @@ -368,8 +367,8 @@ def parse_v2_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: def parse_codecs(data: Iterable[Union[Codec, JSON]]) -> CodecPipeline: - from zarr.codecs.pipeline.hybrid import HybridCodecPipeline + from zarr.codecs import BatchedCodecPipeline if not isinstance(data, Iterable): raise TypeError(f"Expected iterable, got {type(data)}") - return HybridCodecPipeline.from_dict(data) + return BatchedCodecPipeline.from_dict(data) From d9aa24f0b8ed7555f12d9e2c72879f871cbf3538 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 15 May 2024 13:29:31 +0200 Subject: [PATCH 14/21] generic codec classes --- src/zarr/abc/codec.py | 62 ++++++++------------- src/zarr/codecs/batched_codec_pipeline.py | 4 +- src/zarr/codecs/mixins.py | 68 ++++++----------------- src/zarr/codecs/registry.py | 1 - src/zarr/metadata.py | 4 +- 5 files changed, 47 insertions(+), 92 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index e3310c351b..083ce87e51 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -1,15 +1,16 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Iterable, Protocol, runtime_checkable +from typing import TYPE_CHECKING, Generic, Iterable, Protocol, TypeVar, runtime_checkable import numpy as np from zarr.abc.metadata import Metadata +from zarr.common import BytesLike if TYPE_CHECKING: from typing_extensions import Self - from zarr.common import ArraySpec, BytesLike, SliceSelection + from zarr.common import ArraySpec, SliceSelection from zarr.metadata import ArrayMetadata @@ -27,7 +28,11 @@ async def set(self, value: BytesLike, byte_range: tuple[int, int] | None = None) async def delete(self) -> None: ... -class Codec(Metadata): +CodecInput = TypeVar("CodecInput", bound=np.ndarray | BytesLike) +CodecOutput = TypeVar("CodecOutput", bound=np.ndarray | BytesLike) + + +class _Codec(Generic[CodecInput, CodecOutput], Metadata): is_fixed_size: bool @abstractmethod @@ -43,37 +48,34 @@ def evolve(self, array_spec: ArraySpec) -> Self: def validate(self, array_metadata: ArrayMetadata) -> None: pass - -class ArrayArrayCodec(Codec): @abstractmethod async def decode( self, - chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], - ) -> Iterable[np.ndarray | None]: + chunks_and_specs: Iterable[tuple[CodecOutput | None, ArraySpec]], + ) -> Iterable[CodecInput | None]: pass @abstractmethod async def encode( self, - chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], - ) -> Iterable[np.ndarray | None]: + chunks_and_specs: Iterable[tuple[CodecInput | None, ArraySpec]], + ) -> Iterable[CodecOutput | None]: pass -class ArrayBytesCodec(Codec): - @abstractmethod - async def decode( - self, - chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], - ) -> Iterable[np.ndarray | None]: - pass +class ArrayArrayCodec(_Codec[np.ndarray, np.ndarray]): + pass - @abstractmethod - async def encode( - self, - chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], - ) -> Iterable[BytesLike | None]: - pass + +class ArrayBytesCodec(_Codec[np.ndarray, BytesLike]): + pass + + +class BytesBytesCodec(_Codec[BytesLike, BytesLike]): + pass + + +Codec = ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec class ArrayBytesCodecPartialDecodeMixin: @@ -94,22 +96,6 @@ async def encode_partial( pass -class BytesBytesCodec(Codec): - @abstractmethod - async def decode( - self, - chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], - ) -> Iterable[BytesLike | None]: - pass - - @abstractmethod - async def encode( - self, - chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], - ) -> Iterable[BytesLike | None]: - pass - - class CodecPipeline(Metadata): @abstractmethod def evolve(self, array_spec: ArraySpec) -> Self: diff --git a/src/zarr/codecs/batched_codec_pipeline.py b/src/zarr/codecs/batched_codec_pipeline.py index ced0e329c1..77ec468f30 100644 --- a/src/zarr/codecs/batched_codec_pipeline.py +++ b/src/zarr/codecs/batched_codec_pipeline.py @@ -66,7 +66,9 @@ def from_dict(cls, data: Iterable[JSON | Codec], *, batch_size: int | None = Non raise TypeError(f"Expected iterable, got {type(data)}") for c in data: - if isinstance(c, Codec): + if isinstance( + c, ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec + ): # Can't use Codec here because of mypy limitation out.append(c) else: name_parsed, _ = parse_named_configuration(c, require_configuration=False) diff --git a/src/zarr/codecs/mixins.py b/src/zarr/codecs/mixins.py index 9538d5ddf3..380540bbab 100644 --- a/src/zarr/codecs/mixins.py +++ b/src/zarr/codecs/mixins.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import abstractmethod -from typing import Awaitable, Callable, Iterable, TypeVar +from typing import Awaitable, Callable, Generic, Iterable, TypeVar import numpy as np @@ -44,48 +44,38 @@ async def wrap(chunk: CodecInput | None, chunk_spec: ArraySpec) -> CodecOutput | return wrap -class ArrayArrayCodecBatchMixin(ArrayArrayCodec): +class CodecBatchMixin(Generic[CodecInput, CodecOutput]): @abstractmethod - async def decode_single(self, chunk_array: np.ndarray, chunk_spec: ArraySpec) -> np.ndarray: + async def decode_single(self, chunk_data: CodecOutput, chunk_spec: ArraySpec) -> CodecInput: pass async def decode( - self, chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]] - ) -> Iterable[np.ndarray | None]: - return await batching_helper(self.decode_single, chunk_arrays_and_specs) + self, chunk_data_and_specs: Iterable[tuple[CodecOutput | None, ArraySpec]] + ) -> Iterable[CodecInput | None]: + return await batching_helper(self.decode_single, chunk_data_and_specs) @abstractmethod async def encode_single( - self, chunk_array: np.ndarray, chunk_spec: ArraySpec - ) -> np.ndarray | None: + self, chunk_data: CodecInput, chunk_spec: ArraySpec + ) -> CodecOutput | None: pass async def encode( - self, chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]] - ) -> Iterable[np.ndarray | None]: - return await batching_helper(self.encode_single, chunk_arrays_and_specs) + self, chunk_data_and_specs: Iterable[tuple[CodecInput | None, ArraySpec]] + ) -> Iterable[CodecOutput | None]: + return await batching_helper(self.encode_single, chunk_data_and_specs) -class ArrayBytesCodecBatchMixin(ArrayBytesCodec): - @abstractmethod - async def decode_single(self, chunk_bytes: BytesLike, chunk_spec: ArraySpec) -> np.ndarray: - pass +class ArrayArrayCodecBatchMixin(CodecBatchMixin[np.ndarray, np.ndarray], ArrayArrayCodec): + pass - async def decode( - self, chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]] - ) -> Iterable[np.ndarray | None]: - return await batching_helper(self.decode_single, chunk_bytes_and_specs) - @abstractmethod - async def encode_single( - self, chunk_array: np.ndarray, chunk_spec: ArraySpec - ) -> BytesLike | None: - pass +class ArrayBytesCodecBatchMixin(CodecBatchMixin[np.ndarray, BytesLike], ArrayBytesCodec): + pass - async def encode( - self, chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]] - ) -> Iterable[BytesLike | None]: - return await batching_helper(self.encode_single, chunk_arrays_and_specs) + +class BytesBytesCodecBatchMixin(CodecBatchMixin[BytesLike, BytesLike], BytesBytesCodec): + pass class ArrayBytesCodecPartialDecodeBatchMixin(ArrayBytesCodecPartialDecodeMixin): @@ -130,25 +120,3 @@ async def encode_partial( self.encode_partial_single, config.get("async.concurrency"), ) - - -class BytesBytesCodecBatchMixin(BytesBytesCodec): - @abstractmethod - async def decode_single(self, chunk_bytes: BytesLike, chunk_spec: ArraySpec) -> BytesLike: - pass - - async def decode( - self, chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]] - ) -> Iterable[BytesLike | None]: - return await batching_helper(self.decode_single, chunk_bytes_and_specs) - - @abstractmethod - async def encode_single( - self, chunk_array: BytesLike, chunk_spec: ArraySpec - ) -> BytesLike | None: - pass - - async def encode( - self, chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]] - ) -> Iterable[BytesLike | None]: - return await batching_helper(self.encode_single, chunk_bytes_and_specs) diff --git a/src/zarr/codecs/registry.py b/src/zarr/codecs/registry.py index 7d46041255..b981f1f36c 100644 --- a/src/zarr/codecs/registry.py +++ b/src/zarr/codecs/registry.py @@ -7,7 +7,6 @@ from importlib.metadata import EntryPoint, entry_points as get_entry_points - __codec_registry: Dict[str, Type[Codec]] = {} __lazy_load_codecs: Dict[str, EntryPoint] = {} diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py index 356afe7800..d1e72a7600 100644 --- a/src/zarr/metadata.py +++ b/src/zarr/metadata.py @@ -347,12 +347,12 @@ def parse_node_type_array(data: Any) -> Literal["array"]: # todo: real validation -def parse_filters(data: Any) -> List[Codec]: +def parse_filters(data: Any) -> list[dict[str, JSON]]: return data # todo: real validation -def parse_compressor(data: Any) -> Codec: +def parse_compressor(data: Any) -> dict[str, JSON] | None: return data From a5fb71ede766fce6fb950ce5ca2c04c149ad1005 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 15 May 2024 13:38:37 +0200 Subject: [PATCH 15/21] default batch size = 1 --- src/zarr/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/config.py b/src/zarr/config.py index b0afe71e87..5b1640bd56 100644 --- a/src/zarr/config.py +++ b/src/zarr/config.py @@ -10,7 +10,7 @@ { "array": {"order": "C"}, "async": {"concurrency": None, "timeout": None}, - "codec_pipeline": {"batch_size": 1000}, + "codec_pipeline": {"batch_size": 1}, } ], ) From efd9bce299c398f2108692f8f18109c168f4aa89 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 15 May 2024 13:57:51 +0200 Subject: [PATCH 16/21] default batch size = 1 --- tests/v3/test_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v3/test_config.py b/tests/v3/test_config.py index 8c5a7d6f59..aed9775d17 100644 --- a/tests/v3/test_config.py +++ b/tests/v3/test_config.py @@ -7,7 +7,7 @@ def test_config_defaults_set(): { "array": {"order": "C"}, "async": {"concurrency": None, "timeout": None}, - "codec_pipeline": {"batch_size": 1000}, + "codec_pipeline": {"batch_size": 1}, } ] assert config.get("array.order") == "C" From 38c436dda7922d0adafb28e7420a0ea2ab43ba35 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Wed, 15 May 2024 14:55:23 +0200 Subject: [PATCH 17/21] docs --- src/zarr/abc/codec.py | 259 +++++++++++++++++++--- src/zarr/codecs/batched_codec_pipeline.py | 23 +- src/zarr/codecs/mixins.py | 9 + 3 files changed, 242 insertions(+), 49 deletions(-) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 083ce87e51..3115b70bc2 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -33,124 +33,290 @@ async def delete(self) -> None: ... class _Codec(Generic[CodecInput, CodecOutput], Metadata): + """Generic base class for codecs. + Please use ArrayArrayCodec, ArrayBytesCodec or BytesBytesCodec for subclassing. + + Codecs can be registered via zarr.codecs.registry. + """ + is_fixed_size: bool @abstractmethod def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: - pass + """Given an input byte length, this method returns the output byte length. + Raises a NotImplementedError for codecs with variable-sized outputs (e.g. compressors). + + Parameters + ---------- + input_byte_length : int + chunk_spec : ArraySpec + + Returns + ------- + int + """ + ... def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + """Computed the spec of the chunk after it has been encoded by the codec. + This is important for codecs that change the shape, data type or fill value of a chunk. + The spec will then be used for subsequent codecs in the pipeline. + + Parameters + ---------- + chunk_spec : ArraySpec + + Returns + ------- + ArraySpec + """ return chunk_spec def evolve(self, array_spec: ArraySpec) -> Self: + """Fills in codec configuration parameters that can be automatically + inferred from the array metadata. + + Parameters + ---------- + chunk_spec : ArraySpec + + Returns + ------- + Self + """ return self def validate(self, array_metadata: ArrayMetadata) -> None: - pass + """Validates that the codec configuration is compatible with the array metadata. + Raises errors when the codec configuration is not compatible. + + Parameters + ---------- + array_metadata : ArrayMetadata + """ + ... @abstractmethod async def decode( self, chunks_and_specs: Iterable[tuple[CodecOutput | None, ArraySpec]], ) -> Iterable[CodecInput | None]: - pass + """Decodes a batch of chunks. + Chunks can be None in which case they are ignored by the codec. + + Parameters + ---------- + chunks_and_specs : Iterable[tuple[CodecOutput | None, ArraySpec]] + Ordered set of encoded chunks with their accompanying chunk spec. + + Returns + ------- + Iterable[CodecInput | None] + """ + ... @abstractmethod async def encode( self, chunks_and_specs: Iterable[tuple[CodecInput | None, ArraySpec]], ) -> Iterable[CodecOutput | None]: - pass + """Encodes a batch of chunks. + Chunks can be None in which case they are ignored by the codec. + + Parameters + ---------- + chunks_and_specs : Iterable[tuple[CodecInput | None, ArraySpec]] + Ordered set of to-be-encoded chunks with their accompanying chunk spec. + + Returns + ------- + Iterable[CodecOutput | None] + """ + ... class ArrayArrayCodec(_Codec[np.ndarray, np.ndarray]): - pass + """Base class for array-to-array codecs.""" + + ... class ArrayBytesCodec(_Codec[np.ndarray, BytesLike]): - pass + """Base class for array-to-bytes codecs.""" + + ... class BytesBytesCodec(_Codec[BytesLike, BytesLike]): - pass + """Base class for bytes-to-bytes codecs.""" + + ... Codec = ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec class ArrayBytesCodecPartialDecodeMixin: + """Mixin for array-to-bytes codecs that implement partial decoding.""" + @abstractmethod async def decode_partial( self, batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], ) -> Iterable[np.ndarray | None]: - pass + """Partially decodes a batch of chunks. + This method determines parts of a chunk from the slice selection, + fetches these parts from the store (via ByteGetter) and decodes them. + + Parameters + ---------- + batch_info : Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]] + Ordered set of information about slices of encoded chunks. + The slice selection determines which parts of the chunk will be fetched. + The ByteGetter is used to fetch the necessary bytes. + The chunk spec contains information about the construction of an array from the bytes. + + Returns + ------- + Iterable[np.ndarray | None] + """ + ... class ArrayBytesCodecPartialEncodeMixin: + """Mixin for array-to-bytes codecs that implement partial encoding.""" + @abstractmethod async def encode_partial( self, batch_info: Iterable[tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], ) -> None: - pass + """Partially encodes a batch of chunks. + This method determines parts of a chunk from the slice selection, encodes them and + writes these parts to the store (via ByteSetter). + If merging with existing chunk data in the store is necessary, this method will + read from the store first and perform the merge. + + Parameters + ---------- + batch_info : Iterable[tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]] + Ordered set of information about slices of to-be-encoded chunks. + The slice selection determines which parts of the chunk will be encoded. + The ByteSetter is used to write the necessary bytes and fetch bytes for existing chunk data. + The chunk spec contains information about the chunk. + """ + ... class CodecPipeline(Metadata): + """Base class for implementing CodecPipeline. + A CodecPipeline implements the read and write paths for chunk data. + On the read path, it is responsible for fetching chunks from a store (via ByteGetter), + decoding them and assembling an output array. On the write path, it encodes the chunks + and writes them to a store (via ByteSetter).""" + @abstractmethod def evolve(self, array_spec: ArraySpec) -> Self: - pass + """Fills in codec configuration parameters that can be automatically + inferred from the array metadata. + + Parameters + ---------- + array_spec : ArraySpec + + Returns + ------- + Self + """ + ... @classmethod @abstractmethod def from_list(cls, codecs: list[Codec]) -> Self: - pass + """Creates a codec pipeline from a list of codecs. + + Parameters + ---------- + codecs : list[Codec] + + Returns + ------- + Self + """ + ... @property @abstractmethod - def supports_partial_decode(self) -> bool: - pass + def supports_partial_decode(self) -> bool: ... @property @abstractmethod - def supports_partial_encode(self) -> bool: - pass + def supports_partial_encode(self) -> bool: ... @abstractmethod def validate(self, array_metadata: ArrayMetadata) -> None: - pass + """Validates that all codec configurations are compatible with the array metadata. + Raises errors when a codec configuration is not compatible. + + Parameters + ---------- + array_metadata : ArrayMetadata + """ + ... @abstractmethod def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int: - pass + """Given an input byte length, this method returns the output byte length. + Raises a NotImplementedError for codecs with variable-sized outputs (e.g. compressors). + + Parameters + ---------- + input_byte_length : int + array_spec : ArraySpec + + Returns + ------- + int + """ + ... @abstractmethod async def decode( self, chunk_bytes_and_specs: Iterable[tuple[BytesLike | None, ArraySpec]], ) -> Iterable[np.ndarray | None]: - pass + """Decodes a batch of chunks. + Chunks can be None in which case they are ignored by the codec. - @abstractmethod - async def decode_partial( - self, - batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], - ) -> Iterable[np.ndarray | None]: - pass + Parameters + ---------- + chunks_and_specs : Iterable[tuple[BytesLike | None, ArraySpec]] + Ordered set of encoded chunks with their accompanying chunk spec. + + Returns + ------- + Iterable[np.ndarray | None] + """ + ... @abstractmethod async def encode( self, chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], ) -> Iterable[BytesLike | None]: - pass + """Encodes a batch of chunks. + Chunks can be None in which case they are ignored by the codec. - @abstractmethod - async def encode_partial( - self, - batch_info: Iterable[tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], - ) -> None: - pass + Parameters + ---------- + chunks_and_specs : Iterable[tuple[np.ndarray | None, ArraySpec]] + Ordered set of to-be-encoded chunks with their accompanying chunk spec. + + Returns + ------- + Iterable[BytesLike | None] + """ + ... @abstractmethod async def read( @@ -158,7 +324,20 @@ async def read( batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], out: np.ndarray, ) -> None: - pass + """Reads chunk data from the store, decodes it and writes it into an output array. + Partial decoding may be utilized if the codecs and stores support it. + + Parameters + ---------- + batch_info : Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]] + Ordered set of information about the chunks. + The first slice selection determines which parts of the chunk will be fetched. + The second slice selection determines where in the output array the chunk data will be written. + The ByteGetter is used to fetch the necessary bytes. + The chunk spec contains information about the construction of an array from the bytes. + out : np.ndarray + """ + ... @abstractmethod async def write( @@ -166,4 +345,18 @@ async def write( batch_info: Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]], value: np.ndarray, ) -> None: - pass + """Encodes chunk data and writes it to the store. + Merges with existing chunk data by reading first, if necessary. + Partial encoding may be utilized if the codecs and stores support it. + + Parameters + ---------- + batch_info : Iterable[tuple[ByteSetter, ArraySpec, SliceSelection, SliceSelection]] + Ordered set of information about the chunks. + The first slice selection determines which parts of the chunk will be encoded. + The second slice selection determines where in the value array the chunk data is located. + The ByteSetter is used to fetch and write the necessary bytes. + The chunk spec contains information about the chunk. + value : np.ndarray + """ + ... diff --git a/src/zarr/codecs/batched_codec_pipeline.py b/src/zarr/codecs/batched_codec_pipeline.py index 77ec468f30..45b7289cea 100644 --- a/src/zarr/codecs/batched_codec_pipeline.py +++ b/src/zarr/codecs/batched_codec_pipeline.py @@ -54,6 +54,13 @@ def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[ @dataclass(frozen=True) class BatchedCodecPipeline(CodecPipeline): + """Default codec pipeline. + + This batched codec pipeline divides the chunk batches into batches of a configurable + batch size ("mini-batch"). Fetching, decoding, encoding and storing are performed in + lock step for each mini-batch. Multiple mini-batches are processing concurrently. + """ + array_array_codecs: tuple[ArrayArrayCodec, ...] array_bytes_codec: ArrayBytesCodec bytes_bytes_codecs: tuple[BytesBytesCodec, ...] @@ -394,15 +401,6 @@ async def decode( output.extend(await self.decode_batch(batch_info)) return output - async def decode_partial( - self, - batch_info: Iterable[tuple[ByteGetter, SliceSelection, ArraySpec]], - ) -> Iterable[np.ndarray | None]: - output: list[np.ndarray | None] = [] - for single_batch_info in batched(batch_info, self.batch_size): - output.extend(await self.decode_partial_batch(single_batch_info)) - return output - async def encode( self, chunk_arrays_and_specs: Iterable[tuple[np.ndarray | None, ArraySpec]], @@ -412,13 +410,6 @@ async def encode( output.extend(await self.encode_batch(single_batch_info)) return output - async def encode_partial( - self, - batch_info: Iterable[tuple[ByteSetter, np.ndarray, SliceSelection, ArraySpec]], - ) -> None: - for single_batch_info in batched(batch_info, self.batch_size): - await self.encode_partial_batch(single_batch_info) - async def read( self, batch_info: Iterable[tuple[ByteGetter, ArraySpec, SliceSelection, SliceSelection]], diff --git a/src/zarr/codecs/mixins.py b/src/zarr/codecs/mixins.py index 380540bbab..7468f520bf 100644 --- a/src/zarr/codecs/mixins.py +++ b/src/zarr/codecs/mixins.py @@ -45,6 +45,15 @@ async def wrap(chunk: CodecInput | None, chunk_spec: ArraySpec) -> CodecOutput | class CodecBatchMixin(Generic[CodecInput, CodecOutput]): + """The default interface from the Codec class expects batches of codecs. + However, many codec implementation operate on single codecs. + This mixin provides abstract methods for decode_single and encode_single and + implements batching through concurrent processing. + + Use ArrayArrayCodecBatchMixin, ArrayBytesCodecBatchMixin and BytesBytesCodecBatchMixin + for subclassing. + """ + @abstractmethod async def decode_single(self, chunk_data: CodecOutput, chunk_spec: ArraySpec) -> CodecInput: pass From 3a85a0a3954023234f198cdbfe4ae27d51e3ca56 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 16 May 2024 09:52:15 +0200 Subject: [PATCH 18/21] Update src/zarr/codecs/batched_codec_pipeline.py Co-authored-by: Joe Hamman --- src/zarr/codecs/batched_codec_pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/zarr/codecs/batched_codec_pipeline.py b/src/zarr/codecs/batched_codec_pipeline.py index 45b7289cea..90f1089395 100644 --- a/src/zarr/codecs/batched_codec_pipeline.py +++ b/src/zarr/codecs/batched_codec_pipeline.py @@ -126,7 +126,8 @@ def codecs_from_list( if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1: warn( "Combining a `sharding_indexed` codec disables partial reads and " - + "writes, which may lead to inefficient performance." + + "writes, which may lead to inefficient performance.", + stacklevel=3 ) return ( From f33e66a7f93ddf61e5b220580bce406c738ac790 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 16 May 2024 11:50:44 +0200 Subject: [PATCH 19/21] mv batched_codec_pipeline -> pipeline --- src/zarr/codecs/__init__.py | 2 +- ...{batched_codec_pipeline.py => pipeline.py} | 20 +++++++++++++++++++ src/zarr/codecs/sharding.py | 2 +- 3 files changed, 22 insertions(+), 2 deletions(-) rename src/zarr/codecs/{batched_codec_pipeline.py => pipeline.py} (93%) diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index dbeef59928..959a85af57 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -7,4 +7,4 @@ from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation # noqa: F401 from zarr.codecs.transpose import TransposeCodec # noqa: F401 from zarr.codecs.zstd import ZstdCodec # noqa: F401 -from zarr.codecs.batched_codec_pipeline import BatchedCodecPipeline # noqa: F401 +from zarr.codecs.pipeline import BatchedCodecPipeline # noqa: F401 diff --git a/src/zarr/codecs/batched_codec_pipeline.py b/src/zarr/codecs/pipeline.py similarity index 93% rename from src/zarr/codecs/batched_codec_pipeline.py rename to src/zarr/codecs/pipeline.py index 45b7289cea..f20d90b68c 100644 --- a/src/zarr/codecs/batched_codec_pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -148,12 +148,32 @@ def from_list(cls, codecs: list[Codec], *, batch_size: int | None = None) -> Sel @property def supports_partial_decode(self) -> bool: + """Determines whether the codec pipeline supports partial decoding. + + Currently, only codec pipelines with a single ArrayBytesCodec that supports + partial decoding can support partial decoding. This limitation is due to the fact + that ArrayArrayCodecs can change the slice selection leading to non-contiguous + slices and BytesBytesCodecs can change the chunk bytes in a way that slice + selections cannot be attributed to byte ranges anymore which renders partial + decoding infeasible. + + This limitation may softened in the future.""" return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin ) @property def supports_partial_encode(self) -> bool: + """Determines whether the codec pipeline supports partial encoding. + + Currently, only codec pipelines with a single ArrayBytesCodec that supports + partial encoding can support partial encoding. This limitation is due to the fact + that ArrayArrayCodecs can change the slice selection leading to non-contiguous + slices and BytesBytesCodecs can change the chunk bytes in a way that slice + selections cannot be attributed to byte ranges anymore which renders partial + encoding infeasible. + + This limitation may softened in the future.""" return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance( self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin ) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index a46a77e95e..0606e31a40 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -14,7 +14,7 @@ ArrayBytesCodecPartialDecodeBatchMixin, ArrayBytesCodecPartialEncodeBatchMixin, ) -from zarr.codecs.batched_codec_pipeline import BatchedCodecPipeline +from zarr.codecs.pipeline import BatchedCodecPipeline from zarr.codecs.registry import register_codec from zarr.common import ( ArraySpec, From 95ae4b6d05ed62adf4b79eda3f6fc4553abf5fdb Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 16 May 2024 11:51:10 +0200 Subject: [PATCH 20/21] Remove ArrayV2 (#1857) * adds wrapper codecs for the v2 codec pipeline * encode_chunk_key * refactor ArrayV2 away * empty zattrs * Apply suggestions from code review Co-authored-by: Davis Bennett * unify ArrayMetadata * abstract ArrayMetadata * unified Array.create * use zarr.config for batch_size * __init__.py aktualisieren Co-authored-by: Joe Hamman * ruff --------- Co-authored-by: Davis Bennett Co-authored-by: Joe Hamman --- src/zarr/__init__.py | 11 +- src/zarr/abc/codec.py | 17 +- src/zarr/abc/store.py | 36 ++- src/zarr/array.py | 383 ++++++++++++++++++-------- src/zarr/array_v2.py | 516 ------------------------------------ src/zarr/attributes.py | 13 +- src/zarr/chunk_grids.py | 12 +- src/zarr/codecs/_v2.py | 100 +++++++ src/zarr/codecs/sharding.py | 13 +- src/zarr/group.py | 67 +++-- src/zarr/indexing.py | 18 +- src/zarr/metadata.py | 185 ++++++++++--- tests/v3/test_group.py | 15 +- tests/v3/test_v2.py | 28 ++ 14 files changed, 672 insertions(+), 742 deletions(-) delete mode 100644 src/zarr/array_v2.py create mode 100644 src/zarr/codecs/_v2.py create mode 100644 tests/v3/test_v2.py diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 65daae8f6d..00c01560f4 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -1,10 +1,7 @@ from __future__ import annotations -from typing import Union - import zarr.codecs # noqa: F401 from zarr.array import Array, AsyncArray -from zarr.array_v2 import ArrayV2 from zarr.config import config # noqa: F401 from zarr.group import AsyncGroup, Group from zarr.store import ( @@ -18,9 +15,7 @@ assert not __version__.startswith("0.0.0") -async def open_auto_async( - store: StoreLike, -) -> Union[AsyncArray, AsyncGroup]: +async def open_auto_async(store: StoreLike) -> AsyncArray | AsyncGroup: store_path = make_store_path(store) try: return await AsyncArray.open(store_path) @@ -28,9 +23,7 @@ async def open_auto_async( return await AsyncGroup.open(store_path) -def open_auto( - store: StoreLike, -) -> Union[Array, ArrayV2, Group]: +def open_auto(store: StoreLike) -> Array | Group: object = _sync( open_auto_async(store), ) diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index 3115b70bc2..a32790d8e1 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -1,10 +1,11 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Generic, Iterable, Protocol, TypeVar, runtime_checkable +from typing import TYPE_CHECKING, Generic, Iterable, TypeVar import numpy as np from zarr.abc.metadata import Metadata +from zarr.abc.store import ByteGetter, ByteSetter from zarr.common import BytesLike @@ -14,20 +15,6 @@ from zarr.metadata import ArrayMetadata -@runtime_checkable -class ByteGetter(Protocol): - async def get(self, byte_range: tuple[int, int | None] | None = None) -> BytesLike | None: ... - - -@runtime_checkable -class ByteSetter(Protocol): - async def get(self, byte_range: tuple[int, int | None] | None = None) -> BytesLike | None: ... - - async def set(self, value: BytesLike, byte_range: tuple[int, int] | None = None) -> None: ... - - async def delete(self) -> None: ... - - CodecInput = TypeVar("CodecInput", bound=np.ndarray | BytesLike) CodecOutput = TypeVar("CodecOutput", bound=np.ndarray | BytesLike) diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index 3d9550f733..6cc01681a9 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -1,7 +1,8 @@ from abc import abstractmethod, ABC from collections.abc import AsyncGenerator +from typing import List, Protocol, Tuple, Optional, runtime_checkable -from typing import List, Tuple, Optional +from zarr.common import BytesLike class Store(ABC): @@ -61,13 +62,13 @@ def supports_writes(self) -> bool: ... @abstractmethod - async def set(self, key: str, value: bytes) -> None: + async def set(self, key: str, value: BytesLike) -> None: """Store a (key, value) pair. Parameters ---------- key : str - value : bytes + value : BytesLike """ ... @@ -88,12 +89,12 @@ def supports_partial_writes(self) -> bool: ... @abstractmethod - async def set_partial_values(self, key_start_values: List[Tuple[str, int, bytes]]) -> None: + async def set_partial_values(self, key_start_values: list[tuple[str, int, BytesLike]]) -> None: """Store values at a given key, starting at byte range_start. Parameters ---------- - key_start_values : list[tuple[str, int, bytes]] + key_start_values : list[tuple[str, int, BytesLike]] set of key, range_start, values triples, a key may occur multiple times with different range_starts, range_starts (considering the length of the respective values) must not specify overlapping ranges for the same key @@ -145,3 +146,28 @@ def list_dir(self, prefix: str) -> AsyncGenerator[str, None]: AsyncGenerator[str, None] """ ... + + +@runtime_checkable +class ByteGetter(Protocol): + async def get( + self, byte_range: Optional[Tuple[int, Optional[int]]] = None + ) -> Optional[BytesLike]: ... + + +@runtime_checkable +class ByteSetter(Protocol): + async def get( + self, byte_range: Optional[Tuple[int, Optional[int]]] = None + ) -> Optional[BytesLike]: ... + + async def set(self, value: BytesLike, byte_range: Optional[Tuple[int, int]] = None) -> None: ... + + async def delete(self) -> None: ... + + +async def set_or_delete(byte_setter: ByteSetter, value: BytesLike | None) -> None: + if value is None: + await byte_setter.delete() + else: + await byte_setter.set(value) diff --git a/src/zarr/array.py b/src/zarr/array.py index 83dc1757da..88fbd01ef9 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -10,20 +10,26 @@ # 1. Was splitting the array into two classes really necessary? +from asyncio import gather from dataclasses import dataclass, replace import json -from typing import Any, Dict, Iterable, Literal, Optional, Tuple, Union +from typing import Any, Iterable, Literal import numpy as np import numpy.typing as npt -from zarr.abc.codec import Codec, CodecPipeline +from zarr.abc.codec import Codec +from zarr.abc.store import set_or_delete # from zarr.array_v2 import ArrayV2 +from zarr.attributes import Attributes from zarr.codecs import BytesCodec from zarr.common import ( + JSON, ZARR_JSON, + ZARRAY_JSON, + ZATTRS_JSON, ChunkCoords, Selection, ZarrFormat, @@ -31,10 +37,10 @@ ) from zarr.config import config -from zarr.indexing import BasicIndexer, all_chunk_coords +from zarr.indexing import BasicIndexer from zarr.chunk_grids import RegularChunkGrid -from zarr.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding -from zarr.metadata import ArrayMetadata, parse_indexing_order +from zarr.chunk_key_encodings import ChunkKeyEncoding, DefaultChunkKeyEncoding, V2ChunkKeyEncoding +from zarr.metadata import ArrayMetadata, ArrayV3Metadata, ArrayV2Metadata, parse_indexing_order from zarr.store import StoreLike, StorePath, make_store_path from zarr.sync import sync @@ -43,9 +49,11 @@ def parse_array_metadata(data: Any) -> ArrayMetadata: if isinstance(data, ArrayMetadata): return data elif isinstance(data, dict): - return ArrayMetadata.from_dict(data) - else: - raise TypeError + if data["zarr_format"] == 3: + return ArrayV3Metadata.from_dict(data) + elif data["zarr_format"] == 2: + return ArrayV2Metadata.from_dict(data) + raise TypeError @dataclass(frozen=True) @@ -54,10 +62,6 @@ class AsyncArray: store_path: StorePath order: Literal["C", "F"] - @property - def codecs(self) -> CodecPipeline: - return self.metadata.codecs - def __init__( self, metadata: ArrayMetadata, @@ -76,21 +80,116 @@ async def create( cls, store: StoreLike, *, + # v2 and v3 shape: ChunkCoords, dtype: npt.DTypeLike, - chunk_shape: ChunkCoords, - fill_value: Optional[Any] = None, - chunk_key_encoding: Union[ - Tuple[Literal["default"], Literal[".", "/"]], - Tuple[Literal["v2"], Literal[".", "/"]], - ] = ("default", "/"), - codecs: Optional[Iterable[Union[Codec, Dict[str, Any]]]] = None, - dimension_names: Optional[Iterable[str]] = None, - attributes: Optional[Dict[str, Any]] = None, - exists_ok: bool = False, zarr_format: ZarrFormat = 3, + fill_value: Any | None = None, + attributes: dict[str, JSON] | None = None, + # v3 only + chunk_shape: ChunkCoords | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, + dimension_names: Iterable[str] | None = None, + # v2 only + chunks: ChunkCoords | None = None, + dimension_separator: Literal[".", "/"] | None = None, + order: Literal["C", "F"] | None = None, + filters: list[dict[str, JSON]] | None = None, + compressor: dict[str, JSON] | None = None, + # runtime + exists_ok: bool = False, ) -> AsyncArray: store_path = make_store_path(store) + + if chunk_shape is None: + if chunks is None: + raise ValueError("Either chunk_shape or chunks needs to be provided.") + chunk_shape = chunks + elif chunks is not None: + raise ValueError("Only one of chunk_shape or chunks must be provided.") + + if zarr_format == 3: + if dimension_separator is not None: + raise ValueError( + "dimension_separator cannot be used for arrays with version 3. Use chunk_key_encoding instead." + ) + if order is not None: + raise ValueError( + "order cannot be used for arrays with version 3. Use a transpose codec instead." + ) + if filters is not None: + raise ValueError( + "filters cannot be used for arrays with version 3. Use array-to-array codecs instead." + ) + if compressor is not None: + raise ValueError( + "compressor cannot be used for arrays with version 3. Use bytes-to-bytes codecs instead." + ) + return await cls._create_v3( + store_path, + shape=shape, + dtype=dtype, + chunk_shape=chunk_shape, + fill_value=fill_value, + chunk_key_encoding=chunk_key_encoding, + codecs=codecs, + dimension_names=dimension_names, + attributes=attributes, + exists_ok=exists_ok, + ) + elif zarr_format == 2: + if codecs is not None: + raise ValueError( + "codecs cannot be used for arrays with version 2. Use filters and compressor instead." + ) + if chunk_key_encoding is not None: + raise ValueError( + "chunk_key_encoding cannot be used for arrays with version 2. Use dimension_separator instead." + ) + if dimension_names is not None: + raise ValueError("dimension_names cannot be used for arrays with version 2.") + return await cls._create_v2( + store_path, + shape=shape, + dtype=dtype, + chunks=chunk_shape, + dimension_separator=dimension_separator, + fill_value=fill_value, + order=order, + filters=filters, + compressor=compressor, + attributes=attributes, + exists_ok=exists_ok, + ) + else: + raise ValueError(f"Insupported zarr_format. Got: {zarr_format}") + + @classmethod + async def _create_v3( + cls, + store_path: StorePath, + *, + shape: ChunkCoords, + dtype: npt.DTypeLike, + chunk_shape: ChunkCoords, + fill_value: Any | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, + dimension_names: Iterable[str] | None = None, + attributes: dict[str, JSON] | None = None, + exists_ok: bool = False, + ) -> AsyncArray: if not exists_ok: assert not await (store_path / ZARR_JSON).exists() @@ -102,36 +201,86 @@ async def create( else: fill_value = 0 - metadata = ArrayMetadata( - shape=shape, - data_type=dtype, - chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), - chunk_key_encoding=( + if chunk_key_encoding is None: + chunk_key_encoding = ("default", "/") + if isinstance(chunk_key_encoding, tuple): + chunk_key_encoding = ( V2ChunkKeyEncoding(separator=chunk_key_encoding[1]) if chunk_key_encoding[0] == "v2" else DefaultChunkKeyEncoding(separator=chunk_key_encoding[1]) - ), + ) + + metadata = ArrayV3Metadata( + shape=shape, + data_type=dtype, + chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), + chunk_key_encoding=chunk_key_encoding, fill_value=fill_value, codecs=codecs, dimension_names=tuple(dimension_names) if dimension_names else None, attributes=attributes or {}, ) - array = cls( - metadata=metadata, - store_path=store_path, - ) + array = cls(metadata=metadata, store_path=store_path) + + await array._save_metadata(metadata) + return array + + @classmethod + async def _create_v2( + cls, + store_path: StorePath, + *, + shape: ChunkCoords, + dtype: npt.DTypeLike, + chunks: ChunkCoords, + dimension_separator: Literal[".", "/"] | None = None, + fill_value: None | int | float = None, + order: Literal["C", "F"] | None = None, + filters: list[dict[str, JSON]] | None = None, + compressor: dict[str, JSON] | None = None, + attributes: dict[str, JSON] | None = None, + exists_ok: bool = False, + ) -> AsyncArray: + import numcodecs + + if not exists_ok: + assert not await (store_path / ZARRAY_JSON).exists() - await array._save_metadata() + if order is None: + order = "C" + + if dimension_separator is None: + dimension_separator = "." + + metadata = ArrayV2Metadata( + shape=shape, + dtype=np.dtype(dtype), + chunks=chunks, + order=order, + dimension_separator=dimension_separator, + fill_value=0 if fill_value is None else fill_value, + compressor=( + numcodecs.get_codec(compressor).get_config() if compressor is not None else None + ), + filters=( + [numcodecs.get_codec(filter).get_config() for filter in filters] + if filters is not None + else None + ), + attributes=attributes, + ) + array = cls(metadata=metadata, store_path=store_path) + await array._save_metadata(metadata) return array @classmethod def from_dict( cls, store_path: StorePath, - data: Dict[str, Any], + data: dict[str, JSON], ) -> AsyncArray: - metadata = ArrayMetadata.from_dict(data) + metadata = parse_array_metadata(data) async_array = cls(metadata=metadata, store_path=store_path) return async_array @@ -139,30 +288,54 @@ def from_dict( async def open( cls, store: StoreLike, + zarr_format: ZarrFormat | None = 3, ) -> AsyncArray: store_path = make_store_path(store) - zarr_json_bytes = await (store_path / ZARR_JSON).get() - assert zarr_json_bytes is not None - return cls.from_dict( - store_path, - json.loads(zarr_json_bytes), - ) - @classmethod - async def open_auto( - cls, - store: StoreLike, - ) -> AsyncArray: # TODO: Union[AsyncArray, ArrayV2] - store_path = make_store_path(store) - v3_metadata_bytes = await (store_path / ZARR_JSON).get() - if v3_metadata_bytes is not None: - return cls.from_dict( - store_path, - json.loads(v3_metadata_bytes), + if zarr_format == 2: + zarray_bytes, zattrs_bytes = await gather( + (store_path / ZARRAY_JSON).get(), (store_path / ZATTRS_JSON).get() ) + if zarray_bytes is None: + raise KeyError(store_path) # filenotfounderror? + elif zarr_format == 3: + zarr_json_bytes = await (store_path / ZARR_JSON).get() + if zarr_json_bytes is None: + raise KeyError(store_path) # filenotfounderror? + elif zarr_format is None: + zarr_json_bytes, zarray_bytes, zattrs_bytes = await gather( + (store_path / ZARR_JSON).get(), + (store_path / ZARRAY_JSON).get(), + (store_path / ZATTRS_JSON).get(), + ) + if zarr_json_bytes is not None and zarray_bytes is not None: + # TODO: revisit this exception type + # alternatively, we could warn and favor v3 + raise ValueError("Both zarr.json and .zarray objects exist") + if zarr_json_bytes is None and zarray_bytes is None: + raise KeyError(store_path) # filenotfounderror? + # set zarr_format based on which keys were found + if zarr_json_bytes is not None: + zarr_format = 3 + else: + zarr_format = 2 + else: + raise ValueError(f"unexpected zarr_format: {zarr_format}") + + if zarr_format == 2: + # V2 arrays are comprised of a .zarray and .zattrs objects + assert zarray_bytes is not None + zarray_dict = json.loads(zarray_bytes) + zattrs_dict = json.loads(zattrs_bytes) if zattrs_bytes is not None else {} + zarray_dict["attributes"] = zattrs_dict + return cls(store_path=store_path, metadata=ArrayV2Metadata.from_dict(zarray_dict)) else: - raise ValueError("no v2 support yet") - # return await ArrayV2.open(store_path) + # V3 arrays are comprised of a zarr.json object + assert zarr_json_bytes is not None + return cls( + store_path=store_path, + metadata=ArrayV3Metadata.from_dict(json.loads(zarr_json_bytes)), + ) @property def ndim(self) -> int: @@ -181,15 +354,14 @@ def dtype(self) -> np.dtype[Any]: return self.metadata.dtype @property - def attrs(self) -> dict[str, Any]: + def attrs(self) -> dict[str, JSON]: return self.metadata.attributes async def getitem(self, selection: Selection) -> npt.NDArray[Any]: - assert isinstance(self.metadata.chunk_grid, RegularChunkGrid) indexer = BasicIndexer( selection, shape=self.metadata.shape, - chunk_shape=self.metadata.chunk_grid.chunk_shape, + chunk_grid=self.metadata.chunk_grid, ) # setup output array @@ -200,11 +372,10 @@ async def getitem(self, selection: Selection) -> npt.NDArray[Any]: ) # reading chunks and decoding them - await self.codecs.read( + await self.metadata.codec_pipeline.read( [ ( - self.store_path - / self.metadata.chunk_key_encoding.encode_chunk_key(chunk_coords), + self.store_path / self.metadata.encode_chunk_key(chunk_coords), self.metadata.get_chunk_spec(chunk_coords, self.order), chunk_selection, out_selection, @@ -219,16 +390,16 @@ async def getitem(self, selection: Selection) -> npt.NDArray[Any]: else: return out[()] - async def _save_metadata(self) -> None: - await (self.store_path / ZARR_JSON).set(self.metadata.to_bytes()) + async def _save_metadata(self, metadata: ArrayMetadata) -> None: + to_save = metadata.to_bytes() + awaitables = [set_or_delete(self.store_path / key, value) for key, value in to_save.items()] + await gather(*awaitables) async def setitem(self, selection: Selection, value: np.ndarray) -> None: - assert isinstance(self.metadata.chunk_grid, RegularChunkGrid) - chunk_shape = self.metadata.chunk_grid.chunk_shape indexer = BasicIndexer( selection, shape=self.metadata.shape, - chunk_shape=chunk_shape, + chunk_grid=self.metadata.chunk_grid, ) sel_shape = indexer.shape @@ -245,11 +416,10 @@ async def setitem(self, selection: Selection, value: np.ndarray) -> None: value = value.astype(self.metadata.dtype, order="A") # merging with existing data and encoding chunks - await self.codecs.write( + await self.metadata.codec_pipeline.write( [ ( - self.store_path - / self.metadata.chunk_key_encoding.encode_chunk_key(chunk_coords), + self.store_path / self.metadata.encode_chunk_key(chunk_coords), self.metadata.get_chunk_spec(chunk_coords, self.order), chunk_selection, out_selection, @@ -263,14 +433,11 @@ async def resize( self, new_shape: ChunkCoords, delete_outside_chunks: bool = True ) -> AsyncArray: assert len(new_shape) == len(self.metadata.shape) - new_metadata = replace(self.metadata, shape=new_shape) + new_metadata = self.metadata.update_shape(new_shape) # Remove all chunks outside of the new shape - assert isinstance(self.metadata.chunk_grid, RegularChunkGrid) - chunk_shape = self.metadata.chunk_grid.chunk_shape - chunk_key_encoding = self.metadata.chunk_key_encoding - old_chunk_coords = set(all_chunk_coords(self.metadata.shape, chunk_shape)) - new_chunk_coords = set(all_chunk_coords(new_shape, chunk_shape)) + old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape)) + new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape)) if delete_outside_chunks: @@ -279,7 +446,7 @@ async def _delete_key(key: str) -> None: await concurrent_map( [ - (chunk_key_encoding.encode_chunk_key(chunk_coords),) + (self.metadata.encode_chunk_key(chunk_coords),) for chunk_coords in old_chunk_coords.difference(new_chunk_coords) ], _delete_key, @@ -287,14 +454,14 @@ async def _delete_key(key: str) -> None: ) # Write new metadata - await (self.store_path / ZARR_JSON).set(new_metadata.to_bytes()) + await self._save_metadata(new_metadata) return replace(self, metadata=new_metadata) - async def update_attributes(self, new_attributes: Dict[str, Any]) -> AsyncArray: - new_metadata = replace(self.metadata, attributes=new_attributes) + async def update_attributes(self, new_attributes: dict[str, JSON]) -> AsyncArray: + new_metadata = self.metadata.update_attributes(new_attributes) # Write new metadata - await (self.store_path / ZARR_JSON).set(new_metadata.to_bytes()) + await self._save_metadata(new_metadata) return replace(self, metadata=new_metadata) def __repr__(self) -> str: @@ -313,17 +480,29 @@ def create( cls, store: StoreLike, *, + # v2 and v3 shape: ChunkCoords, dtype: npt.DTypeLike, - chunk_shape: ChunkCoords, - fill_value: Optional[Any] = None, - chunk_key_encoding: Union[ - Tuple[Literal["default"], Literal[".", "/"]], - Tuple[Literal["v2"], Literal[".", "/"]], - ] = ("default", "/"), - codecs: Optional[Iterable[Union[Codec, Dict[str, Any]]]] = None, - dimension_names: Optional[Iterable[str]] = None, - attributes: Optional[Dict[str, Any]] = None, + zarr_format: ZarrFormat = 3, + fill_value: Any | None = None, + attributes: dict[str, JSON] | None = None, + # v3 only + chunk_shape: ChunkCoords | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, + dimension_names: Iterable[str] | None = None, + # v2 only + chunks: ChunkCoords | None = None, + dimension_separator: Literal[".", "/"] | None = None, + order: Literal["C", "F"] | None = None, + filters: list[dict[str, JSON]] | None = None, + compressor: dict[str, JSON] | None = None, + # runtime exists_ok: bool = False, ) -> Array: async_array = sync( @@ -331,12 +510,18 @@ def create( store=store, shape=shape, dtype=dtype, - chunk_shape=chunk_shape, + zarr_format=zarr_format, + attributes=attributes, fill_value=fill_value, + chunk_shape=chunk_shape, chunk_key_encoding=chunk_key_encoding, codecs=codecs, dimension_names=dimension_names, - attributes=attributes, + chunks=chunks, + dimension_separator=dimension_separator, + order=order, + filters=filters, + compressor=compressor, exists_ok=exists_ok, ), ) @@ -346,7 +531,7 @@ def create( def from_dict( cls, store_path: StorePath, - data: Dict[str, Any], + data: dict[str, JSON], ) -> Array: async_array = AsyncArray.from_dict(store_path=store_path, data=data) return cls(async_array) @@ -359,16 +544,6 @@ def open( async_array = sync(AsyncArray.open(store)) return cls(async_array) - @classmethod - def open_auto( - cls, - store: StoreLike, - ) -> Array: # TODO: Union[Array, ArrayV2]: - async_array = sync( - AsyncArray.open_auto(store), - ) - return cls(async_array) - @property def ndim(self) -> int: return self._async_array.ndim @@ -386,8 +561,8 @@ def dtype(self) -> np.dtype[Any]: return self._async_array.dtype @property - def attrs(self) -> dict[str, Any]: - return self._async_array.attrs + def attrs(self) -> Attributes: + return Attributes(self) @property def metadata(self) -> ArrayMetadata: @@ -418,7 +593,7 @@ def resize(self, new_shape: ChunkCoords) -> Array: ) ) - def update_attributes(self, new_attributes: Dict[str, Any]) -> Array: + def update_attributes(self, new_attributes: dict[str, JSON]) -> Array: return type(self)( sync( self._async_array.update_attributes(new_attributes), diff --git a/src/zarr/array_v2.py b/src/zarr/array_v2.py deleted file mode 100644 index 18251e7db7..0000000000 --- a/src/zarr/array_v2.py +++ /dev/null @@ -1,516 +0,0 @@ -from __future__ import annotations - -import asyncio -from dataclasses import dataclass, replace -import json -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union - -import numcodecs -import numpy as np - -from numcodecs.compat import ensure_bytes, ensure_ndarray - -from zarr.common import ( - ZARRAY_JSON, - ZATTRS_JSON, - BytesLike, - ChunkCoords, - Selection, - SliceSelection, - concurrent_map, - to_thread, -) -from zarr.indexing import BasicIndexer, all_chunk_coords, is_total_slice -from zarr.metadata import ArrayV2Metadata -from zarr.store import StoreLike, StorePath, make_store_path -from zarr.sync import sync - -if TYPE_CHECKING: - from zarr.array import Array - - -@dataclass(frozen=True) -class _AsyncArrayProxy: - array: ArrayV2 - - def __getitem__(self, selection: Selection) -> _AsyncArraySelectionProxy: - return _AsyncArraySelectionProxy(self.array, selection) - - -@dataclass(frozen=True) -class _AsyncArraySelectionProxy: - array: ArrayV2 - selection: Selection - - async def get(self) -> np.ndarray: - return await self.array.get_async(self.selection) - - async def set(self, value: np.ndarray): - return await self.array.set_async(self.selection, value) - - -@dataclass(frozen=True) -class ArrayV2: - metadata: ArrayV2Metadata - attributes: Optional[Dict[str, Any]] - store_path: StorePath - - @classmethod - async def create_async( - cls, - store: StoreLike, - *, - shape: ChunkCoords, - dtype: np.dtype, - chunks: ChunkCoords, - dimension_separator: Literal[".", "/"] = ".", - fill_value: Optional[Union[None, int, float]] = None, - order: Literal["C", "F"] = "C", - filters: Optional[List[Dict[str, Any]]] = None, - compressor: Optional[Dict[str, Any]] = None, - attributes: Optional[Dict[str, Any]] = None, - exists_ok: bool = False, - ) -> ArrayV2: - store_path = make_store_path(store) - if not exists_ok: - assert not await (store_path / ZARRAY_JSON).exists() - - metadata = ArrayV2Metadata( - shape=shape, - dtype=np.dtype(dtype), - chunks=chunks, - order=order, - dimension_separator=dimension_separator, - fill_value=0 if fill_value is None else fill_value, - compressor=( - numcodecs.get_codec(compressor).get_config() if compressor is not None else None - ), - filters=( - [numcodecs.get_codec(filter).get_config() for filter in filters] - if filters is not None - else None - ), - ) - array = cls( - metadata=metadata, - store_path=store_path, - attributes=attributes, - ) - await array._save_metadata() - return array - - @classmethod - def create( - cls, - store: StoreLike, - *, - shape: ChunkCoords, - dtype: np.dtype, - chunks: ChunkCoords, - dimension_separator: Literal[".", "/"] = ".", - fill_value: Optional[Union[None, int, float]] = None, - order: Literal["C", "F"] = "C", - filters: Optional[List[Dict[str, Any]]] = None, - compressor: Optional[Dict[str, Any]] = None, - attributes: Optional[Dict[str, Any]] = None, - exists_ok: bool = False, - ) -> ArrayV2: - return sync( - cls.create_async( - store, - shape=shape, - dtype=dtype, - chunks=chunks, - order=order, - dimension_separator=dimension_separator, - fill_value=0 if fill_value is None else fill_value, - compressor=compressor, - filters=filters, - attributes=attributes, - exists_ok=exists_ok, - ), - ) - - @classmethod - async def open_async( - cls, - store: StoreLike, - ) -> ArrayV2: - store_path = make_store_path(store) - zarray_bytes, zattrs_bytes = await asyncio.gather( - (store_path / ZARRAY_JSON).get(), - (store_path / ZATTRS_JSON).get(), - ) - assert zarray_bytes is not None - return cls.from_dict( - store_path, - zarray_json=json.loads(zarray_bytes), - zattrs_json=json.loads(zattrs_bytes) if zattrs_bytes is not None else None, - ) - - @classmethod - def open( - cls, - store: StoreLike, - ) -> ArrayV2: - return sync( - cls.open_async(store), - ) - - @classmethod - def from_dict( - cls, - store_path: StorePath, - zarray_json: Any, - zattrs_json: Optional[Any], - ) -> ArrayV2: - metadata = ArrayV2Metadata.from_dict(zarray_json) - out = cls( - store_path=store_path, - metadata=metadata, - attributes=zattrs_json, - ) - out._validate_metadata() - return out - - async def _save_metadata(self) -> None: - self._validate_metadata() - - await (self.store_path / ZARRAY_JSON).set(self.metadata.to_bytes()) - if self.attributes is not None and len(self.attributes) > 0: - await (self.store_path / ZATTRS_JSON).set( - json.dumps(self.attributes).encode(), - ) - else: - await (self.store_path / ZATTRS_JSON).delete() - - def _validate_metadata(self) -> None: - assert len(self.metadata.shape) == len( - self.metadata.chunks - ), "`chunks` and `shape` need to have the same number of dimensions." - - @property - def ndim(self) -> int: - return len(self.metadata.shape) - - @property - def shape(self) -> ChunkCoords: - return self.metadata.shape - - @property - def dtype(self) -> np.dtype: - return self.metadata.dtype - - @property - def async_(self) -> _AsyncArrayProxy: - return _AsyncArrayProxy(self) - - def __getitem__(self, selection: Selection): - return sync(self.get_async(selection)) - - async def get_async(self, selection: Selection): - indexer = BasicIndexer( - selection, - shape=self.metadata.shape, - chunk_shape=self.metadata.chunks, - ) - - # setup output array - out = np.zeros( - indexer.shape, - dtype=self.metadata.dtype, - order=self.metadata.order, - ) - - # reading chunks and decoding them - await concurrent_map( - [ - (chunk_coords, chunk_selection, out_selection, out) - for chunk_coords, chunk_selection, out_selection in indexer - ], - self._read_chunk, - ) - - if out.shape: - return out - else: - return out[()] - - async def _read_chunk( - self, - chunk_coords: ChunkCoords, - chunk_selection: SliceSelection, - out_selection: SliceSelection, - out: np.ndarray, - ): - store_path = self.store_path / self._encode_chunk_key(chunk_coords) - - chunk_array = await self._decode_chunk(await store_path.get()) - if chunk_array is not None: - tmp = chunk_array[chunk_selection] - out[out_selection] = tmp - else: - out[out_selection] = self.metadata.fill_value - - async def _decode_chunk(self, chunk_bytes: Optional[BytesLike]) -> Optional[np.ndarray]: - if chunk_bytes is None: - return None - - if self.metadata.compressor is not None: - compressor = numcodecs.get_codec(self.metadata.compressor) - chunk_array = ensure_ndarray(await to_thread(compressor.decode, chunk_bytes)) - else: - chunk_array = ensure_ndarray(chunk_bytes) - - # ensure correct dtype - if str(chunk_array.dtype) != self.metadata.dtype: - chunk_array = chunk_array.view(self.metadata.dtype) - - # apply filters in reverse order - if self.metadata.filters is not None: - for filter_metadata in self.metadata.filters[::-1]: - filter = numcodecs.get_codec(filter_metadata) - chunk_array = await to_thread(filter.decode, chunk_array) - - # ensure correct chunk shape - if chunk_array.shape != self.metadata.chunks: - chunk_array = chunk_array.reshape( - self.metadata.chunks, - order=self.metadata.order, - ) - - return chunk_array - - def __setitem__(self, selection: Selection, value: np.ndarray) -> None: - sync(self.set_async(selection, value)) - - async def set_async(self, selection: Selection, value: np.ndarray) -> None: - chunk_shape = self.metadata.chunks - indexer = BasicIndexer( - selection, - shape=self.metadata.shape, - chunk_shape=chunk_shape, - ) - - sel_shape = indexer.shape - - # check value shape - if np.isscalar(value): - # setting a scalar value - pass - else: - if not hasattr(value, "shape"): - value = np.asarray(value, self.metadata.dtype) - assert value.shape == sel_shape - if value.dtype != self.metadata.dtype: - value = value.astype(self.metadata.dtype, order="A") - - # merging with existing data and encoding chunks - await concurrent_map( - [ - ( - value, - chunk_shape, - chunk_coords, - chunk_selection, - out_selection, - ) - for chunk_coords, chunk_selection, out_selection in indexer - ], - self._write_chunk, - ) - - async def _write_chunk( - self, - value: np.ndarray, - chunk_shape: ChunkCoords, - chunk_coords: ChunkCoords, - chunk_selection: SliceSelection, - out_selection: SliceSelection, - ): - store_path = self.store_path / self._encode_chunk_key(chunk_coords) - - if is_total_slice(chunk_selection, chunk_shape): - # write entire chunks - if np.isscalar(value): - chunk_array = np.empty( - chunk_shape, - dtype=self.metadata.dtype, - order=self.metadata.order, - ) - chunk_array.fill(value) - else: - chunk_array = value[out_selection] - await self._write_chunk_to_store(store_path, chunk_array) - - else: - # writing partial chunks - # read chunk first - tmp = await self._decode_chunk(await store_path.get()) - - # merge new value - if tmp is None: - chunk_array = np.empty( - chunk_shape, - dtype=self.metadata.dtype, - order=self.metadata.order, - ) - chunk_array.fill(self.metadata.fill_value) - else: - chunk_array = tmp.copy( - order=self.metadata.order, - ) # make a writable copy - chunk_array[chunk_selection] = value[out_selection] - - await self._write_chunk_to_store(store_path, chunk_array) - - async def _write_chunk_to_store(self, store_path: StorePath, chunk_array: np.ndarray): - chunk_bytes: Optional[BytesLike] - if np.all(chunk_array == self.metadata.fill_value): - # chunks that only contain fill_value will be removed - await store_path.delete() - else: - chunk_bytes = await self._encode_chunk(chunk_array) - if chunk_bytes is None: - await store_path.delete() - else: - await store_path.set(chunk_bytes) - - async def _encode_chunk(self, chunk_array: np.ndarray) -> Optional[BytesLike]: - chunk_array = chunk_array.ravel(order=self.metadata.order) - - if self.metadata.filters is not None: - for filter_metadata in self.metadata.filters: - filter = numcodecs.get_codec(filter_metadata) - chunk_array = await to_thread(filter.encode, chunk_array) - - if self.metadata.compressor is not None: - compressor = numcodecs.get_codec(self.metadata.compressor) - if not chunk_array.flags.c_contiguous and not chunk_array.flags.f_contiguous: - chunk_array = chunk_array.copy(order="A") - encoded_chunk_bytes = ensure_bytes(await to_thread(compressor.encode, chunk_array)) - else: - encoded_chunk_bytes = ensure_bytes(chunk_array) - - return encoded_chunk_bytes - - def _encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: - chunk_identifier = self.metadata.dimension_separator.join(map(str, chunk_coords)) - return "0" if chunk_identifier == "" else chunk_identifier - - async def resize_async(self, new_shape: ChunkCoords) -> ArrayV2: - assert len(new_shape) == len(self.metadata.shape) - new_metadata = replace(self.metadata, shape=new_shape) - - # Remove all chunks outside of the new shape - chunk_shape = self.metadata.chunks - old_chunk_coords = set(all_chunk_coords(self.metadata.shape, chunk_shape)) - new_chunk_coords = set(all_chunk_coords(new_shape, chunk_shape)) - - async def _delete_key(key: str) -> None: - await (self.store_path / key).delete() - - await concurrent_map( - [ - (self._encode_chunk_key(chunk_coords),) - for chunk_coords in old_chunk_coords.difference(new_chunk_coords) - ], - _delete_key, - ) - - # Write new metadata - await (self.store_path / ZARRAY_JSON).set(new_metadata.to_bytes()) - return replace(self, metadata=new_metadata) - - def resize(self, new_shape: ChunkCoords) -> ArrayV2: - return sync(self.resize_async(new_shape)) - - async def convert_to_v3_async(self) -> Array: - from sys import byteorder as sys_byteorder - - from zarr.abc.codec import Codec - from zarr.array import Array - from zarr.common import ZARR_JSON - from zarr.chunk_grids import RegularChunkGrid - from zarr.chunk_key_encodings import V2ChunkKeyEncoding - from zarr.metadata import ArrayMetadata, DataType - - from zarr.codecs import ( - BloscCodec, - BloscShuffle, - BytesCodec, - GzipCodec, - TransposeCodec, - ) - - data_type = DataType.from_dtype(self.metadata.dtype) - endian: Literal["little", "big"] - if self.metadata.dtype.byteorder == "=": - endian = sys_byteorder - elif self.metadata.dtype.byteorder == ">": - endian = "big" - else: - endian = "little" - - assert ( - self.metadata.filters is None or len(self.metadata.filters) == 0 - ), "Filters are not supported by v3." - - codecs: List[Codec] = [] - - if self.metadata.order == "F": - codecs.append(TransposeCodec(order=tuple(reversed(range(self.metadata.ndim))))) - codecs.append(BytesCodec(endian=endian)) - - if self.metadata.compressor is not None: - v2_codec = numcodecs.get_codec(self.metadata.compressor).get_config() - assert v2_codec["id"] in ( - "blosc", - "gzip", - ), "Only blosc and gzip are supported by v3." - if v2_codec["id"] == "blosc": - codecs.append( - BloscCodec( - typesize=data_type.byte_count, - cname=v2_codec["cname"], - clevel=v2_codec["clevel"], - shuffle=BloscShuffle.from_int(v2_codec.get("shuffle", 0)), - blocksize=v2_codec.get("blocksize", 0), - ) - ) - elif v2_codec["id"] == "gzip": - codecs.append(GzipCodec(level=v2_codec.get("level", 5))) - - new_metadata = ArrayMetadata( - shape=self.metadata.shape, - chunk_grid=RegularChunkGrid(chunk_shape=self.metadata.chunks), - data_type=data_type, - fill_value=0 if self.metadata.fill_value is None else self.metadata.fill_value, - chunk_key_encoding=V2ChunkKeyEncoding(separator=self.metadata.dimension_separator), - codecs=codecs, - attributes=self.attributes or {}, - dimension_names=None, - ) - - new_metadata_bytes = new_metadata.to_bytes() - await (self.store_path / ZARR_JSON).set(new_metadata_bytes) - - return Array.from_dict( - store_path=self.store_path, - data=json.loads(new_metadata_bytes), - ) - - async def update_attributes_async(self, new_attributes: Dict[str, Any]) -> ArrayV2: - await (self.store_path / ZATTRS_JSON).set(json.dumps(new_attributes).encode()) - return replace(self, attributes=new_attributes) - - def update_attributes(self, new_attributes: Dict[str, Any]) -> ArrayV2: - return sync( - self.update_attributes_async(new_attributes), - ) - - def convert_to_v3(self) -> Array: - return sync(self.convert_to_v3_async()) - - def __repr__(self): - return f"" diff --git a/src/zarr/attributes.py b/src/zarr/attributes.py index 18f6a63a55..e6b26309f2 100644 --- a/src/zarr/attributes.py +++ b/src/zarr/attributes.py @@ -1,21 +1,24 @@ from __future__ import annotations + from collections.abc import MutableMapping -from typing import TYPE_CHECKING, Any, Iterator, Union +from typing import TYPE_CHECKING, Iterator + +from zarr.common import JSON if TYPE_CHECKING: from zarr.group import Group from zarr.array import Array -class Attributes(MutableMapping[str, Any]): - def __init__(self, obj: Union[Array, Group]): +class Attributes(MutableMapping[str, JSON]): + def __init__(self, obj: Array | Group): # key=".zattrs", read_only=False, cache=True, synchronizer=None self._obj = obj - def __getitem__(self, key: str) -> Any: + def __getitem__(self, key: str) -> JSON: return self._obj.metadata.attributes[key] - def __setitem__(self, key: str, value: Any) -> None: + def __setitem__(self, key: str, value: JSON) -> None: new_attrs = dict(self._obj.metadata.attributes) new_attrs[key] = value self._obj = self._obj.update_attributes(new_attrs) diff --git a/src/zarr/chunk_grids.py b/src/zarr/chunk_grids.py index 73557f6e4b..16c0df9174 100644 --- a/src/zarr/chunk_grids.py +++ b/src/zarr/chunk_grids.py @@ -1,5 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict +import itertools +from typing import TYPE_CHECKING, Any, Dict, Iterator from dataclasses import dataclass from zarr.abc.metadata import Metadata @@ -10,6 +11,7 @@ parse_named_configuration, parse_shapelike, ) +from zarr.indexing import _ceildiv if TYPE_CHECKING: from typing_extensions import Self @@ -27,6 +29,9 @@ def from_dict(cls, data: Dict[str, JSON]) -> ChunkGrid: return RegularChunkGrid.from_dict(data) raise ValueError(f"Unknown chunk grid. Got {name_parsed}.") + def all_chunk_coords(self, array_shape: ChunkCoords) -> Iterator[ChunkCoords]: + raise NotImplementedError + @dataclass(frozen=True) class RegularChunkGrid(ChunkGrid): @@ -45,3 +50,8 @@ def from_dict(cls, data: Dict[str, Any]) -> Self: def to_dict(self) -> Dict[str, JSON]: return {"name": "regular", "configuration": {"chunk_shape": list(self.chunk_shape)}} + + def all_chunk_coords(self, array_shape: ChunkCoords) -> Iterator[ChunkCoords]: + return itertools.product( + *(range(0, _ceildiv(s, c)) for s, c in zip(array_shape, self.chunk_shape)) + ) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py new file mode 100644 index 0000000000..444ae96c5c --- /dev/null +++ b/src/zarr/codecs/_v2.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal +import numpy as np + +from zarr.codecs.mixins import ArrayArrayCodecBatchMixin, ArrayBytesCodecBatchMixin +from zarr.common import JSON, ArraySpec, BytesLike, to_thread + +import numcodecs +from numcodecs.compat import ensure_bytes, ensure_ndarray + + +@dataclass(frozen=True) +class V2Compressor(ArrayBytesCodecBatchMixin): + compressor: dict[str, JSON] | None + + is_fixed_size = False + + async def decode_single( + self, + chunk_bytes: BytesLike, + chunk_spec: ArraySpec, + ) -> np.ndarray: + if chunk_bytes is None: + return None + + if self.compressor is not None: + compressor = numcodecs.get_codec(self.compressor) + chunk_array = ensure_ndarray(await to_thread(compressor.decode, chunk_bytes)) + else: + chunk_array = ensure_ndarray(chunk_bytes) + + # ensure correct dtype + if str(chunk_array.dtype) != chunk_spec.dtype: + chunk_array = chunk_array.view(chunk_spec.dtype) + + return chunk_array + + async def encode_single( + self, + chunk_array: np.ndarray, + _chunk_spec: ArraySpec, + ) -> BytesLike | None: + if self.compressor is not None: + compressor = numcodecs.get_codec(self.compressor) + if not chunk_array.flags.c_contiguous and not chunk_array.flags.f_contiguous: + chunk_array = chunk_array.copy(order="A") + encoded_chunk_bytes = ensure_bytes(await to_thread(compressor.encode, chunk_array)) + else: + encoded_chunk_bytes = ensure_bytes(chunk_array) + + return encoded_chunk_bytes + + def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: + raise NotImplementedError + + +@dataclass(frozen=True) +class V2Filters(ArrayArrayCodecBatchMixin): + filters: list[dict[str, JSON]] + order: Literal["C", "F"] + + is_fixed_size = False + + async def decode_single( + self, + chunk_array: np.ndarray, + chunk_spec: ArraySpec, + ) -> np.ndarray: + # apply filters in reverse order + if self.filters is not None: + for filter_metadata in self.filters[::-1]: + filter = numcodecs.get_codec(filter_metadata) + chunk_array = await to_thread(filter.decode, chunk_array) + + # ensure correct chunk shape + if chunk_array.shape != chunk_spec.shape: + chunk_array = chunk_array.reshape( + chunk_spec.shape, + order=self.order, + ) + + return chunk_array + + async def encode_single( + self, + chunk_array: np.ndarray, + _chunk_spec: ArraySpec, + ) -> np.ndarray | None: + chunk_array = chunk_array.ravel(order=self.order) + + for filter_metadata in self.filters: + filter = numcodecs.get_codec(filter_metadata) + chunk_array = await to_thread(filter.encode, chunk_array) + + return chunk_array + + def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: + raise NotImplementedError diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index a46a77e95e..8ea8804d49 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -32,10 +32,7 @@ c_order_iter, morton_order_iter, ) -from zarr.metadata import ( - ArrayMetadata, - parse_codecs, -) +from zarr.metadata import ArrayMetadata, parse_codecs if TYPE_CHECKING: from typing import Awaitable, Callable, Dict, Iterator, Optional, Set @@ -394,7 +391,7 @@ async def decode_single( indexer = BasicIndexer( tuple(slice(0, s) for s in shard_shape), shape=shard_shape, - chunk_shape=chunk_shape, + chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), ) # setup output array @@ -439,7 +436,7 @@ async def decode_partial_single( indexer = BasicIndexer( selection, shape=shard_shape, - chunk_shape=chunk_shape, + chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), ) # setup output array @@ -503,7 +500,7 @@ async def encode_single( BasicIndexer( tuple(slice(0, s) for s in shard_shape), shape=shard_shape, - chunk_shape=chunk_shape, + chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), ) ) @@ -546,7 +543,7 @@ async def encode_partial_single( BasicIndexer( selection, shape=shard_shape, - chunk_shape=chunk_shape, + chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape), ) ) diff --git a/src/zarr/group.py b/src/zarr/group.py index f8d57e3fba..5bc124a4f6 100644 --- a/src/zarr/group.py +++ b/src/zarr/group.py @@ -7,22 +7,33 @@ import logging import numpy.typing as npt -if TYPE_CHECKING: - from typing import Any, AsyncGenerator, Literal, Iterable +from zarr.abc.store import set_or_delete from zarr.abc.codec import Codec from zarr.abc.metadata import Metadata from zarr.array import AsyncArray, Array from zarr.attributes import Attributes -from zarr.common import ZARR_JSON, ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, ChunkCoords +from zarr.chunk_key_encodings import ChunkKeyEncoding +from zarr.common import ( + JSON, + ZARR_JSON, + ZARRAY_JSON, + ZATTRS_JSON, + ZGROUP_JSON, + ChunkCoords, + ZarrFormat, +) from zarr.store import StoreLike, StorePath, make_store_path from zarr.sync import SyncMixin, sync from typing import overload +if TYPE_CHECKING: + from typing import Any, AsyncGenerator, Literal, Iterable + logger = logging.getLogger("zarr.group") -def parse_zarr_format(data: Any) -> Literal[2, 3]: +def parse_zarr_format(data: Any) -> ZarrFormat: if data in (2, 3): return data msg = msg = f"Invalid zarr_format. Expected one 2 or 3. Got {data}." @@ -62,7 +73,7 @@ def _parse_async_node(node: AsyncArray | AsyncGroup) -> Array | Group: @dataclass(frozen=True) class GroupMetadata(Metadata): attributes: dict[str, Any] = field(default_factory=dict) - zarr_format: Literal[2, 3] = 3 + zarr_format: ZarrFormat = 3 node_type: Literal["group"] = field(default="group", init=False) # todo: rename this, since it doesn't return bytes @@ -75,7 +86,7 @@ def to_bytes(self) -> dict[str, bytes]: ZATTRS_JSON: json.dumps(self.attributes).encode(), } - def __init__(self, attributes: dict[str, Any] | None = None, zarr_format: Literal[2, 3] = 3): + def __init__(self, attributes: dict[str, Any] | None = None, zarr_format: ZarrFormat = 3): attributes_parsed = parse_attributes(attributes) zarr_format_parsed = parse_zarr_format(zarr_format) @@ -103,7 +114,7 @@ async def create( *, attributes: dict[str, Any] = {}, exists_ok: bool = False, - zarr_format: Literal[2, 3] = 3, + zarr_format: ZarrFormat = 3, ) -> AsyncGroup: store_path = make_store_path(store) if not exists_ok: @@ -245,6 +256,7 @@ async def delitem(self, key: str) -> None: elif self.metadata.zarr_format == 2: await asyncio.gather( (store_path / ZGROUP_JSON).delete(), # TODO: missing_ok=False + (store_path / ZARRAY_JSON).delete(), # TODO: missing_ok=False (store_path / ZATTRS_JSON).delete(), # TODO: missing_ok=True ) else: @@ -252,7 +264,7 @@ async def delitem(self, key: str) -> None: async def _save_metadata(self) -> None: to_save = self.metadata.to_bytes() - awaitables = [(self.store_path / key).set(value) for key, value in to_save.items()] + awaitables = [set_or_delete(self.store_path / key, value) for key, value in to_save.items()] await asyncio.gather(*awaitables) @property @@ -278,13 +290,25 @@ async def create_array( path: str, shape: ChunkCoords, dtype: npt.DTypeLike, - chunk_shape: ChunkCoords, fill_value: Any | None = None, - chunk_key_encoding: tuple[Literal["default"], Literal[".", "/"]] - | tuple[Literal["v2"], Literal[".", "/"]] = ("default", "/"), - codecs: Iterable[Codec | dict[str, Any]] | None = None, + attributes: dict[str, JSON] | None = None, + # v3 only + chunk_shape: ChunkCoords | None = None, + chunk_key_encoding: ( + ChunkKeyEncoding + | tuple[Literal["default"], Literal[".", "/"]] + | tuple[Literal["v2"], Literal[".", "/"]] + | None + ) = None, + codecs: Iterable[Codec | dict[str, JSON]] | None = None, dimension_names: Iterable[str] | None = None, - attributes: dict[str, Any] | None = None, + # v2 only + chunks: ChunkCoords | None = None, + dimension_separator: Literal[".", "/"] | None = None, + order: Literal["C", "F"] | None = None, + filters: list[dict[str, JSON]] | None = None, + compressor: dict[str, JSON] | None = None, + # runtime exists_ok: bool = False, ) -> AsyncArray: return await AsyncArray.create( @@ -297,6 +321,11 @@ async def create_array( codecs=codecs, dimension_names=dimension_names, attributes=attributes, + chunks=chunks, + dimension_separator=dimension_separator, + order=order, + filters=filters, + compressor=compressor, exists_ok=exists_ok, zarr_format=self.metadata.zarr_format, ) @@ -307,15 +336,7 @@ async def update_attributes(self, new_attributes: dict[str, Any]) -> "AsyncGroup self.metadata.attributes.update(new_attributes) # Write new metadata - to_save = self.metadata.to_bytes() - if self.metadata.zarr_format == 2: - # only save the .zattrs object - await (self.store_path / ZATTRS_JSON).set(to_save[ZATTRS_JSON]) - else: - await (self.store_path / ZARR_JSON).set(to_save[ZARR_JSON]) - - self.metadata.attributes.clear() - self.metadata.attributes.update(new_attributes) + await self._save_metadata() return self @@ -480,7 +501,7 @@ async def update_attributes_async(self, new_attributes: dict[str, Any]) -> Group # Write new metadata to_save = new_metadata.to_bytes() - awaitables = [(self.store_path / key).set(value) for key, value in to_save.items()] + awaitables = [set_or_delete(self.store_path / key, value) for key, value in to_save.items()] await asyncio.gather(*awaitables) async_group = replace(self._async_group, metadata=new_metadata) diff --git a/src/zarr/indexing.py b/src/zarr/indexing.py index 9f324eb5ea..8e7cd95430 100644 --- a/src/zarr/indexing.py +++ b/src/zarr/indexing.py @@ -2,10 +2,13 @@ import itertools import math -from typing import Iterator, List, NamedTuple, Optional, Tuple +from typing import TYPE_CHECKING, Iterator, List, NamedTuple, Optional, Tuple from zarr.common import ChunkCoords, Selection, SliceSelection, product +if TYPE_CHECKING: + from zarr.chunk_grids import ChunkGrid + def _ensure_tuple(v: Selection) -> SliceSelection: if not isinstance(v, tuple): @@ -131,13 +134,18 @@ def __init__( self, selection: Selection, shape: Tuple[int, ...], - chunk_shape: Tuple[int, ...], + chunk_grid: ChunkGrid, ): + from zarr.chunk_grids import RegularChunkGrid + + assert isinstance( + chunk_grid, RegularChunkGrid + ), "Only regular chunk grid is supported, currently." # setup per-dimension indexers self.dim_indexers = [ _SliceDimIndexer(dim_sel, dim_len, dim_chunk_len) for dim_sel, dim_len, dim_chunk_len in zip( - _ensure_selection(selection, shape), shape, chunk_shape + _ensure_selection(selection, shape), shape, chunk_grid.chunk_shape ) ] self.shape = tuple(s.nitems for s in self.dim_indexers) @@ -202,7 +210,3 @@ def is_total_slice(item: Selection, shape: ChunkCoords) -> bool: ) else: raise TypeError("expected slice or tuple of slices, found %r" % item) - - -def all_chunk_coords(shape: ChunkCoords, chunk_shape: ChunkCoords) -> Iterator[ChunkCoords]: - return itertools.product(*(range(0, _ceildiv(s, c)) for s, c in zip(shape, chunk_shape))) diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py index d1e72a7600..4ea5f9bc5a 100644 --- a/src/zarr/metadata.py +++ b/src/zarr/metadata.py @@ -1,24 +1,29 @@ from __future__ import annotations +from abc import ABC, abstractmethod from enum import Enum -from typing import TYPE_CHECKING, cast, Dict, Iterable, Any -from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, cast, Iterable +from dataclasses import dataclass, field, replace import json import numpy as np import numpy.typing as npt +from zarr.abc.codec import Codec, CodecPipeline +from zarr.abc.metadata import Metadata from zarr.chunk_grids import ChunkGrid, RegularChunkGrid from zarr.chunk_key_encodings import ChunkKeyEncoding, parse_separator +from zarr.codecs._v2 import V2Compressor, V2Filters if TYPE_CHECKING: - from typing import Literal, Union, List, Optional, Tuple + from typing import Literal + from typing_extensions import Self -from zarr.abc.codec import Codec, CodecPipeline -from zarr.abc.metadata import Metadata - from zarr.common import ( JSON, + ZARR_JSON, + ZARRAY_JSON, + ZATTRS_JSON, ArraySpec, ChunkCoords, parse_dtype, @@ -102,16 +107,58 @@ def from_dtype(cls, dtype: np.dtype[Any]) -> DataType: return DataType[dtype_to_data_type[dtype.str]] -@dataclass(frozen=True) -class ArrayMetadata(Metadata): +@dataclass(frozen=True, kw_only=True) +class ArrayMetadata(Metadata, ABC): + shape: ChunkCoords + chunk_grid: ChunkGrid + attributes: dict[str, JSON] + + @property + @abstractmethod + def dtype(self) -> np.dtype[Any]: + pass + + @property + @abstractmethod + def ndim(self) -> int: + pass + + @property + @abstractmethod + def codec_pipeline(self) -> CodecPipeline: + pass + + @abstractmethod + def get_chunk_spec(self, _chunk_coords: ChunkCoords, order: Literal["C", "F"]) -> ArraySpec: + pass + + @abstractmethod + def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: + pass + + @abstractmethod + def to_bytes(self) -> dict[str, bytes]: + pass + + @abstractmethod + def update_shape(self, shape: ChunkCoords) -> Self: + pass + + @abstractmethod + def update_attributes(self, attributes: dict[str, JSON]) -> Self: + pass + + +@dataclass(frozen=True, kw_only=True) +class ArrayV3Metadata(ArrayMetadata): shape: ChunkCoords data_type: np.dtype[Any] chunk_grid: ChunkGrid chunk_key_encoding: ChunkKeyEncoding fill_value: Any codecs: CodecPipeline - attributes: Dict[str, Any] = field(default_factory=dict) - dimension_names: Optional[Tuple[str, ...]] = None + attributes: dict[str, Any] = field(default_factory=dict) + dimension_names: tuple[str, ...] | None = None zarr_format: Literal[3] = field(default=3, init=False) node_type: Literal["array"] = field(default="array", init=False) @@ -180,6 +227,10 @@ def dtype(self) -> np.dtype[Any]: def ndim(self) -> int: return len(self.shape) + @property + def codec_pipeline(self) -> CodecPipeline: + return self.codecs + def get_chunk_spec(self, _chunk_coords: ChunkCoords, order: Literal["C", "F"]) -> ArraySpec: assert isinstance( self.chunk_grid, RegularChunkGrid @@ -191,7 +242,10 @@ def get_chunk_spec(self, _chunk_coords: ChunkCoords, order: Literal["C", "F"]) - order=order, ) - def to_bytes(self) -> bytes: + def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: + return self.chunk_key_encoding.encode_chunk_key(chunk_coords) + + def to_bytes(self) -> dict[str, bytes]: def _json_convert(o): if isinstance(o, np.dtype): return str(o) @@ -203,13 +257,10 @@ def _json_convert(o): return o.get_config() raise TypeError - return json.dumps( - self.to_dict(), - default=_json_convert, - ).encode() + return {ZARR_JSON: json.dumps(self.to_dict(), default=_json_convert).encode()} @classmethod - def from_dict(cls, data: Dict[str, Any]) -> ArrayMetadata: + def from_dict(cls, data: dict[str, JSON]) -> ArrayV3Metadata: # check that the zarr_format attribute is correct _ = parse_zarr_format_v3(data.pop("zarr_format")) # check that the node_type attribute is correct @@ -219,7 +270,7 @@ def from_dict(cls, data: Dict[str, Any]) -> ArrayMetadata: return cls(**data, dimension_names=dimension_names) - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> dict[str, Any]: out_dict = super().to_dict() if not isinstance(out_dict, dict): @@ -231,18 +282,24 @@ def to_dict(self) -> Dict[str, Any]: out_dict.pop("dimension_names") return out_dict + def update_shape(self, shape: ChunkCoords) -> Self: + return replace(self, shape=shape) + + def update_attributes(self, attributes: dict[str, JSON]) -> Self: + return replace(self, attributes=attributes) + -@dataclass(frozen=True) -class ArrayV2Metadata(Metadata): +@dataclass(frozen=True, kw_only=True) +class ArrayV2Metadata(ArrayMetadata): shape: ChunkCoords - chunks: ChunkCoords - dtype: np.dtype[Any] - fill_value: Union[None, int, float] = 0 + chunk_grid: RegularChunkGrid + data_type: np.dtype[Any] + fill_value: None | int | float = 0 order: Literal["C", "F"] = "C" - filters: Optional[List[Dict[str, Any]]] = None + filters: list[dict[str, JSON]] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: Optional[Dict[str, Any]] = None - attributes: Optional[Dict[str, Any]] = cast(Dict[str, Any], field(default_factory=dict)) + compressor: dict[str, JSON] | None = None + attributes: dict[str, JSON] = cast(dict[str, JSON], field(default_factory=dict)) zarr_format: Literal[2] = field(init=False, default=2) def __init__( @@ -254,9 +311,9 @@ def __init__( fill_value: Any, order: Literal["C", "F"], dimension_separator: Literal[".", "/"] = ".", - compressor: Optional[Dict[str, Any]] = None, - filters: Optional[List[Dict[str, Any]]] = None, - attributes: Optional[Dict[str, JSON]] = None, + compressor: dict[str, JSON] | None = None, + filters: list[dict[str, JSON]] | None = None, + attributes: dict[str, JSON] | None = None, ): """ Metadata for a Zarr version 2 array. @@ -266,14 +323,14 @@ def __init__( chunks_parsed = parse_shapelike(chunks) compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) - dimension_separator_parsed = parse_separator(order) + dimension_separator_parsed = parse_separator(dimension_separator) filters_parsed = parse_filters(filters) fill_value_parsed = parse_fill_value(fill_value) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) object.__setattr__(self, "data_type", data_type_parsed) - object.__setattr__(self, "chunks", chunks_parsed) + object.__setattr__(self, "chunk_grid", RegularChunkGrid(chunk_shape=chunks_parsed)) object.__setattr__(self, "compressor", compressor_parsed) object.__setattr__(self, "order", order_parsed) object.__setattr__(self, "dimension_separator", dimension_separator_parsed) @@ -288,7 +345,23 @@ def __init__( def ndim(self) -> int: return len(self.shape) - def to_bytes(self) -> bytes: + @property + def dtype(self) -> np.dtype[Any]: + return self.data_type + + @property + def chunks(self) -> ChunkCoords: + return self.chunk_grid.chunk_shape + + @property + def codec_pipeline(self) -> CodecPipeline: + from zarr.codecs import BatchedCodecPipeline + + return BatchedCodecPipeline.from_list( + [V2Filters(self.filters or [], self.order), V2Compressor(self.compressor)] + ) + + def to_bytes(self) -> dict[str, bytes]: def _json_convert(o): if isinstance(o, np.dtype): if o.fields is None: @@ -297,16 +370,54 @@ def _json_convert(o): return o.descr raise TypeError - return json.dumps(self.to_dict(), default=_json_convert).encode() + zarray_dict = self.to_dict() + assert isinstance(zarray_dict, dict) + zattrs_dict = zarray_dict.pop("attributes", {}) + assert isinstance(zattrs_dict, dict) + return { + ZARRAY_JSON: json.dumps(zarray_dict, default=_json_convert).encode(), + ZATTRS_JSON: json.dumps(zattrs_dict).encode(), + } @classmethod - def from_dict(cls, data: Dict[str, Any]) -> ArrayV2Metadata: + def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: # check that the zarr_format attribute is correct _ = parse_zarr_format_v2(data.pop("zarr_format")) return cls(**data) + def to_dict(self) -> JSON: + zarray_dict = super().to_dict() + + assert isinstance(zarray_dict, dict) + + _ = zarray_dict.pop("chunk_grid") + zarray_dict["chunks"] = self.chunk_grid.chunk_shape + + _ = zarray_dict.pop("data_type") + zarray_dict["dtype"] = self.data_type + + return zarray_dict + + def get_chunk_spec(self, _chunk_coords: ChunkCoords, order: Literal["C", "F"]) -> ArraySpec: + return ArraySpec( + shape=self.chunk_grid.chunk_shape, + dtype=self.dtype, + fill_value=self.fill_value, + order=order, + ) + + def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str: + chunk_identifier = self.dimension_separator.join(map(str, chunk_coords)) + return "0" if chunk_identifier == "" else chunk_identifier + + def update_shape(self, shape: ChunkCoords) -> Self: + return replace(self, shape=shape) + + def update_attributes(self, attributes: dict[str, JSON]) -> Self: + return replace(self, attributes=attributes) + -def parse_dimension_names(data: Any) -> Tuple[str, ...] | None: +def parse_dimension_names(data: Any) -> tuple[str, ...] | None: if data is None: return data if isinstance(data, Iterable) and all([isinstance(x, str) for x in data]): @@ -316,11 +427,11 @@ def parse_dimension_names(data: Any) -> Tuple[str, ...] | None: # todo: real validation -def parse_attributes(data: Any) -> Dict[str, JSON]: +def parse_attributes(data: Any) -> dict[str, JSON]: if data is None: return {} - data_json = cast(Dict[str, JSON], data) + data_json = cast(dict[str, JSON], data) return data_json @@ -366,7 +477,7 @@ def parse_v2_metadata(data: ArrayV2Metadata) -> ArrayV2Metadata: return data -def parse_codecs(data: Iterable[Union[Codec, JSON]]) -> CodecPipeline: +def parse_codecs(data: Iterable[Codec | JSON]) -> CodecPipeline: from zarr.codecs import BatchedCodecPipeline if not isinstance(data, Iterable): diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py index 710eb3e527..89363373ba 100644 --- a/tests/v3/test_group.py +++ b/tests/v3/test_group.py @@ -233,10 +233,7 @@ def test_asyncgroup_from_dict(store: MemoryStore | LocalStore, data: dict[str, A @pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"]) -@pytest.mark.parametrize( - "zarr_format", - (pytest.param(2, marks=pytest.mark.xfail(reason="V2 arrays cannot be created yet.")), 3), -) +@pytest.mark.parametrize("zarr_format", (2, 3)) async def test_asyncgroup_getitem(store: LocalStore | MemoryStore, zarr_format: ZarrFormat) -> None: """ Create an `AsyncGroup`, then create members of that group, and ensure that we can access those @@ -263,10 +260,7 @@ async def test_asyncgroup_getitem(store: LocalStore | MemoryStore, zarr_format: @pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"]) -@pytest.mark.parametrize( - "zarr_format", - (2, 3), -) +@pytest.mark.parametrize("zarr_format", (2, 3)) async def test_asyncgroup_delitem(store: LocalStore | MemoryStore, zarr_format: ZarrFormat) -> None: agroup = await AsyncGroup.create(store=store, zarr_format=zarr_format) sub_array_path = "sub_array" @@ -315,10 +309,7 @@ async def test_asyncgroup_create_group( @pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"]) -@pytest.mark.parametrize( - "zarr_format", - (pytest.param(2, marks=pytest.mark.xfail(reason="V2 arrays cannot be created yet")), 3), -) +@pytest.mark.parametrize("zarr_format", (2, 3)) async def test_asyncgroup_create_array( store: LocalStore | MemoryStore, zarr_format: ZarrFormat, diff --git a/tests/v3/test_v2.py b/tests/v3/test_v2.py new file mode 100644 index 0000000000..5b831b1bb0 --- /dev/null +++ b/tests/v3/test_v2.py @@ -0,0 +1,28 @@ +from typing import Iterator +import numpy as np +import pytest + +from zarr.abc.store import Store +from zarr.array import Array +from zarr.store import StorePath, MemoryStore + + +@pytest.fixture +def store() -> Iterator[Store]: + yield StorePath(MemoryStore()) + + +def test_simple(store: Store): + data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) + + a = Array.create( + store / "simple_v2", + zarr_format=2, + shape=data.shape, + chunks=(16, 16), + dtype=data.dtype, + fill_value=0, + ) + + a[:, :] = data + assert np.array_equal(data, a[:, :]) From db974393b3daeb5348af50a1b4e77b6527e811ed Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Thu, 16 May 2024 14:37:23 +0200 Subject: [PATCH 21/21] merge --- src/zarr/array.py | 5 ++++- src/zarr/codecs/pipeline.py | 12 +++++++----- tests/v3/test_codecs.py | 8 ++++---- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/zarr/array.py b/src/zarr/array.py index 80894e01d4..61f91ab966 100644 --- a/src/zarr/array.py +++ b/src/zarr/array.py @@ -368,7 +368,10 @@ async def getitem( # setup output array out = factory( - shape=indexer.shape, dtype=self.metadata.dtype, order=self.order, fill_value=0 + shape=indexer.shape, + dtype=self.metadata.dtype, + order=self.order, + fill_value=0, # TODO use fill_value ) # reading chunks and decoding them diff --git a/src/zarr/codecs/pipeline.py b/src/zarr/codecs/pipeline.py index 57f48d2b99..8396a0c2ce 100644 --- a/src/zarr/codecs/pipeline.py +++ b/src/zarr/codecs/pipeline.py @@ -3,7 +3,6 @@ from itertools import islice from typing import TYPE_CHECKING, Iterator, TypeVar, Iterable from warnings import warn -import numpy as np from dataclasses import dataclass from zarr.config import config @@ -370,11 +369,12 @@ def _merge_chunk_array( if is_total_slice(chunk_selection, chunk_spec.shape): return new_chunk_array_slice if existing_chunk_array is None: - chunk_array = np.empty( - chunk_spec.shape, + chunk_array = NDBuffer.create( + shape=chunk_spec.shape, dtype=chunk_spec.dtype, + order=chunk_spec.order, + fill_value=chunk_spec.fill_value, ) - chunk_array.fill(chunk_spec.fill_value) else: chunk_array = existing_chunk_array.copy() # make a writable copy chunk_array[chunk_selection] = new_chunk_array_slice @@ -388,7 +388,9 @@ def _merge_chunk_array( ] chunk_array_batch = [ - None if np.all(chunk_array == chunk_spec.fill_value) else chunk_array + None + if chunk_array is None or chunk_array.all_equal(chunk_spec.fill_value) + else chunk_array for chunk_array, (_, chunk_spec, _, _) in zip(chunk_array_batch, batch_info) ] diff --git a/tests/v3/test_codecs.py b/tests/v3/test_codecs.py index 665e3124c0..73553b5565 100644 --- a/tests/v3/test_codecs.py +++ b/tests/v3/test_codecs.py @@ -615,9 +615,9 @@ async def test_delete_empty_chunks(store: Store): assert await (store / "delete_empty_chunks/c0/0").get() is None -async def test_delete_empty_sharded_chunks(store: Store): +async def test_delete_empty_shards(store: Store): a = await AsyncArray.create( - store / "delete_empty_sharded_chunks", + store / "delete_empty_shards", shape=(16, 16), chunk_shape=(8, 16), dtype="uint16", @@ -635,8 +635,8 @@ async def test_delete_empty_sharded_chunks(store: Store): data = np.ones((16, 16), dtype="uint16") data[:8, :8] = 0 assert np.array_equal(data, await _AsyncArrayProxy(a)[:, :].get()) - assert await (store / "delete_empty_sharded_chunks/c/1/0").get() is None - chunk_bytes = await (store / "delete_empty_sharded_chunks/c/0/0").get() + assert await (store / "delete_empty_shards/c/1/0").get() is None + chunk_bytes = await (store / "delete_empty_shards/c/0/0").get() assert chunk_bytes is not None and len(chunk_bytes) == 16 * 2 + 8 * 8 * 2 + 4