From ffd58c37bf6071ee61aad44d9638f1aa7971dbf0 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 3 May 2024 22:58:52 +0200 Subject: [PATCH 1/7] adds wrappers for numcodecs codecs --- src/zarr/codecs/__init__.py | 6 ++ src/zarr/codecs/numcodecs_.py | 185 ++++++++++++++++++++++++++++++++++ tests/v3/test_codecs.py | 86 +++++++++++++++- 3 files changed, 276 insertions(+), 1 deletion(-) create mode 100644 src/zarr/codecs/numcodecs_.py diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index 8fa0c9f7b0..ab7c373c7c 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -3,6 +3,12 @@ from zarr.codecs.blosc import BloscCodec, BloscCname, BloscShuffle # noqa: F401 from zarr.codecs.bytes import BytesCodec, Endian # noqa: F401 from zarr.codecs.crc32c_ import Crc32cCodec # noqa: F401 +from zarr.codecs.numcodecs_ import ( + NumcodecsBytesBytesCodec, + NumcodecsArrayArrayCodec, + AsTypeCodec, + PackbitsCodec, +) # noqa: F401 from zarr.codecs.gzip import GzipCodec # noqa: F401 from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation # noqa: F401 from zarr.codecs.transpose import TransposeCodec # noqa: F401 diff --git a/src/zarr/codecs/numcodecs_.py b/src/zarr/codecs/numcodecs_.py new file mode 100644 index 0000000000..fe98b4c2f5 --- /dev/null +++ b/src/zarr/codecs/numcodecs_.py @@ -0,0 +1,185 @@ +from dataclasses import dataclass +import math +from typing import Type +from typing_extensions import Self +from warnings import warn + +import numpy as np +from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec + +import numcodecs +from zarr.codecs.registry import register_codec +from zarr.common import JSON, ArraySpec, BytesLike, parse_named_configuration, product, to_thread +from zarr.config import RuntimeConfiguration +from zarr.metadata import ArrayMetadata + +CODEC_PREFIX = "https://zarr.dev/numcodecs/" + + +def parse_codec_configuration(name: str, expected_name_prefix: str) -> dict[str, JSON]: + parsed_name, parsed_configuration = parse_named_configuration(name) + if not parsed_name.startswith(expected_name_prefix): + raise ValueError( + f"Expected name to start with '{expected_name_prefix}'. Got {name} instead." + ) + id = parsed_name[len(expected_name_prefix) :] + return {"id": id, **parsed_configuration} + + +@dataclass(frozen=True) +class NumcodecsCodec(Codec): + codec: numcodecs.abc.Codec + codec_id: str + + def __init__(self, *, codec_config: dict[str, JSON]) -> None: + if "id" not in codec_config: + codec_config = {"id": self.codec_id, **codec_config} + object.__setattr__(self, "codec", numcodecs.get_codec(codec_config)) + warn( + "Numcodecs codecs are not in the Zarr version 3 specification and " + "may not be supported by other zarr implementations.", + category=UserWarning, + ) + + @classmethod + def from_dict(cls, data: dict[str, JSON]) -> Self: + codec_config = parse_codec_configuration(data, CODEC_PREFIX) + cls(codec_config) + + def to_dict(self) -> JSON: + codec_config = self.codec.get_config() + del codec_config["id"] + return { + "name": f"{CODEC_PREFIX}{self.codec.codec_id}", + "configuration": codec_config, + } + + +@dataclass(frozen=True) +class NumcodecsBytesBytesCodec(BytesBytesCodec, NumcodecsCodec): + def __init__(self, *, codec_config: dict[str, JSON]) -> None: + super().__init__(codec_config=codec_config) + + async def decode( + self, + chunk_bytes: BytesLike, + _chunk_spec: ArraySpec, + _runtime_configuration: RuntimeConfiguration, + ) -> BytesLike: + return await to_thread(self.codec.decode, chunk_bytes) + + async def encode( + self, + chunk_bytes: BytesLike, + _chunk_spec: ArraySpec, + _runtime_configuration: RuntimeConfiguration, + ) -> BytesLike: + return await to_thread(self.codec.encode, chunk_bytes) + + +@dataclass(frozen=True) +class NumcodecsArrayArrayCodec(ArrayArrayCodec, NumcodecsCodec): + def __init__(self, *, codec_config: dict[str, JSON]) -> None: + super().__init__(codec_config=codec_config) + + async def decode( + self, + chunk_array: np.ndarray, + chunk_spec: ArraySpec, + _runtime_configuration: RuntimeConfiguration, + ) -> np.ndarray: + out = await to_thread(self.codec.decode, chunk_array) + return out.reshape(chunk_spec.shape) + + async def encode( + self, + chunk_array: np.ndarray, + _chunk_spec: ArraySpec, + _runtime_configuration: RuntimeConfiguration, + ) -> np.ndarray: + return await to_thread(self.codec.encode, chunk_array) + + +def make_bytes_bytes_codec(codec_id: str) -> Type[NumcodecsBytesBytesCodec]: + # rename for class scope + _codec_id = codec_id + + @dataclass(frozen=True) + class _Codec(NumcodecsBytesBytesCodec): + codec_id = _codec_id + + def __init__(self, codec_config: dict[str, JSON] = {}) -> None: + super().__init__(codec_config=codec_config) + + return _Codec + + +def make_array_array_codec(codec_id: str) -> Type[NumcodecsArrayArrayCodec]: + # rename for class scope + _codec_id = codec_id + + @dataclass(frozen=True) + class _Codec(NumcodecsArrayArrayCodec): + codec_id = _codec_id + + def __init__(self, codec_config: dict[str, JSON] = {}) -> None: + if "id" not in codec_config: + codec_config = {"id": self.codec_id, **codec_config} + super().__init__(codec_config=codec_config) + + return _Codec + + +@dataclass(frozen=True) +class AsTypeCodec(NumcodecsArrayArrayCodec): + codec_id = "astype" + + def __init__(self, codec_config: dict[str, JSON] = {}) -> None: + super().__init__(codec_config=codec_config) + + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + return ArraySpec( + chunk_spec.shape, + np.dtype(self.codec.get_config()["encode_dtype"]), + chunk_spec.fill_value, + ) + + def evolve(self, array_spec: ArraySpec) -> Self: + codec_config = self.codec.get_config() + if str(array_spec.dtype) != codec_config["decode_dtype"]: + return AsTypeCodec({**codec_config, "decode_dtype": str(array_spec.dtype)}) + + +@dataclass(frozen=True) +class PackbitsCodec(NumcodecsArrayArrayCodec): + codec_id = "packbits" + + def __init__(self, codec_config: dict[str, JSON] = {}) -> None: + super().__init__(codec_config=codec_config) + + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + return ArraySpec( + (1 + math.ceil(product(chunk_spec.shape) / 8),), + np.dtype("uint8"), + chunk_spec.fill_value, + ) + + def validate(self, array_metadata: ArrayMetadata) -> None: + if array_metadata.dtype != np.dtype("bool"): + raise ValueError(f"Packbits filter requires bool dtype. Got {array_metadata.dtype}.") + + +register_codec(f"{CODEC_PREFIX}blosc", make_bytes_bytes_codec("blosc")) +register_codec(f"{CODEC_PREFIX}lz4", make_bytes_bytes_codec("lz4")) +register_codec(f"{CODEC_PREFIX}zstd", make_bytes_bytes_codec("zstd")) +register_codec(f"{CODEC_PREFIX}zlib", make_bytes_bytes_codec("zlib")) +register_codec(f"{CODEC_PREFIX}gzip", make_bytes_bytes_codec("gzip")) +register_codec(f"{CODEC_PREFIX}bz2", make_bytes_bytes_codec("bz2")) +register_codec(f"{CODEC_PREFIX}lzma", make_bytes_bytes_codec("lzma")) + +register_codec(f"{CODEC_PREFIX}delta", make_array_array_codec("delta")) +register_codec(f"{CODEC_PREFIX}fixedscaleoffset", make_array_array_codec("fixedscaleoffset")) +register_codec(f"{CODEC_PREFIX}quantize", make_array_array_codec("quantize")) +register_codec(f"{CODEC_PREFIX}bitround", make_array_array_codec("bitround")) +register_codec(f"{CODEC_PREFIX}packbits", PackbitsCodec) +register_codec(f"{CODEC_PREFIX}astype", AsTypeCodec) diff --git a/tests/v3/test_codecs.py b/tests/v3/test_codecs.py index ffd225668b..09f4a10e2f 100644 --- a/tests/v3/test_codecs.py +++ b/tests/v3/test_codecs.py @@ -7,10 +7,11 @@ import numpy as np import pytest +from zarr.codecs.registry import get_codec_class import zarr.v2 from zarr.abc.codec import Codec from zarr.array import Array, AsyncArray -from zarr.common import Selection +from zarr.common import JSON, Selection from zarr.indexing import morton_order_iter from zarr.codecs import ( ShardingCodec, @@ -1042,3 +1043,86 @@ def test_update_attributes_array(store: Store): a = Array.open(store / "update_attributes") assert a.attrs["hello"] == "zarrita" + + +@pytest.mark.parametrize("codec_id", ["blosc", "lz4", "zstd", "zlib", "gzip", "bz2", "lzma"]) +def test_generic_codec(store: Store, codec_id: str): + data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) + + with pytest.warns(UserWarning, match="Numcodecs.*"): + a = Array.create( + store / "generic", + shape=data.shape, + chunk_shape=(16, 16), + dtype=data.dtype, + fill_value=0, + codecs=[ + BytesCodec(), + get_codec_class(f"https://zarr.dev/numcodecs/{codec_id}")({"id": codec_id}), + ], + ) + + a[:, :] = data + assert np.array_equal(data, a[:, :]) + + +@pytest.mark.parametrize( + "codec_config", + [ + {"id": "delta", "dtype": "float32"}, + {"id": "fixedscaleoffset", "dtype": "float32", "offset": -2, "scale": 5.5}, + { + "id": "fixedscaleoffset", + "dtype": "float32", + "offset": 0, + "scale": 25.6, + "astype": "uint32", + }, + {"id": "quantize", "digits": 3, "dtype": "float32"}, + {"id": "bitround", "keepbits": 10}, + {"id": "astype", "encode_dtype": "float32", "decode_dtype": "float64"}, + ], + ids=["delta", "fixedscaleoffset", "fixedscaleoffset2", "quantize", "bitround", "astype"], +) +def test_generic_filter(store: Store, codec_config: dict[str, JSON]): + data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) + + codec_id = codec_config["id"] + del codec_config["id"] + + with pytest.warns(UserWarning, match="Numcodecs.*"): + a = Array.create( + store / "generic", + shape=data.shape, + chunk_shape=(16, 16), + dtype=data.dtype, + fill_value=0, + codecs=[ + get_codec_class(f"https://zarr.dev/numcodecs/{codec_id}")(codec_config), + BytesCodec(), + ], + ) + + a[:, :] = data + assert np.array_equal(data, a[:, :]) + + +def test_generic_filter_packbits(store: Store): + data = np.zeros((16, 16), dtype="bool") + data[0:4, :] = True + + with pytest.warns(UserWarning, match="Numcodecs.*"): + a = Array.create( + store / "generic", + shape=data.shape, + chunk_shape=(16, 16), + dtype=data.dtype, + fill_value=0, + codecs=[ + get_codec_class("https://zarr.dev/numcodecs/packbits")(), + BytesCodec(), + ], + ) + + a[:, :] = data + assert np.array_equal(data, a[:, :]) From eeff4d024863e2210ab0b6f7f8b608db6990ecd0 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 May 2024 21:48:40 +0200 Subject: [PATCH 2/7] better wrappers --- src/zarr/codecs/__init__.py | 7 +- src/zarr/codecs/numcodecs_.py | 156 +++++++++++++++++++++++----------- src/zarr/codecs/registry.py | 22 ++--- tests/v3/test_codecs.py | 93 ++++++++++++++++---- 4 files changed, 192 insertions(+), 86 deletions(-) diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index ab7c373c7c..9afca97d67 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -3,13 +3,8 @@ from zarr.codecs.blosc import BloscCodec, BloscCname, BloscShuffle # noqa: F401 from zarr.codecs.bytes import BytesCodec, Endian # noqa: F401 from zarr.codecs.crc32c_ import Crc32cCodec # noqa: F401 -from zarr.codecs.numcodecs_ import ( - NumcodecsBytesBytesCodec, - NumcodecsArrayArrayCodec, - AsTypeCodec, - PackbitsCodec, -) # noqa: F401 from zarr.codecs.gzip import GzipCodec # noqa: F401 from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation # noqa: F401 from zarr.codecs.transpose import TransposeCodec # noqa: F401 from zarr.codecs.zstd import ZstdCodec # noqa: F401 +import zarr.codecs.numcodecs_ # noqa: F401 diff --git a/src/zarr/codecs/numcodecs_.py b/src/zarr/codecs/numcodecs_.py index fe98b4c2f5..168cdb000b 100644 --- a/src/zarr/codecs/numcodecs_.py +++ b/src/zarr/codecs/numcodecs_.py @@ -1,6 +1,6 @@ from dataclasses import dataclass +from functools import cached_property import math -from typing import Type from typing_extensions import Self from warnings import warn @@ -16,11 +16,11 @@ CODEC_PREFIX = "https://zarr.dev/numcodecs/" -def parse_codec_configuration(name: str, expected_name_prefix: str) -> dict[str, JSON]: - parsed_name, parsed_configuration = parse_named_configuration(name) +def parse_codec_configuration(data: dict[str, JSON], expected_name_prefix: str) -> dict[str, JSON]: + parsed_name, parsed_configuration = parse_named_configuration(data) if not parsed_name.startswith(expected_name_prefix): raise ValueError( - f"Expected name to start with '{expected_name_prefix}'. Got {name} instead." + f"Expected name to start with '{expected_name_prefix}'. Got {parsed_name} instead." ) id = parsed_name[len(expected_name_prefix) :] return {"id": id, **parsed_configuration} @@ -28,37 +28,48 @@ def parse_codec_configuration(name: str, expected_name_prefix: str) -> dict[str, @dataclass(frozen=True) class NumcodecsCodec(Codec): - codec: numcodecs.abc.Codec - codec_id: str + codec_config: dict[str, JSON] - def __init__(self, *, codec_config: dict[str, JSON]) -> None: + def __init__(self, *, codec_id: str | None = None, codec_config: dict[str, JSON]) -> None: if "id" not in codec_config: - codec_config = {"id": self.codec_id, **codec_config} - object.__setattr__(self, "codec", numcodecs.get_codec(codec_config)) + if not codec_id: + raise ValueError( + "The codec id needs to be supplied either through the id attribute " + "of the codec_config or through the codec_id argument." + ) + codec_config = {"id": codec_id, **codec_config} + elif codec_id and codec_config["id"] != codec_id: + raise ValueError(f"Codec id does not match {codec_id}. Got: {codec_config['id']}.") + + object.__setattr__(self, "codec_config", codec_config) warn( "Numcodecs codecs are not in the Zarr version 3 specification and " "may not be supported by other zarr implementations.", category=UserWarning, ) + @cached_property + def _codec(self) -> numcodecs.abc.Codec: + return numcodecs.get_codec(self.codec_config) + @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: codec_config = parse_codec_configuration(data, CODEC_PREFIX) - cls(codec_config) + assert isinstance(codec_config["id"], str) # for mypy + return cls(codec_config=codec_config) def to_dict(self) -> JSON: - codec_config = self.codec.get_config() - del codec_config["id"] + codec_config = self.codec_config.copy() + codec_id = codec_config.pop("id") return { - "name": f"{CODEC_PREFIX}{self.codec.codec_id}", + "name": f"{CODEC_PREFIX}{codec_id}", "configuration": codec_config, } -@dataclass(frozen=True) -class NumcodecsBytesBytesCodec(BytesBytesCodec, NumcodecsCodec): - def __init__(self, *, codec_config: dict[str, JSON]) -> None: - super().__init__(codec_config=codec_config) +class NumcodecsBytesBytesCodec(NumcodecsCodec, BytesBytesCodec): + def __init__(self, *, codec_id: str, codec_config: dict[str, JSON]) -> None: + super().__init__(codec_id=codec_id, codec_config=codec_config) async def decode( self, @@ -66,7 +77,13 @@ async def decode( _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, ) -> BytesLike: - return await to_thread(self.codec.decode, chunk_bytes) + return await to_thread(self._codec.decode, chunk_bytes) + + def _encode(self, chunk_bytes: BytesLike) -> BytesLike: + encoded = self._codec.encode(chunk_bytes) + if isinstance(encoded, np.ndarray): # Required for checksum codecs + return encoded.tobytes() + return encoded async def encode( self, @@ -74,13 +91,12 @@ async def encode( _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, ) -> BytesLike: - return await to_thread(self.codec.encode, chunk_bytes) + return await to_thread(self._encode, chunk_bytes) -@dataclass(frozen=True) -class NumcodecsArrayArrayCodec(ArrayArrayCodec, NumcodecsCodec): - def __init__(self, *, codec_config: dict[str, JSON]) -> None: - super().__init__(codec_config=codec_config) +class NumcodecsArrayArrayCodec(NumcodecsCodec, ArrayArrayCodec): + def __init__(self, *, codec_id: str, codec_config: dict[str, JSON]) -> None: + super().__init__(codec_id=codec_id, codec_config=codec_config) async def decode( self, @@ -88,7 +104,7 @@ async def decode( chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, ) -> np.ndarray: - out = await to_thread(self.codec.decode, chunk_array) + out = await to_thread(self._codec.decode, chunk_array) return out.reshape(chunk_spec.shape) async def encode( @@ -97,65 +113,95 @@ async def encode( _chunk_spec: ArraySpec, _runtime_configuration: RuntimeConfiguration, ) -> np.ndarray: - return await to_thread(self.codec.encode, chunk_array) + return await to_thread(self._codec.encode, chunk_array) -def make_bytes_bytes_codec(codec_id: str) -> Type[NumcodecsBytesBytesCodec]: +def make_bytes_bytes_codec(codec_id: str) -> type[NumcodecsBytesBytesCodec]: # rename for class scope _codec_id = codec_id - @dataclass(frozen=True) class _Codec(NumcodecsBytesBytesCodec): - codec_id = _codec_id - def __init__(self, codec_config: dict[str, JSON] = {}) -> None: - super().__init__(codec_config=codec_config) + super().__init__(codec_id=_codec_id, codec_config=codec_config) return _Codec -def make_array_array_codec(codec_id: str) -> Type[NumcodecsArrayArrayCodec]: +def make_array_array_codec(codec_id: str) -> type[NumcodecsArrayArrayCodec]: # rename for class scope _codec_id = codec_id - @dataclass(frozen=True) class _Codec(NumcodecsArrayArrayCodec): - codec_id = _codec_id - def __init__(self, codec_config: dict[str, JSON] = {}) -> None: - if "id" not in codec_config: - codec_config = {"id": self.codec_id, **codec_config} - super().__init__(codec_config=codec_config) + super().__init__(codec_id=_codec_id, codec_config=codec_config) return _Codec -@dataclass(frozen=True) -class AsTypeCodec(NumcodecsArrayArrayCodec): - codec_id = "astype" +def make_checksum_codec(codec_id: str) -> type[NumcodecsBytesBytesCodec]: + # rename for class scope + _codec_id = codec_id + + class _ChecksumCodec(NumcodecsBytesBytesCodec): + def __init__(self, codec_config: dict[str, JSON] = {}) -> None: + super().__init__(codec_id=_codec_id, codec_config=codec_config) + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + return input_byte_length + 4 + + return _ChecksumCodec + + +class FixedScaleOffsetCodec(NumcodecsArrayArrayCodec): def __init__(self, codec_config: dict[str, JSON] = {}) -> None: - super().__init__(codec_config=codec_config) + super().__init__(codec_id="fixedscaleoffset", codec_config=codec_config) + + def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: + if astype := self.codec_config.get("astype"): + return ArraySpec( + chunk_spec.shape, + np.dtype(astype), + chunk_spec.fill_value, + ) + return chunk_spec + + def evolve(self, array_spec: ArraySpec) -> Self: + if str(array_spec.dtype) != self.codec_config.get("dtype"): + return self.__class__({**self.codec_config, "dtype": str(array_spec.dtype)}) + return self + + +class QuantizeCodec(NumcodecsArrayArrayCodec): + def __init__(self, codec_config: dict[str, JSON] = {}) -> None: + super().__init__(codec_id="quantize", codec_config=codec_config) + + def evolve(self, array_spec: ArraySpec) -> Self: + if str(array_spec.dtype) != self.codec_config.get("dtype"): + return self.__class__({**self.codec_config, "dtype": str(array_spec.dtype)}) + return self + + +class AsTypeCodec(NumcodecsArrayArrayCodec): + def __init__(self, codec_config: dict[str, JSON] = {}) -> None: + super().__init__(codec_id="astype", codec_config=codec_config) def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: return ArraySpec( chunk_spec.shape, - np.dtype(self.codec.get_config()["encode_dtype"]), + np.dtype(self.codec_config["encode_dtype"]), chunk_spec.fill_value, ) def evolve(self, array_spec: ArraySpec) -> Self: - codec_config = self.codec.get_config() - if str(array_spec.dtype) != codec_config["decode_dtype"]: - return AsTypeCodec({**codec_config, "decode_dtype": str(array_spec.dtype)}) + decode_dtype = self.codec_config.get("decode_dtype") + if str(array_spec.dtype) != decode_dtype: + return self.__class__({**self.codec_config, "decode_dtype": str(array_spec.dtype)}) + return self -@dataclass(frozen=True) class PackbitsCodec(NumcodecsArrayArrayCodec): - codec_id = "packbits" - def __init__(self, codec_config: dict[str, JSON] = {}) -> None: - super().__init__(codec_config=codec_config) + super().__init__(codec_id="packbits", codec_config=codec_config) def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec: return ArraySpec( @@ -169,6 +215,7 @@ def validate(self, array_metadata: ArrayMetadata) -> None: raise ValueError(f"Packbits filter requires bool dtype. Got {array_metadata.dtype}.") +# bytes-to-bytes codecs register_codec(f"{CODEC_PREFIX}blosc", make_bytes_bytes_codec("blosc")) register_codec(f"{CODEC_PREFIX}lz4", make_bytes_bytes_codec("lz4")) register_codec(f"{CODEC_PREFIX}zstd", make_bytes_bytes_codec("zstd")) @@ -177,9 +224,16 @@ def validate(self, array_metadata: ArrayMetadata) -> None: register_codec(f"{CODEC_PREFIX}bz2", make_bytes_bytes_codec("bz2")) register_codec(f"{CODEC_PREFIX}lzma", make_bytes_bytes_codec("lzma")) +# array-to-array codecs ("filters") register_codec(f"{CODEC_PREFIX}delta", make_array_array_codec("delta")) -register_codec(f"{CODEC_PREFIX}fixedscaleoffset", make_array_array_codec("fixedscaleoffset")) -register_codec(f"{CODEC_PREFIX}quantize", make_array_array_codec("quantize")) +register_codec(f"{CODEC_PREFIX}fixedscaleoffset", FixedScaleOffsetCodec) +register_codec(f"{CODEC_PREFIX}quantize", QuantizeCodec) register_codec(f"{CODEC_PREFIX}bitround", make_array_array_codec("bitround")) register_codec(f"{CODEC_PREFIX}packbits", PackbitsCodec) register_codec(f"{CODEC_PREFIX}astype", AsTypeCodec) + +# bytes-to-bytes checksum codecs +register_codec(f"{CODEC_PREFIX}crc32", make_checksum_codec("crc32")) +register_codec(f"{CODEC_PREFIX}adler32", make_checksum_codec("crc32")) +register_codec(f"{CODEC_PREFIX}fletcher32", make_checksum_codec("crc32")) +register_codec(f"{CODEC_PREFIX}jenkins_lookup3", make_checksum_codec("crc32")) diff --git a/src/zarr/codecs/registry.py b/src/zarr/codecs/registry.py index 140e1372ef..707331efe4 100644 --- a/src/zarr/codecs/registry.py +++ b/src/zarr/codecs/registry.py @@ -2,33 +2,27 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from typing import Dict, Type from zarr.abc.codec import Codec from importlib.metadata import EntryPoint, entry_points as get_entry_points -__codec_registry: Dict[str, Type[Codec]] = {} -__lazy_load_codecs: Dict[str, EntryPoint] = {} +__codec_registry: dict[str, type[Codec]] = {} +__lazy_load_codecs: dict[str, EntryPoint] = {} -def _collect_entrypoints() -> None: +def _collect_entrypoints() -> dict[str, EntryPoint]: entry_points = get_entry_points() - if hasattr(entry_points, "select"): - # If entry_points() has a select method, use that. Python 3.10+ - for e in entry_points.select(group="zarr.codecs"): - __lazy_load_codecs[e.name] = e - else: - # Otherwise, fallback to using get - for e in entry_points.get("zarr.codecs", []): - __lazy_load_codecs[e.name] = e + for e in entry_points.select(group="zarr.codecs"): + __lazy_load_codecs[e.name] = e + return __lazy_load_codecs -def register_codec(key: str, codec_cls: Type[Codec]) -> None: +def register_codec(key: str, codec_cls: type[Codec]) -> None: __codec_registry[key] = codec_cls -def get_codec_class(key: str) -> Type[Codec]: +def get_codec_class(key: str) -> type[Codec]: item = __codec_registry.get(key) if item is None: if key in __lazy_load_codecs: diff --git a/tests/v3/test_codecs.py b/tests/v3/test_codecs.py index 09f4a10e2f..6959633735 100644 --- a/tests/v3/test_codecs.py +++ b/tests/v3/test_codecs.py @@ -1062,7 +1062,7 @@ def test_generic_codec(store: Store, codec_id: str): ], ) - a[:, :] = data + a[:, :] = data.copy() assert np.array_equal(data, a[:, :]) @@ -1070,19 +1070,16 @@ def test_generic_codec(store: Store, codec_id: str): "codec_config", [ {"id": "delta", "dtype": "float32"}, - {"id": "fixedscaleoffset", "dtype": "float32", "offset": -2, "scale": 5.5}, - { - "id": "fixedscaleoffset", - "dtype": "float32", - "offset": 0, - "scale": 25.6, - "astype": "uint32", - }, - {"id": "quantize", "digits": 3, "dtype": "float32"}, - {"id": "bitround", "keepbits": 10}, + {"id": "fixedscaleoffset", "offset": 0, "scale": 25.5}, + {"id": "fixedscaleoffset", "offset": 0, "scale": 51, "astype": "uint16"}, {"id": "astype", "encode_dtype": "float32", "decode_dtype": "float64"}, ], - ids=["delta", "fixedscaleoffset", "fixedscaleoffset2", "quantize", "bitround", "astype"], + ids=[ + "delta", + "fixedscaleoffset", + "fixedscaleoffset2", + "astype", + ], ) def test_generic_filter(store: Store, codec_config: dict[str, JSON]): data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) @@ -1103,17 +1100,60 @@ def test_generic_filter(store: Store, codec_config: dict[str, JSON]): ], ) - a[:, :] = data + a[:, :] = data.copy() + a = Array.open(store / "generic") assert np.array_equal(data, a[:, :]) +def test_generic_filter_bitround(store: Store): + data = np.linspace(0, 1, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(UserWarning, match="Numcodecs.*"): + a = Array.create( + store / "generic_bitround", + shape=data.shape, + chunk_shape=(16, 16), + dtype=data.dtype, + fill_value=0, + codecs=[ + get_codec_class("https://zarr.dev/numcodecs/bitround")({"keepbits": 3}), + BytesCodec(), + ], + ) + + a[:, :] = data.copy() + a = Array.open(store / "generic_bitround") + assert np.allclose(data, a[:, :], atol=0.1) + + +def test_generic_filter_quantize(store: Store): + data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(UserWarning, match="Numcodecs.*"): + a = Array.create( + store / "generic_quantize", + shape=data.shape, + chunk_shape=(16, 16), + dtype=data.dtype, + fill_value=0, + codecs=[ + get_codec_class("https://zarr.dev/numcodecs/quantize")({"digits": 3}), + BytesCodec(), + ], + ) + + a[:, :] = data.copy() + a = Array.open(store / "generic_quantize") + assert np.allclose(data, a[:, :], atol=0.001) + + def test_generic_filter_packbits(store: Store): data = np.zeros((16, 16), dtype="bool") data[0:4, :] = True with pytest.warns(UserWarning, match="Numcodecs.*"): a = Array.create( - store / "generic", + store / "generic_packbits", shape=data.shape, chunk_shape=(16, 16), dtype=data.dtype, @@ -1124,5 +1164,28 @@ def test_generic_filter_packbits(store: Store): ], ) - a[:, :] = data + a[:, :] = data.copy() + a = Array.open(store / "generic_packbits") + assert np.array_equal(data, a[:, :]) + + +@pytest.mark.parametrize("codec_id", ["crc32", "adler32", "fletcher32", "jenkins_lookup3"]) +def test_generic_checksum(store: Store, codec_id: str): + data = np.linspace(0, 10, 256, dtype="float32").reshape((16, 16)) + + with pytest.warns(UserWarning, match="Numcodecs.*"): + a = Array.create( + store / "generic_checksum", + shape=data.shape, + chunk_shape=(16, 16), + dtype=data.dtype, + fill_value=0, + codecs=[ + BytesCodec(), + get_codec_class(f"https://zarr.dev/numcodecs/{codec_id}")(), + ], + ) + + a[:, :] = data.copy() + a = Array.open(store / "generic_checksum") assert np.array_equal(data, a[:, :]) From 4d9585e27efffd02bd858e0157328cbd7a7a3c05 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 May 2024 21:54:38 +0200 Subject: [PATCH 3/7] fixes types --- pyproject.toml | 3 --- src/zarr/codecs/numcodecs_.py | 5 ++++- src/zarr/common.py | 19 +++++++++++++++++-- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9887c824ca..b2a9c0c622 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -207,9 +207,6 @@ module = [ "zarr.array", "zarr.common", "zarr.store.local", - "zarr.codecs.blosc", - "zarr.codecs.gzip", - "zarr.codecs.zstd", ] disallow_untyped_calls = false diff --git a/src/zarr/codecs/numcodecs_.py b/src/zarr/codecs/numcodecs_.py index 168cdb000b..f74af28baf 100644 --- a/src/zarr/codecs/numcodecs_.py +++ b/src/zarr/codecs/numcodecs_.py @@ -66,6 +66,9 @@ def to_dict(self) -> JSON: "configuration": codec_config, } + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + return input_byte_length + class NumcodecsBytesBytesCodec(NumcodecsCodec, BytesBytesCodec): def __init__(self, *, codec_id: str, codec_config: dict[str, JSON]) -> None: @@ -226,9 +229,9 @@ def validate(self, array_metadata: ArrayMetadata) -> None: # array-to-array codecs ("filters") register_codec(f"{CODEC_PREFIX}delta", make_array_array_codec("delta")) +register_codec(f"{CODEC_PREFIX}bitround", make_array_array_codec("bitround")) register_codec(f"{CODEC_PREFIX}fixedscaleoffset", FixedScaleOffsetCodec) register_codec(f"{CODEC_PREFIX}quantize", QuantizeCodec) -register_codec(f"{CODEC_PREFIX}bitround", make_array_array_codec("bitround")) register_codec(f"{CODEC_PREFIX}packbits", PackbitsCodec) register_codec(f"{CODEC_PREFIX}astype", AsTypeCodec) diff --git a/src/zarr/common.py b/src/zarr/common.py index 7d8431f97e..8581f8f8cd 100644 --- a/src/zarr/common.py +++ b/src/zarr/common.py @@ -1,5 +1,16 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Union, Tuple, Iterable, Dict, List, TypeVar, overload, Any +from typing import ( + TYPE_CHECKING, + ParamSpec, + Union, + Tuple, + Iterable, + Dict, + List, + TypeVar, + overload, + Any, +) import asyncio import contextvars from dataclasses import dataclass @@ -48,7 +59,11 @@ async def run(item): return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items]) -async def to_thread(func, /, *args, **kwargs): +P = ParamSpec("P") +U = TypeVar("U") + + +async def to_thread(func: Callable[P, U], /, *args: P.args, **kwargs: P.kwargs) -> U: loop = asyncio.get_running_loop() ctx = contextvars.copy_context() func_call = functools.partial(ctx.run, func, *args, **kwargs) From 45d7a1c3ee3204f7863e4d697dd2b14c6853469e Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 May 2024 22:02:09 +0200 Subject: [PATCH 4/7] fixes types --- pyproject.toml | 5 ----- src/zarr/codecs/sharding.py | 11 +++++++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b2a9c0c622..97746e2d36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,11 +181,6 @@ check_untyped_defs = false [[tool.mypy.overrides]] module = [ "zarr.v2.*", - "zarr.abc.codec", - "zarr.codecs.bytes", - "zarr.codecs.pipeline", - "zarr.codecs.sharding", - "zarr.codecs.transpose", "zarr.array_v2", "zarr.array", "zarr.sync", diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index d4f8b7dfc9..be9ed7bd5c 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -126,7 +126,10 @@ def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardIndex: return cls(offsets_and_lengths) -class _ShardProxy(Mapping): +ShardMapping = Mapping[ChunkCoords, BytesLike | None] + + +class _ShardProxy(ShardMapping): index: _ShardIndex buf: BytesLike @@ -175,7 +178,7 @@ def merge_with_morton_order( cls, chunks_per_shard: ChunkCoords, tombstones: Set[ChunkCoords], - *shard_dicts: Mapping[ChunkCoords, BytesLike], + *shard_dicts: ShardMapping, ) -> _ShardBuilder: obj = cls.create_empty(chunks_per_shard) for chunk_coords in morton_order_iter(chunks_per_shard): @@ -375,7 +378,7 @@ async def decode_partial( all_chunk_coords = set(chunk_coords for chunk_coords, _, _ in indexed_chunks) # reading bytes of all requested chunks - shard_dict: Mapping[ChunkCoords, BytesLike] = {} + shard_dict: ShardMapping = {} if self._is_total_shard(all_chunk_coords, chunks_per_shard): # read entire shard shard_dict_maybe = await self._load_full_shard_maybe(store_path, chunks_per_shard) @@ -417,7 +420,7 @@ async def decode_partial( async def _read_chunk( self, - shard_dict: Mapping[ChunkCoords, Optional[BytesLike]], + shard_dict: ShardMapping, chunk_coords: ChunkCoords, chunk_selection: SliceSelection, out_selection: SliceSelection, From 431e2442d786039e53bd4987a3bcc05510ef320f Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 May 2024 22:05:37 +0200 Subject: [PATCH 5/7] add shuffle --- src/zarr/codecs/numcodecs_.py | 11 +++++++++++ src/zarr/codecs/sharding.py | 8 +++----- tests/v3/test_codecs.py | 4 +++- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/zarr/codecs/numcodecs_.py b/src/zarr/codecs/numcodecs_.py index f74af28baf..fa1d64c031 100644 --- a/src/zarr/codecs/numcodecs_.py +++ b/src/zarr/codecs/numcodecs_.py @@ -155,6 +155,16 @@ def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> return _ChecksumCodec +class ShuffleCodec(NumcodecsBytesBytesCodec): + def __init__(self, codec_config: dict[str, JSON] = {}) -> None: + super().__init__(codec_id="shuffle", codec_config=codec_config) + + def evolve(self, array_spec: ArraySpec) -> Self: + if array_spec.dtype.itemsize != self.codec_config.get("elementsize"): + return self.__class__({**self.codec_config, "elementsize": array_spec.dtype.itemsize}) + return self + + class FixedScaleOffsetCodec(NumcodecsArrayArrayCodec): def __init__(self, codec_config: dict[str, JSON] = {}) -> None: super().__init__(codec_id="fixedscaleoffset", codec_config=codec_config) @@ -226,6 +236,7 @@ def validate(self, array_metadata: ArrayMetadata) -> None: register_codec(f"{CODEC_PREFIX}gzip", make_bytes_bytes_codec("gzip")) register_codec(f"{CODEC_PREFIX}bz2", make_bytes_bytes_codec("bz2")) register_codec(f"{CODEC_PREFIX}lzma", make_bytes_bytes_codec("lzma")) +register_codec(f"{CODEC_PREFIX}shuffle", ShuffleCodec) # array-to-array codecs ("filters") register_codec(f"{CODEC_PREFIX}delta", make_array_array_codec("delta")) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index be9ed7bd5c..0436c96f1f 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -18,6 +18,8 @@ from zarr.codecs.registry import register_codec from zarr.common import ( ArraySpec, + ChunkCoords, + BytesLike, ChunkCoordsLike, concurrent_map, parse_enum, @@ -45,13 +47,12 @@ from zarr.store import StorePath from zarr.common import ( JSON, - ChunkCoords, - BytesLike, SliceSelection, ) from zarr.config import RuntimeConfiguration MAX_UINT_64 = 2**64 - 1 +ShardMapping = Mapping[ChunkCoords, BytesLike | None] class ShardingCodecIndexLocation(Enum): @@ -126,9 +127,6 @@ def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardIndex: return cls(offsets_and_lengths) -ShardMapping = Mapping[ChunkCoords, BytesLike | None] - - class _ShardProxy(ShardMapping): index: _ShardIndex buf: BytesLike diff --git a/tests/v3/test_codecs.py b/tests/v3/test_codecs.py index 6959633735..e250d46d54 100644 --- a/tests/v3/test_codecs.py +++ b/tests/v3/test_codecs.py @@ -1045,7 +1045,9 @@ def test_update_attributes_array(store: Store): assert a.attrs["hello"] == "zarrita" -@pytest.mark.parametrize("codec_id", ["blosc", "lz4", "zstd", "zlib", "gzip", "bz2", "lzma"]) +@pytest.mark.parametrize( + "codec_id", ["blosc", "lz4", "zstd", "zlib", "gzip", "bz2", "lzma", "shuffle"] +) def test_generic_codec(store: Store, codec_id: str): data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) From d8b5ca7748f0d1a0bcb049aa414aac4bdbc2d876 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Sun, 5 May 2024 22:29:08 +0200 Subject: [PATCH 6/7] pep8 --- src/zarr/codecs/numcodecs_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/codecs/numcodecs_.py b/src/zarr/codecs/numcodecs_.py index fa1d64c031..b69ae517c3 100644 --- a/src/zarr/codecs/numcodecs_.py +++ b/src/zarr/codecs/numcodecs_.py @@ -22,7 +22,7 @@ def parse_codec_configuration(data: dict[str, JSON], expected_name_prefix: str) raise ValueError( f"Expected name to start with '{expected_name_prefix}'. Got {parsed_name} instead." ) - id = parsed_name[len(expected_name_prefix) :] + id = parsed_name[len(expected_name_prefix):] return {"id": id, **parsed_configuration} From 33a48b095c816e87287f7aa6c2f8d6f4eadae711 Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Mon, 6 May 2024 21:56:30 +0200 Subject: [PATCH 7/7] fix checksum codecs --- src/zarr/codecs/numcodecs_.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/zarr/codecs/numcodecs_.py b/src/zarr/codecs/numcodecs_.py index b69ae517c3..c7803d5caf 100644 --- a/src/zarr/codecs/numcodecs_.py +++ b/src/zarr/codecs/numcodecs_.py @@ -248,6 +248,6 @@ def validate(self, array_metadata: ArrayMetadata) -> None: # bytes-to-bytes checksum codecs register_codec(f"{CODEC_PREFIX}crc32", make_checksum_codec("crc32")) -register_codec(f"{CODEC_PREFIX}adler32", make_checksum_codec("crc32")) -register_codec(f"{CODEC_PREFIX}fletcher32", make_checksum_codec("crc32")) -register_codec(f"{CODEC_PREFIX}jenkins_lookup3", make_checksum_codec("crc32")) +register_codec(f"{CODEC_PREFIX}adler32", make_checksum_codec("adler32")) +register_codec(f"{CODEC_PREFIX}fletcher32", make_checksum_codec("fletcher32")) +register_codec(f"{CODEC_PREFIX}jenkins_lookup3", make_checksum_codec("jenkins_lookup3"))