From 64bbbd8fce888cb45ed74763e21381831b2f51a9 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Mon, 15 Nov 2021 21:53:45 -0500 Subject: [PATCH 1/9] add v3 store classes Define the StoreV3 class and create v3 versions of most existing stores Add a test_storage_v3.py with test classes inheriting from their v2 counterparts. Only a subset of methods involving differences in v3 behavior were overridden. --- zarr/_storage/store.py | 251 +++++++++- zarr/meta.py | 235 +++++++++- zarr/storage.py | 731 +++++++++++++++++++++++++---- zarr/tests/test_storage.py | 18 +- zarr/tests/test_storage_v3.py | 847 ++++++++++++++++++++++++++++++++++ zarr/tests/util.py | 6 +- 6 files changed, 1996 insertions(+), 92 deletions(-) create mode 100644 zarr/tests/test_storage_v3.py diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index 6f5bf78e28..0ff9e0c043 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -1,7 +1,10 @@ +import json +import sys from collections.abc import MutableMapping +from string import ascii_letters, digits from typing import Any, List, Optional, Union -from zarr.meta import Metadata2 +from zarr.meta import Metadata2, Metadata3, _default_entry_point_metadata_v3 from zarr.util import normalize_storage_path # v2 store keys @@ -131,6 +134,169 @@ def rmdir(self, path: str = "") -> None: _rmdir_from_keys(self, path) +class StoreV3(BaseStore): + _store_version = 3 + _metadata_class = Metadata3 + + @staticmethod + def _valid_key(key: str) -> bool: + """ + Verify that a key conforms to the specification. + + A key is any string containing only character in the range a-z, A-Z, + 0-9, or in the set /.-_ it will return True if that's the case, False + otherwise. + + In addition, in spec v3, keys can only start with the prefix meta/, + data/ or be exactly zarr.json and should not end with /. This should + not be exposed to the user, and is a store implementation detail, so + this method will raise a ValueError in that case. + """ + if sys.version_info > (3, 7): + if not key.isascii(): + return False + if set(key) - set(ascii_letters + digits + "/.-_"): + return False + + if ( + not key.startswith("data/") + and (not key.startswith("meta/")) + and (not key == "zarr.json") + ): + raise ValueError("keys starts with unexpected value: `{}`".format(key)) + + if key.endswith('/'): + raise ValueError("keys may not end in /") + + return True + + def list_prefix(self, prefix): + if prefix.startswith('/'): + raise ValueError("prefix must not begin with /") + # TODO: force prefix to end with /? + return [k for k in self.list() if k.startswith(prefix)] + + def erase(self, key): + self.__delitem__(key) + + def erase_prefix(self, prefix): + assert prefix.endswith("/") + + if prefix == "/": + all_keys = self.list() + else: + all_keys = self.list_prefix(prefix) + for key in all_keys: + self.erase(key) + + def list_dir(self, prefix): + """ + Note: carefully test this with trailing/leading slashes + """ + if prefix: # allow prefix = "" ? + assert prefix.endswith("/") + + all_keys = self.list_prefix(prefix) + len_prefix = len(prefix) + keys = [] + prefixes = [] + for k in all_keys: + trail = k[len_prefix:] + if "/" not in trail: + keys.append(prefix + trail) + else: + prefixes.append(prefix + trail.split("/", maxsplit=1)[0] + "/") + return keys, list(set(prefixes)) + + def list(self): + if hasattr(self, 'keys'): + return list(self.keys()) + raise NotImplementedError( + "The list method has not been implemented for this store type." + ) + + # TODO: Remove listdir? This method is just to match the current V2 stores + # The v3 spec mentions: list, list_dir, list_prefix + def listdir(self, path: str = ""): + if path and not path.endswith("/"): + path = path + "/" + keys, prefixes = self.list_dir(path) + prefixes = [p[len(path):].rstrip("/") for p in prefixes] + keys = [k[len(path):] for k in keys] + return keys + prefixes + + # TODO: rmdir here is identical to the rmdir on Store so could potentially + # move to BaseStore instead. + def rmdir(self, path: str = "") -> None: + if not self.is_erasable(): + raise NotImplementedError( + f'{type(self)} is not erasable, cannot call "rmdir"' + ) # pragma: no cover + path = normalize_storage_path(path) + _rmdir_from_keys(self, path) + + def __contains__(self, key): + # TODO: re-enable this check? + # if not key.startswith(("meta/", "data/")): + # raise ValueError( + # f'Key must start with either "meta/" or "data/". ' + # f'Got {key}' + # ) + return key in self.list() + + def clear(self): + """Remove all items from store.""" + self.erase_prefix("/") + + def __eq__(self, other): + from zarr.storage import KVStoreV3 # avoid circular import + if isinstance(other, KVStoreV3): + return self._mutable_mapping == other._mutable_mapping + else: + return NotImplemented + + @staticmethod + def _ensure_store(store): + """ + We want to make sure internally that zarr stores are always a class + with a specific interface derived from ``Store``, which is slightly + different than ``MutableMapping``. + + We'll do this conversion in a few places automatically + """ + from zarr.storage import KVStoreV3 # avoid circular import + if store is None: + return None + elif isinstance(store, StoreV3): + return store + elif isinstance(store, MutableMapping): + return KVStoreV3(store) + else: + for attr in [ + "keys", + "values", + "get", + "__setitem__", + "__getitem__", + "__delitem__", + "__contains__", + ]: + if not hasattr(store, attr): + break + else: + return KVStoreV3(store) + + raise ValueError( + "v3 stores must be subclasses of StoreV3, " + "if your store exposes the MutableMapping interface wrap it in " + f"Zarr.storage.KVStoreV3. Got {store}" + ) + + +# allow MutableMapping for backwards compatibility +StoreLike = Union[BaseStore, MutableMapping] + + def _path_to_prefix(path: Optional[str]) -> str: # assume path already normalized if path: @@ -140,17 +306,49 @@ def _path_to_prefix(path: Optional[str]) -> str: return prefix +# TODO: Should this return default metadata or raise an Error if zarr.json +# is absent? +def _get_hierarchy_metadata(store=None): + meta = _default_entry_point_metadata_v3 + if store is not None: + version = getattr(store, '_store_version', 2) + if version < 3: + raise ValueError("zarr.json hierarchy metadata not stored for " + f"zarr v{version} stores") + if 'zarr.json' in store: + meta = store._metadata_class.decode_hierarchy_metadata(store['zarr.json']) + return meta + + def _rename_from_keys(store: BaseStore, src_path: str, dst_path: str) -> None: # assume path already normalized src_prefix = _path_to_prefix(src_path) dst_prefix = _path_to_prefix(dst_path) - for key in list(store.keys()): - if key.startswith(src_prefix): - new_key = dst_prefix + key.lstrip(src_prefix) - store[new_key] = store.pop(key) - - -def _rmdir_from_keys(store: Union[BaseStore, MutableMapping], path: Optional[str] = None) -> None: + version = getattr(store, '_store_version', 2) + if version == 2: + root_prefixes = [''] + elif version == 3: + root_prefixes = ['meta/root/', 'data/root/'] + for root_prefix in root_prefixes: + _src_prefix = root_prefix + src_prefix + _dst_prefix = root_prefix + dst_prefix + for key in list(store.keys()): + if key.startswith(_src_prefix): + new_key = _dst_prefix + key.lstrip(_src_prefix) + store[new_key] = store.pop(key) + if version == 3: + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + _src_array_json = 'meta/root/' + src_prefix[:-1] + '.array' + sfx + if _src_array_json in store: + new_key = 'meta/root/' + dst_prefix[:-1] + '.array' + sfx + store[new_key] = store.pop(_src_array_json) + _src_group_json = 'meta/root/' + src_prefix[:-1] + '.group' + sfx + if _src_group_json in store: + new_key = 'meta/root/' + dst_prefix[:-1] + '.group' + sfx + store[new_key] = store.pop(_src_group_json) + + +def _rmdir_from_keys(store: StoreLike, path: Optional[str] = None) -> None: # assume path already normalized prefix = _path_to_prefix(path) for key in list(store.keys()): @@ -168,3 +366,40 @@ def _listdir_from_keys(store: BaseStore, path: Optional[str] = None) -> List[str child = suffix.split('/')[0] children.add(child) return sorted(children) + + +def _prefix_to_array_key(store: StoreLike, prefix: str) -> str: + if getattr(store, "_store_version", 2) == 3: + if prefix: + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + key = "meta/root/" + prefix.rstrip("/") + ".array" + sfx + else: + raise ValueError("prefix must be supplied to get a v3 array key") + else: + key = prefix + array_meta_key + return key + + +def _prefix_to_group_key(store: StoreLike, prefix: str) -> str: + if getattr(store, "_store_version", 2) == 3: + if prefix: + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + key = "meta/root/" + prefix.rstrip('/') + ".group" + sfx + else: + raise ValueError("prefix must be supplied to get a v3 group key") + else: + key = prefix + group_meta_key + return key + + +def _prefix_to_attrs_key(store: StoreLike, prefix: str) -> str: + if getattr(store, "_store_version", 2) == 3: + # for v3, attributes are stored in the array metadata + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + if prefix: + key = "meta/root/" + prefix.rstrip('/') + ".array" + sfx + else: + raise ValueError("prefix must be supplied to get a v3 array key") + else: + key = prefix + attrs_key + return key diff --git a/zarr/meta.py b/zarr/meta.py index c292b09a14..07fbdcb7d4 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -1,4 +1,6 @@ import base64 +import itertools +import os from collections.abc import Mapping import numpy as np @@ -9,6 +11,37 @@ from typing import cast, Union, Any, List, Mapping as MappingType ZARR_FORMAT = 2 +ZARR_FORMAT_v3 = 3 + +FLOAT_FILLS = {"NaN": np.nan, "Infinity": np.PINF, "-Infinity": np.NINF} + + +_v3_core_type = set( + "".join(d) + for d in itertools.product("<>", ("u", "i", "f"), ("2", "4", "8")) +) +_v3_core_type = {"bool", "i1", "u1"} | _v3_core_type + +ZARR_V3_CORE_DTYPES_ONLY = int(os.environ.get("ZARR_V3_CORE_DTYPES_ONLY", False)) +ZARR_V3_ALLOW_COMPLEX = int(os.environ.get("ZARR_V3_ALLOW_COMPLEX", + not ZARR_V3_CORE_DTYPES_ONLY)) +ZARR_V3_ALLOW_DATETIME = int(os.environ.get("ZARR_V3_ALLOW_DATETIME", + not ZARR_V3_CORE_DTYPES_ONLY)) +ZARR_V3_ALLOW_STRUCTURED = int(os.environ.get("ZARR_V3_ALLOW_STRUCTURED", + not ZARR_V3_CORE_DTYPES_ONLY)) +ZARR_V3_ALLOW_OBJECTARRAY = int(os.environ.get("ZARR_V3_ALLOW_OBJECTARRAY", + not ZARR_V3_CORE_DTYPES_ONLY)) +ZARR_V3_ALLOW_BYTES_ARRAY = int(os.environ.get("ZARR_V3_ALLOW_BYTES_ARRAY", + not ZARR_V3_CORE_DTYPES_ONLY)) +ZARR_V3_ALLOW_UNICODE_ARRAY = int(os.environ.get("ZARR_V3_ALLOW_UNICODE_ARRAY", + not ZARR_V3_CORE_DTYPES_ONLY)) + +_default_entry_point_metadata_v3 = { + 'zarr_format': "https://purl.org/zarr/spec/protocol/core/3.0", + 'metadata_encoding': "https://purl.org/zarr/spec/protocol/core/3.0", + 'metadata_key_suffix': '.json', + "extensions": [], +} class Metadata2: @@ -228,7 +261,207 @@ def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> return v -# expose class methods for backwards compatibility +class Metadata3(Metadata2): + ZARR_FORMAT = ZARR_FORMAT_v3 + + @classmethod + def decode_dtype(cls, d): + d = cls._decode_dtype_descr(d) + dtype = np.dtype(d) + if dtype.kind == 'c': + if not ZARR_V3_ALLOW_COMPLEX: + raise ValueError("complex-valued arrays not supported") + elif dtype.kind in 'mM': + if not ZARR_V3_ALLOW_DATETIME: + raise ValueError( + "datetime64 and timedelta64 arrays not supported" + ) + elif dtype.kind == 'O': + if not ZARR_V3_ALLOW_OBJECTARRAY: + raise ValueError("object arrays not supported") + elif dtype.kind == 'V': + if not ZARR_V3_ALLOW_STRUCTURED: + raise ValueError("structured arrays not supported") + elif dtype.kind == 'U': + if not ZARR_V3_ALLOW_UNICODE_ARRAY: + raise ValueError("unicode arrays not supported") + elif dtype.kind == 'S': + if not ZARR_V3_ALLOW_BYTES_ARRAY: + raise ValueError("bytes arrays not supported") + else: + assert d in _v3_core_type + return dtype + + @classmethod + def encode_dtype(cls, d): + s = Metadata2.encode_dtype(d) + if s == "|b1": + return "bool" + elif s == "|u1": + return "u1" + elif s == "|i1": + return "i1" + dtype = np.dtype(d) + if dtype.kind == "c": + if not ZARR_V3_ALLOW_COMPLEX: + raise ValueError( + "complex-valued arrays not part of the base v3 spec" + ) + elif dtype.kind in "mM": + if not ZARR_V3_ALLOW_DATETIME: + raise ValueError( + "datetime64 and timedelta64 not part of the base v3 " + "spec" + ) + elif dtype.kind == "O": + if not ZARR_V3_ALLOW_OBJECTARRAY: + raise ValueError( + "object dtypes are not part of the base v3 spec" + ) + elif dtype.kind == "V": + if not ZARR_V3_ALLOW_STRUCTURED: + raise ValueError( + "structured arrays are not part of the base v3 spec" + ) + elif dtype.kind == 'U': + if not ZARR_V3_ALLOW_UNICODE_ARRAY: + raise ValueError("unicode dtypes are not part of the base v3 " + "spec") + elif dtype.kind == 'S': + if not ZARR_V3_ALLOW_BYTES_ARRAY: + raise ValueError("bytes dtypes are not part of the base v3 " + "spec") + else: + assert s in _v3_core_type + return s + + @classmethod + def decode_group_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + # 1 / 0 + # # check metadata format version + # zarr_format = meta.get("zarr_format", None) + # if zarr_format != cls.ZARR_FORMAT: + # raise MetadataError("unsupported zarr format: %s" % zarr_format) + + assert 'attributes' in meta + # meta = dict(attributes=meta['attributes']) + return meta + + # return json.loads(s) + + @classmethod + def encode_group_metadata(cls, meta=None) -> bytes: + # The ZARR_FORMAT should not be in the group metadata, but in the + # entry point metadata instead + # meta = dict(zarr_format=cls.ZARR_FORMAT) + if meta is None: + meta = {'attributes': {}} + meta = dict(attributes=meta.get("attributes", {})) + return json_dumps(meta) + + @classmethod + def encode_hierarchy_metadata(cls, meta=None) -> bytes: + if meta is None: + meta = _default_entry_point_metadata_v3 + elif set(meta.keys()) != { + "zarr_format", + "metadata_encoding", + "metadata_key_suffix", + "extensions", + }: + raise ValueError(f"Unexpected keys in metadata. meta={meta}") + return json_dumps(meta) + + @classmethod + def decode_hierarchy_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + # check metadata format + # zarr_format = meta.get("zarr_format", None) + # if zarr_format != "https://purl.org/zarr/spec/protocol/core/3.0": + # raise MetadataError("unsupported zarr format: %s" % zarr_format) + if set(meta.keys()) != { + "zarr_format", + "metadata_encoding", + "metadata_key_suffix", + "extensions", + }: + raise ValueError(f"Unexpected keys in metdata. meta={meta}") + return meta + + @classmethod + def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + + # check metadata format + zarr_format = meta.get("zarr_format", None) + if zarr_format != cls.ZARR_FORMAT: + raise MetadataError("unsupported zarr format: %s" % zarr_format) + + # extract array metadata fields + try: + dtype = cls.decode_dtype(meta["data_type"]) + if dtype.hasobject: + import numcodecs + object_codec = numcodecs.get_codec(meta['attributes']['filters'][0]) + else: + object_codec = None + fill_value = cls.decode_fill_value(meta["fill_value"], dtype, object_codec) + # TODO: remove dimension_separator? + meta = dict( + zarr_format=meta["zarr_format"], + shape=tuple(meta["shape"]), + chunk_grid=dict( + type=meta["chunk_grid"]["type"], + chunk_shape=tuple(meta["chunk_grid"]["chunk_shape"]), + separator=meta["chunk_grid"]["separator"], + ), + data_type=dtype, + compressor=meta["compressor"], + fill_value=fill_value, + chunk_memory_layout=meta["chunk_memory_layout"], + dimension_separator=meta.get("dimension_separator", "/"), + attributes=meta["attributes"], + ) + # dimension_separator = meta.get("dimension_separator", None) + # if dimension_separator: + # meta["dimension_separator"] = dimension_separator + except Exception as e: + raise MetadataError("error decoding metadata: %s" % e) + else: + return meta + + @classmethod + def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: + dtype = meta["data_type"] + sdshape = () + if dtype.subdtype is not None: + dtype, sdshape = dtype.subdtype + dimension_separator = meta.get("dimension_separator") + if dtype.hasobject: + import numcodecs + object_codec = numcodecs.get_codec(meta['attributes']['filters'][0]) + else: + object_codec = None + meta = dict( + zarr_format=cls.ZARR_FORMAT, + shape=meta["shape"] + sdshape, + chunk_grid=dict( + type=meta["chunk_grid"]["type"], + chunk_shape=tuple(meta["chunk_grid"]["chunk_shape"]), + separator=meta["chunk_grid"]["separator"], + ), + data_type=cls.encode_dtype(dtype), + compressor=meta["compressor"], + fill_value=encode_fill_value(meta["fill_value"], dtype, object_codec), + chunk_memory_layout=meta["chunk_memory_layout"], + attributes=meta.get("attributes", {}), + ) + if dimension_separator: + meta["dimension_separator"] = dimension_separator + return json_dumps(meta) + + parse_metadata = Metadata2.parse_metadata decode_array_metadata = Metadata2.decode_array_metadata encode_array_metadata = Metadata2.encode_array_metadata diff --git a/zarr/storage.py b/zarr/storage.py index 7170eeaf23..00ca4591b4 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -57,15 +57,19 @@ normalize_shape, normalize_storage_path, retry_call) from zarr._storage.absstore import ABSStore # noqa: F401 -from zarr._storage.store import (_listdir_from_keys, - _path_to_prefix, +from zarr._storage.store import (_get_hierarchy_metadata, + _listdir_from_keys, _rename_from_keys, _rmdir_from_keys, + _path_to_prefix, + _prefix_to_array_key, + _prefix_to_group_key, array_meta_key, group_meta_key, attrs_key, BaseStore, - Store) + Store, + StoreV3) __doctest_requires__ = { ('RedisStore', 'RedisStore.*'): ['redis'], @@ -92,40 +96,95 @@ def contains_array(store: StoreLike, path: Path = None) -> bool: """Return True if the store contains an array at the given logical path.""" path = normalize_storage_path(path) prefix = _path_to_prefix(path) - key = prefix + array_meta_key + key = _prefix_to_array_key(store, prefix) return key in store -def contains_group(store: StoreLike, path: Path = None) -> bool: +def contains_group(store: StoreLike, path: Path = None, explicit_only=True) -> bool: """Return True if the store contains a group at the given logical path.""" path = normalize_storage_path(path) prefix = _path_to_prefix(path) - key = prefix + group_meta_key - return key in store + key = _prefix_to_group_key(store, prefix) + store_version = getattr(store, '_store_version', 2) + if store_version == 2 or explicit_only: + return key in store + else: + if key in store: + return True + # for v3, need to also handle implicit groups + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + implicit_prefix = key.replace('.group' + sfx, '') + if not implicit_prefix.endswith('/'): + implicit_prefix += '/' + if store.list_prefix(implicit_prefix): # type: ignore + return True + return False -def normalize_store_arg(store: Any, clobber=False, storage_options=None, mode="w") -> BaseStore: +def normalize_store_arg(store, clobber=False, storage_options=None, mode="w", + *, zarr_version=None) -> Store: + if zarr_version is None: + # default to v2 store for backward compatibility + zarr_version = getattr(store, '_store_version', 2) + if zarr_version not in [2, 3]: + raise ValueError("zarr_version must be 2 or 3") if store is None: - return BaseStore._ensure_store(dict()) - elif isinstance(store, os.PathLike): + if zarr_version == 2: + store = KVStore(dict()) + else: + store = KVStoreV3(dict()) + # add default zarr.json metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) + return store + if isinstance(store, os.PathLike): store = os.fspath(store) if isinstance(store, str): mode = mode if clobber else "r" - if "://" in store or "::" in store: - return FSStore(store, mode=mode, **(storage_options or {})) - elif storage_options: - raise ValueError("storage_options passed with non-fsspec path") - if store.endswith('.zip'): - return ZipStore(store, mode=mode) - elif store.endswith('.n5'): - from zarr.n5 import N5Store - return N5Store(store) - else: - return DirectoryStore(store) - else: - if not isinstance(store, BaseStore) and isinstance(store, MutableMapping): - store = BaseStore._ensure_store(store) - return store + if zarr_version == 2: + if "://" in store or "::" in store: + return FSStore(store, mode=mode, **(storage_options or {})) + elif storage_options: + raise ValueError("storage_options passed with non-fsspec path") + if store.endswith('.zip'): + return ZipStore(store, mode=mode) + elif store.endswith('.n5'): + from zarr.n5 import N5Store + return N5Store(store) + else: + return DirectoryStore(store) + elif zarr_version == 3: + if "://" in store or "::" in store: + store = FSStoreV3(store, mode=mode, **(storage_options or {})) + elif storage_options: + store = ValueError("storage_options passed with non-fsspec path") + if store.endswith('.zip'): + store = ZipStoreV3(store, mode=mode) + elif store.endswith('.n5'): + raise NotImplementedError("N5Store not yet implemented for V3") + # return N5StoreV3(store) + else: + store = DirectoryStoreV3(store) + # add default zarr.json metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) + return store + elif zarr_version == 2: + store = Store._ensure_store(store) + if getattr(store, '_store_version', 2) != 2: + raise ValueError( + "provided store does not match the specified zarr version.") + # if not isinstance(store, Store) and isinstance(store, MutableMapping): + # store = KVStore(store) + elif zarr_version == 3: + store = StoreV3._ensure_store(store) + if getattr(store, '_store_version', 2) != 3: + raise ValueError( + "provided store does not match the specified zarr version.") + # if not isinstance(store, StoreV3) and isinstance(store, MutableMapping): + # store = KVStoreV3(store) + if 'zarr.json' not in store: + # add default zarr.json metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) + return store def rmdir(store: StoreLike, path: Path = None): @@ -133,15 +192,36 @@ def rmdir(store: StoreLike, path: Path = None): this will be called, otherwise will fall back to implementation via the `Store` interface.""" path = normalize_storage_path(path) - if hasattr(store, "rmdir") and store.is_erasable(): # type: ignore - # pass through - store.rmdir(path) # type: ignore + if getattr(store, '_store_version', 2) == 2: + if hasattr(store, "rmdir") and store.is_erasable(): # type: ignore + # pass through + store.rmdir(path) # type: ignore + else: + # slow version, delete one key at a time + _rmdir_from_keys(store, path) else: - # slow version, delete one key at a time - _rmdir_from_keys(store, path) + # TODO: check behavior for v3 and fix in the Store class, deferring to + # those by default + + # remove metadata folder + meta_dir = 'meta/root/' + path + _rmdir_from_keys(store, meta_dir) + + # remove data folder + data_dir = 'data/root/' + path + _rmdir_from_keys(store, data_dir) + # remove metadata files + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + array_meta_file = meta_dir + '.array' + sfx + if array_meta_file in store: + store.erase(array_meta_file) # type: ignore + group_meta_file = meta_dir + '.group' + sfx + if group_meta_file in store: + store.erase(group_meta_file) # type: ignore -def rename(store: BaseStore, src_path: Path, dst_path: Path): + +def rename(store: Store, src_path: Path, dst_path: Path): """Rename all items under the given path. If `store` provides a `rename` method, this will be called, otherwise will fall back to implementation via the `Store` interface.""" @@ -163,6 +243,27 @@ def listdir(store: BaseStore, path: Path = None): if hasattr(store, 'listdir'): # pass through return store.listdir(path) # type: ignore + elif getattr(store, "_store_version", None) == 3: + meta_prefix = 'meta/root/' + dir_path = meta_prefix + path + path_start = len(meta_prefix) + meta_keys = [] + include_meta_keys = False + if include_meta_keys: + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + group_meta_key = dir_path + '.group' + sfx + if group_meta_key in store: + meta_keys.append(group_meta_key[path_start:]) + array_meta_key = dir_path + '.array' + sfx + if array_meta_key in store: + meta_keys.append(array_meta_key[path_start:]) + if not dir_path.endswith('/'): + dir_path += '/' + keys, prefixes = store.list_dir(dir_path) # type: ignore + keys = [k[path_start:] for k in keys] + prefixes = [p[path_start:] for p in prefixes] + return meta_keys + keys + prefixes + else: # slow version, iterate through all keys warnings.warn( @@ -173,33 +274,45 @@ def listdir(store: BaseStore, path: Path = None): return _listdir_from_keys(store, path) +def _getsize(store: BaseStore, path: Path = None) -> int: + # compute from size of values + if path and path in store: + v = store[path] + size = buffer_size(v) + else: + path = '' if path is None else normalize_storage_path(path) + size = 0 + store_version = getattr(store, '_store_version', 2) + if store_version == 3: + members = store.list_prefix('data/root/' + path) # type: ignore + members += store.list_prefix('meta/root/' + path) # type: ignore + # members += ['zarr.json'] + else: + members = listdir(store, path) + prefix = _path_to_prefix(path) + members = [prefix + k for k in members] + for k in members: + try: + v = store[k] + except KeyError: + pass + else: + try: + size += buffer_size(v) + except TypeError: + return -1 + return size + + def getsize(store: BaseStore, path: Path = None) -> int: """Compute size of stored items for a given path. If `store` provides a `getsize` method, this will be called, otherwise will return -1.""" - path = normalize_storage_path(path) if hasattr(store, 'getsize'): # pass through + path = normalize_storage_path(path) return store.getsize(path) # type: ignore elif isinstance(store, MutableMapping): - # compute from size of values - if path in store: - v = store[path] - size = buffer_size(v) - else: - members = listdir(store, path) - prefix = _path_to_prefix(path) - size = 0 - for k in members: - try: - v = store[prefix + k] - except KeyError: - pass - else: - try: - size += buffer_size(v) - except TypeError: - return -1 - return size + return _getsize(store, path) else: return -1 @@ -346,7 +459,14 @@ def init_array( path = normalize_storage_path(path) # ensure parent group initialized - _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite) + store_version = getattr(store, "_store_version", 2) + if store_version < 3: + _require_parent_group(path, store=store, chunk_store=chunk_store, + overwrite=overwrite) + + if store_version == 3 and 'zarr.json' not in store: + # initialize with default zarr.json entry level metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) # type: ignore _init_array_metadata(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, @@ -372,16 +492,50 @@ def _init_array_metadata( dimension_separator=None, ): + store_version = getattr(store, '_store_version', 2) + + path = normalize_storage_path(path) + # guard conditions if overwrite: - # attempt to delete any pre-existing items in store - rmdir(store, path) - if chunk_store is not None: - rmdir(chunk_store, path) - elif contains_array(store, path): - raise ContainsArrayError(path) - elif contains_group(store, path): - raise ContainsGroupError(path) + if store_version == 2: + # attempt to delete any pre-existing array in store + rmdir(store, path) + if chunk_store is not None: + rmdir(chunk_store, path) + else: + group_meta_key = _prefix_to_group_key(store, _path_to_prefix(path)) + array_meta_key = _prefix_to_array_key(store, _path_to_prefix(path)) + data_prefix = 'data/root/' + _path_to_prefix(path) + + # attempt to delete any pre-existing array in store + if array_meta_key in store: + store.erase(array_meta_key) # type: ignore + if group_meta_key in store: + store.erase(group_meta_key) # type: ignore + store.erase_prefix(data_prefix) # type: ignore + if chunk_store is not None: + chunk_store.erase_prefix(data_prefix) # type: ignore + + if '/' in path: + # path is a subfolder of an existing array, remove that array + parent_path = '/'.join(path.split('/')[:-1]) + sfx = _get_hierarchy_metadata(store)['metadata_key_suffix'] + array_key = 'meta/root/' + parent_path + '.array' + sfx + if array_key in store: + store.erase(array_key) # type: ignore + + if not overwrite: + if contains_array(store, path): + raise ContainsArrayError(path) + elif contains_group(store, path, explicit_only=False): + raise ContainsGroupError(path) + elif store_version == 3: + if '/' in path: + # cannot create an array within an existing array path + parent_path = '/'.join(path.split('/')[:-1]) + if contains_array(store, parent_path): + raise ContainsArrayError(path) # normalize metadata dtype, object_codec = normalize_dtype(dtype, object_codec) @@ -392,7 +546,7 @@ def _init_array_metadata( fill_value = normalize_fill_value(fill_value, dtype) # optional array metadata - if dimension_separator is None: + if dimension_separator is None and store_version == 2: dimension_separator = getattr(store, "_dimension_separator", None) dimension_separator = normalize_dimension_separator(dimension_separator) @@ -416,6 +570,8 @@ def _init_array_metadata( # obtain filters config if filters: + # TODO: filters was removed from the metadata in v3 + # raise error here if store_version > 2? filters_config = [f.get_config() for f in filters] else: filters_config = [] @@ -441,11 +597,30 @@ def _init_array_metadata( filters_config = None # type: ignore # initialize metadata - meta = dict(shape=shape, chunks=chunks, dtype=dtype, - compressor=compressor_config, fill_value=fill_value, - order=order, filters=filters_config, + # TODO: don't store redundant dimension_separator for v3? + meta = dict(shape=shape, compressor=compressor_config, + fill_value=fill_value, dimension_separator=dimension_separator) - key = _path_to_prefix(path) + array_meta_key + if store_version < 3: + meta.update(dict(chunks=chunks, dtype=dtype, order=order, + filters=filters_config)) + else: + if dimension_separator is None: + dimension_separator = "/" + if filters_config: + attributes = {'filters': filters_config} + else: + attributes = {} + meta.update( + dict(chunk_grid=dict(type="regular", + chunk_shape=chunks, + separator=dimension_separator), + chunk_memory_layout=order, + data_type=dtype, + attributes=attributes) + ) + + key = _prefix_to_array_key(store, _path_to_prefix(path)) if hasattr(store, '_metadata_class'): store[key] = store._metadata_class.encode_array_metadata(meta) # type: ignore else: @@ -482,14 +657,26 @@ def init_group( # normalize path path = normalize_storage_path(path) - # ensure parent group initialized - _require_parent_group(path, store=store, chunk_store=chunk_store, - overwrite=overwrite) + store_version = getattr(store, '_store_version', 2) + if store_version < 3: + # ensure parent group initialized + _require_parent_group(path, store=store, chunk_store=chunk_store, + overwrite=overwrite) + + if store_version == 3 and 'zarr.json' not in store: + # initialize with default zarr.json entry level metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) # type: ignore # initialise metadata _init_group_metadata(store=store, overwrite=overwrite, path=path, chunk_store=chunk_store) + if store_version == 3: + # TODO: Should initializing a v3 group also create a corresponding + # empty folder under data/root/? I think probably not until there + # is actual data written there. + pass + def _init_group_metadata( store: StoreLike, @@ -498,22 +685,51 @@ def _init_group_metadata( chunk_store: StoreLike = None, ): + store_version = getattr(store, '_store_version', 2) + path = normalize_storage_path(path) + # guard conditions if overwrite: - # attempt to delete any pre-existing items in store - rmdir(store, path) - if chunk_store is not None: - rmdir(chunk_store, path) - elif contains_array(store, path): - raise ContainsArrayError(path) - elif contains_group(store, path): - raise ContainsGroupError(path) + if store_version == 2: + # attempt to delete any pre-existing items in store + rmdir(store, path) + if chunk_store is not None: + rmdir(chunk_store, path) + else: + group_meta_key = _prefix_to_group_key(store, _path_to_prefix(path)) + array_meta_key = _prefix_to_array_key(store, _path_to_prefix(path)) + data_prefix = 'data/root/' + _path_to_prefix(path) + meta_prefix = 'meta/root/' + _path_to_prefix(path) + + # attempt to delete any pre-existing array in store + if array_meta_key in store: + store.erase(array_meta_key) # type: ignore + if group_meta_key in store: + store.erase(group_meta_key) # type: ignore + store.erase_prefix(data_prefix) # type: ignore + store.erase_prefix(meta_prefix) # type: ignore + if chunk_store is not None: + chunk_store.erase_prefix(data_prefix) # type: ignore + + if not overwrite: + if contains_array(store, path): + raise ContainsArrayError(path) + elif contains_group(store, path): + raise ContainsGroupError(path) + elif store_version == 3 and '/' in path: + # cannot create a group overlapping with an existing array name + parent_path = '/'.join(path.split('/')[:-1]) + if contains_array(store, parent_path): + raise ContainsArrayError(path) # initialize metadata # N.B., currently no metadata properties are needed, however there may # be in future - meta = dict() # type: ignore - key = _path_to_prefix(path) + group_meta_key + if store_version == 3: + meta = {'attributes': {}} # type: ignore + else: + meta = {} # type: ignore + key = _prefix_to_group_key(store, _path_to_prefix(path)) if hasattr(store, '_metadata_class'): store[key] = store._metadata_class.encode_group_metadata(meta) # type: ignore else: @@ -1139,14 +1355,17 @@ def __init__(self, url, normalize_keys=False, key_separator=None, dimension_separator = key_separator self.key_separator = dimension_separator - if self.key_separator is None: - self.key_separator = "." + self._default_key_separator() # Pass attributes to array creation self._dimension_separator = dimension_separator if self.fs.exists(self.path) and not self.fs.isdir(self.path): raise FSPathExistNotDir(url) + def _default_key_separator(self): + if self.key_separator is None: + self.key_separator = "." + def _normalize_key(self, key): key = normalize_storage_path(key).lstrip('/') if key: @@ -2647,6 +2866,10 @@ class ConsolidatedMetadataStore(Store): def __init__(self, store: StoreLike, metadata_key=".zmetadata"): self.store = Store._ensure_store(store) + if getattr(store, '_store_version', 2) != 2: + raise ValueError("Can only consolidate stores corresponding to " + "the Zarr v2 spec.") + # retrieve consolidated metadata meta = json_loads(store[metadata_key]) @@ -2682,3 +2905,351 @@ def getsize(self, path): def listdir(self, path): return listdir(self.meta_store, path) + + +""" versions of stores following the v3 protocol """ + + +def _get_files_and_dirs_from_path(store, path): + path = normalize_storage_path(path) + + files = [] + # add array metadata file if present + array_key = _prefix_to_array_key(store, path) + if array_key in store: + files.append(os.path.join(store.path, array_key)) + + # add group metadata file if present + group_key = _prefix_to_group_key(store, path) + if group_key in store: + files.append(os.path.join(store.path, group_key)) + + dirs = [] + # add array and group folders if present + for d in ['data/root/' + path, 'meta/root/' + path]: + dir_path = os.path.join(store.path, d) + if os.path.exists(dir_path): + dirs.append(dir_path) + return files, dirs + + +class KVStoreV3(KVStore, StoreV3): + + def list(self): + return list(self._mutable_mapping.keys()) + + +KVStoreV3.__doc__ = KVStore.__doc__ + + +class FSStoreV3(FSStore, StoreV3): + + # FSStoreV3 doesn't use this (FSStore uses it within _normalize_key) + _META_KEYS = () + + def _default_key_separator(self): + if self.key_separator is None: + self.key_separator = "/" + + def list(self): + return list(self.keys()) + + def _normalize_key(self, key): + key = normalize_storage_path(key).lstrip('/') + return key.lower() if self.normalize_keys else key + + def getsize(self, path=None): + size = 0 + if path is None or path == '': + # size of both the data and meta subdirs + dirs = [] + for d in ['data/root', 'meta/root']: + dir_path = os.path.join(self.path, d) + if os.path.exists(dir_path): + dirs.append(dir_path) + else: + files, dirs = _get_files_and_dirs_from_path(self, path) + for file in files: + size += os.path.getsize(file) + for d in dirs: + size += self.fs.du(d, total=True, maxdepth=None) + return size + + def setitems(self, values): + if self.mode == 'r': + raise ReadOnlyError() + values = {self._normalize_key(key): val for key, val in values.items()} + + # initialize the /data/root/... folder corresponding to the array! + # Note: zarr.tests.test_core_v3.TestArrayWithFSStoreV3PartialRead fails + # without this explicit creation of directories + subdirectories = set([os.path.dirname(v) for v in values.keys()]) + for subdirectory in subdirectories: + data_dir = os.path.join(self.path, subdirectory) + if not self.fs.exists(data_dir): + self.fs.mkdir(data_dir) + + self.map.setitems(values) + + +class MemoryStoreV3(MemoryStore, StoreV3): + + def __init__(self, root=None, cls=dict, dimension_separator=None): + if root is None: + self.root = cls() + else: + self.root = root + self.cls = cls + self.write_mutex = Lock() + self._dimension_separator = dimension_separator # TODO: modify for v3? + + def __eq__(self, other): + return ( + isinstance(other, MemoryStoreV3) and + self.root == other.root and + self.cls == other.cls + ) + + def list(self): + return list(self.keys()) + + def getsize(self, path: Path = None): + size = 0 + path = normalize_storage_path(path) + members = self.list_prefix('data/root/' + path) + members += self.list_prefix('meta/root/' + path) + for k in members: + try: + v = self[k] + except KeyError: + pass + else: + try: + size += buffer_size(v) + except TypeError: + return -1 + return size + + +MemoryStoreV3.__doc__ = MemoryStore.__doc__ + + +class DirectoryStoreV3(DirectoryStore, StoreV3): + + def list(self): + return list(self.keys()) + + def __eq__(self, other): + return ( + isinstance(other, DirectoryStoreV3) and + self.path == other.path + ) + + # def getsize(self, path=None): + # size = 0 + # if path is None or path == '': + # # add array and group folders if present + # dirs = [] + # for d in ['data/root', 'meta/root']: + # dir_path = os.path.join(self.path, d) + # if os.path.exists(dir_path): + # dirs.append(dir_path) + # print(f"dirs={dirs}") + # else: + # files, dirs = _get_files_and_dirs_from_path(self, path) + # for file in files: + # size += os.path.getsize(file) + # for d in dirs: + # for child in scandir(d): + # print(f"child={child}") + # if child.is_file(): + # size += child.stat().st_size + # return size + + def getsize(self, path: Path = None): + size = 0 + path = normalize_storage_path(path) + members = self.list_prefix('data/root/' + path) + members += self.list_prefix('meta/root/' + path) + for k in members: + try: + v = self[k] + except KeyError: + pass + else: + try: + size += buffer_size(v) + except TypeError: + return -1 + return size + + def rename(self, src_path, dst_path, metadata_key_suffix='.json'): + store_src_path = normalize_storage_path(src_path) + store_dst_path = normalize_storage_path(dst_path) + + dir_path = self.path + any_existed = False + for root_prefix in ['meta', 'data']: + src_path = os.path.join(dir_path, root_prefix, 'root', store_src_path) + if os.path.exists(src_path): + any_existed = True + dst_path = os.path.join(dir_path, root_prefix, 'root', store_dst_path) + os.renames(src_path, dst_path) + + for suffix in ['.array' + metadata_key_suffix, + '.group' + metadata_key_suffix]: + src_meta = os.path.join(dir_path, 'meta', 'root', store_src_path + suffix) + if os.path.exists(src_meta): + any_existed = True + dst_meta = os.path.join(dir_path, 'meta', 'root', store_dst_path + suffix) + dst_dir = os.path.dirname(dst_meta) + if not os.path.exists(dst_dir): + os.makedirs(dst_dir) + os.rename(src_meta, dst_meta) + if not any_existed: + raise FileNotFoundError("nothing found at src_path") + + +DirectoryStoreV3.__doc__ = DirectoryStore.__doc__ + + +class ZipStoreV3(ZipStore, StoreV3): + + def list(self): + return list(self.keys()) + + def __eq__(self, other): + return ( + isinstance(other, ZipStore) and + self.path == other.path and + self.compression == other.compression and + self.allowZip64 == other.allowZip64 + ) + + def getsize(self, path=None): + path = normalize_storage_path(path) + with self.mutex: + children = self.list_prefix('data/root/' + path) + children += self.list_prefix('meta/root/' + path) + if children: + size = 0 + for name in children: + try: + info = self.zf.getinfo(name) + except KeyError: + pass + else: + size += info.compress_size + return size + elif path: + try: + info = self.zf.getinfo(path) + return info.compress_size + except KeyError: + return 0 + else: + return 0 + + +ZipStoreV3.__doc__ = ZipStore.__doc__ + + +class NestedDirectoryStoreV3(NestedDirectoryStore, DirectoryStoreV3): + + def list(self): + return list(self.keys()) + + def __eq__(self, other): + return ( + isinstance(other, NestedDirectoryStoreV3) and + self.path == other.path + ) + + +NestedDirectoryStoreV3.__doc__ = NestedDirectoryStore.__doc__ + + +class RedisStoreV3(RedisStore, StoreV3): + + def list(self): + return list(self.keys()) + + +RedisStoreV3.__doc__ = RedisStore.__doc__ + + +class MongoDBStoreV3(MongoDBStore, StoreV3): + + def list(self): + return list(self.keys()) + + +MongoDBStoreV3.__doc__ = MongoDBStore.__doc__ + + +class DBMStoreV3(DBMStore, StoreV3): + + def list(self): + return list(self.keys()) + + +DBMStoreV3.__doc__ = DBMStore.__doc__ + + +class LMDBStoreV3(LMDBStore, StoreV3): + + def list(self): + return list(self.keys()) + + +LMDBStoreV3.__doc__ = LMDBStore.__doc__ + + +class SQLiteStoreV3(SQLiteStore, StoreV3): + + def list(self): + return list(self.keys()) + + def getsize(self, path=None): + if path is None or path == '': + # TODO: why does the query below not work in this case? + # For now fall back to the default _getsize implementation + return _getsize(self, path) + else: + path = normalize_storage_path(path) + size = 0 + for _path in ['data/root/' + path, 'meta/root/' + path]: + c = self.cursor.execute( + ''' + SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr + WHERE k LIKE (? || "%") AND + 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") + ''', + (_path, _path) + ) + for item_size, in c: + size += item_size + return size + + +SQLiteStoreV3.__doc__ = SQLiteStore.__doc__ + + +class LRUStoreCacheV3(LRUStoreCache, StoreV3): + + def __init__(self, store, max_size: int): + self._store = StoreV3._ensure_store(store) + self._max_size = max_size + self._current_size = 0 + self._keys_cache = None + self._contains_cache = None + self._listdir_cache: Dict[Path, Any] = dict() + self._values_cache: Dict[Path, Any] = OrderedDict() + self._mutex = Lock() + self.hits = self.misses = 0 + + def list(self): + return list(self.keys()) + + +LRUStoreCacheV3.__doc__ = LRUStoreCache.__doc__ diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 3438e60691..0865917926 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -18,9 +18,9 @@ from numcodecs.compat import ensure_bytes from zarr.codecs import BZ2, AsType, Blosc, Zlib -from zarr.errors import MetadataError +from zarr.errors import ContainsArrayError, ContainsGroupError, MetadataError from zarr.hierarchy import group -from zarr.meta import ZARR_FORMAT, decode_array_metadata +from zarr.meta import ZARR_FORMAT, ZARR_FORMAT_v3, decode_array_metadata from zarr.n5 import N5Store, N5FSStore from zarr.storage import (ABSStore, ConsolidatedMetadataStore, DBMStore, DictStore, DirectoryStore, KVStore, LMDBStore, @@ -31,7 +31,12 @@ attrs_key, default_compressor, getsize, group_meta_key, init_array, init_group, migrate_1to2) from zarr.storage import FSStore, rename, listdir +from zarr.storage import (KVStoreV3, MemoryStoreV3, ZipStoreV3, FSStoreV3, + DirectoryStoreV3, NestedDirectoryStoreV3, + RedisStoreV3, MongoDBStoreV3, DBMStoreV3, + LMDBStoreV3, SQLiteStoreV3, LRUStoreCacheV3) from zarr.tests.util import CountingDict, have_fsspec, skip_test_env_var, abs_container +from zarr.tests.util import CountingDictV3 @contextmanager @@ -48,6 +53,15 @@ def dimension_separator_fixture(request): return request.param +@pytest.fixture(params=[ + (None, "/"), + (".", "."), + ("/", "/"), +]) +def dimension_separator_fixture_v3(request): + return request.param + + def skip_if_nested_chunks(**kwargs): if kwargs.get("dimension_separator") == "/": pytest.skip("nested chunks are unsupported") diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py new file mode 100644 index 0000000000..9118bc513c --- /dev/null +++ b/zarr/tests/test_storage_v3.py @@ -0,0 +1,847 @@ +import array +import atexit +import os +import tempfile + +import numpy as np +import pytest + +from numcodecs.compat import ensure_bytes + +from zarr.codecs import Zlib +from zarr.errors import ContainsArrayError, ContainsGroupError +from zarr.meta import ZARR_FORMAT, ZARR_FORMAT_v3 +from zarr.storage import (array_meta_key, atexit_rmglob, atexit_rmtree, + default_compressor, getsize, init_array, init_group) +from zarr.storage import (KVStoreV3, MemoryStoreV3, ZipStoreV3, FSStoreV3, + DirectoryStoreV3, NestedDirectoryStoreV3, + RedisStoreV3, MongoDBStoreV3, DBMStoreV3, + LMDBStoreV3, SQLiteStoreV3, LRUStoreCacheV3) +from zarr.tests.util import CountingDictV3, have_fsspec, skip_test_env_var + +from .test_storage import (StoreTests, TestMemoryStore, TestDirectoryStore, + TestFSStore, TestNestedDirectoryStore, TestZipStore, + TestDBMStore, TestDBMStoreDumb, TestDBMStoreGnu, + TestDBMStoreNDBM, TestDBMStoreBerkeleyDB, + TestLMDBStore, TestSQLiteStore, + TestSQLiteStoreInMemory, TestLRUStoreCache, + dimension_separator_fixture, s3, + skip_if_nested_chunks) + + +@pytest.fixture(params=[ + (None, "/"), + (".", "."), + ("/", "/"), +]) +def dimension_separator_fixture_v3(request): + return request.param + + +class StoreV3Tests(StoreTests): + + def test_getsize(self): + # TODO: determine proper getsize() behavior for v3 + + # Currently returns the combined size of entries under + # meta/root/path and data/root/path. + # Any path not under meta/root/ or data/root/ (including zarr.json) + # returns size 0. + + store = self.create_store() + if isinstance(store, dict) or hasattr(store, 'getsize'): + assert 0 == getsize(store, 'zarr.json') + store['meta/root/foo/a'] = b'x' + assert 1 == getsize(store) + assert 1 == getsize(store, 'foo') + store['meta/root/foo/b'] = b'x' + assert 2 == getsize(store, 'foo') + assert 1 == getsize(store, 'foo/b') + store['meta/root/bar/a'] = b'yy' + assert 2 == getsize(store, 'bar') + store['data/root/bar/a'] = b'zzz' + assert 5 == getsize(store, 'bar') + store['data/root/baz/a'] = b'zzz' + assert 3 == getsize(store, 'baz') + assert 10 == getsize(store) + store['data/root/quux'] = array.array('B', b'zzzz') + assert 14 == getsize(store) + assert 4 == getsize(store, 'quux') + store['data/root/spong'] = np.frombuffer(b'zzzzz', dtype='u1') + assert 19 == getsize(store) + assert 5 == getsize(store, 'spong') + + store.close() + + # noinspection PyStatementEffect + def test_hierarchy(self): + pytest.skip("TODO: adapt v2 test_hierarchy tests to v3") + + def test_init_array(self, dimension_separator_fixture_v3): + + pass_dim_sep, want_dim_sep = dimension_separator_fixture_v3 + + store = self.create_store() + path = 'arr1' + init_array(store, path=path, shape=1000, chunks=100, + dimension_separator=pass_dim_sep) + + # check metadata + mkey = 'meta/root/' + path + '.array.json' + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + # TODO: zarr_format already stored at the heirarchy level should we + # also keep it in the .array.json? + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype(None) == meta['data_type'] + assert default_compressor.get_config() == meta['compressor'] + assert meta['fill_value'] is None + # Missing MUST be assumed to be "/" + assert meta.get('dimension_separator', "/") is want_dim_sep + assert meta['chunk_grid']['separator'] is want_dim_sep + store.close() + + def _test_init_array_overwrite(self, order): + # setup + store = self.create_store() + + if store._store_version < 3: + path = None + mkey = array_meta_key + else: + path = 'arr1' # no default, have to specify for v3 + mkey = 'meta/root/' + path + '.array.json' + store[mkey] = store._metadata_class.encode_array_metadata( + dict(shape=(2000,), + chunk_grid=dict(type='regular', + chunk_shape=(200,), + separator=('/')), + data_type=np.dtype('u1'), + compressor=Zlib(1).get_config(), + fill_value=0, + chunk_memory_layout=order, + filters=None) + ) + + # don't overwrite (default) + with pytest.raises(ContainsArrayError): + init_array(store, path=path, shape=1000, chunks=100) + + # do overwrite + try: + init_array(store, path=path, shape=1000, chunks=100, + dtype='i4', overwrite=True) + except NotImplementedError: + pass + else: + assert mkey in store + meta = store._metadata_class.decode_array_metadata( + store[mkey] + ) + assert (1000,) == meta['shape'] + if store._store_version == 2: + assert ZARR_FORMAT == meta['zarr_format'] + assert (100,) == meta['chunks'] + assert np.dtype('i4') == meta['dtype'] + elif store._store_version == 3: + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype('i4') == meta['data_type'] + else: + raise ValueError( + "unexpected store version: {store._store_version}" + ) + store.close() + + def test_init_array_path(self): + path = 'foo/bar' + store = self.create_store() + init_array(store, shape=1000, chunks=100, path=path) + + # check metadata + mkey = 'meta/root/' + path + '.array.json' + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype(None) == meta['data_type'] + assert default_compressor.get_config() == meta['compressor'] + assert meta['fill_value'] is None + + store.close() + + def _test_init_array_overwrite_path(self, order): + # setup + path = 'foo/bar' + store = self.create_store() + meta = dict(shape=(2000,), + chunk_grid=dict(type='regular', + chunk_shape=(200,), + separator=('/')), + data_type=np.dtype('u1'), + compressor=Zlib(1).get_config(), + fill_value=0, + chunk_memory_layout=order, + filters=None) + mkey = 'meta/root/' + path + '.array.json' + store[mkey] = store._metadata_class.encode_array_metadata(meta) + + # don't overwrite + with pytest.raises(ContainsArrayError): + init_array(store, shape=1000, chunks=100, path=path) + + # do overwrite + try: + init_array(store, shape=1000, chunks=100, dtype='i4', path=path, + overwrite=True) + except NotImplementedError: + pass + else: + assert mkey in store + # should have been overwritten + meta = store._metadata_class.decode_array_metadata(store[mkey]) + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype('i4') == meta['data_type'] + + store.close() + + def test_init_array_overwrite_group(self): + # setup + path = 'foo/bar' + store = self.create_store() + array_key = 'meta/root/' + path + '.array.json' + group_key = 'meta/root/' + path + '.group.json' + store[group_key] = store._metadata_class.encode_group_metadata() + + with pytest.raises(ContainsGroupError): + init_array(store, shape=1000, chunks=100, path=path) + + # do overwrite + try: + init_array(store, shape=1000, chunks=100, dtype='i4', path=path, + overwrite=True) + except NotImplementedError: + pass + else: + assert group_key not in store + assert array_key in store + meta = store._metadata_class.decode_array_metadata( + store[array_key] + ) + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype('i4') == meta['data_type'] + + store.close() + + def _test_init_array_overwrite_chunk_store(self, order): + # setup + store = self.create_store() + chunk_store = self.create_store() + path = 'arr1' + mkey = 'meta/root/' + path + '.array.json' + store[mkey] = store._metadata_class.encode_array_metadata( + dict(shape=(2000,), + chunk_grid=dict(type='regular', + chunk_shape=(200,), + separator=('/')), + data_type=np.dtype('u1'), + compressor=None, + fill_value=0, + filters=None, + chunk_memory_layout=order) + ) + + chunk_store['data/root/arr1/0'] = b'aaa' + chunk_store['data/root/arr1/1'] = b'bbb' + + assert 'data/root/arr1/0' in chunk_store + assert 'data/root/arr1/1' in chunk_store + + # don't overwrite (default) + with pytest.raises(ValueError): + init_array(store, path=path, shape=1000, chunks=100, chunk_store=chunk_store) + + # do overwrite + try: + init_array(store, path=path, shape=1000, chunks=100, dtype='i4', + overwrite=True, chunk_store=chunk_store) + except NotImplementedError: + pass + else: + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype('i4') == meta['data_type'] + assert 'data/root/arr1/0' not in chunk_store + assert 'data/root/arr1/1' not in chunk_store + + store.close() + chunk_store.close() + + def test_init_array_compat(self): + store = self.create_store() + path = 'arr1' + init_array(store, path=path, shape=1000, chunks=100, compressor='none') + mkey = 'meta/root/' + path + '.array.json' + meta = store._metadata_class.decode_array_metadata( + store[mkey] + ) + assert meta['compressor'] is None + + store.close() + + def test_init_group(self): + store = self.create_store() + path = "meta/root/foo" + init_group(store, path=path) + + # check metadata + mkey = 'meta/root/' + path + '.group.json' + assert mkey in store + meta = store._metadata_class.decode_group_metadata(store[mkey]) + assert meta == {'attributes': {}} + + store.close() + + def _test_init_group_overwrite(self, order): + pytest.skip( + "In v3 array and group names cannot overlap" + ) + + def _test_init_group_overwrite_path(self, order): + # setup + path = 'foo/bar' + store = self.create_store() + meta = dict( + shape=(2000,), + chunk_grid=dict(type='regular', + chunk_shape=(200,), + separator=('/')), + data_type=np.dtype('u1'), + compressor=None, + fill_value=0, + filters=None, + chunk_memory_layout=order, + ) + array_key = 'meta/root/' + path + '.array.json' + group_key = 'meta/root/' + path + '.group.json' + store[array_key] = store._metadata_class.encode_array_metadata(meta) + + # don't overwrite + with pytest.raises(ContainsArrayError): + init_group(store, path=path) + + # do overwrite + try: + init_group(store, overwrite=True, path=path) + except NotImplementedError: + pass + else: + assert array_key not in store + assert group_key in store + # should have been overwritten + meta = store._metadata_class.decode_group_metadata(store[group_key]) + # assert ZARR_FORMAT == meta['zarr_format'] + assert meta == {'attributes': {}} + + store.close() + + def _test_init_group_overwrite_chunk_store(self, order): + pytest.skip( + "In v3 array and group names cannot overlap" + ) + + +class TestMappingStoreV3(StoreV3Tests): + + def create_store(self, **kwargs): + return KVStoreV3(dict()) + + def test_set_invalid_content(self): + # Generic mappings support non-buffer types + pass + + +class TestMemoryStoreV3(TestMemoryStore, StoreV3Tests): + + def create_store(self, **kwargs): + skip_if_nested_chunks(**kwargs) + return MemoryStoreV3(**kwargs) + + +class TestDirectoryStoreV3(TestDirectoryStore, StoreV3Tests): + + def create_store(self, normalize_keys=False, **kwargs): + # For v3, don't have to skip if nested. + # skip_if_nested_chunks(**kwargs) + + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = DirectoryStoreV3(path, normalize_keys=normalize_keys, **kwargs) + return store + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestFSStoreV3(TestFSStore, StoreV3Tests): + + def create_store(self, normalize_keys=False, + dimension_separator=".", + path=None, + **kwargs): + + if path is None: + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + + store = FSStoreV3( + path, + normalize_keys=normalize_keys, + dimension_separator=dimension_separator, + **kwargs) + return store + + def test_init_array(self): + store = self.create_store() + path = 'arr1' + init_array(store, path=path, shape=1000, chunks=100) + + # check metadata + array_meta_key = 'meta/root/' + path + '.array.json' + assert array_meta_key in store + meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype(None) == meta['data_type'] + assert meta['chunk_grid']['separator'] == "/" + + # TODO: remove this skip once v3 support is added to hierarchy.Group + @pytest.mark.skipif(True, reason="need v3 support in zarr.hierarchy.Group") + def test_deep_ndim(self): + import zarr + + store = self.create_store() + foo = zarr.open_group(store=store, path='group1') + bar = foo.create_group("bar") + baz = bar.create_dataset("baz", + shape=(4, 4, 4), + chunks=(2, 2, 2), + dtype="i8") + baz[:] = 1 + assert set(store.listdir()) == set(["data", "meta", "zarr.json"]) + assert set(store.listdir("meta/root/group1")) == set(["bar", "bar.group.json"]) + assert set(store.listdir("data/root/group1")) == set(["bar"]) + assert foo["bar"]["baz"][(0, 0, 0)] == 1 + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestFSStoreV3WithKeySeparator(StoreV3Tests): + + def create_store(self, normalize_keys=False, key_separator=".", **kwargs): + + # Since the user is passing key_separator, that will take priority. + skip_if_nested_chunks(**kwargs) + + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + return FSStoreV3( + path, + normalize_keys=normalize_keys, + key_separator=key_separator) + + +# TODO: remove NestedDirectoryStoreV3? +class TestNestedDirectoryStoreV3(TestNestedDirectoryStore, + TestDirectoryStoreV3): + + def create_store(self, normalize_keys=False, **kwargs): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = NestedDirectoryStoreV3(path, normalize_keys=normalize_keys, **kwargs) + return store + + def test_init_array(self): + store = self.create_store() + # assert store._dimension_separator == "/" + path = 'arr1' + init_array(store, path=path, shape=1000, chunks=100) + + # check metadata + array_meta_key = 'meta/root/' + path + '.array.json' + assert array_meta_key in store + meta = store._metadata_class.decode_array_metadata(store[array_meta_key]) + assert ZARR_FORMAT_v3 == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype(None) == meta['data_type'] + # assert meta['dimension_separator'] == "/" + assert meta['chunk_grid']['separator'] == "/" + +# TODO: enable once N5StoreV3 has been implemented +# @pytest.mark.skipif(True, reason="N5StoreV3 not yet fully implemented") +# class TestN5StoreV3(TestN5Store, TestNestedDirectoryStoreV3, StoreV3Tests): + + +class TestZipStoreV3(TestZipStore, StoreV3Tests): + + def create_store(self, **kwargs): + path = tempfile.mktemp(suffix='.zip') + atexit.register(os.remove, path) + store = ZipStoreV3(path, mode='w', **kwargs) + return store + + def test_mode(self): + with ZipStoreV3('data/store.zip', mode='w') as store: + store['foo'] = b'bar' + store = ZipStoreV3('data/store.zip', mode='r') + with pytest.raises(PermissionError): + store['foo'] = b'bar' + with pytest.raises(PermissionError): + store.clear() + + +class TestDBMStoreV3(TestDBMStore, StoreV3Tests): + + def create_store(self, dimension_separator=None): + path = tempfile.mktemp(suffix='.anydbm') + atexit.register(atexit_rmglob, path + '*') + # create store using default dbm implementation + store = DBMStoreV3(path, flag='n', dimension_separator=dimension_separator) + return store + + +class TestDBMStoreV3Dumb(TestDBMStoreDumb, StoreV3Tests): + + def create_store(self, **kwargs): + path = tempfile.mktemp(suffix='.dumbdbm') + atexit.register(atexit_rmglob, path + '*') + + import dbm.dumb as dumbdbm + store = DBMStoreV3(path, flag='n', open=dumbdbm.open, **kwargs) + return store + + +class TestDBMStoreV3Gnu(TestDBMStoreGnu, StoreV3Tests): + + def create_store(self, **kwargs): + gdbm = pytest.importorskip("dbm.gnu") + path = tempfile.mktemp(suffix=".gdbm") # pragma: no cover + atexit.register(os.remove, path) # pragma: no cover + store = DBMStoreV3( + path, flag="n", open=gdbm.open, write_lock=False, **kwargs + ) # pragma: no cover + return store # pragma: no cover + + +class TestDBMStoreV3NDBM(TestDBMStoreNDBM, StoreV3Tests): + + def create_store(self, **kwargs): + ndbm = pytest.importorskip("dbm.ndbm") + path = tempfile.mktemp(suffix=".ndbm") # pragma: no cover + atexit.register(atexit_rmglob, path + "*") # pragma: no cover + store = DBMStoreV3(path, flag="n", open=ndbm.open, **kwargs) # pragma: no cover + return store # pragma: no cover + + +class TestDBMStoreV3BerkeleyDB(TestDBMStoreBerkeleyDB, StoreV3Tests): + + def create_store(self, **kwargs): + bsddb3 = pytest.importorskip("bsddb3") + path = tempfile.mktemp(suffix='.dbm') + atexit.register(os.remove, path) + store = DBMStoreV3(path, flag='n', open=bsddb3.btopen, write_lock=False, **kwargs) + return store + + +class TestLMDBStoreV3(TestLMDBStore, StoreV3Tests): + + def create_store(self, **kwargs): + pytest.importorskip("lmdb") + path = tempfile.mktemp(suffix='.lmdb') + atexit.register(atexit_rmtree, path) + buffers = True + store = LMDBStoreV3(path, buffers=buffers, **kwargs) + return store + + +class TestSQLiteStoreV3(TestSQLiteStore, StoreV3Tests): + + def create_store(self, **kwargs): + pytest.importorskip("sqlite3") + path = tempfile.mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStoreV3(path, **kwargs) + return store + + +class TestSQLiteStoreV3InMemory(TestSQLiteStoreInMemory, StoreV3Tests): + + def create_store(self, **kwargs): + pytest.importorskip("sqlite3") + store = SQLiteStoreV3(':memory:', **kwargs) + return store + + +@skip_test_env_var("ZARR_TEST_MONGO") +class TestMongoDBStoreV3(StoreV3Tests): + + def create_store(self, **kwargs): + pytest.importorskip("pymongo") + store = MongoDBStoreV3(host='127.0.0.1', database='zarr_tests', + collection='zarr_tests', **kwargs) + # start with an empty store + store.clear() + return store + + +@skip_test_env_var("ZARR_TEST_REDIS") +class TestRedisStoreV3(StoreV3Tests): + + def create_store(self, **kwargs): + # TODO: this is the default host for Redis on Travis, + # we probably want to generalize this though + pytest.importorskip("redis") + store = RedisStoreV3(host='localhost', port=6379, **kwargs) + # start with an empty store + store.clear() + return store + + +class TestLRUStoreCacheV3(TestLRUStoreCache, StoreV3Tests): + + def create_store(self, **kwargs): + # wrapper therefore no dimension_separator argument + skip_if_nested_chunks(**kwargs) + return LRUStoreCacheV3(dict(), max_size=2**27) + + def test_cache_values_no_max_size(self): + + # setup store + store = CountingDictV3() + store['foo'] = b'xxx' + store['bar'] = b'yyy' + assert 0 == store.counter['__getitem__', 'foo'] + assert 1 == store.counter['__setitem__', 'foo'] + assert 0 == store.counter['__getitem__', 'bar'] + assert 1 == store.counter['__setitem__', 'bar'] + + # setup cache + cache = LRUStoreCacheV3(store, max_size=None) + assert 0 == cache.hits + assert 0 == cache.misses + + # test first __getitem__, cache miss + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 1 == store.counter['__setitem__', 'foo'] + assert 0 == cache.hits + assert 1 == cache.misses + + # test second __getitem__, cache hit + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 1 == store.counter['__setitem__', 'foo'] + assert 1 == cache.hits + assert 1 == cache.misses + + # test __setitem__, __getitem__ + cache['foo'] = b'zzz' + assert 1 == store.counter['__getitem__', 'foo'] + assert 2 == store.counter['__setitem__', 'foo'] + # should be a cache hit + assert b'zzz' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 2 == store.counter['__setitem__', 'foo'] + assert 2 == cache.hits + assert 1 == cache.misses + + # manually invalidate all cached values + cache.invalidate_values() + assert b'zzz' == cache['foo'] + assert 2 == store.counter['__getitem__', 'foo'] + assert 2 == store.counter['__setitem__', 'foo'] + cache.invalidate() + assert b'zzz' == cache['foo'] + assert 3 == store.counter['__getitem__', 'foo'] + assert 2 == store.counter['__setitem__', 'foo'] + + # test __delitem__ + del cache['foo'] + with pytest.raises(KeyError): + # noinspection PyStatementEffect + cache['foo'] + with pytest.raises(KeyError): + # noinspection PyStatementEffect + store['foo'] + + # verify other keys untouched + assert 0 == store.counter['__getitem__', 'bar'] + assert 1 == store.counter['__setitem__', 'bar'] + + def test_cache_values_with_max_size(self): + + # setup store + store = CountingDictV3() + store['foo'] = b'xxx' + store['bar'] = b'yyy' + assert 0 == store.counter['__getitem__', 'foo'] + assert 0 == store.counter['__getitem__', 'bar'] + # setup cache - can only hold one item + cache = LRUStoreCacheV3(store, max_size=5) + assert 0 == cache.hits + assert 0 == cache.misses + + # test first 'foo' __getitem__, cache miss + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 0 == cache.hits + assert 1 == cache.misses + + # test second 'foo' __getitem__, cache hit + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 1 == cache.hits + assert 1 == cache.misses + + # test first 'bar' __getitem__, cache miss + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 1 == cache.hits + assert 2 == cache.misses + + # test second 'bar' __getitem__, cache hit + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 2 == cache.hits + assert 2 == cache.misses + + # test 'foo' __getitem__, should have been evicted, cache miss + assert b'xxx' == cache['foo'] + assert 2 == store.counter['__getitem__', 'foo'] + assert 2 == cache.hits + assert 3 == cache.misses + + # test 'bar' __getitem__, should have been evicted, cache miss + assert b'yyy' == cache['bar'] + assert 2 == store.counter['__getitem__', 'bar'] + assert 2 == cache.hits + assert 4 == cache.misses + + # setup store + store = CountingDictV3() + store['foo'] = b'xxx' + store['bar'] = b'yyy' + assert 0 == store.counter['__getitem__', 'foo'] + assert 0 == store.counter['__getitem__', 'bar'] + # setup cache - can hold two items + cache = LRUStoreCacheV3(store, max_size=6) + assert 0 == cache.hits + assert 0 == cache.misses + + # test first 'foo' __getitem__, cache miss + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 0 == cache.hits + assert 1 == cache.misses + + # test second 'foo' __getitem__, cache hit + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 1 == cache.hits + assert 1 == cache.misses + + # test first 'bar' __getitem__, cache miss + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 1 == cache.hits + assert 2 == cache.misses + + # test second 'bar' __getitem__, cache hit + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 2 == cache.hits + assert 2 == cache.misses + + # test 'foo' __getitem__, should still be cached + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 3 == cache.hits + assert 2 == cache.misses + + # test 'bar' __getitem__, should still be cached + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 4 == cache.hits + assert 2 == cache.misses + + def test_cache_keys(self): + + # setup + store = CountingDictV3() + store['foo'] = b'xxx' + store['bar'] = b'yyy' + assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__iter__'] + assert 0 == store.counter['keys'] + cache = LRUStoreCacheV3(store, max_size=None) + + # keys should be cached on first call + keys = sorted(cache.keys()) + assert keys == ['bar', 'foo'] + assert 1 == store.counter['keys'] + # keys should now be cached + assert keys == sorted(cache.keys()) + assert 1 == store.counter['keys'] + assert 'foo' in cache + assert 0 == store.counter['__contains__', 'foo'] + assert keys == sorted(cache) + assert 0 == store.counter['__iter__'] + assert 1 == store.counter['keys'] + + # cache should be cleared if store is modified - crude but simple for now + cache['baz'] = b'zzz' + keys = sorted(cache.keys()) + assert keys == ['bar', 'baz', 'foo'] + assert 2 == store.counter['keys'] + # keys should now be cached + assert keys == sorted(cache.keys()) + assert 2 == store.counter['keys'] + + # manually invalidate keys + cache.invalidate_keys() + keys = sorted(cache.keys()) + assert keys == ['bar', 'baz', 'foo'] + assert 3 == store.counter['keys'] + assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__iter__'] + cache.invalidate_keys() + keys = sorted(cache) + assert keys == ['bar', 'baz', 'foo'] + assert 4 == store.counter['keys'] + assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__iter__'] + cache.invalidate_keys() + assert 'foo' in cache + assert 5 == store.counter['keys'] + assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__iter__'] + + # check these would get counted if called directly + assert 'foo' in store + assert 1 == store.counter['__contains__', 'foo'] + assert keys == sorted(store) + assert 1 == store.counter['__iter__'] + + +# TODO: implement ABSStoreV3 +# @skip_test_env_var("ZARR_TEST_ABS") +# class TestABSStoreV3(TestABSStore, StoreV3Tests): diff --git a/zarr/tests/util.py b/zarr/tests/util.py index e0f11d72ad..bb4df90d1b 100644 --- a/zarr/tests/util.py +++ b/zarr/tests/util.py @@ -1,7 +1,7 @@ import collections import os -from zarr.storage import Store +from zarr.storage import Store, StoreV3 import pytest @@ -41,6 +41,10 @@ def __delitem__(self, key): del self.wrapped[key] +class CountingDictV3(CountingDict, StoreV3): + pass + + def skip_test_env_var(name): """ Checks for environment variables indicating whether tests requiring services should be run """ From 85b8e2322025a367a9f47329cfef9e8a095b0e87 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Wed, 17 Nov 2021 13:58:52 -0500 Subject: [PATCH 2/9] add TODO comment to meta.py --- zarr/meta.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/zarr/meta.py b/zarr/meta.py index 07fbdcb7d4..d3f4ec50d5 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -22,6 +22,12 @@ ) _v3_core_type = {"bool", "i1", "u1"} | _v3_core_type +# TODO: How do we want to handle dtypes not officially in the v3 spec? +# Those in _v3_core_type above are the only ones defined in the spec. +# However we currently support many other dtypes for v2. For now, I also +# allow all of these for v3 unless the user sets an environment variable +# ZARR_V3_CORE_DTYPES_ONLY=1, etc. + ZARR_V3_CORE_DTYPES_ONLY = int(os.environ.get("ZARR_V3_CORE_DTYPES_ONLY", False)) ZARR_V3_ALLOW_COMPLEX = int(os.environ.get("ZARR_V3_ALLOW_COMPLEX", not ZARR_V3_CORE_DTYPES_ONLY)) From 983d190dd7be8fb9b7db95cbb3d92c43517a71d3 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 30 Nov 2021 14:37:47 -0500 Subject: [PATCH 3/9] fix flake8 errors --- zarr/_storage/store.py | 1 - zarr/tests/test_storage.py | 9 ++------- zarr/tests/test_storage_v3.py | 6 +++--- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index 0ff9e0c043..2c6d7b3978 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -1,4 +1,3 @@ -import json import sys from collections.abc import MutableMapping from string import ascii_letters, digits diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 0865917926..1cad7f459f 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -18,9 +18,9 @@ from numcodecs.compat import ensure_bytes from zarr.codecs import BZ2, AsType, Blosc, Zlib -from zarr.errors import ContainsArrayError, ContainsGroupError, MetadataError +from zarr.errors import MetadataError from zarr.hierarchy import group -from zarr.meta import ZARR_FORMAT, ZARR_FORMAT_v3, decode_array_metadata +from zarr.meta import ZARR_FORMAT, decode_array_metadata from zarr.n5 import N5Store, N5FSStore from zarr.storage import (ABSStore, ConsolidatedMetadataStore, DBMStore, DictStore, DirectoryStore, KVStore, LMDBStore, @@ -31,12 +31,7 @@ attrs_key, default_compressor, getsize, group_meta_key, init_array, init_group, migrate_1to2) from zarr.storage import FSStore, rename, listdir -from zarr.storage import (KVStoreV3, MemoryStoreV3, ZipStoreV3, FSStoreV3, - DirectoryStoreV3, NestedDirectoryStoreV3, - RedisStoreV3, MongoDBStoreV3, DBMStoreV3, - LMDBStoreV3, SQLiteStoreV3, LRUStoreCacheV3) from zarr.tests.util import CountingDict, have_fsspec, skip_test_env_var, abs_container -from zarr.tests.util import CountingDictV3 @contextmanager diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py index 9118bc513c..24f358c350 100644 --- a/zarr/tests/test_storage_v3.py +++ b/zarr/tests/test_storage_v3.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from numcodecs.compat import ensure_bytes - from zarr.codecs import Zlib from zarr.errors import ContainsArrayError, ContainsGroupError from zarr.meta import ZARR_FORMAT, ZARR_FORMAT_v3 @@ -25,9 +23,11 @@ TestDBMStoreNDBM, TestDBMStoreBerkeleyDB, TestLMDBStore, TestSQLiteStore, TestSQLiteStoreInMemory, TestLRUStoreCache, - dimension_separator_fixture, s3, skip_if_nested_chunks) +# pytest will fail to run if the following fixtures aren't imported here +from .test_storage import dimension_separator_fixture, s3 # noqa + @pytest.fixture(params=[ (None, "/"), From 9ed6181c41567fa99ef6fe1c3f73d89d62280706 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 14 Dec 2021 19:10:04 -0500 Subject: [PATCH 4/9] follow zarr v3 spec when dealing with extension data types --- zarr/meta.py | 224 +++++++++++++++++++++++++-------------------------- 1 file changed, 111 insertions(+), 113 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index d3f4ec50d5..72c6cfc869 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -1,6 +1,7 @@ import base64 import itertools import os +from collections import namedtuple from collections.abc import Mapping import numpy as np @@ -13,42 +14,72 @@ ZARR_FORMAT = 2 ZARR_FORMAT_v3 = 3 -FLOAT_FILLS = {"NaN": np.nan, "Infinity": np.PINF, "-Infinity": np.NINF} - - -_v3_core_type = set( - "".join(d) - for d in itertools.product("<>", ("u", "i", "f"), ("2", "4", "8")) -) -_v3_core_type = {"bool", "i1", "u1"} | _v3_core_type - -# TODO: How do we want to handle dtypes not officially in the v3 spec? -# Those in _v3_core_type above are the only ones defined in the spec. -# However we currently support many other dtypes for v2. For now, I also -# allow all of these for v3 unless the user sets an environment variable -# ZARR_V3_CORE_DTYPES_ONLY=1, etc. - -ZARR_V3_CORE_DTYPES_ONLY = int(os.environ.get("ZARR_V3_CORE_DTYPES_ONLY", False)) -ZARR_V3_ALLOW_COMPLEX = int(os.environ.get("ZARR_V3_ALLOW_COMPLEX", - not ZARR_V3_CORE_DTYPES_ONLY)) -ZARR_V3_ALLOW_DATETIME = int(os.environ.get("ZARR_V3_ALLOW_DATETIME", - not ZARR_V3_CORE_DTYPES_ONLY)) -ZARR_V3_ALLOW_STRUCTURED = int(os.environ.get("ZARR_V3_ALLOW_STRUCTURED", - not ZARR_V3_CORE_DTYPES_ONLY)) -ZARR_V3_ALLOW_OBJECTARRAY = int(os.environ.get("ZARR_V3_ALLOW_OBJECTARRAY", - not ZARR_V3_CORE_DTYPES_ONLY)) -ZARR_V3_ALLOW_BYTES_ARRAY = int(os.environ.get("ZARR_V3_ALLOW_BYTES_ARRAY", - not ZARR_V3_CORE_DTYPES_ONLY)) -ZARR_V3_ALLOW_UNICODE_ARRAY = int(os.environ.get("ZARR_V3_ALLOW_UNICODE_ARRAY", - not ZARR_V3_CORE_DTYPES_ONLY)) +# FLOAT_FILLS = {"NaN": np.nan, "Infinity": np.PINF, "-Infinity": np.NINF} _default_entry_point_metadata_v3 = { - 'zarr_format': "https://purl.org/zarr/spec/protocol/core/3.0", - 'metadata_encoding': "https://purl.org/zarr/spec/protocol/core/3.0", - 'metadata_key_suffix': '.json', + "zarr_format": "https://purl.org/zarr/spec/protocol/core/3.0", + "metadata_encoding": "https://purl.org/zarr/spec/protocol/core/3.0", + "metadata_key_suffix": ".json", "extensions": [], } +_v3_core_types = set( + "".join(d) for d in itertools.product("<>", ("u", "i", "f"), ("2", "4", "8")) +) +_v3_core_types = {"bool", "i1", "u1"} | _v3_core_types + +# The set of complex types allowed ({"c4", ">c8"}) +_v3_complex_types = set( + f"{end}c{_bytes}" for end, _bytes in itertools.product("<>", ("4", "8")) +) + +# All dtype.str values corresponding to datetime64 and timedelta64 +# see: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units +_date_units = ["Y", "M", "W", "D"] +_time_units = ["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] +_v3_datetime_types = set(f"{end}{kind}8[{unit}]" for end, unit, kind in itertools.product("<>", _date_units + _time_units, ('m', 'M'))) + + +def get_extended_dtype_info(dtype): + if dtype.str in _v3_complex_types: + return dict( + extension="https://zarr-specs.readthedocs.io/en/core-protocol-v3.0-dev/protocol/extensions/complex-dtypes/v1.0.html", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str == "|O": + return dict( + extension="TODO: object array protocol URL", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str.startswith("|S"): + return dict( + extension="TODO: bytestring array protocol URL", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str.startswith("|U"): + return dict( + extension="TODO: unicode array protocol URL", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str.startswith("|V"): + return dict( + extension="TODO: structured array protocol URL", # noqa + type=dtype.descr, + fallback=None, + ) + elif dtype.str in _v3_datetime_types: + return dict( + extension="https://zarr-specs.readthedocs.io/en/core-protocol-v3.0-dev/protocol/extensions/datetime-dtypes/v1.0.html", # noqa + type=dtype.str, + fallback=None, + ) + else: + raise ValueError(f"Unsupport dtype: {dtype}") + class Metadata2: ZARR_FORMAT = ZARR_FORMAT @@ -85,12 +116,13 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A dtype = cls.decode_dtype(meta["dtype"]) if dtype.hasobject: import numcodecs - object_codec = numcodecs.get_codec(meta['filters'][0]) + + object_codec = numcodecs.get_codec(meta["filters"][0]) else: object_codec = None dimension_separator = meta.get("dimension_separator", None) - fill_value = cls.decode_fill_value(meta['fill_value'], dtype, object_codec) + fill_value = cls.decode_fill_value(meta["fill_value"], dtype, object_codec) meta = dict( zarr_format=meta["zarr_format"], shape=tuple(meta["shape"]), @@ -102,7 +134,7 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A filters=meta["filters"], ) if dimension_separator: - meta['dimension_separator'] = dimension_separator + meta["dimension_separator"] = dimension_separator except Exception as e: raise MetadataError("error decoding metadata") from e else: @@ -118,7 +150,8 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: dimension_separator = meta.get("dimension_separator") if dtype.hasobject: import numcodecs - object_codec = numcodecs.get_codec(meta['filters'][0]) + + object_codec = numcodecs.get_codec(meta["filters"][0]) else: object_codec = None @@ -133,7 +166,7 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: filters=meta["filters"], ) if dimension_separator: - meta['dimension_separator'] = dimension_separator + meta["dimension_separator"] = dimension_separator if dimension_separator: meta["dimension_separator"] = dimension_separator @@ -180,13 +213,15 @@ def encode_group_metadata(cls, meta=None) -> bytes: return json_dumps(meta) @classmethod - def decode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: + def decode_fill_value( + cls, v: Any, dtype: np.dtype, object_codec: Any = None + ) -> Any: # early out if v is None: return v - if dtype.kind == 'V' and dtype.hasobject: + if dtype.kind == "V" and dtype.hasobject: if object_codec is None: - raise ValueError('missing object_codec for object array') + raise ValueError("missing object_codec for object array") v = base64.standard_b64decode(v) v = object_codec.decode(v) v = np.array(v, dtype=dtype)[()] @@ -228,15 +263,17 @@ def decode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> return np.array(v, dtype=dtype)[()] @classmethod - def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: + def encode_fill_value( + cls, v: Any, dtype: np.dtype, object_codec: Any = None + ) -> Any: # early out if v is None: return v - if dtype.kind == 'V' and dtype.hasobject: + if dtype.kind == "V" and dtype.hasobject: if object_codec is None: - raise ValueError('missing object_codec for object array') + raise ValueError("missing object_codec for object array") v = object_codec.encode(v) - v = str(base64.standard_b64encode(v), 'ascii') + v = str(base64.standard_b64encode(v), "ascii") return v if dtype.kind == "f": if np.isnan(v): @@ -253,8 +290,10 @@ def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> return bool(v) elif dtype.kind in "c": c = cast(np.complex128, np.dtype(complex).type()) - v = (cls.encode_fill_value(v.real, c.real.dtype, object_codec), - cls.encode_fill_value(v.imag, c.imag.dtype, object_codec)) + v = ( + cls.encode_fill_value(v.real, c.real.dtype, object_codec), + cls.encode_fill_value(v.imag, c.imag.dtype, object_codec), + ) return v elif dtype.kind in "SV": v = str(base64.standard_b64encode(v), "ascii") @@ -272,74 +311,29 @@ class Metadata3(Metadata2): @classmethod def decode_dtype(cls, d): + if isinstance(d, dict): + # extract the type from the extension info + info = get_extended_dtype_info(d) + d = info['type'] d = cls._decode_dtype_descr(d) dtype = np.dtype(d) - if dtype.kind == 'c': - if not ZARR_V3_ALLOW_COMPLEX: - raise ValueError("complex-valued arrays not supported") - elif dtype.kind in 'mM': - if not ZARR_V3_ALLOW_DATETIME: - raise ValueError( - "datetime64 and timedelta64 arrays not supported" - ) - elif dtype.kind == 'O': - if not ZARR_V3_ALLOW_OBJECTARRAY: - raise ValueError("object arrays not supported") - elif dtype.kind == 'V': - if not ZARR_V3_ALLOW_STRUCTURED: - raise ValueError("structured arrays not supported") - elif dtype.kind == 'U': - if not ZARR_V3_ALLOW_UNICODE_ARRAY: - raise ValueError("unicode arrays not supported") - elif dtype.kind == 'S': - if not ZARR_V3_ALLOW_BYTES_ARRAY: - raise ValueError("bytes arrays not supported") - else: - assert d in _v3_core_type return dtype @classmethod def encode_dtype(cls, d): - s = Metadata2.encode_dtype(d) + s = d.str if s == "|b1": return "bool" elif s == "|u1": return "u1" elif s == "|i1": return "i1" - dtype = np.dtype(d) - if dtype.kind == "c": - if not ZARR_V3_ALLOW_COMPLEX: - raise ValueError( - "complex-valued arrays not part of the base v3 spec" - ) - elif dtype.kind in "mM": - if not ZARR_V3_ALLOW_DATETIME: - raise ValueError( - "datetime64 and timedelta64 not part of the base v3 " - "spec" - ) - elif dtype.kind == "O": - if not ZARR_V3_ALLOW_OBJECTARRAY: - raise ValueError( - "object dtypes are not part of the base v3 spec" - ) - elif dtype.kind == "V": - if not ZARR_V3_ALLOW_STRUCTURED: - raise ValueError( - "structured arrays are not part of the base v3 spec" - ) - elif dtype.kind == 'U': - if not ZARR_V3_ALLOW_UNICODE_ARRAY: - raise ValueError("unicode dtypes are not part of the base v3 " - "spec") - elif dtype.kind == 'S': - if not ZARR_V3_ALLOW_BYTES_ARRAY: - raise ValueError("bytes dtypes are not part of the base v3 " - "spec") + elif s in _v3_core_types: + return Metadata2.encode_dtype(d) else: - assert s in _v3_core_type - return s + # Check if this dtype corresponds to a supported extension to + # the v3 protocol. + return get_extended_dtype_info(np.dtype(d)) @classmethod def decode_group_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: @@ -350,7 +344,7 @@ def decode_group_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A # if zarr_format != cls.ZARR_FORMAT: # raise MetadataError("unsupported zarr format: %s" % zarr_format) - assert 'attributes' in meta + assert "attributes" in meta # meta = dict(attributes=meta['attributes']) return meta @@ -362,7 +356,7 @@ def encode_group_metadata(cls, meta=None) -> bytes: # entry point metadata instead # meta = dict(zarr_format=cls.ZARR_FORMAT) if meta is None: - meta = {'attributes': {}} + meta = {"attributes": {}} meta = dict(attributes=meta.get("attributes", {})) return json_dumps(meta) @@ -371,26 +365,28 @@ def encode_hierarchy_metadata(cls, meta=None) -> bytes: if meta is None: meta = _default_entry_point_metadata_v3 elif set(meta.keys()) != { - "zarr_format", - "metadata_encoding", - "metadata_key_suffix", - "extensions", + "zarr_format", + "metadata_encoding", + "metadata_key_suffix", + "extensions", }: raise ValueError(f"Unexpected keys in metadata. meta={meta}") return json_dumps(meta) @classmethod - def decode_hierarchy_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: + def decode_hierarchy_metadata( + cls, s: Union[MappingType, str] + ) -> MappingType[str, Any]: meta = cls.parse_metadata(s) # check metadata format # zarr_format = meta.get("zarr_format", None) # if zarr_format != "https://purl.org/zarr/spec/protocol/core/3.0": # raise MetadataError("unsupported zarr format: %s" % zarr_format) if set(meta.keys()) != { - "zarr_format", - "metadata_encoding", - "metadata_key_suffix", - "extensions", + "zarr_format", + "metadata_encoding", + "metadata_key_suffix", + "extensions", }: raise ValueError(f"Unexpected keys in metdata. meta={meta}") return meta @@ -409,7 +405,8 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A dtype = cls.decode_dtype(meta["data_type"]) if dtype.hasobject: import numcodecs - object_codec = numcodecs.get_codec(meta['attributes']['filters'][0]) + + object_codec = numcodecs.get_codec(meta["attributes"]["filters"][0]) else: object_codec = None fill_value = cls.decode_fill_value(meta["fill_value"], dtype, object_codec) @@ -446,7 +443,8 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: dimension_separator = meta.get("dimension_separator") if dtype.hasobject: import numcodecs - object_codec = numcodecs.get_codec(meta['attributes']['filters'][0]) + + object_codec = numcodecs.get_codec(meta["attributes"]["filters"][0]) else: object_codec = None meta = dict( From 662e310fb5fe01500a10c5f02c358ebda559a5ec Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 14 Dec 2021 19:40:07 -0500 Subject: [PATCH 5/9] fixes to v3 dtype handling --- zarr/meta.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index 72c6cfc869..bdb7dd9702 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -28,9 +28,9 @@ ) _v3_core_types = {"bool", "i1", "u1"} | _v3_core_types -# The set of complex types allowed ({"c4", ">c8"}) +# The set of complex types allowed ({"c8", ">c16"}) _v3_complex_types = set( - f"{end}c{_bytes}" for end, _bytes in itertools.product("<>", ("4", "8")) + f"{end}c{_bytes}" for end, _bytes in itertools.product("<>", ("8", "16")) ) # All dtype.str values corresponding to datetime64 and timedelta64 @@ -310,13 +310,24 @@ class Metadata3(Metadata2): ZARR_FORMAT = ZARR_FORMAT_v3 @classmethod - def decode_dtype(cls, d): + def decode_dtype(cls, d, validate=True): if isinstance(d, dict): # extract the type from the extension info - info = get_extended_dtype_info(d) - d = info['type'] + try: + d = d['type'] + except KeyError: + raise KeyError( + "Extended dtype info must provide a key named 'type'." + ) d = cls._decode_dtype_descr(d) dtype = np.dtype(d) + if validate: + if dtype.str in (_v3_core_types | {"|b1", "|u1", "|i1"}): + # it is a core dtype of the v3 spec + pass + else: + # will raise if this is not a recognized extended dtype + get_extended_dtype_info(dtype) return dtype @classmethod From 450c57506a7427692a55acf098a57a10152dda88 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Wed, 15 Dec 2021 09:40:50 -0500 Subject: [PATCH 6/9] flake8 cleanup --- zarr/meta.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index bdb7dd9702..3d56e16fd3 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -1,7 +1,5 @@ import base64 import itertools -import os -from collections import namedtuple from collections.abc import Mapping import numpy as np @@ -37,7 +35,10 @@ # see: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units _date_units = ["Y", "M", "W", "D"] _time_units = ["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] -_v3_datetime_types = set(f"{end}{kind}8[{unit}]" for end, unit, kind in itertools.product("<>", _date_units + _time_units, ('m', 'M'))) +_v3_datetime_types = set( + f"{end}{kind}8[{unit}]" + for end, unit, kind in itertools.product("<>", _date_units + _time_units, ('m', 'M')) +) def get_extended_dtype_info(dtype): @@ -322,7 +323,7 @@ def decode_dtype(cls, d, validate=True): d = cls._decode_dtype_descr(d) dtype = np.dtype(d) if validate: - if dtype.str in (_v3_core_types | {"|b1", "|u1", "|i1"}): + if dtype.str in (_v3_core_types | {"|b1", "|u1", "|i1"}): # it is a core dtype of the v3 spec pass else: From 843210020909c0d935ff7b2141f5d31b036f26de Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 30 Nov 2021 14:59:43 -0500 Subject: [PATCH 7/9] update Attributes, adding StoreV3 support avoid pytest error about missing fixture fix flake8 error related to zarr_version fixture --- zarr/attrs.py | 68 +++++++++++++++++---- zarr/tests/test_attrs.py | 125 +++++++++++++++++++++++++-------------- zarr/tests/test_sync.py | 3 +- 3 files changed, 141 insertions(+), 55 deletions(-) diff --git a/zarr/attrs.py b/zarr/attrs.py index eff1237db1..78c26461c4 100644 --- a/zarr/attrs.py +++ b/zarr/attrs.py @@ -1,6 +1,6 @@ from collections.abc import MutableMapping -from zarr._storage.store import Store +from zarr._storage.store import Store, StoreV3 from zarr.util import json_dumps @@ -26,7 +26,15 @@ class Attributes(MutableMapping): def __init__(self, store, key='.zattrs', read_only=False, cache=True, synchronizer=None): - self.store = Store._ensure_store(store) + + self._version = getattr(store, '_store_version', 2) + assert key + + if self._version == 3 and '.z' in key: + raise ValueError('invalid v3 key') + + _Store = Store if self._version == 2 else StoreV3 + self.store = _Store._ensure_store(store) self.key = key self.read_only = read_only self.cache = cache @@ -38,6 +46,8 @@ def _get_nosync(self): data = self.store[self.key] except KeyError: d = dict() + if self._version > 2: + d['attributes'] = {} else: d = self.store._metadata_class.parse_metadata(data) return d @@ -47,6 +57,8 @@ def asdict(self): if self.cache and self._cached_asdict is not None: return self._cached_asdict d = self._get_nosync() + if self._version == 3: + d = d['attributes'] if self.cache: self._cached_asdict = d return d @@ -54,7 +66,10 @@ def asdict(self): def refresh(self): """Refresh cached attributes from the store.""" if self.cache: - self._cached_asdict = self._get_nosync() + if self._version == 3: + self._cached_asdict = self._get_nosync()['attributes'] + else: + self._cached_asdict = self._get_nosync() def __contains__(self, x): return x in self.asdict() @@ -84,7 +99,10 @@ def _setitem_nosync(self, item, value): d = self._get_nosync() # set key value - d[item] = value + if self._version == 2: + d[item] = value + else: + d['attributes'][item] = value # _put modified data self._put_nosync(d) @@ -98,7 +116,10 @@ def _delitem_nosync(self, key): d = self._get_nosync() # delete key value - del d[key] + if self._version == 2: + del d[key] + else: + del d['attributes'][key] # _put modified data self._put_nosync(d) @@ -106,12 +127,34 @@ def _delitem_nosync(self, key): def put(self, d): """Overwrite all attributes with the key/value pairs in the provided dictionary `d` in a single operation.""" - self._write_op(self._put_nosync, d) + if self._version == 2: + self._write_op(self._put_nosync, d) + else: + self._write_op(self._put_nosync, dict(attributes=d)) def _put_nosync(self, d): - self.store[self.key] = json_dumps(d) - if self.cache: - self._cached_asdict = d + if self._version == 2: + self.store[self.key] = json_dumps(d) + if self.cache: + self._cached_asdict = d + else: + if self.key in self.store: + # Cannot write the attributes directly to JSON, but have to + # store it within the pre-existing attributes key of the v3 + # metadata. + + # Note: this changes the store.counter result in test_caching_on! + + meta = self.store._metadata_class.parse_metadata(self.store[self.key]) + if 'attributes' in meta and 'filters' in meta['attributes']: + # need to preserve any existing "filters" attribute + d['attributes']['filters'] = meta['attributes']['filters'] + meta['attributes'] = d['attributes'] + else: + meta = d + self.store[self.key] = json_dumps(meta) + if self.cache: + self._cached_asdict = d['attributes'] # noinspection PyMethodOverriding def update(self, *args, **kwargs): @@ -124,7 +167,12 @@ def _update_nosync(self, *args, **kwargs): d = self._get_nosync() # update - d.update(*args, **kwargs) + if self._version == 2: + d.update(*args, **kwargs) + else: + if 'attributes' not in d: + d['attributes'] = {} + d['attributes'].update(*args, **kwargs) # _put modified data self._put_nosync(d) diff --git a/zarr/tests/test_attrs.py b/zarr/tests/test_attrs.py index b2de736d4a..62faf662da 100644 --- a/zarr/tests/test_attrs.py +++ b/zarr/tests/test_attrs.py @@ -3,8 +3,20 @@ import pytest from zarr.attrs import Attributes -from zarr.tests.util import CountingDict -from zarr.storage import KVStore +from zarr.storage import KVStore, KVStoreV3 +from zarr.tests.util import CountingDict, CountingDictV3 + + +@pytest.fixture(params=[2, 3]) +def zarr_version(request): + return request.param + + +def _init_store(version): + """Use a plain dict() for v2, but KVStoreV3 otherwise.""" + if version == 2: + return dict() + return KVStoreV3(dict()) class TestAttributes(): @@ -12,13 +24,9 @@ class TestAttributes(): def init_attributes(self, store, read_only=False, cache=True): return Attributes(store, key='attrs', read_only=read_only, cache=cache) - @pytest.mark.parametrize('store_from_dict', [False, True]) - def test_storage(self, store_from_dict): + def test_storage(self, zarr_version): - if store_from_dict: - store = dict() - else: - store = KVStore(dict()) + store = _init_store(zarr_version) a = Attributes(store=store, key='attrs') assert isinstance(a.store, KVStore) assert 'foo' not in a @@ -30,11 +38,14 @@ def test_storage(self, store_from_dict): assert 'attrs' in store assert isinstance(store['attrs'], bytes) d = json.loads(str(store['attrs'], 'ascii')) + if zarr_version == 3: + d = d['attributes'] assert dict(foo='bar', baz=42) == d - def test_get_set_del_contains(self): + def test_get_set_del_contains(self, zarr_version): - a = self.init_attributes(dict()) + store = _init_store(zarr_version) + a = self.init_attributes(store) assert 'foo' not in a a['foo'] = 'bar' a['baz'] = 42 @@ -48,9 +59,10 @@ def test_get_set_del_contains(self): # noinspection PyStatementEffect a['foo'] - def test_update_put(self): + def test_update_put(self, zarr_version): - a = self.init_attributes(dict()) + store = _init_store(zarr_version) + a = self.init_attributes(store) assert 'foo' not in a assert 'bar' not in a assert 'baz' not in a @@ -65,9 +77,10 @@ def test_update_put(self): assert a['bar'] == 84 assert 'baz' not in a - def test_iterators(self): + def test_iterators(self, zarr_version): - a = self.init_attributes(dict()) + store = _init_store(zarr_version) + a = self.init_attributes(store) assert 0 == len(a) assert set() == set(a) assert set() == set(a.keys()) @@ -83,10 +96,13 @@ def test_iterators(self): assert {'bar', 42} == set(a.values()) assert {('foo', 'bar'), ('baz', 42)} == set(a.items()) - def test_read_only(self): - store = dict() + def test_read_only(self, zarr_version): + store = _init_store(zarr_version) a = self.init_attributes(store, read_only=True) - store['attrs'] = json.dumps(dict(foo='bar', baz=42)).encode('ascii') + if zarr_version == 2: + store['attrs'] = json.dumps(dict(foo='bar', baz=42)).encode('ascii') + else: + store['attrs'] = json.dumps(dict(attributes=dict(foo='bar', baz=42))).encode('ascii') assert a['foo'] == 'bar' assert a['baz'] == 42 with pytest.raises(PermissionError): @@ -96,8 +112,9 @@ def test_read_only(self): with pytest.raises(PermissionError): a.update(foo='quux') - def test_key_completions(self): - a = self.init_attributes(dict()) + def test_key_completions(self, zarr_version): + store = _init_store(zarr_version) + a = self.init_attributes(store) d = a._ipython_key_completions_() assert 'foo' not in d assert '123' not in d @@ -112,14 +129,17 @@ def test_key_completions(self): assert 'asdf;' in d assert 'baz' not in d - def test_caching_on(self): + def test_caching_on(self, zarr_version): # caching is turned on by default # setup store - store = CountingDict() + store = CountingDict() if zarr_version == 2 else CountingDictV3() assert 0 == store.counter['__getitem__', 'attrs'] assert 0 == store.counter['__setitem__', 'attrs'] - store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + if zarr_version == 2: + store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + else: + store['attrs'] = json.dumps(dict(attributes=dict(foo='xxx', bar=42))).encode('ascii') assert 0 == store.counter['__getitem__', 'attrs'] assert 1 == store.counter['__setitem__', 'attrs'] @@ -136,54 +156,65 @@ def test_caching_on(self): # test __setitem__ updates the cache a['foo'] = 'yyy' - assert 2 == store.counter['__getitem__', 'attrs'] + get_cnt = 2 if zarr_version == 2 else 3 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 2 == store.counter['__setitem__', 'attrs'] assert a['foo'] == 'yyy' - assert 2 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 2 == store.counter['__setitem__', 'attrs'] # test update() updates the cache a.update(foo='zzz', bar=84) - assert 3 == store.counter['__getitem__', 'attrs'] + get_cnt = 3 if zarr_version == 2 else 5 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] assert a['foo'] == 'zzz' assert a['bar'] == 84 - assert 3 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] # test __contains__ uses the cache assert 'foo' in a - assert 3 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] assert 'spam' not in a - assert 3 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] # test __delitem__ updates the cache del a['bar'] - assert 4 == store.counter['__getitem__', 'attrs'] + get_cnt = 4 if zarr_version == 2 else 7 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 4 == store.counter['__setitem__', 'attrs'] assert 'bar' not in a - assert 4 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 4 == store.counter['__setitem__', 'attrs'] # test refresh() - store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') - assert 4 == store.counter['__getitem__', 'attrs'] + if zarr_version == 2: + store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + else: + store['attrs'] = json.dumps(dict(attributes=dict(foo='xxx', bar=42))).encode('ascii') + assert get_cnt == store.counter['__getitem__', 'attrs'] a.refresh() - assert 5 == store.counter['__getitem__', 'attrs'] + get_cnt = 5 if zarr_version == 2 else 8 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert a['foo'] == 'xxx' - assert 5 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] assert a['bar'] == 42 - assert 5 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', 'attrs'] - def test_caching_off(self): + def test_caching_off(self, zarr_version): # setup store - store = CountingDict() + store = CountingDict() if zarr_version == 2 else CountingDictV3() assert 0 == store.counter['__getitem__', 'attrs'] assert 0 == store.counter['__setitem__', 'attrs'] - store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + + if zarr_version == 2: + store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + else: + store['attrs'] = json.dumps(dict(attributes=dict(foo='xxx', bar=42))).encode('ascii') assert 0 == store.counter['__getitem__', 'attrs'] assert 1 == store.counter['__setitem__', 'attrs'] @@ -200,25 +231,31 @@ def test_caching_off(self): # test __setitem__ a['foo'] = 'yyy' - assert 4 == store.counter['__getitem__', 'attrs'] + get_cnt = 4 if zarr_version == 2 else 5 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 2 == store.counter['__setitem__', 'attrs'] assert a['foo'] == 'yyy' - assert 5 == store.counter['__getitem__', 'attrs'] + get_cnt = 5 if zarr_version == 2 else 6 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 2 == store.counter['__setitem__', 'attrs'] # test update() a.update(foo='zzz', bar=84) - assert 6 == store.counter['__getitem__', 'attrs'] + get_cnt = 6 if zarr_version == 2 else 8 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] assert a['foo'] == 'zzz' assert a['bar'] == 84 - assert 8 == store.counter['__getitem__', 'attrs'] + get_cnt = 8 if zarr_version == 2 else 10 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] # test __contains__ assert 'foo' in a - assert 9 == store.counter['__getitem__', 'attrs'] + get_cnt = 9 if zarr_version == 2 else 11 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] assert 'spam' not in a - assert 10 == store.counter['__getitem__', 'attrs'] + get_cnt = 10 if zarr_version == 2 else 12 + assert get_cnt == store.counter['__getitem__', 'attrs'] assert 3 == store.counter['__setitem__', 'attrs'] diff --git a/zarr/tests/test_sync.py b/zarr/tests/test_sync.py index 69fc0d7708..1a763dc7f7 100644 --- a/zarr/tests/test_sync.py +++ b/zarr/tests/test_sync.py @@ -15,7 +15,8 @@ from zarr.storage import (DirectoryStore, KVStore, atexit_rmtree, init_array, init_group) from zarr.sync import ProcessSynchronizer, ThreadSynchronizer -from zarr.tests.test_attrs import TestAttributes +# zarr_version fixture must be imported although not used directly here +from zarr.tests.test_attrs import TestAttributes, zarr_version # noqa from zarr.tests.test_core import TestArray from zarr.tests.test_hierarchy import TestGroup From 116fd6a4bff5135ad87f741d461e001953789959 Mon Sep 17 00:00:00 2001 From: Gregory Lee Date: Tue, 30 Nov 2021 16:30:24 -0500 Subject: [PATCH 8/9] add StoreV3 support to core Array object --- zarr/core.py | 147 ++++-- zarr/tests/test_core.py | 3 +- zarr/tests/test_core_v3.py | 898 +++++++++++++++++++++++++++++++++++++ 3 files changed, 1012 insertions(+), 36 deletions(-) create mode 100644 zarr/tests/test_core_v3.py diff --git a/zarr/core.py b/zarr/core.py index 6f6b468e3b..07096ab1c4 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -4,13 +4,14 @@ import math import operator import re +from collections.abc import MutableMapping from functools import reduce +from typing import Any import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray -from collections.abc import MutableMapping - +from zarr._storage.store import _prefix_to_attrs_key from zarr.attrs import Attributes from zarr.codecs import AsType, get_codec from zarr.errors import ArrayNotFoundError, ReadOnlyError, ArrayIndexError @@ -31,7 +32,13 @@ is_scalar, pop_fields, ) -from zarr.storage import array_meta_key, attrs_key, getsize, listdir, BaseStore +from zarr.storage import ( + _get_hierarchy_metadata, + _prefix_to_array_key, + getsize, + listdir, + normalize_store_arg, +) from zarr.util import ( all_equal, InfoReporter, @@ -146,7 +153,7 @@ class Array: def __init__( self, - store: BaseStore, + store: Any, # BaseStore not stricly required due to normalize_store_arg path=None, read_only=False, chunk_store=None, @@ -155,12 +162,22 @@ def __init__( cache_attrs=True, partial_decompress=False, write_empty_chunks=True, + zarr_version=None, ): # N.B., expect at this point store is fully initialized with all # configuration metadata fully specified and normalized - store = BaseStore._ensure_store(store) - chunk_store = BaseStore._ensure_store(chunk_store) + store = normalize_store_arg(store, zarr_version=zarr_version) + if zarr_version is None: + zarr_version = getattr(store, '_store_version', 2) + + if chunk_store is not None: + chunk_store = normalize_store_arg(chunk_store, + zarr_version=zarr_version) + if not getattr(chunk_store, '_store_version', 2) == zarr_version: + raise ValueError( + "zarr_version of store and chunk_store must match" + ) self._store = store self._chunk_store = chunk_store @@ -175,12 +192,19 @@ def __init__( self._is_view = False self._partial_decompress = partial_decompress self._write_empty_chunks = write_empty_chunks + self._version = zarr_version + + if self._version == 3: + self._data_key_prefix = 'data/root/' + self._key_prefix + self._data_path = 'data/root/' + self._path + self._hierarchy_metadata = _get_hierarchy_metadata(store=None) + self._metadata_key_suffix = self._hierarchy_metadata['metadata_key_suffix'] # initialize metadata self._load_metadata() # initialize attributes - akey = self._key_prefix + attrs_key + akey = _prefix_to_attrs_key(self._store, self._key_prefix) self._attrs = Attributes(store, key=akey, read_only=read_only, synchronizer=synchronizer, cache=cache_attrs) @@ -196,13 +220,13 @@ def _load_metadata(self): if self._synchronizer is None: self._load_metadata_nosync() else: - mkey = self._key_prefix + array_meta_key + mkey = _prefix_to_array_key(self._store, self._key_prefix) with self._synchronizer[mkey]: self._load_metadata_nosync() def _load_metadata_nosync(self): try: - mkey = self._key_prefix + array_meta_key + mkey = _prefix_to_array_key(self._store, self._key_prefix) meta_bytes = self._store[mkey] except KeyError: raise ArrayNotFoundError(self._path) @@ -212,21 +236,31 @@ def _load_metadata_nosync(self): meta = self._store._metadata_class.decode_array_metadata(meta_bytes) self._meta = meta self._shape = meta['shape'] - self._chunks = meta['chunks'] - self._dtype = meta['dtype'] self._fill_value = meta['fill_value'] - self._order = meta['order'] - dimension_separator = meta.get('dimension_separator', None) - if dimension_separator is None: - try: - dimension_separator = self._store._dimension_separator - except (AttributeError, KeyError): - pass - - # Fallback for any stores which do not choose a default + if self._version == 2: + self._chunks = meta['chunks'] + self._dtype = meta['dtype'] + self._order = meta['order'] if dimension_separator is None: - dimension_separator = "." + try: + dimension_separator = self._store._dimension_separator + except (AttributeError, KeyError): + pass + + # Fallback for any stores which do not choose a default + if dimension_separator is None: + dimension_separator = "." + else: + self._chunks = meta['chunk_grid']['chunk_shape'] + self._dtype = meta['data_type'] + self._order = meta['chunk_memory_layout'] + if dimension_separator is None: + # TODO: omit attribute in v3? + dimension_separator = meta.get('dimension_separator', '/') + chunk_separator = meta['chunk_grid']['separator'] + assert chunk_separator == dimension_separator + self._dimension_separator = dimension_separator # setup compressor @@ -237,7 +271,12 @@ def _load_metadata_nosync(self): self._compressor = get_codec(config) # setup filters - filters = meta['filters'] + if self._version == 2: + filters = meta.get('filters', []) + else: + # TODO: storing filters under attributes for now since the v3 + # array metadata does not have a 'filters' attribute. + filters = meta['attributes'].get('filters', []) if filters: filters = [get_codec(config) for config in filters] self._filters = filters @@ -262,10 +301,22 @@ def _flush_metadata_nosync(self): filters_config = [f.get_config() for f in self._filters] else: filters_config = None - meta = dict(shape=self._shape, chunks=self._chunks, dtype=self._dtype, - compressor=compressor_config, fill_value=self._fill_value, - order=self._order, filters=filters_config) - mkey = self._key_prefix + array_meta_key + meta = dict(shape=self._shape, compressor=compressor_config, + fill_value=self._fill_value, filters=filters_config) + if getattr(self._store, '_store_version', 2) == 2: + meta.update( + dict(chunks=self._chunks, dtype=self._dtype, order=self._order) + ) + else: + meta.update( + dict(chunk_grid=dict(type='regular', + chunk_shape=self._chunks, + separator=self._dimension_separator), + data_type=self._dtype, + chunk_memory_layout=self._order, + attributes=self.attrs.asdict()) + ) + mkey = _prefix_to_array_key(self._store, self._key_prefix) self._store[mkey] = self._store._metadata_class.encode_array_metadata(meta) @property @@ -453,11 +504,28 @@ def nchunks(self): def nchunks_initialized(self): """The number of chunks that have been initialized with some data.""" - # key pattern for chunk keys - prog = re.compile(r'\.'.join([r'\d+'] * min(1, self.ndim))) - # count chunk keys - return sum(1 for k in listdir(self.chunk_store, self._path) if prog.match(k)) + if self._version == 3: + # # key pattern for chunk keys + # prog = re.compile(r'\.'.join([r'c\d+'] * min(1, self.ndim))) + # # get chunk keys, excluding the prefix + # members = self.chunk_store.list_prefix(self._data_path) + # members = [k.split(self._data_key_prefix)[1] for k in members] + # # count the chunk keys + # return sum(1 for k in members if prog.match(k)) + + # key pattern for chunk keys + prog = re.compile(self._data_key_prefix + r'c\d+') # TODO: ndim == 0 case? + # get chunk keys, excluding the prefix + members = self.chunk_store.list_prefix(self._data_path) + # count the chunk keys + return sum(1 for k in members if prog.match(k)) + else: + # key pattern for chunk keys + prog = re.compile(r'\.'.join([r'\d+'] * min(1, self.ndim))) + + # count chunk keys + return sum(1 for k in listdir(self.chunk_store, self._path) if prog.match(k)) # backwards compatibility initialized = nchunks_initialized @@ -2061,7 +2129,15 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): return chunk def _chunk_key(self, chunk_coords): - return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords)) + if self._version == 3: + # _chunk_key() corresponds to data_key(P, i, j, ...) example in the spec + # where P = self._key_prefix, i, j, ... = chunk_coords + # e.g. c0/2/3 for 3d array with chunk index (0, 2, 3) + # https://zarr-specs.readthedocs.io/en/core-protocol-v3.0-dev/protocol/core/v3.0.html#regular-grids + return ("data/root/" + self._key_prefix + + "c" + self._dimension_separator.join(map(str, chunk_coords))) + else: + return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords)) def _decode_chunk(self, cdata, start=None, nitems=None, expected_shape=None): # decompress @@ -2242,7 +2318,8 @@ def digest(self, hashname="sha1"): for i in itertools.product(*[range(s) for s in self.cdata_shape]): h.update(self.chunk_store.get(self._chunk_key(i), b"")) - h.update(self.store.get(self._key_prefix + array_meta_key, b"")) + mkey = _prefix_to_array_key(self._store, self._key_prefix) + h.update(self.store.get(mkey, b"")) h.update(self.store.get(self.attrs.key, b"")) @@ -2279,7 +2356,7 @@ def hexdigest(self, hashname="sha1"): def __getstate__(self): return (self._store, self._path, self._read_only, self._chunk_store, self._synchronizer, self._cache_metadata, self._attrs.cache, - self._partial_decompress, self._write_empty_chunks) + self._partial_decompress, self._write_empty_chunks, self._version) def __setstate__(self, state): self.__init__(*state) @@ -2292,7 +2369,7 @@ def _synchronized_op(self, f, *args, **kwargs): else: # synchronize on the array - mkey = self._key_prefix + array_meta_key + mkey = _prefix_to_array_key(self._store, self._key_prefix) lock = self._synchronizer[mkey] with lock: @@ -2559,7 +2636,7 @@ def view(self, shape=None, chunks=None, dtype=None, if synchronizer is None: synchronizer = self._synchronizer a = Array(store=store, path=path, chunk_store=chunk_store, read_only=read_only, - synchronizer=synchronizer, cache_metadata=True) + synchronizer=synchronizer, cache_metadata=True, zarr_version=self._version) a._is_view = True # allow override of some properties diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 7423132887..938a58b494 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -43,6 +43,8 @@ class TestArray(unittest.TestCase): + _version = 2 + def test_array_init(self): # normal initialization @@ -1180,7 +1182,6 @@ def test_object_arrays(self): def test_object_arrays_vlen_text(self): data = np.array(greetings * 1000, dtype=object) - z = self.create_array(shape=data.shape, dtype=object, object_codec=VLenUTF8()) z[0] = 'foo' assert z[0] == 'foo' diff --git a/zarr/tests/test_core_v3.py b/zarr/tests/test_core_v3.py new file mode 100644 index 0000000000..d0d08b29bb --- /dev/null +++ b/zarr/tests/test_core_v3.py @@ -0,0 +1,898 @@ +import atexit +import os +import shutil +from tempfile import mkdtemp, mktemp + +import numpy as np +import pytest +from numcodecs import (Blosc, Zlib) +from numcodecs.compat import ensure_bytes +from numpy.testing import assert_array_almost_equal, assert_array_equal + +from zarr.core import Array +from zarr.errors import ArrayNotFoundError, ContainsGroupError +from zarr.meta import json_loads +from zarr.storage import ( + # ABSStoreV3, + DBMStoreV3, + DirectoryStoreV3, + FSStoreV3, + KVStoreV3, + LMDBStoreV3, + LRUStoreCacheV3, + NestedDirectoryStoreV3, + SQLiteStoreV3, + StoreV3, + atexit_rmglob, + atexit_rmtree, + init_array, + init_group, +) +from zarr.tests.test_core import TestArrayWithPath +from zarr.tests.util import have_fsspec +from zarr.util import buffer_size + + +# Start with TestArrayWithPathV3 not TestArrayV3 since path must be supplied + +class TestArrayWithPathV3(TestArrayWithPath): + + _version = 3 + + @staticmethod + def create_array(array_path='arr1', read_only=False, **kwargs): + store = KVStoreV3(dict()) + kwargs.setdefault('compressor', Zlib(level=1)) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) + init_array(store, path=array_path, **kwargs) + return Array(store, path=array_path, read_only=read_only, + cache_metadata=cache_metadata, cache_attrs=cache_attrs, + write_empty_chunks=write_empty_chunks) + + def test_array_init(self): + + # should not be able to initialize without a path in V3 + store = KVStoreV3(dict()) + with pytest.raises(ValueError): + init_array(store, shape=100, chunks=10, dtype=" Date: Tue, 30 Nov 2021 19:21:36 -0500 Subject: [PATCH 9/9] add StoreV3 support to Group, open_group, etc. --- zarr/hierarchy.py | 265 ++++++++++++---- zarr/tests/test_hierarchy.py | 569 +++++++++++++++++++++++++++++------ 2 files changed, 687 insertions(+), 147 deletions(-) diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 763a5f1631..53db0e617e 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -6,8 +6,7 @@ from zarr.attrs import Attributes from zarr.core import Array from zarr.creation import (array, create, empty, empty_like, full, full_like, - normalize_store_arg, ones, ones_like, zeros, - zeros_like) + ones, ones_like, zeros, zeros_like) from zarr.errors import ( ContainsArrayError, ContainsGroupError, @@ -15,14 +14,18 @@ ReadOnlyError, ) from zarr.storage import ( + _get_hierarchy_metadata, + _prefix_to_group_key, BaseStore, MemoryStore, + MemoryStoreV3, attrs_key, contains_array, contains_group, group_meta_key, init_group, listdir, + normalize_store_arg, rename, rmdir, ) @@ -109,9 +112,17 @@ class Group(MutableMapping): """ def __init__(self, store, path=None, read_only=False, chunk_store=None, - cache_attrs=True, synchronizer=None): - store: BaseStore = BaseStore._ensure_store(store) - chunk_store: BaseStore = BaseStore._ensure_store(chunk_store) + cache_attrs=True, synchronizer=None, zarr_version=None): + store: BaseStore = _normalize_store_arg(store, zarr_version=zarr_version) + if zarr_version is None: + zarr_version = getattr(store, '_store_version', 2) + if zarr_version > 2 and path: + if path.startswith(("meta/", "data/")): + raise ValueError("path must note start with 'meta/' or 'data/'") + if chunk_store is not None: + chunk_store: BaseStore = _normalize_store_arg(chunk_store, zarr_version=zarr_version) + if not getattr(chunk_store, '_store_version', 2) == zarr_version: + raise ValueError("zarr_version of store and chunk_store must match") self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) @@ -121,6 +132,13 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, self._key_prefix = '' self._read_only = read_only self._synchronizer = synchronizer + self._version = zarr_version + + if self._version == 3: + self._data_key_prefix = 'data/root/' + self._key_prefix + self._data_path = 'data/root/' + self._path + self._hierarchy_metadata = _get_hierarchy_metadata(store=None) + self._metadata_key_suffix = self._hierarchy_metadata['metadata_key_suffix'] # guard conditions if contains_array(store, path=self._path): @@ -128,15 +146,31 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, # initialize metadata try: - mkey = self._key_prefix + group_meta_key + mkey = _prefix_to_group_key(self._store, self._key_prefix) + assert not mkey.endswith("root/.group") meta_bytes = store[mkey] except KeyError: - raise GroupNotFoundError(path) + if self._version == 2: + raise GroupNotFoundError(path) + else: + implicit_prefix = 'meta/root/' + self._key_prefix + if not implicit_prefix.endswith('/'): + implicit_prefix += '/' + if self._store.list_prefix(implicit_prefix): + # implicit group does not have any metadata + self._meta = None + else: + raise GroupNotFoundError(path) else: self._meta = self._store._metadata_class.decode_group_metadata(meta_bytes) # setup attributes - akey = self._key_prefix + attrs_key + if self._version == 2: + akey = self._key_prefix + attrs_key + else: + # Note: mkey doesn't actually exist for implicit groups, but the + # object can still be created. + akey = mkey self._attrs = Attributes(store, key=akey, read_only=read_only, cache=cache_attrs, synchronizer=synchronizer) @@ -227,11 +261,36 @@ def __iter__(self): quux """ - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if (contains_array(self._store, path) or - contains_group(self._store, path)): - yield key + if getattr(self._store, '_store_version', 2) == 2: + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if (contains_array(self._store, path) or + contains_group(self._store, path)): + yield key + else: + # TODO: Should this iterate over data folders and/or metadata + # folders and/or metadata files + + dir_path = 'meta/root/' + self._key_prefix + name_start = len(dir_path) + keys, prefixes = self._store.list_dir(dir_path) + + # yield any groups or arrays + sfx = self._metadata_key_suffix + for key in keys: + len_suffix = len('.group') + len(sfx) # same for .array + if key.endswith(('.group' + sfx, '.array' + sfx)): + yield key[name_start:-len_suffix] + + # also yield any implicit groups + for prefix in prefixes: + prefix = prefix.rstrip('/') + # only implicit if there is no .group.sfx file + if not prefix + '.group' + sfx in self._store: + yield prefix[name_start:] + + # Note: omit data/root/ to avoid duplicate listings + # any group in data/root/ must has an entry in meta/root/ def __len__(self): """Number of members.""" @@ -323,9 +382,11 @@ def __contains__(self, item): False """ + if self._version > 2 and item.startswith('meta/'): + raise ValueError("meta/ must not be in item") path = self._item_path(item) return contains_array(self._store, path) or \ - contains_group(self._store, path) + contains_group(self._store, path, explicit_only=False) def __getitem__(self, item): """Obtain a group member. @@ -352,11 +413,21 @@ def __getitem__(self, item): if contains_array(self._store, path): return Array(self._store, read_only=self._read_only, path=path, chunk_store=self._chunk_store, - synchronizer=self._synchronizer, cache_attrs=self.attrs.cache) - elif contains_group(self._store, path): + synchronizer=self._synchronizer, cache_attrs=self.attrs.cache, + zarr_version=self._version) + elif contains_group(self._store, path, explicit_only=True): return Group(self._store, read_only=self._read_only, path=path, chunk_store=self._chunk_store, cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer) + synchronizer=self._synchronizer, zarr_version=self._version) + elif self._version == 3: + implicit_group = 'meta/root/' + path + '/' + # non-empty folder in the metadata path implies an implicit group + if self._store.list_prefix(implicit_group): + return Group(self._store, read_only=self._read_only, path=path, + chunk_store=self._chunk_store, cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, zarr_version=self._version) + else: + raise KeyError(item) else: raise KeyError(item) @@ -369,7 +440,7 @@ def __delitem__(self, item): def _delitem_nosync(self, item): path = self._item_path(item) if contains_array(self._store, path) or \ - contains_group(self._store, path): + contains_group(self._store, path, explicit_only=False): rmdir(self._store, path) else: raise KeyError(item) @@ -406,10 +477,24 @@ def group_keys(self): ['bar', 'foo'] """ - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_group(self._store, path): - yield key + if self._version == 2: + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_group(self._store, path): + yield key + else: + dir_name = 'meta/root/' + self._path + sfx = _get_hierarchy_metadata(self._store)['metadata_key_suffix'] + group_sfx = '.group' + sfx + for key in sorted(listdir(self._store, dir_name)): + if key.endswith(group_sfx): + key = key[:-len(group_sfx)] + path = self._key_prefix + key + if path.endswith(".array" + sfx): + # skip array keys + continue + if contains_group(self._store, path, explicit_only=False): + yield key def groups(self): """Return an iterator over (name, value) pairs for groups only. @@ -428,13 +513,39 @@ def groups(self): foo """ - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_group(self._store, path): - yield key, Group(self._store, path=path, read_only=self._read_only, - chunk_store=self._chunk_store, - cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer) + if self._version == 2: + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_group(self._store, path, explicit_only=False): + yield key, Group( + self._store, + path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, + zarr_version=self._version) + + else: + dir_name = 'meta/root/' + self._path + sfx = _get_hierarchy_metadata(self._store)['metadata_key_suffix'] + group_sfx = '.group' + sfx + for key in sorted(listdir(self._store, dir_name)): + if key.endswith(group_sfx): + key = key[:-len(group_sfx)] + path = self._key_prefix + key + if path.endswith(".array" + sfx): + # skip array keys + continue + if contains_group(self._store, path, explicit_only=False): + yield key, Group( + self._store, + path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, + zarr_version=self._version) def array_keys(self, recurse=False): """Return an iterator over member names for arrays only. @@ -491,14 +602,36 @@ def arrays(self, recurse=False): recurse=recurse) def _array_iter(self, keys_only, method, recurse): - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_array(self._store, path): - yield key if keys_only else (key, self[key]) - elif recurse and contains_group(self._store, path): - group = self[key] - for i in getattr(group, method)(recurse=recurse): - yield i + if self._version == 2: + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + assert not path.startswith("meta") + if contains_array(self._store, path): + _key = key.rstrip("/") + yield _key if keys_only else (_key, self[key]) + elif recurse and contains_group(self._store, path): + group = self[key] + for i in getattr(group, method)(recurse=recurse): + yield i + else: + dir_name = 'meta/root/' + self._path + sfx = _get_hierarchy_metadata(self._store)['metadata_key_suffix'] + array_sfx = '.array' + sfx + for key in sorted(listdir(self._store, dir_name)): + if key.endswith(array_sfx): + key = key[:-len(array_sfx)] + path = self._key_prefix + key + assert not path.startswith("meta") + if key.endswith('.group' + sfx): + # skip group metadata keys + continue + if contains_array(self._store, path): + _key = key.rstrip("/") + yield _key if keys_only else (_key, self[key]) + elif recurse and contains_group(self._store, path): + group = self[key] + for i in getattr(group, method)(recurse=recurse): + yield i def visitvalues(self, func): """Run ``func`` on each object. @@ -707,7 +840,7 @@ def _create_group_nosync(self, name, overwrite=False): return Group(self._store, path=path, read_only=self._read_only, chunk_store=self._chunk_store, cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer) + synchronizer=self._synchronizer, zarr_version=self._version) def create_groups(self, *names, **kwargs): """Convenience method to create multiple groups in a single call.""" @@ -751,7 +884,7 @@ def _require_group_nosync(self, name, overwrite=False): return Group(self._store, path=path, read_only=self._read_only, chunk_store=self._chunk_store, cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer) + synchronizer=self._synchronizer, zarr_version=self._version) def require_groups(self, *names): """Convenience method to require multiple groups in a single call.""" @@ -1039,9 +1172,10 @@ def move(self, source, dest): # Check that source exists. if not (contains_array(self._store, source) or - contains_group(self._store, source)): + contains_group(self._store, source, explicit_only=False)): raise ValueError('The source, "%s", does not exist.' % source) - if contains_array(self._store, dest) or contains_group(self._store, dest): + if (contains_array(self._store, dest) or + contains_group(self._store, dest, explicit_only=False)): raise ValueError('The dest, "%s", already exists.' % dest) # Ensure groups needed for `dest` exist. @@ -1051,15 +1185,19 @@ def move(self, source, dest): self._write_op(self._move_nosync, source, dest) -def _normalize_store_arg(store, *, clobber=False, storage_options=None, mode=None): +def _normalize_store_arg(store, *, clobber=False, storage_options=None, mode=None, + zarr_version=None): + if zarr_version is None: + zarr_version = getattr(store, '_store_version', 2) if store is None: - return MemoryStore() + return MemoryStore() if zarr_version == 2 else MemoryStoreV3() return normalize_store_arg(store, clobber=clobber, - storage_options=storage_options, mode=mode) + storage_options=storage_options, mode=mode, + zarr_version=zarr_version) def group(store=None, overwrite=False, chunk_store=None, - cache_attrs=True, synchronizer=None, path=None): + cache_attrs=True, synchronizer=None, path=None, *, zarr_version=None): """Create a group. Parameters @@ -1104,20 +1242,29 @@ def group(store=None, overwrite=False, chunk_store=None, """ # handle polymorphic store arg - store = _normalize_store_arg(store) + store = _normalize_store_arg(store, zarr_version=zarr_version) + if zarr_version is None: + zarr_version = getattr(store, '_store_version', 2) + if zarr_version == 3 and path is None: + raise ValueError(f"path must be provided for a v{zarr_version} group") path = normalize_storage_path(path) - # require group - if overwrite or not contains_group(store): + if zarr_version == 2: + requires_init = overwrite or not contains_group(store) + elif zarr_version == 3: + requires_init = overwrite or not contains_group(store, path) + + if requires_init: init_group(store, overwrite=overwrite, chunk_store=chunk_store, path=path) return Group(store, read_only=False, chunk_store=chunk_store, - cache_attrs=cache_attrs, synchronizer=synchronizer, path=path) + cache_attrs=cache_attrs, synchronizer=synchronizer, path=path, + zarr_version=zarr_version) def open_group(store=None, mode='a', cache_attrs=True, synchronizer=None, path=None, - chunk_store=None, storage_options=None): + chunk_store=None, storage_options=None, *, zarr_version=None): """Open a group using file-mode-like semantics. Parameters @@ -1166,11 +1313,22 @@ def open_group(store=None, mode='a', cache_attrs=True, synchronizer=None, path=N # handle polymorphic store arg clobber = mode != "r" store = _normalize_store_arg( - store, clobber=clobber, storage_options=storage_options, mode=mode - ) + store, clobber=clobber, storage_options=storage_options, mode=mode, + zarr_version=zarr_version) + if zarr_version is None: + zarr_version = getattr(store, '_store_version', 2) if chunk_store is not None: chunk_store = _normalize_store_arg(chunk_store, clobber=clobber, storage_options=storage_options) + if not getattr(chunk_store, '_store_version', 2) == zarr_version: + raise ValueError( + "zarr_version of store and chunk_store must match" + ) + + store_version = getattr(store, '_store_version', 2) + if store_version == 3 and path is None: + raise ValueError("path must be supplied to initialize a zarr v3 group") + path = normalize_storage_path(path) # ensure store is initialized @@ -1202,4 +1360,5 @@ def open_group(store=None, mode='a', cache_attrs=True, synchronizer=None, path=N read_only = mode == 'r' return Group(store, read_only=read_only, cache_attrs=cache_attrs, - synchronizer=synchronizer, path=path, chunk_store=chunk_store) + synchronizer=synchronizer, path=path, chunk_store=chunk_store, + zarr_version=zarr_version) diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index 2830be8c38..046995fd3e 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -27,6 +27,10 @@ NestedDirectoryStore, SQLiteStore, ZipStore, array_meta_key, atexit_rmglob, atexit_rmtree, group_meta_key, init_array, init_group) +from zarr.storage import (KVStoreV3, DirectoryStoreV3, # MemoryStoreV3 + FSStoreV3, NestedDirectoryStoreV3, ZipStoreV3, + DBMStoreV3, LMDBStoreV3, SQLiteStoreV3, + LRUStoreCacheV3) from zarr.util import InfoReporter from zarr.tests.util import skip_test_env_var, have_fsspec, abs_container @@ -96,30 +100,51 @@ def test_group_init_errors_2(self): Group(store, chunk_store=chunk_store) store.close() + def _subgroup_path(self, group, path): + path = path.rstrip('/') + absolute = path.startswith('/') + if absolute: + group_path = path + else: + if path: + group_path = '/'.join([group.path, path]) + else: + group_path = path + group_path = group_path.lstrip('/') + group_name = '/' + group_path + return group_path, group_name + def test_create_group(self): g1 = self.create_group() + if g1._version == 2: + path, name = '', '/' + else: + path, name = 'group', '/group' # check root group - assert '' == g1.path - assert '/' == g1.name + assert path == g1.path + assert name == g1.name # create level 1 child group g2 = g1.create_group('foo') + path, name = self._subgroup_path(g1, 'foo') assert isinstance(g2, Group) - assert 'foo' == g2.path - assert '/foo' == g2.name + assert path == g2.path + assert name == g2.name # create level 2 child group g3 = g2.create_group('bar') + path, name = self._subgroup_path(g2, 'bar') assert isinstance(g3, Group) - assert 'foo/bar' == g3.path - assert '/foo/bar' == g3.name + assert path == g3.path + assert name == g3.name # create level 3 child group g4 = g1.create_group('foo/bar/baz') + path, name = self._subgroup_path(g1, 'foo/bar/baz') assert isinstance(g4, Group) - assert 'foo/bar/baz' == g4.path - assert '/foo/bar/baz' == g4.name + assert path == g4.path + assert name == g4.name # create level 3 group via root g5 = g4.create_group('/a/b/c/') @@ -138,17 +163,23 @@ def __str__(self): o = Foo('test/object') go = g1.create_group(o) + path, name = self._subgroup_path(g1, str(o)) assert isinstance(go, Group) - assert 'test/object' == go.path + assert path == go.path go = g1.create_group(b'test/bytes') + path, name = self._subgroup_path(g1, 'test/bytes') assert isinstance(go, Group) - assert 'test/bytes' == go.path + assert path == go.path # test bad keys with pytest.raises(ValueError): g1.create_group('foo') # already exists - with pytest.raises(ValueError): - g1.create_group('a/b/c') # already exists + if g1._version == 2: + with pytest.raises(ValueError): + g1.create_group('a/b/c') # already exists + elif g1._version == 3: + # for v3 'group/a/b/c' does not already exist + g1.create_group('a/b/c') with pytest.raises(ValueError): g4.create_group('/a/b/c') # already exists with pytest.raises(ValueError): @@ -161,9 +192,9 @@ def __str__(self): # multi g6, g7 = g1.create_groups('y', 'z') assert isinstance(g6, Group) - assert g6.path == 'y' + assert g6.path == self._subgroup_path(g1, 'y')[0] assert isinstance(g7, Group) - assert g7.path == 'z' + assert g7.path == self._subgroup_path(g1, 'z')[0] g1.store.close() @@ -172,14 +203,17 @@ def test_require_group(self): # test creation g2 = g1.require_group('foo') + path, name = self._subgroup_path(g1, 'foo') assert isinstance(g2, Group) - assert 'foo' == g2.path + assert path == g2.path g3 = g2.require_group('bar') + path, name = self._subgroup_path(g2, 'bar') assert isinstance(g3, Group) - assert 'foo/bar' == g3.path + assert path == g3.path g4 = g1.require_group('foo/bar/baz') + path, name = self._subgroup_path(g1, 'foo/bar/baz') assert isinstance(g4, Group) - assert 'foo/bar/baz' == g4.path + assert path == g4.path g5 = g4.require_group('/a/b/c/') assert isinstance(g5, Group) assert 'a/b/c' == g5.path @@ -199,33 +233,50 @@ def test_require_group(self): assert g5.store is g5a.store # test path normalization - assert g1.require_group('quux') == g1.require_group('/quux/') + if g1._version == 2: + # TODO: expected behavior for v3 + assert g1.require_group('quux') == g1.require_group('/quux/') # multi g6, g7 = g1.require_groups('y', 'z') assert isinstance(g6, Group) - assert g6.path == 'y' + assert g6.path == self._subgroup_path(g1, 'y')[0] assert isinstance(g7, Group) - assert g7.path == 'z' + assert g7.path == self._subgroup_path(g1, 'z')[0] g1.store.close() + def _dataset_path(self, group, path): + path = path.rstrip('/') + absolute = path.startswith('/') + if absolute: + dataset_path = path + else: + dataset_path = '/'.join([group.path, path]) + dataset_path = dataset_path.lstrip('/') + dataset_name = '/' + dataset_path + return dataset_path, dataset_name + def test_create_dataset(self): g = self.create_group() # create as immediate child - d1 = g.create_dataset('foo', shape=1000, chunks=100) + dpath = 'foo' + d1 = g.create_dataset(dpath, shape=1000, chunks=100) + path, name = self._dataset_path(g, dpath) assert isinstance(d1, Array) assert (1000,) == d1.shape assert (100,) == d1.chunks - assert 'foo' == d1.path - assert '/foo' == d1.name + assert path == d1.path + assert name == d1.name assert g.store is d1.store # create as descendant - d2 = g.create_dataset('/a/b/c/', shape=2000, chunks=200, dtype='i1', + dpath = '/a/b/c/' + d2 = g.create_dataset(dpath, shape=2000, chunks=200, dtype='i1', compression='zlib', compression_opts=9, fill_value=42, order='F') + path, name = self._dataset_path(g, dpath) assert isinstance(d2, Array) assert (2000,) == d2.shape assert (200,) == d2.chunks @@ -234,20 +285,22 @@ def test_create_dataset(self): assert 9 == d2.compressor.level assert 42 == d2.fill_value assert 'F' == d2.order - assert 'a/b/c' == d2.path - assert '/a/b/c' == d2.name + assert path == d2.path + assert name == d2.name assert g.store is d2.store # create with data data = np.arange(3000, dtype='u2') - d3 = g.create_dataset('bar', data=data, chunks=300) + dpath = 'bar' + d3 = g.create_dataset(dpath, data=data, chunks=300) + path, name = self._dataset_path(g, dpath) assert isinstance(d3, Array) assert (3000,) == d3.shape assert (300,) == d3.chunks assert np.dtype('u2') == d3.dtype assert_array_equal(data, d3[:]) - assert 'bar' == d3.path - assert '/bar' == d3.name + assert path == d3.path + assert name == d3.name assert g.store is d3.store # compression arguments handling follows... @@ -290,25 +343,27 @@ def test_require_dataset(self): g = self.create_group() # create - d1 = g.require_dataset('foo', shape=1000, chunks=100, dtype='f4') + dpath = 'foo' + d1 = g.require_dataset(dpath, shape=1000, chunks=100, dtype='f4') d1[:] = np.arange(1000) + path, name = self._dataset_path(g, dpath) assert isinstance(d1, Array) assert (1000,) == d1.shape assert (100,) == d1.chunks assert np.dtype('f4') == d1.dtype - assert 'foo' == d1.path - assert '/foo' == d1.name + assert path == d1.path + assert name == d1.name assert g.store is d1.store assert_array_equal(np.arange(1000), d1[:]) # require - d2 = g.require_dataset('foo', shape=1000, chunks=100, dtype='f4') + d2 = g.require_dataset(dpath, shape=1000, chunks=100, dtype='f4') assert isinstance(d2, Array) assert (1000,) == d2.shape assert (100,) == d2.chunks assert np.dtype('f4') == d2.dtype - assert 'foo' == d2.path - assert '/foo' == d2.name + assert path == d2.path + assert name == d2.name assert g.store is d2.store assert_array_equal(np.arange(1000), d2[:]) assert d1 == d2 @@ -419,7 +474,12 @@ def test_getitem_contains_iterators(self): # setup g1 = self.create_group() g2 = g1.create_group('foo/bar') - d1 = g2.create_dataset('/a/b/c', shape=1000, chunks=100) + if g1._version == 2: + d1 = g2.create_dataset('/a/b/c', shape=1000, chunks=100) + else: + # v3: cannot create a dataset at the root by starting with / + # instead, need to create the dataset on g1 directly + d1 = g1.create_dataset('a/b/c', shape=1000, chunks=100) d1[:] = np.arange(1000) d2 = g1.create_dataset('foo/baz', shape=3000, chunks=300) d2[:] = np.arange(3000) @@ -428,7 +488,13 @@ def test_getitem_contains_iterators(self): assert isinstance(g1['foo'], Group) assert isinstance(g1['foo']['bar'], Group) assert isinstance(g1['foo/bar'], Group) - assert isinstance(g1['/foo/bar/'], Group) + if g1._version == 2: + assert isinstance(g1['/foo/bar/'], Group) + else: + # start or end with / raises KeyError + # TODO: should we fix allow stripping of these on v3? + with pytest.raises(KeyError): + assert isinstance(g1['/foo/bar/'], Group) assert isinstance(g1['foo/baz'], Array) assert g2 == g1['foo/bar'] assert g1['foo']['bar'] == g1['foo/bar'] @@ -454,7 +520,9 @@ def test_getitem_contains_iterators(self): assert 'baz' not in g1 assert 'a/b/c/d' not in g1 assert 'a/z' not in g1 - assert 'quux' not in g1['foo'] + if g1._version == 2: + # TODO: handle implicit group for v3 spec + assert 'quux' not in g1['foo'] # test key errors with pytest.raises(KeyError): @@ -470,12 +538,19 @@ def test_getitem_contains_iterators(self): assert 1 == len(g1['a/b']) # test __iter__, keys() - # currently assumes sorted by key - assert ['a', 'foo'] == list(g1) - assert ['a', 'foo'] == list(g1.keys()) - assert ['bar', 'baz'] == list(g1['foo']) - assert ['bar', 'baz'] == list(g1['foo'].keys()) + if g1._version == 2: + # currently assumes sorted by key + assert ['a', 'foo'] == list(g1) + assert ['a', 'foo'] == list(g1.keys()) + assert ['bar', 'baz'] == list(g1['foo']) + assert ['bar', 'baz'] == list(g1['foo'].keys()) + else: + # v3 is not necessarily sorted by key + assert ['a', 'foo'] == sorted(list(g1)) + assert ['a', 'foo'] == sorted(list(g1.keys())) + assert ['bar', 'baz'] == sorted(list(g1['foo'])) + assert ['bar', 'baz'] == sorted(list(g1['foo'].keys())) assert [] == sorted(g1['foo/bar']) assert [] == sorted(g1['foo/bar'].keys()) @@ -484,6 +559,9 @@ def test_getitem_contains_iterators(self): items = list(g1.items()) values = list(g1.values()) + if g1._version == 3: + # v3 are not automatically sorted by key + items, values = zip(*sorted(zip(items, values), key=lambda x: x[0])) assert 'a' == items[0][0] assert g1['a'] == items[0][1] assert g1['a'] == values[0] @@ -493,6 +571,9 @@ def test_getitem_contains_iterators(self): items = list(g1['foo'].items()) values = list(g1['foo'].values()) + if g1._version == 3: + # v3 are not automatically sorted by key + items, values = zip(*sorted(zip(items, values), key=lambda x: x[0])) assert 'bar' == items[0][0] assert g1['foo']['bar'] == items[0][1] assert g1['foo']['bar'] == values[0] @@ -501,11 +582,16 @@ def test_getitem_contains_iterators(self): assert g1['foo']['baz'] == values[1] # test array_keys(), arrays(), group_keys(), groups() - # currently assumes sorted by key - assert ['a', 'foo'] == list(g1.group_keys()) groups = list(g1.groups()) arrays = list(g1.arrays()) + if g1._version == 2: + # currently assumes sorted by key + assert ['a', 'foo'] == list(g1.group_keys()) + else: + assert ['a', 'foo'] == sorted(list(g1.group_keys())) + groups = sorted(groups) + arrays = sorted(arrays) assert 'a' == groups[0][0] assert g1['a'] == groups[0][1] assert 'foo' == groups[1][0] @@ -517,6 +603,9 @@ def test_getitem_contains_iterators(self): assert ['baz'] == list(g1['foo'].array_keys()) groups = list(g1['foo'].groups()) arrays = list(g1['foo'].arrays()) + if g1._version == 3: + groups = sorted(groups) + arrays = sorted(arrays) assert 'bar' == groups[0][0] assert g1['foo']['bar'] == groups[0][1] assert 'baz' == arrays[0][0] @@ -537,21 +626,27 @@ def visitor4(name, obj): del items[:] g1.visitvalues(visitor2) - assert [ + expected_items = [ "a", "a/b", "a/b/c", "foo", "foo/bar", "foo/baz", - ] == items + ] + if g1._version == 3: + expected_items = [g1.path + '/' + i for i in expected_items] + assert expected_items == items del items[:] g1["foo"].visitvalues(visitor2) - assert [ + expected_items = [ "foo/bar", "foo/baz", - ] == items + ] + if g1._version == 3: + expected_items = [g1.path + '/' + i for i in expected_items] + assert expected_items == items del items[:] g1.visit(visitor3) @@ -627,6 +722,9 @@ def visitor0(val, *args): # noinspection PyUnusedLocal def visitor1(val, *args): name = getattr(val, "path", val) + if name.startswith('group/'): + # strip the group path for v3 + name = name[6:] if name == "a/b/c": return True @@ -664,6 +762,9 @@ def test_iterators_recurse(self): d3 = g2.create_dataset('zab', shape=2000, chunks=200) d3[:] = np.arange(2000) + if g1._version == 3: + pytest.skip("TODO: fix for V3") + # test recursive array_keys array_keys = list(g1['foo'].array_keys(recurse=False)) array_keys_recurse = list(g1['foo'].array_keys(recurse=True)) @@ -762,9 +863,13 @@ def test_move(self): g2.move("bar", "/bar") assert "foo2" in g assert "foo2/bar" not in g - assert "bar" in g + if g2._version == 2: + # TODO: how to access element created outside of group.path in v3? + assert "bar" in g assert isinstance(g["foo2"], Group) - assert_array_equal(data, g["bar"]) + if g2._version == 2: + # TODO: how to access element created outside of group.path in v3? + assert_array_equal(data, g["bar"]) with pytest.raises(ValueError): g2.move("bar", "bar2") @@ -841,6 +946,9 @@ def test_paths(self): g1 = self.create_group() g2 = g1.create_group('foo/bar') + if g1._version == 3: + pytest.skip("TODO: update class for v3") + assert g1 == g1['/'] assert g1 == g1['//'] assert g1 == g1['///'] @@ -893,7 +1001,9 @@ def test_pickle(self): assert name == g2.name assert n == len(g2) assert keys == list(g2) - assert isinstance(g2['foo'], Group) + if g2._version == 2: + # TODO: handle implicit group for v3 + assert isinstance(g2['foo'], Group) assert isinstance(g2['foo/bar'], Array) g2.store.close() @@ -921,6 +1031,57 @@ def test_group_init_from_dict(chunk_dict): assert chunk_store is not g.chunk_store +# noinspection PyStatementEffect +class TestGroupV3(TestGroup, unittest.TestCase): + + @staticmethod + def create_store(): + # can be overridden in sub-classes + return KVStoreV3(dict()), None + + def create_group(self, store=None, path='group', read_only=False, + chunk_store=None, synchronizer=None): + # can be overridden in sub-classes + if store is None: + store, chunk_store = self.create_store() + init_group(store, path=path, chunk_store=chunk_store) + g = Group(store, path=path, read_only=read_only, + chunk_store=chunk_store, synchronizer=synchronizer) + return g + + def test_group_init_1(self): + store, chunk_store = self.create_store() + g = self.create_group(store, chunk_store=chunk_store) + assert store is g.store + if chunk_store is None: + assert store is g.chunk_store + else: + assert chunk_store is g.chunk_store + assert not g.read_only + # different path/name in v3 case + assert 'group' == g.path + assert '/group' == g.name + assert 'group' == g.basename + + assert isinstance(g.attrs, Attributes) + g.attrs['foo'] = 'bar' + assert g.attrs['foo'] == 'bar' + + assert isinstance(g.info, InfoReporter) + assert isinstance(repr(g.info), str) + assert isinstance(g.info._repr_html_(), str) + store.close() + + def test_group_init_errors_2(self): + store, chunk_store = self.create_store() + path = 'tmp' + init_array(store, path=path, shape=1000, chunks=100, chunk_store=chunk_store) + # array blocks group + with pytest.raises(ValueError): + Group(store, path=path, chunk_store=chunk_store) + store.close() + + class TestGroupWithMemoryStore(TestGroup): @staticmethod @@ -928,6 +1089,14 @@ def create_store(): return MemoryStore(), None +# TODO: fix MemoryStoreV3 _get_parent, etc. +# # noinspection PyStatementEffect +# class TestGroupV3WithMemoryStore(TestGroupWithMemoryStore, TestGroupV3): + +# @staticmethod +# def create_store(): +# return MemoryStoreV3(), None + class TestGroupWithDirectoryStore(TestGroup): @staticmethod @@ -938,6 +1107,16 @@ def create_store(): return store, None +class TestGroupV3WithDirectoryStore(TestGroupWithDirectoryStore, TestGroupV3): + + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = DirectoryStoreV3(path) + return store, None + + @skip_test_env_var("ZARR_TEST_ABS") class TestGroupWithABSStore(TestGroup): @@ -953,6 +1132,8 @@ def test_pickle(self): # internal attribute on ContainerClient isn't serializable for py36 and earlier super().test_pickle() +# TODO TestGroupV3WithABSStore(TestGroup): + class TestGroupWithNestedDirectoryStore(TestGroup): @@ -964,6 +1145,16 @@ def create_store(): return store, None +class TestGroupV3WithNestedDirectoryStore(TestGroupWithNestedDirectoryStore, TestGroupV3): + + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = NestedDirectoryStoreV3(path) + return store, None + + @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") class TestGroupWithFSStore(TestGroup): @@ -986,6 +1177,29 @@ def test_round_trip_nd(self): np.testing.assert_array_equal(h[name][:], data) +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestGroupV3WithFSStore(TestGroupWithFSStore, TestGroupV3): + + @staticmethod + def create_store(): + pytest.skip("TODO: Fix for V3") + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = FSStoreV3(path) + return store, None + + def test_round_trip_nd(self): + data = np.arange(1000).reshape(10, 10, 10) + name = 'raw' + + store, _ = self.create_store() + f = open_group(store, path='group', mode='w') + f.create_dataset(name, data=data, chunks=(5, 5, 5), + compressor=None) + h = open_group(store, path='group', mode='r') + np.testing.assert_array_equal(h[name][:], data) + + @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") class TestGroupWithNestedFSStore(TestGroupWithFSStore): @@ -1009,6 +1223,30 @@ def test_inconsistent_dimension_separator(self): compressor=None, dimension_separator='.') +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestGroupV3WithNestedFSStore(TestGroupV3WithFSStore): + + @staticmethod + def create_store(): + pytest.skip("TODO: Fix for V3") + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = FSStoreV3(path, key_separator='/', auto_mkdir=True) + return store, None + + def test_inconsistent_dimension_separator(self): + data = np.arange(1000).reshape(10, 10, 10) + name = 'raw' + + store, _ = self.create_store() + f = open_group(store, path='group', mode='w') + + # cannot specify dimension_separator that conflicts with the store + with pytest.raises(ValueError): + f.create_dataset(name, data=data, chunks=(5, 5, 5), + compressor=None, dimension_separator='.') + + class TestGroupWithZipStore(TestGroup): @staticmethod @@ -1036,6 +1274,16 @@ def test_move(self): pass +class TestGroupV3WithZipStore(TestGroupWithZipStore, TestGroupV3): + + @staticmethod + def create_store(): + path = tempfile.mktemp(suffix='.zip') + atexit.register(os.remove, path) + store = ZipStoreV3(path) + return store, None + + class TestGroupWithDBMStore(TestGroup): @staticmethod @@ -1046,6 +1294,16 @@ def create_store(): return store, None +class TestGroupV3WithDBMStore(TestGroupWithDBMStore, TestGroupV3): + + @staticmethod + def create_store(): + path = tempfile.mktemp(suffix='.anydbm') + atexit.register(atexit_rmglob, path + '*') + store = DBMStoreV3(path, flag='n') + return store, None + + class TestGroupWithDBMStoreBerkeleyDB(TestGroup): @staticmethod @@ -1057,6 +1315,17 @@ def create_store(): return store, None +class TestGroupV3WithDBMStoreBerkeleyDB(TestGroupWithDBMStoreBerkeleyDB, TestGroupV3): + + @staticmethod + def create_store(): + bsddb3 = pytest.importorskip("bsddb3") + path = tempfile.mktemp(suffix='.dbm') + atexit.register(os.remove, path) + store = DBMStoreV3(path, flag='n', open=bsddb3.btopen) + return store, None + + class TestGroupWithLMDBStore(TestGroup): @staticmethod @@ -1068,6 +1337,17 @@ def create_store(): return store, None +class TestGroupV3WithLMDBStore(TestGroupWithLMDBStore, TestGroupV3): + + @staticmethod + def create_store(): + pytest.importorskip("lmdb") + path = tempfile.mktemp(suffix='.lmdb') + atexit.register(atexit_rmtree, path) + store = LMDBStoreV3(path) + return store, None + + class TestGroupWithSQLiteStore(TestGroup): def create_store(self): @@ -1078,6 +1358,16 @@ def create_store(self): return store, None +class TestGroupV3WithSQLiteStore(TestGroupWithSQLiteStore, TestGroupV3): + + def create_store(self): + pytest.importorskip("sqlite3") + path = tempfile.mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStoreV3(path) + return store, None + + class TestGroupWithChunkStore(TestGroup): @staticmethod @@ -1109,6 +1399,41 @@ def test_chunk_store(self): assert expect == actual +class TestGroupV3WithChunkStore(TestGroupWithChunkStore, TestGroupV3): + + @staticmethod + def create_store(): + return KVStoreV3(dict()), KVStoreV3(dict()) + + def test_chunk_store(self): + # setup + store, chunk_store = self.create_store() + path = 'group1' + g = self.create_group(store, path=path, chunk_store=chunk_store) + + # check attributes + assert store is g.store + assert chunk_store is g.chunk_store + + # create array + a = g.zeros('foo', shape=100, chunks=10) + assert store is a.store + assert chunk_store is a.chunk_store + a[:] = np.arange(100) + assert_array_equal(np.arange(100), a[:]) + + # check store keys + group_key = 'meta/root/' + path + '.group.json' + array_key = 'meta/root/' + path + '/foo' + '.array.json' + expect = sorted([group_key, array_key, 'zarr.json']) + actual = sorted(store.keys()) + assert expect == actual + expect = ['data/root/' + path + '/foo/c' + str(i) for i in range(10)] + expect += ['zarr.json'] + actual = sorted(chunk_store.keys()) + assert expect == actual + + class TestGroupWithStoreCache(TestGroup): @staticmethod @@ -1117,44 +1442,75 @@ def create_store(): return store, None -def test_group(): +class TestGroupV3WithStoreCache(TestGroupWithStoreCache, TestGroupV3): + + @staticmethod + def create_store(): + store = LRUStoreCacheV3(dict(), max_size=None) + return store, None + + +@pytest.mark.parametrize('zarr_version', [2, 3]) +def test_group(zarr_version): # test the group() convenience function # basic usage - g = group() + if zarr_version == 2: + g = group() + assert '' == g.path + assert '/' == g.name + else: + g = group(path='group1', zarr_version=zarr_version) + assert 'group1' == g.path + assert '/group1' == g.name assert isinstance(g, Group) - assert '' == g.path - assert '/' == g.name # usage with custom store - store = KVStore(dict()) - g = group(store=store) + if zarr_version == 2: + store = KVStore(dict()) + path = None + else: + store = KVStoreV3(dict()) + path = 'foo' + g = group(store=store, path=path) assert isinstance(g, Group) assert store is g.store # overwrite behaviour - store = KVStore(dict()) - init_array(store, shape=100, chunks=10) + if zarr_version == 2: + store = KVStore(dict()) + path = None + else: + store = KVStoreV3(dict()) + path = 'foo' + init_array(store, path=path, shape=100, chunks=10) with pytest.raises(ValueError): - group(store) - g = group(store, overwrite=True) + group(store, path=path) + g = group(store, path=path, overwrite=True) assert isinstance(g, Group) assert store is g.store -def test_open_group(): +@pytest.mark.parametrize('zarr_version', [2, 3]) +def test_open_group(zarr_version): # test the open_group() convenience function store = 'data/group.zarr' + expected_store_type = DirectoryStore if zarr_version == 2 else DirectoryStoreV3 + # mode == 'w' - g = open_group(store, mode='w') + path = None if zarr_version == 2 else 'group1' + g = open_group(store, path=path, mode='w', zarr_version=zarr_version) assert isinstance(g, Group) - assert isinstance(g.store, DirectoryStore) + assert isinstance(g.store, expected_store_type) assert 0 == len(g) g.create_groups('foo', 'bar') assert 2 == len(g) + # TODO: update the r, r+ test case here for zarr_version == 3 after + # open_array has StoreV3 support + # mode in 'r', 'r+' open_array('data/array.zarr', shape=100, chunks=10, mode='w') for mode in 'r', 'r+': @@ -1175,37 +1531,40 @@ def test_open_group(): # mode == 'a' shutil.rmtree(store) - g = open_group(store, mode='a') + g = open_group(store, path=path, mode='a', zarr_version=zarr_version) assert isinstance(g, Group) - assert isinstance(g.store, DirectoryStore) + assert isinstance(g.store, expected_store_type) assert 0 == len(g) g.create_groups('foo', 'bar') assert 2 == len(g) with pytest.raises(ValueError): - open_group('data/array.zarr', mode='a') + open_group('data/array.zarr', mode='a', zarr_version=zarr_version) # mode in 'w-', 'x' for mode in 'w-', 'x': shutil.rmtree(store) - g = open_group(store, mode=mode) + g = open_group(store, path=path, mode=mode, zarr_version=zarr_version) assert isinstance(g, Group) - assert isinstance(g.store, DirectoryStore) + assert isinstance(g.store, expected_store_type) assert 0 == len(g) g.create_groups('foo', 'bar') assert 2 == len(g) with pytest.raises(ValueError): - open_group(store, mode=mode) - with pytest.raises(ValueError): - open_group('data/array.zarr', mode=mode) + open_group(store, path=path, mode=mode, zarr_version=zarr_version) + if zarr_version == 2: + with pytest.raises(ValueError): + open_group('data/array.zarr', mode=mode) # open with path - g = open_group(store, path='foo/bar') + g = open_group(store, path='foo/bar', zarr_version=zarr_version) assert isinstance(g, Group) assert 'foo/bar' == g.path -def test_group_completions(): - g = group() +@pytest.mark.parametrize('zarr_version', [2, 3]) +def test_group_completions(zarr_version): + path = None if zarr_version == 2 else 'group1' + g = group(path=path, zarr_version=zarr_version) d = dir(g) assert 'foo' not in d assert 'bar' not in d @@ -1233,8 +1592,10 @@ def test_group_completions(): assert '456' not in d # not valid identifier -def test_group_key_completions(): - g = group() +@pytest.mark.parametrize('zarr_version', [2, 3]) +def test_group_key_completions(zarr_version): + path = None if zarr_version == 2 else 'group1' + g = group(path=path, zarr_version=zarr_version) d = dir(g) # noinspection PyProtectedMember k = g._ipython_key_completions_() @@ -1308,9 +1669,11 @@ def _check_tree(g, expect_bytes, expect_text): isinstance(widget, ipytree.Tree) -def test_tree(): +@pytest.mark.parametrize('zarr_version', [2, 3]) +def test_tree(zarr_version): # setup - g1 = group() + path = None if zarr_version == 2 else 'group1' + g1 = group(path=path, zarr_version=zarr_version) g2 = g1.create_group('foo') g3 = g1.create_group('bar') g3.create_group('baz') @@ -1318,20 +1681,38 @@ def test_tree(): g5.create_dataset('baz', shape=100, chunks=10) # test root group - expect_bytes = textwrap.dedent("""\ - / - +-- bar - | +-- baz - | +-- quux - | +-- baz (100,) float64 - +-- foo""").encode() - expect_text = textwrap.dedent("""\ - / - ├── bar - │ ├── baz - │ └── quux - │ └── baz (100,) float64 - └── foo""") + if zarr_version == 2: + expect_bytes = textwrap.dedent("""\ + / + +-- bar + | +-- baz + | +-- quux + | +-- baz (100,) float64 + +-- foo""").encode() + expect_text = textwrap.dedent("""\ + / + ├── bar + │ ├── baz + │ └── quux + │ └── baz (100,) float64 + └── foo""") + else: + # Almost the same as for v2, but has a path name and the + # subgroups are not necessarily sorted alphabetically. + expect_bytes = textwrap.dedent("""\ + group1 + +-- foo + +-- bar + +-- baz + +-- quux + +-- baz (100,) float64""").encode() + expect_text = textwrap.dedent("""\ + group1 + ├── foo + └── bar + ├── baz + └── quux + └── baz (100,) float64""") _check_tree(g1, expect_bytes, expect_text) # test different group