diff --git a/docs/api/convenience.rst b/docs/api/convenience.rst index 51997a4dc2..a70a90ce7c 100644 --- a/docs/api/convenience.rst +++ b/docs/api/convenience.rst @@ -10,3 +10,5 @@ Convenience functions (``zarr.convenience``) .. autofunction:: copy_all .. autofunction:: copy_store .. autofunction:: tree +.. autofunction:: consolidate_metadata +.. autofunction:: open_consolidated diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 2365359fa9..74801d3115 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -27,6 +27,8 @@ Storage (``zarr.storage``) .. automethod:: invalidate_values .. automethod:: invalidate_keys +.. autoclass:: ConsolidatedMetadataStore + .. autofunction:: init_array .. autofunction:: init_group .. autofunction:: contains_array diff --git a/docs/release.rst b/docs/release.rst index 7968840cb0..96ac7c8f2f 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -9,6 +9,13 @@ Release notes Enhancements ~~~~~~~~~~~~ +* Add "consolidated" metadata as an experimental feature: use + :func:`zarr.convenience.consolidate_metadata` to copy all metadata from the various + metadata keys within a dataset hierarchy under a single key, and + :func:`zarr.convenience.open_consolidated` to use this single key. This can greatly + cut down the number of calls to the storage backend, and so remove a lot of overhead + for reading remote data. By :user:`Martin Durant `, :issue:`268`. + * Support has been added for structured arrays with sub-array shape and/or nested fields. By :user:`Tarik Onalan `, :issue:`111`, :issue:`296`. diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 5c090669ce..606b5acef5 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -778,9 +778,11 @@ chunk size, which will reduce the number of chunks and thus reduce the number of round-trips required to retrieve data for an array (and thus reduce the impact of network latency). Another option is to try to increase the compression ratio by changing compression options or trying a different compressor (which will reduce the impact of -limited network bandwidth). As of version 2.2, Zarr also provides the -:class:`zarr.storage.LRUStoreCache` which can be used to implement a local in-memory cache -layer over a remote store. E.g.:: +limited network bandwidth). + +As of version 2.2, Zarr also provides the :class:`zarr.storage.LRUStoreCache` +which can be used to implement a local in-memory cache layer over a remote +store. E.g.:: >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) @@ -797,13 +799,51 @@ layer over a remote store. E.g.:: b'Hello from the cloud!' 0.0009490990014455747 -If you are still experiencing poor performance with distributed/cloud storage, please -raise an issue on the GitHub issue tracker with any profiling data you can provide, as -there may be opportunities to optimise further either within Zarr or within the mapping -interface to the storage. +If you are still experiencing poor performance with distributed/cloud storage, +please raise an issue on the GitHub issue tracker with any profiling data you +can provide, as there may be opportunities to optimise further either within +Zarr or within the mapping interface to the storage. .. _tutorial_copy: +Consolidating metadata +~~~~~~~~~~~~~~~~~~~~~~ + +(This is an experimental feature.) + +Since there is a significant overhead for every connection to a cloud object +store such as S3, the pattern described in the previous section may incur +significant latency while scanning the metadata of the dataset hierarchy, even +though each individual metadata object is small. For cases such as these, once +the data are static and can be regarded as read-only, at least for the +metadata/structure of the dataset hierarchy, the many metadata objects can be +consolidated into a single one via +:func:`zarr.convenience.consolidate_metadata`. Doing this can greatly increase +the speed of reading the dataset metadata, e.g.:: + + >>> zarr.consolidate_metadata(store) # doctest: +SKIP + +This creates a special key with a copy of all of the metadata from all of the +metadata objects in the store. + +Later, to open a Zarr store with consolidated metadata, use +:func:`zarr.convenience.open_consolidated`, e.g.:: + + >>> root = zarr.open_consolidated(store) # doctest: +SKIP + +This uses the special key to read all of the metadata in a single call to the +backend storage. + +Note that, the hierarchy could still be opened in the normal way and altered, +causing the consolidated metadata to become out of sync with the real state of +the dataset hierarchy. In this case, +:func:`zarr.convenience.consolidate_metadata` would need to be called again. + +To protect against consolidated metadata accidentally getting out of sync, the +root group returned by :func:`zarr.convenience.open_consolidated` is read-only +for the metadata, meaning that no new groups or arrays can be created, and +arrays cannot be resized. However, data values with arrays can still be updated. + Copying/migrating data ---------------------- diff --git a/requirements_dev.txt b/requirements_dev.txt index d495e04bfd..23de426def 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -46,6 +46,7 @@ python-dateutil==2.7.3 readme-renderer==22.0 requests==2.19.1 requests-toolbelt==0.8.0 +setuptools-scm==3.1.0 s3fs==0.1.6 s3transfer==0.1.13 scandir==1.9.0 diff --git a/zarr/__init__.py b/zarr/__init__.py index 56d060fdac..cf34d3d427 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -12,6 +12,7 @@ from zarr.sync import ThreadSynchronizer, ProcessSynchronizer from zarr.codecs import * from zarr.convenience import (open, save, save_array, save_group, load, copy_store, - copy, copy_all, tree) + copy, copy_all, tree, consolidate_metadata, + open_consolidated) from zarr.errors import CopyError, MetadataError, PermissionError from zarr.version import version as __version__ diff --git a/zarr/attrs.py b/zarr/attrs.py index 6d74d6479a..21cb77bc10 100644 --- a/zarr/attrs.py +++ b/zarr/attrs.py @@ -4,8 +4,8 @@ from collections import MutableMapping -from zarr.compat import text_type from zarr.errors import PermissionError +from zarr.meta import parse_metadata class Attributes(MutableMapping): @@ -43,7 +43,7 @@ def _get_nosync(self): except KeyError: d = dict() else: - d = json.loads(text_type(data, 'ascii')) + d = parse_metadata(data) return d def asdict(self): diff --git a/zarr/compat.py b/zarr/compat.py index 9be3384123..117a8edf59 100644 --- a/zarr/compat.py +++ b/zarr/compat.py @@ -19,6 +19,8 @@ class PermissionError(Exception): def OrderedDict_move_to_end(od, key): od[key] = od.pop(key) + from collections import Mapping + else: # pragma: py2 no cover @@ -29,3 +31,5 @@ def OrderedDict_move_to_end(od, key): def OrderedDict_move_to_end(od, key): od.move_to_end(key) + + from collections.abc import Mapping diff --git a/zarr/convenience.py b/zarr/convenience.py index 19de7b2826..1bb99c92e4 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -15,15 +15,16 @@ from zarr.errors import err_path_not_found, CopyError from zarr.util import normalize_storage_path, TreeViewer, buffer_size from zarr.compat import PY2, text_type +from zarr.meta import ensure_str, json_dumps # noinspection PyShadowingBuiltins -def open(store, mode='a', **kwargs): +def open(store=None, mode='a', **kwargs): """Convenience function to open a group or array using file-mode-like semantics. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system or name of zip file. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means @@ -31,12 +32,17 @@ def open(store, mode='a', **kwargs): exist); 'w' means create (overwrite if exists); 'w-' means create (fail if exists). **kwargs - Additional parameters are passed through to :func:`zarr.open_array` or - :func:`zarr.open_group`. + Additional parameters are passed through to :func:`zarr.creation.open_array` or + :func:`zarr.hierarchy.open_group`. + + Returns + ------- + z : :class:`zarr.core.Array` or :class:`zarr.hierarchy.Group` + Array or group, depending on what exists in the given store. See Also -------- - zarr.open_array, zarr.open_group + zarr.creation.open_array, zarr.hierarchy.open_group Examples -------- @@ -68,7 +74,8 @@ def open(store, mode='a', **kwargs): path = kwargs.get('path', None) # handle polymorphic store arg - store = normalize_store_arg(store, clobber=(mode == 'w')) + clobber = mode == 'w' + store = normalize_store_arg(store, clobber=clobber) path = normalize_storage_path(path) if mode in {'w', 'w-', 'x'}: @@ -1069,3 +1076,110 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) return n_copied, n_skipped, n_bytes_copied + + +def consolidate_metadata(store, metadata_key='.zmetadata'): + """ + Consolidate all metadata for groups and arrays within the given store + into a single resource and put it under the given key. + + This produces a single object in the backend store, containing all the + metadata read from all the zarr-related keys that can be found. After + metadata have been consolidated, use :func:`open_consolidated` to open + the root group in optimised, read-only mode, using the consolidated + metadata to reduce the number of read operations on the backend store. + + Note, that if the metadata in the store is changed after this + consolidation, then the metadata read by :func:`open_consolidated` + would be incorrect unless this function is called again. + + .. note:: This is an experimental feature. + + Parameters + ---------- + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + metadata_key : str + Key to put the consolidated metadata under. + + Returns + ------- + g : :class:`zarr.hierarchy.Group` + Group instance, opened with the new consolidated metadata. + + See Also + -------- + open_consolidated + + """ + import json + + store = normalize_store_arg(store) + + def is_zarr_key(key): + return (key.endswith('.zarray') or key.endswith('.zgroup') or + key.endswith('.zattrs')) + + out = { + 'zarr_consolidated_format': 1, + 'metadata': { + key: json.loads(ensure_str(store[key])) + for key in store if is_zarr_key(key) + } + } + store[metadata_key] = json_dumps(out).encode() + return open_consolidated(store, metadata_key=metadata_key) + + +def open_consolidated(store, metadata_key='.zmetadata', mode='r+'): + """Open group using metadata previously consolidated into a single key. + + This is an optimised method for opening a Zarr group, where instead of + traversing the group/array hierarchy by accessing the metadata keys at + each level, a single key contains all of the metadata for everything. + For remote data sources where the overhead of accessing a key is large + compared to the time to read data. + + The group accessed must have already had its metadata consolidated into a + single key using the function :func:`consolidate_metadata`. + + This optimised method only works in modes which do not change the + metadata, although the data may still be written/updated. + + Parameters + ---------- + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + metadata_key : str + Key to read the consolidated metadata from. The default (.zmetadata) + corresponds to the default used by :func:`consolidate_metadata`. + mode : {'r', 'r+'}, optional + Persistence mode: 'r' means read only (must exist); 'r+' means + read/write (must exist) although only writes to data are allowed, + changes to metadata including creation of new arrays or group + are not allowed. + + Returns + ------- + g : :class:`zarr.hierarchy.Group` + Group instance, opened with the consolidated metadata. + + See Also + -------- + consolidate_metadata + + """ + + from .storage import ConsolidatedMetadataStore + + # normalize parameters + store = normalize_store_arg(store) + if mode not in {'r', 'r+'}: + raise ValueError("invalid mode, expected either 'r' or 'r+'; found {!r}" + .format(mode)) + + # setup metadata sotre + meta_store = ConsolidatedMetadataStore(store, metadata_key=metadata_key) + + # pass through + return open(store=meta_store, chunk_store=store, mode=mode) diff --git a/zarr/core.py b/zarr/core.py index 00ad269557..b4da45cd99 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -165,6 +165,9 @@ def _load_metadata_nosync(self): if config is None: self._compressor = None else: + # temporary workaround for + # https://github.com/zarr-developers/numcodecs/issues/78 + config = dict(config) self._compressor = get_codec(config) # setup filters diff --git a/zarr/creation.py b/zarr/creation.py index 49b4a9d2ea..0184a4a5da 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -346,15 +346,15 @@ def array(data, **kwargs): return z -def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor='default', - fill_value=0, order='C', synchronizer=None, filters=None, - cache_metadata=True, cache_attrs=True, path=None, object_codec=None, - **kwargs): +def open_array(store=None, mode='a', shape=None, chunks=True, dtype=None, + compressor='default', fill_value=0, order='C', synchronizer=None, + filters=None, cache_metadata=True, cache_attrs=True, path=None, + object_codec=None, chunk_store=None, **kwargs): """Open an array using file-mode-like semantics. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system or name of zip file. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means @@ -391,6 +391,8 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= Array path within store. object_codec : Codec, optional A codec to encode object arrays, only needed if dtype=object. + chunk_store : MutableMapping or string, optional + Store or path to directory in file system or name of zip file. Returns ------- @@ -426,7 +428,10 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= # a : read/write if exists, create otherwise (default) # handle polymorphic store arg - store = normalize_store_arg(store, clobber=(mode == 'w')) + clobber = mode == 'w' + store = normalize_store_arg(store, clobber=clobber) + if chunk_store is not None: + chunk_store = normalize_store_arg(chunk_store, clobber=clobber) path = normalize_storage_path(path) # API compatibility with h5py @@ -448,7 +453,7 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, overwrite=True, path=path, - object_codec=object_codec) + object_codec=object_codec, chunk_store=chunk_store) elif mode == 'a': if contains_group(store, path=path): @@ -457,7 +462,7 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, path=path, - object_codec=object_codec) + object_codec=object_codec, chunk_store=chunk_store) elif mode in ['w-', 'x']: if contains_group(store, path=path): @@ -468,14 +473,15 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, path=path, - object_codec=object_codec) + object_codec=object_codec, chunk_store=chunk_store) # determine read only status read_only = mode == 'r' # instantiate array z = Array(store, read_only=read_only, synchronizer=synchronizer, - cache_metadata=cache_metadata, cache_attrs=cache_attrs, path=path) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, path=path, + chunk_store=chunk_store) return z diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index e9565caa13..17821130eb 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -91,7 +91,6 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): - self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) @@ -1059,12 +1058,13 @@ def group(store=None, overwrite=False, chunk_store=None, cache_attrs=cache_attrs, synchronizer=synchronizer, path=path) -def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): +def open_group(store=None, mode='a', cache_attrs=True, synchronizer=None, path=None, + chunk_store=None): """Open a group using file-mode-like semantics. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system or name of zip file. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means @@ -1079,6 +1079,8 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): Array synchronizer. path : string, optional Group path within store. + chunk_store : MutableMapping or string, optional + Store or path to directory in file system or name of zip file. Returns ------- @@ -1102,6 +1104,8 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): # handle polymorphic store arg store = _normalize_store_arg(store) + if chunk_store is not None: + chunk_store = _normalize_store_arg(chunk_store) path = normalize_storage_path(path) # ensure store is initialized @@ -1113,13 +1117,13 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): err_group_not_found(path) elif mode == 'w': - init_group(store, overwrite=True, path=path) + init_group(store, overwrite=True, path=path, chunk_store=chunk_store) elif mode == 'a': if contains_array(store, path=path): err_contains_array(path) if not contains_group(store, path=path): - init_group(store, path=path) + init_group(store, path=path, chunk_store=chunk_store) elif mode in ['w-', 'x']: if contains_array(store, path=path): @@ -1127,10 +1131,10 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): elif contains_group(store, path=path): err_contains_group(path) else: - init_group(store, path=path) + init_group(store, path=path, chunk_store=chunk_store) # determine read only status read_only = mode == 'r' return Group(store, read_only=read_only, cache_attrs=cache_attrs, - synchronizer=synchronizer, path=path) + synchronizer=synchronizer, path=path, chunk_store=chunk_store) diff --git a/zarr/meta.py b/zarr/meta.py index 291e5c6643..9ce580eff2 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -7,14 +7,14 @@ import numpy as np -from zarr.compat import PY2, binary_type +from zarr.compat import PY2, binary_type, Mapping from zarr.errors import MetadataError ZARR_FORMAT = 2 -def _ensure_str(s): +def ensure_str(s): if PY2: # pragma: py3 no cover # noinspection PyUnresolvedReferences if isinstance(s, buffer): # noqa @@ -27,12 +27,40 @@ def _ensure_str(s): return s +def json_dumps(o): + """Write JSON in a consistent, human-readable way.""" + return json.dumps(o, indent=4, sort_keys=True, ensure_ascii=True, + separators=(',', ': ')) + + +def parse_metadata(s): + + # Here we allow that a store may return an already-parsed metadata object, + # or a string of JSON that we will parse here. We allow for an already-parsed + # object to accommodate a consolidated metadata store, where all the metadata for + # all groups and arrays will already have been parsed from JSON. + + if isinstance(s, Mapping): + # assume metadata has already been parsed into a mapping object + meta = s + + else: + # assume metadata needs to be parsed as JSON + s = ensure_str(s) + meta = json.loads(s) + + return meta + + def decode_array_metadata(s): - s = _ensure_str(s) - meta = json.loads(s) + meta = parse_metadata(s) + + # check metadata format zarr_format = meta.get('zarr_format', None) if zarr_format != ZARR_FORMAT: raise MetadataError('unsupported zarr format: %s' % zarr_format) + + # extract array metadata fields try: dtype = decode_dtype(meta['dtype']) fill_value = decode_fill_value(meta['fill_value'], dtype) @@ -67,8 +95,7 @@ def encode_array_metadata(meta): order=meta['order'], filters=meta['filters'], ) - s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True, - separators=(',', ': ')) + s = json_dumps(meta) b = s.encode('ascii') return b @@ -98,14 +125,14 @@ def decode_dtype(d): def decode_group_metadata(s): - s = _ensure_str(s) - meta = json.loads(s) + meta = parse_metadata(s) + + # check metadata format version zarr_format = meta.get('zarr_format', None) if zarr_format != ZARR_FORMAT: raise MetadataError('unsupported zarr format: %s' % zarr_format) - meta = dict( - zarr_format=ZARR_FORMAT, - ) + + meta = dict(zarr_format=zarr_format) return meta @@ -115,7 +142,7 @@ def encode_group_metadata(meta=None): meta = dict( zarr_format=ZARR_FORMAT, ) - s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True) + s = json_dumps(meta) b = s.encode('ascii') return b diff --git a/zarr/storage.py b/zarr/storage.py index a945b1a932..6720b42d12 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -24,6 +24,7 @@ import atexit import re import sys +import json import multiprocessing from threading import Lock, RLock import glob @@ -40,7 +41,7 @@ from zarr.compat import PY2, binary_type, OrderedDict_move_to_end from numcodecs.registry import codec_registry from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, - err_fspath_exists_notdir, err_read_only) + err_fspath_exists_notdir, err_read_only, MetadataError) array_meta_key = '.zarray' @@ -1892,3 +1893,82 @@ def __delitem__(self, key): with self._mutex: self._invalidate_keys() self._invalidate_value(key) + + +class ConsolidatedMetadataStore(MutableMapping): + """A layer over other storage, where the metadata has been consolidated into + a single key. + + The purpose of this class, is to be able to get all of the metadata for + a given dataset in a single read operation from the underlying storage. + See :func:`zarr.convenience.consolidate_metadata` for how to create this + single metadata key. + + This class loads from the one key, and stores the data in a dict, so that + accessing the keys no longer requires operations on the backend store. + + This class is read-only, and attempts to change the dataset metadata will + fail, but changing the data is possible. If the backend storage is changed + directly, then the metadata stored here could become obsolete, and + :func:`zarr.convenience.consolidate_metadata` should be called again and the class + re-invoked. The use case is for write once, read many times. + + .. versionadded:: 2.3 + + .. note:: This is an experimental feature. + + Parameters + ---------- + store: MutableMapping + Containing the zarr dataset. + metadata_key: str + The target in the store where all of the metadata are stored. We + assume JSON encoding. + + See Also + -------- + zarr.convenience.consolidate_metadata, zarr.convenience.open_consolidated + + """ + def __init__(self, store, metadata_key='.zmetadata'): + self.store = store + + # retrieve consolidated metadata + if sys.version_info.major == 3 and sys.version_info.minor < 6: + d = store[metadata_key].decode() # pragma: no cover + else: # pragma: no cover + d = store[metadata_key] + meta = json.loads(d) + + # check format of consolidated metadata + consolidated_format = meta.get('zarr_consolidated_format', None) + if consolidated_format != 1: + raise MetadataError('unsupported zarr consolidated metadata format: %s' % + consolidated_format) + + # decode metadata + self.meta_store = meta['metadata'] + + def __getitem__(self, key): + return self.meta_store[key] + + def __contains__(self, item): + return item in self.meta_store + + def __iter__(self): + return iter(self.meta_store) + + def __len__(self): + return len(self.meta_store) + + def __delitem__(self, key): + err_read_only() + + def __setitem__(self, key, value): + err_read_only() + + def getsize(self, path): + return getsize(self.meta_store, path) + + def listdir(self, path): + return listdir(self.meta_store, path) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index c77006c4f6..12bfab4a5a 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -4,6 +4,7 @@ import atexit import os import unittest +from numbers import Integral import numpy as np @@ -12,11 +13,12 @@ import pytest -from zarr.convenience import open, save, save_group, load, copy_store, copy -from zarr.storage import atexit_rmtree +from zarr.convenience import (open, save, save_group, load, copy_store, copy, + consolidate_metadata, open_consolidated) +from zarr.storage import atexit_rmtree, DictStore, getsize, ConsolidatedMetadataStore from zarr.core import Array from zarr.hierarchy import Group, group -from zarr.errors import CopyError +from zarr.errors import CopyError, PermissionError def test_open_array(): @@ -91,6 +93,77 @@ def test_lazy_loader(): assert_array_equal(bar, loader['bar']) +def test_consolidate_metadata(): + + # setup initial data + store = DictStore() + z = group(store) + z.create_group('g1') + g2 = z.create_group('g2') + g2.attrs['hello'] = 'world' + arr = g2.create_dataset('arr', shape=(20, 20), chunks=(5, 5), dtype='f8') + assert 16 == arr.nchunks + assert 0 == arr.nchunks_initialized + arr.attrs['data'] = 1 + arr[:] = 1.0 + assert 16 == arr.nchunks_initialized + + # perform consolidation + out = consolidate_metadata(store) + assert isinstance(out, Group) + assert '.zmetadata' in store + for key in ['.zgroup', + 'g1/.zgroup', + 'g2/.zgroup', + 'g2/.zattrs', + 'g2/arr/.zarray', + 'g2/arr/.zattrs']: + del store[key] + + # open consolidated + z2 = open_consolidated(store) + assert ['g1', 'g2'] == list(z2) + assert 'world' == z2.g2.attrs['hello'] + assert 1 == z2.g2.arr.attrs['data'] + assert (z2.g2.arr[:] == 1.0).all() + assert 16 == z2.g2.arr.nchunks + assert 16 == z2.g2.arr.nchunks_initialized + + # tests del/write on the store + cmd = ConsolidatedMetadataStore(store) + with pytest.raises(PermissionError): + del cmd['.zgroup'] + with pytest.raises(PermissionError): + cmd['.zgroup'] = None + + # test getsize on the store + assert isinstance(getsize(cmd), Integral) + + # test new metadata are not writeable + with pytest.raises(PermissionError): + z2.create_group('g3') + with pytest.raises(PermissionError): + z2.create_dataset('spam', shape=42, chunks=7, dtype='i4') + with pytest.raises(PermissionError): + del z2['g2'] + + # test consolidated metadata are not writeable + with pytest.raises(PermissionError): + z2.g2.attrs['hello'] = 'universe' + with pytest.raises(PermissionError): + z2.g2.arr.attrs['foo'] = 'bar' + + # test the data are writeable + z2.g2.arr[:] = 2 + assert (z2.g2.arr[:] == 2).all() + + # test invalid modes + with pytest.raises(ValueError): + open_consolidated(store, mode='a') + with pytest.raises(ValueError): + open_consolidated(store, mode='w') + + class TestCopyStore(unittest.TestCase): def setUp(self): diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 304714991e..ef2232c234 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -3,6 +3,7 @@ import tempfile import shutil import atexit +import os.path import numpy as np @@ -240,6 +241,14 @@ def test_open_array(): assert isinstance(z, Array) assert 'foo/bar' == z.path + # with chunk store + meta_store = 'data/meta.zarr' + chunk_store = 'data/chunks.zarr' + z = open_array(store=meta_store, chunk_store=chunk_store, shape=11, mode='w') + z[:] = 42 + assert os.path.abspath(meta_store) == z.store.path + assert os.path.abspath(chunk_store) == z.chunk_store.path + def test_empty_like():