diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 04b0c899fd..2365359fa9 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -21,6 +21,18 @@ Storage (``zarr.storage``) .. automethod:: close .. automethod:: flush +.. autoclass:: LRUStoreCache + + .. automethod:: invalidate + .. automethod:: invalidate_values + .. automethod:: invalidate_keys + .. autofunction:: init_array .. autofunction:: init_group +.. autofunction:: contains_array +.. autofunction:: contains_group +.. autofunction:: listdir +.. autofunction:: rmdir +.. autofunction:: getsize +.. autofunction:: rename .. autofunction:: migrate_1to2 diff --git a/docs/release.rst b/docs/release.rst index eaa07d49c3..00e29297c6 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -127,6 +127,11 @@ Enhancements * **Added support for ``datetime64`` and ``timedelta64`` data types**; :issue:`85`, :issue:`215`. +* **New LRUStoreCache class**. The class :class:`zarr.storage.LRUStoreCache` has been + added and provides a means to locally cache data in memory from a store that may be + slow, e.g., a store that retrieves data from a remote server via the network; + :issue:`223`. + * **New copy functions**. The new functions :func:`zarr.convenience.copy` and :func:`zarr.convenience.copy_all` provide a way to copy groups and/or arrays between HDF5 and Zarr, or between two Zarr groups. The diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 96a0911e94..0fd2f1b88d 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -729,6 +729,9 @@ group (requires `lmdb `_ to be installed):: >>> z[:] = 42 >>> store.close() +Distributed/cloud storage +~~~~~~~~~~~~~~~~~~~~~~~~~ + It is also possible to use distributed storage systems. The Dask project has implementations of the ``MutableMapping`` interface for Amazon S3 (`S3Map `_), Hadoop @@ -767,6 +770,37 @@ Here is an example using S3Map to read an array created previously:: >>> z[:].tostring() b'Hello from the cloud!' +Note that retrieving data from a remote service via the network can be significantly +slower than retrieving data from a local file system, and will depend on network latency +and bandwidth between the client and server systems. If you are experiencing poor +performance, there are several things you can try. One option is to increase the array +chunk size, which will reduce the number of chunks and thus reduce the number of network +round-trips required to retrieve data for an array (and thus reduce the impact of network +latency). Another option is to try to increase the compression ratio by changing +compression options or trying a different compressor (which will reduce the impact of +limited network bandwidth). As of version 2.2, Zarr also provides the +:class:`zarr.storage.LRUStoreCache` which can be used to implement a local in-memory cache +layer over a remote store. E.g.:: + + >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) + >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) + >>> cache = zarr.LRUStoreCache(store, max_size=2**28) + >>> root = zarr.group(store=cache) + >>> z = root['foo/bar/baz'] + >>> from timeit import timeit + >>> # first data access is relatively slow, retrieved from store + ... timeit('print(z[:].tostring())', number=1, globals=globals()) # doctest: +SKIP + b'Hello from the cloud!' + 0.1081731989979744 + >>> # second data access is faster, uses cache + ... timeit('print(z[:].tostring())', number=1, globals=globals()) # doctest: +SKIP + b'Hello from the cloud!' + 0.0009490990014455747 + +If you are still experiencing poor performance with distributed/cloud storage, please +raise an issue on the GitHub issue tracker with any profiling data you can provide, as +there may be opportunities to optimise further either within Zarr or within the mapping +interface to the storage. .. _tutorial_copy: diff --git a/zarr/__init__.py b/zarr/__init__.py index 5368897482..56d060fdac 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -7,7 +7,7 @@ from zarr.creation import (empty, zeros, ones, full, array, empty_like, zeros_like, ones_like, full_like, open_array, open_like, create) from zarr.storage import (DictStore, DirectoryStore, ZipStore, TempStore, - NestedDirectoryStore, DBMStore, LMDBStore) + NestedDirectoryStore, DBMStore, LMDBStore, LRUStoreCache) from zarr.hierarchy import group, open_group, Group from zarr.sync import ThreadSynchronizer, ProcessSynchronizer from zarr.codecs import * diff --git a/zarr/compat.py b/zarr/compat.py index 38ab245f58..9be3384123 100644 --- a/zarr/compat.py +++ b/zarr/compat.py @@ -16,9 +16,16 @@ class PermissionError(Exception): pass + def OrderedDict_move_to_end(od, key): + od[key] = od.pop(key) + + else: # pragma: py2 no cover text_type = str binary_type = bytes from functools import reduce PermissionError = PermissionError + + def OrderedDict_move_to_end(od, key): + od.move_to_end(key) diff --git a/zarr/storage.py b/zarr/storage.py index 4d25e6a634..7a5273c044 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -5,9 +5,18 @@ :mod:`collections` module in the Python standard library can be used as a Zarr array store, as long as it accepts string (str) keys and bytes values. +In addition to the :class:`MutableMapping` interface, store classes may also implement +optional methods `listdir` (list members of a "directory") and `rmdir` (remove all +members of a "directory"). These methods should be implemented if the store class is +aware of the hierarchical organisation of resources within the store and can provide +efficient implementations. If these methods are not available, Zarr will fall back to +slower implementations that work via the :class:`MutableMapping` interface. Store +classes may also optionally implement a `rename` method (rename all members under a given +path) and a `getsize` method (return the size in bytes of a given value). + """ from __future__ import absolute_import, print_function, division -from collections import MutableMapping +from collections import MutableMapping, OrderedDict import os import tempfile import zipfile @@ -28,10 +37,10 @@ normalize_storage_path, buffer_size, normalize_fill_value, nolock, normalize_dtype) from zarr.meta import encode_array_metadata, encode_group_metadata -from zarr.compat import PY2, binary_type +from zarr.compat import PY2, binary_type, OrderedDict_move_to_end from numcodecs.registry import codec_registry -from zarr.errors import (err_contains_group, err_contains_array, err_path_not_found, - err_bad_compressor, err_fspath_exists_notdir, err_read_only) +from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, + err_fspath_exists_notdir, err_read_only) array_meta_key = '.zarray' @@ -80,7 +89,9 @@ def _rmdir_from_keys(store, path=None): def rmdir(store, path=None): - """Remove all items under the given path.""" + """Remove all items under the given path. If `store` provides a `rmdir` method, + this will be called, otherwise will fall back to implementation via the + `MutableMapping` interface.""" path = normalize_storage_path(path) if hasattr(store, 'rmdir'): # pass through @@ -101,7 +112,9 @@ def _rename_from_keys(store, src_path, dst_path): def rename(store, src_path, dst_path): - """Rename all items under the given path.""" + """Rename all items under the given path. If `store` provides a `rename` method, + this will be called, otherwise will fall back to implementation via the + `MutableMapping` interface.""" src_path = normalize_storage_path(src_path) dst_path = normalize_storage_path(dst_path) if hasattr(store, 'rename'): @@ -125,7 +138,9 @@ def _listdir_from_keys(store, path=None): def listdir(store, path=None): - """Obtain a directory listing for the given path.""" + """Obtain a directory listing for the given path. If `store` provides a `listdir` + method, this will be called, otherwise will fall back to implementation via the + `MutableMapping` interface.""" path = normalize_storage_path(path) if hasattr(store, 'listdir'): # pass through @@ -136,25 +151,31 @@ def listdir(store, path=None): def getsize(store, path=None): - """Compute size of stored items for a given path.""" + """Compute size of stored items for a given path. If `store` provides a `getsize` + method, this will be called, otherwise will return -1.""" path = normalize_storage_path(path) if hasattr(store, 'getsize'): # pass through return store.getsize(path) elif isinstance(store, dict): # compute from size of values - prefix = _path_to_prefix(path) - size = 0 - for k in listdir(store, path): - try: - v = store[prefix + k] - except KeyError: - pass - else: + if path in store: + v = store[path] + size = buffer_size(v) + else: + members = listdir(store, path) + prefix = _path_to_prefix(path) + size = 0 + for k in members: try: - size += buffer_size(v) - except TypeError: - return -1 + v = store[prefix + k] + except KeyError: + pass + else: + try: + size += buffer_size(v) + except TypeError: + return -1 return size else: return -1 @@ -610,16 +631,21 @@ def getsize(self, path=None): path = normalize_storage_path(path) # obtain value to return size of - value = self.root + value = None if path: try: parent, key = self._get_parent(path) value = parent[key] except KeyError: - err_path_not_found(path) + pass + else: + value = self.root # obtain size of value - if isinstance(value, self.cls): + if value is None: + return 0 + + elif isinstance(value, self.cls): # total size for directory size = 0 for v in value.values(): @@ -629,6 +655,7 @@ def getsize(self, path=None): except TypeError: return -1 return size + else: try: return buffer_size(value) @@ -843,7 +870,7 @@ def getsize(self, path=None): size += os.path.getsize(child_fs_path) return size else: - err_path_not_found(path) + return 0 def clear(self): shutil.rmtree(self.path) @@ -857,6 +884,7 @@ def atexit_rmtree(path, rmtree(path) +# noinspection PyShadowingNames def atexit_rmglob(path, glob=glob.glob, isdir=os.path.isdir, @@ -1230,7 +1258,7 @@ def getsize(self, path=None): info = self.zf.getinfo(path) return info.compress_size except KeyError: - err_path_not_found(path) + return 0 else: return 0 @@ -1676,3 +1704,185 @@ def __iter__(self): def __len__(self): return self.db.stat()['entries'] + + +class LRUStoreCache(MutableMapping): + """Storage class that implements a least-recently-used (LRU) cache layer over + some other store. Intended primarily for use with stores that can be slow to + access, e.g., remote stores that require network communication to store and + retrieve data. + + Parameters + ---------- + store : MutableMapping + The store containing the actual data to be cached. + max_size : int + The maximum size that the cache may grow to, in number of bytes. Provide `None` + if you would like the cache to have unlimited size. + + Examples + -------- + The example below wraps an S3 store with an LRU cache:: + + >>> import s3fs + >>> import zarr + >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) + >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) + >>> cache = zarr.LRUStoreCache(store, max_size=2**28) + >>> root = zarr.group(store=cache) + >>> z = root['foo/bar/baz'] + >>> from timeit import timeit + >>> # first data access is relatively slow, retrieved from store + ... timeit('print(z[:].tostring())', number=1, globals=globals()) # doctest: +SKIP + b'Hello from the cloud!' + 0.1081731989979744 + >>> # second data access is faster, uses cache + ... timeit('print(z[:].tostring())', number=1, globals=globals()) # doctest: +SKIP + b'Hello from the cloud!' + 0.0009490990014455747 + + """ + + def __init__(self, store, max_size): + self._store = store + self._max_size = max_size + self._current_size = 0 + self._keys_cache = None + self._contains_cache = None + self._listdir_cache = dict() + self._values_cache = OrderedDict() + self._mutex = Lock() + self.hits = self.misses = 0 + + def __getstate__(self): + return (self._store, self._max_size, self._current_size, self._keys_cache, + self._contains_cache, self._listdir_cache, self._values_cache, self.hits, + self.misses) + + def __setstate__(self, state): + (self._store, self._max_size, self._current_size, self._keys_cache, + self._contains_cache, self._listdir_cache, self._values_cache, self.hits, + self.misses) = state + self._mutex = Lock() + + def __len__(self): + return len(self._keys()) + + def __iter__(self): + return self.keys() + + def __contains__(self, key): + with self._mutex: + if self._contains_cache is None: + self._contains_cache = set(self._keys()) + return key in self._contains_cache + + def clear(self): + self._store.clear() + self.invalidate() + + def keys(self): + with self._mutex: + return iter(self._keys()) + + def _keys(self): + if self._keys_cache is None: + self._keys_cache = list(self._store.keys()) + return self._keys_cache + + def listdir(self, path=None): + with self._mutex: + try: + return self._listdir_cache[path] + except KeyError: + listing = listdir(self._store, path) + self._listdir_cache[path] = listing + return listing + + def getsize(self, path=None): + return getsize(self._store, path=path) + + def _pop_value(self): + # remove the first value from the cache, as this will be the least recently + # used value + _, v = self._values_cache.popitem(last=False) + return v + + def _accommodate_value(self, value_size): + if self._max_size is None: + return + # ensure there is enough space in the cache for a new value + while self._current_size + value_size > self._max_size: + v = self._pop_value() + self._current_size -= buffer_size(v) + + def _cache_value(self, key, value): + # cache a value + value_size = buffer_size(value) + # check size of the value against max size, as if the value itself exceeds max + # size then we are never going to cache it + if self._max_size is None or value_size <= self._max_size: + self._accommodate_value(value_size) + self._values_cache[key] = value + self._current_size += value_size + + def invalidate(self): + """Completely clear the cache.""" + with self._mutex: + self._values_cache.clear() + self._invalidate_keys() + + def invalidate_values(self): + """Clear the values cache.""" + with self._mutex: + self._values_cache.clear() + + def invalidate_keys(self): + """Clear the keys cache.""" + with self._mutex: + self._invalidate_keys() + + def _invalidate_keys(self): + self._keys_cache = None + self._contains_cache = None + self._listdir_cache.clear() + + def _invalidate_value(self, key): + if key in self._values_cache: + value = self._values_cache.pop(key) + self._current_size -= buffer_size(value) + + def __getitem__(self, key): + try: + # first try to obtain the value from the cache + with self._mutex: + value = self._values_cache[key] + # cache hit if no KeyError is raised + self.hits += 1 + # treat the end as most recently used + OrderedDict_move_to_end(self._values_cache, key) + + except KeyError: + # cache miss, retrieve value from the store + value = self._store[key] + with self._mutex: + self.misses += 1 + # need to check if key is not in the cache, as it may have been cached + # while we were retrieving the value from the store + if key not in self._values_cache: + self._cache_value(key, value) + + return value + + def __setitem__(self, key, value): + self._store[key] = value + with self._mutex: + self._invalidate_keys() + self._invalidate_value(key) + self._cache_value(key, value) + + def __delitem__(self, key): + del self._store[key] + with self._mutex: + self._invalidate_keys() + self._invalidate_value(key) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index c50328422e..3867befdec 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -18,7 +18,8 @@ from zarr.storage import (DirectoryStore, init_array, init_group, NestedDirectoryStore, - DBMStore, LMDBStore, atexit_rmtree, atexit_rmglob) + DBMStore, LMDBStore, atexit_rmtree, atexit_rmglob, + LRUStoreCache) from zarr.core import Array from zarr.errors import PermissionError from zarr.compat import PY2, text_type, binary_type @@ -1615,3 +1616,13 @@ def test_cache_metadata(self): def test_object_arrays_danger(self): # skip this one as it only works if metadata are cached pass + + +class TestArrayWithStoreCache(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + store = LRUStoreCache(dict(), max_size=None) + kwargs.setdefault('compressor', Zlib(level=1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only) diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index 2273568a0e..e19472d214 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -20,7 +20,8 @@ from zarr.storage import (DictStore, DirectoryStore, ZipStore, init_group, init_array, array_meta_key, group_meta_key, atexit_rmtree, - NestedDirectoryStore, DBMStore, LMDBStore, atexit_rmglob) + NestedDirectoryStore, DBMStore, LMDBStore, atexit_rmglob, + LRUStoreCache) from zarr.core import Array from zarr.compat import PY2, text_type from zarr.hierarchy import Group, group, open_group @@ -945,6 +946,14 @@ def test_chunk_store(self): eq(expect, actual) +class TestGroupWithStoreCache(TestGroup): + + @staticmethod + def create_store(): + store = LRUStoreCache(dict(), max_size=None) + return store, None + + def test_group(): # test the group() convenience function diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index d5c7c1e067..ec8443e778 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -14,19 +14,21 @@ from numpy.testing import assert_array_equal, assert_array_almost_equal from nose import SkipTest from nose.tools import assert_raises, eq_ as eq, assert_is_none +import pytest from zarr.storage import (init_array, array_meta_key, attrs_key, DictStore, DirectoryStore, ZipStore, init_group, group_meta_key, getsize, migrate_1to2, TempStore, atexit_rmtree, NestedDirectoryStore, default_compressor, DBMStore, - LMDBStore, atexit_rmglob) + LMDBStore, atexit_rmglob, LRUStoreCache) from zarr.meta import (decode_array_metadata, encode_array_metadata, ZARR_FORMAT, decode_group_metadata, encode_group_metadata) from zarr.compat import PY2 from zarr.codecs import Zlib, Blosc, BZ2 from zarr.errors import PermissionError from zarr.hierarchy import group +from zarr.tests.util import CountingDict class StoreTests(object): @@ -139,23 +141,23 @@ def test_pickle(self): def test_getsize(self): store = self.create_store() - if hasattr(store, 'getsize'): - eq(0, store.getsize()) + if isinstance(store, dict) or hasattr(store, 'getsize'): + eq(0, getsize(store)) store['foo'] = b'x' - eq(1, store.getsize()) - eq(1, store.getsize('foo')) + eq(1, getsize(store)) + eq(1, getsize(store, 'foo')) store['bar'] = b'yy' - eq(3, store.getsize()) - eq(2, store.getsize('bar')) + eq(3, getsize(store)) + eq(2, getsize(store, 'bar')) store['baz'] = bytearray(b'zzz') - eq(6, store.getsize()) - eq(3, store.getsize('baz')) + eq(6, getsize(store)) + eq(3, getsize(store, 'baz')) store['quux'] = array.array('B', b'zzzz') - eq(10, store.getsize()) - eq(4, store.getsize('quux')) + eq(10, getsize(store)) + eq(4, getsize(store, 'quux')) store['spong'] = np.frombuffer(b'zzzzz', dtype='u1') - eq(15, store.getsize()) - eq(5, store.getsize('spong')) + eq(15, getsize(store)) + eq(5, getsize(store, 'spong')) # noinspection PyStatementEffect def test_hierarchy(self): @@ -197,18 +199,13 @@ def test_hierarchy(self): eq(6, store.getsize('c/e')) eq(3, store.getsize('c/e/f')) eq(3, store.getsize('c/e/g')) - with assert_raises(ValueError): - store.getsize('x') - with assert_raises(ValueError): - store.getsize('a/x') - with assert_raises(ValueError): - store.getsize('c/x') - with assert_raises(ValueError): - store.getsize('c/x/y') - with assert_raises(ValueError): - store.getsize('c/d/y') - with assert_raises(ValueError): - store.getsize('c/d/y/z') + # non-existent paths + eq(0, store.getsize('x')) + eq(0, store.getsize('a/x')) + eq(0, store.getsize('c/x')) + eq(0, store.getsize('c/x/y')) + eq(0, store.getsize('c/d/y')) + eq(0, store.getsize('c/d/y/z')) # test listdir (optional) if hasattr(store, 'listdir'): @@ -846,6 +843,230 @@ def test_context_manager(self): eq(2, len(store)) +class TestLRUStoreCache(StoreTests, unittest.TestCase): + + def create_store(self): + return LRUStoreCache(dict(), max_size=2**27) + + def test_cache_values_no_max_size(self): + + # setup store + store = CountingDict() + store['foo'] = b'xxx' + store['bar'] = b'yyy' + assert 0 == store.counter['__getitem__', 'foo'] + assert 1 == store.counter['__setitem__', 'foo'] + assert 0 == store.counter['__getitem__', 'bar'] + assert 1 == store.counter['__setitem__', 'bar'] + + # setup cache + cache = LRUStoreCache(store, max_size=None) + assert 0 == cache.hits + assert 0 == cache.misses + + # test first __getitem__, cache miss + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 1 == store.counter['__setitem__', 'foo'] + assert 0 == cache.hits + assert 1 == cache.misses + + # test second __getitem__, cache hit + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 1 == store.counter['__setitem__', 'foo'] + assert 1 == cache.hits + assert 1 == cache.misses + + # test __setitem__, __getitem__ + cache['foo'] = b'zzz' + assert 1 == store.counter['__getitem__', 'foo'] + assert 2 == store.counter['__setitem__', 'foo'] + # should be a cache hit + assert b'zzz' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 2 == store.counter['__setitem__', 'foo'] + assert 2 == cache.hits + assert 1 == cache.misses + + # manually invalidate all cached values + cache.invalidate_values() + assert b'zzz' == cache['foo'] + assert 2 == store.counter['__getitem__', 'foo'] + assert 2 == store.counter['__setitem__', 'foo'] + cache.invalidate() + assert b'zzz' == cache['foo'] + assert 3 == store.counter['__getitem__', 'foo'] + assert 2 == store.counter['__setitem__', 'foo'] + + # test __delitem__ + del cache['foo'] + with pytest.raises(KeyError): + # noinspection PyStatementEffect + cache['foo'] + with pytest.raises(KeyError): + # noinspection PyStatementEffect + store['foo'] + + # verify other keys untouched + assert 0 == store.counter['__getitem__', 'bar'] + assert 1 == store.counter['__setitem__', 'bar'] + + def test_cache_values_with_max_size(self): + + # setup store + store = CountingDict() + store['foo'] = b'xxx' + store['bar'] = b'yyy' + assert 0 == store.counter['__getitem__', 'foo'] + assert 0 == store.counter['__getitem__', 'bar'] + # setup cache - can only hold one item + cache = LRUStoreCache(store, max_size=5) + assert 0 == cache.hits + assert 0 == cache.misses + + # test first 'foo' __getitem__, cache miss + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 0 == cache.hits + assert 1 == cache.misses + + # test second 'foo' __getitem__, cache hit + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 1 == cache.hits + assert 1 == cache.misses + + # test first 'bar' __getitem__, cache miss + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 1 == cache.hits + assert 2 == cache.misses + + # test second 'bar' __getitem__, cache hit + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 2 == cache.hits + assert 2 == cache.misses + + # test 'foo' __getitem__, should have been evicted, cache miss + assert b'xxx' == cache['foo'] + assert 2 == store.counter['__getitem__', 'foo'] + assert 2 == cache.hits + assert 3 == cache.misses + + # test 'bar' __getitem__, should have been evicted, cache miss + assert b'yyy' == cache['bar'] + assert 2 == store.counter['__getitem__', 'bar'] + assert 2 == cache.hits + assert 4 == cache.misses + + # setup store + store = CountingDict() + store['foo'] = b'xxx' + store['bar'] = b'yyy' + assert 0 == store.counter['__getitem__', 'foo'] + assert 0 == store.counter['__getitem__', 'bar'] + # setup cache - can hold two items + cache = LRUStoreCache(store, max_size=6) + assert 0 == cache.hits + assert 0 == cache.misses + + # test first 'foo' __getitem__, cache miss + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 0 == cache.hits + assert 1 == cache.misses + + # test second 'foo' __getitem__, cache hit + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 1 == cache.hits + assert 1 == cache.misses + + # test first 'bar' __getitem__, cache miss + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 1 == cache.hits + assert 2 == cache.misses + + # test second 'bar' __getitem__, cache hit + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 2 == cache.hits + assert 2 == cache.misses + + # test 'foo' __getitem__, should still be cached + assert b'xxx' == cache['foo'] + assert 1 == store.counter['__getitem__', 'foo'] + assert 3 == cache.hits + assert 2 == cache.misses + + # test 'bar' __getitem__, should still be cached + assert b'yyy' == cache['bar'] + assert 1 == store.counter['__getitem__', 'bar'] + assert 4 == cache.hits + assert 2 == cache.misses + + def test_cache_keys(self): + + # setup + store = CountingDict() + store['foo'] = b'xxx' + store['bar'] = b'yyy' + assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__iter__'] + assert 0 == store.counter['keys'] + cache = LRUStoreCache(store, max_size=None) + + # keys should be cached on first call + keys = sorted(cache.keys()) + assert keys == ['bar', 'foo'] + assert 1 == store.counter['keys'] + # keys should now be cached + assert keys == sorted(cache.keys()) + assert 1 == store.counter['keys'] + assert 'foo' in cache + assert 0 == store.counter['__contains__', 'foo'] + assert keys == sorted(cache) + assert 0 == store.counter['__iter__'] + assert 1 == store.counter['keys'] + + # cache should be cleared if store is modified - crude but simple for now + cache['baz'] = b'zzz' + keys = sorted(cache.keys()) + assert keys == ['bar', 'baz', 'foo'] + assert 2 == store.counter['keys'] + # keys should now be cached + assert keys == sorted(cache.keys()) + assert 2 == store.counter['keys'] + + # manually invalidate keys + cache.invalidate_keys() + keys = sorted(cache.keys()) + assert keys == ['bar', 'baz', 'foo'] + assert 3 == store.counter['keys'] + assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__iter__'] + cache.invalidate_keys() + keys = sorted(cache) + assert keys == ['bar', 'baz', 'foo'] + assert 4 == store.counter['keys'] + assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__iter__'] + cache.invalidate_keys() + assert 'foo' in cache + assert 5 == store.counter['keys'] + assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__iter__'] + + # check these would get counted if called directly + assert 'foo' in store + assert 1 == store.counter['__contains__', 'foo'] + assert keys == sorted(store) + assert 1 == store.counter['__iter__'] + + def test_getsize(): store = dict() store['foo'] = b'aaa' diff --git a/zarr/tests/util.py b/zarr/tests/util.py new file mode 100644 index 0000000000..def9c61132 --- /dev/null +++ b/zarr/tests/util.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, print_function, division +import collections + + +class CountingDict(collections.MutableMapping): + + def __init__(self): + self.wrapped = dict() + self.counter = collections.Counter() + + def __len__(self): + self.counter['__len__'] += 1 + return len(self.wrapped) + + def keys(self): + self.counter['keys'] += 1 + return self.wrapped.keys() + + def __iter__(self): + self.counter['__iter__'] += 1 + return iter(self.wrapped) + + def __contains__(self, item): + self.counter['__contains__', item] += 1 + return item in self.wrapped + + def __getitem__(self, item): + self.counter['__getitem__', item] += 1 + return self.wrapped[item] + + def __setitem__(self, key, value): + self.counter['__setitem__', key] += 1 + self.wrapped[key] = value + + def __delitem__(self, key): + self.counter['__delitem__', key] += 1 + del self.wrapped[key] diff --git a/zarr/util.py b/zarr/util.py index 3b43e3a0d6..aedff4ab94 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -318,7 +318,10 @@ def buffer_size(v): return v.buffer_info()[1] * v.itemsize else: # pragma: py2 no cover v = memoryview(v) - return reduce(operator.mul, v.shape) * v.itemsize + if v.shape: + return reduce(operator.mul, v.shape) * v.itemsize + else: + return v.itemsize def info_text_report(items):