diff --git a/docs/api/storage.rst b/docs/api/storage.rst
index 04b0c899fd..2365359fa9 100644
--- a/docs/api/storage.rst
+++ b/docs/api/storage.rst
@@ -21,6 +21,18 @@ Storage (``zarr.storage``)
.. automethod:: close
.. automethod:: flush
+.. autoclass:: LRUStoreCache
+
+ .. automethod:: invalidate
+ .. automethod:: invalidate_values
+ .. automethod:: invalidate_keys
+
.. autofunction:: init_array
.. autofunction:: init_group
+.. autofunction:: contains_array
+.. autofunction:: contains_group
+.. autofunction:: listdir
+.. autofunction:: rmdir
+.. autofunction:: getsize
+.. autofunction:: rename
.. autofunction:: migrate_1to2
diff --git a/docs/release.rst b/docs/release.rst
index eaa07d49c3..00e29297c6 100644
--- a/docs/release.rst
+++ b/docs/release.rst
@@ -127,6 +127,11 @@ Enhancements
* **Added support for ``datetime64`` and ``timedelta64`` data types**;
:issue:`85`, :issue:`215`.
+* **New LRUStoreCache class**. The class :class:`zarr.storage.LRUStoreCache` has been
+ added and provides a means to locally cache data in memory from a store that may be
+ slow, e.g., a store that retrieves data from a remote server via the network;
+ :issue:`223`.
+
* **New copy functions**. The new functions :func:`zarr.convenience.copy` and
:func:`zarr.convenience.copy_all` provide a way to copy groups and/or arrays
between HDF5 and Zarr, or between two Zarr groups. The
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index 96a0911e94..0fd2f1b88d 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -729,6 +729,9 @@ group (requires `lmdb `_ to be installed)::
>>> z[:] = 42
>>> store.close()
+Distributed/cloud storage
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
It is also possible to use distributed storage systems. The Dask project has
implementations of the ``MutableMapping`` interface for Amazon S3 (`S3Map
`_), Hadoop
@@ -767,6 +770,37 @@ Here is an example using S3Map to read an array created previously::
>>> z[:].tostring()
b'Hello from the cloud!'
+Note that retrieving data from a remote service via the network can be significantly
+slower than retrieving data from a local file system, and will depend on network latency
+and bandwidth between the client and server systems. If you are experiencing poor
+performance, there are several things you can try. One option is to increase the array
+chunk size, which will reduce the number of chunks and thus reduce the number of network
+round-trips required to retrieve data for an array (and thus reduce the impact of network
+latency). Another option is to try to increase the compression ratio by changing
+compression options or trying a different compressor (which will reduce the impact of
+limited network bandwidth). As of version 2.2, Zarr also provides the
+:class:`zarr.storage.LRUStoreCache` which can be used to implement a local in-memory cache
+layer over a remote store. E.g.::
+
+ >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2'))
+ >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False)
+ >>> cache = zarr.LRUStoreCache(store, max_size=2**28)
+ >>> root = zarr.group(store=cache)
+ >>> z = root['foo/bar/baz']
+ >>> from timeit import timeit
+ >>> # first data access is relatively slow, retrieved from store
+ ... timeit('print(z[:].tostring())', number=1, globals=globals()) # doctest: +SKIP
+ b'Hello from the cloud!'
+ 0.1081731989979744
+ >>> # second data access is faster, uses cache
+ ... timeit('print(z[:].tostring())', number=1, globals=globals()) # doctest: +SKIP
+ b'Hello from the cloud!'
+ 0.0009490990014455747
+
+If you are still experiencing poor performance with distributed/cloud storage, please
+raise an issue on the GitHub issue tracker with any profiling data you can provide, as
+there may be opportunities to optimise further either within Zarr or within the mapping
+interface to the storage.
.. _tutorial_copy:
diff --git a/zarr/__init__.py b/zarr/__init__.py
index 5368897482..56d060fdac 100644
--- a/zarr/__init__.py
+++ b/zarr/__init__.py
@@ -7,7 +7,7 @@
from zarr.creation import (empty, zeros, ones, full, array, empty_like, zeros_like,
ones_like, full_like, open_array, open_like, create)
from zarr.storage import (DictStore, DirectoryStore, ZipStore, TempStore,
- NestedDirectoryStore, DBMStore, LMDBStore)
+ NestedDirectoryStore, DBMStore, LMDBStore, LRUStoreCache)
from zarr.hierarchy import group, open_group, Group
from zarr.sync import ThreadSynchronizer, ProcessSynchronizer
from zarr.codecs import *
diff --git a/zarr/compat.py b/zarr/compat.py
index 38ab245f58..9be3384123 100644
--- a/zarr/compat.py
+++ b/zarr/compat.py
@@ -16,9 +16,16 @@
class PermissionError(Exception):
pass
+ def OrderedDict_move_to_end(od, key):
+ od[key] = od.pop(key)
+
+
else: # pragma: py2 no cover
text_type = str
binary_type = bytes
from functools import reduce
PermissionError = PermissionError
+
+ def OrderedDict_move_to_end(od, key):
+ od.move_to_end(key)
diff --git a/zarr/storage.py b/zarr/storage.py
index 4d25e6a634..7a5273c044 100644
--- a/zarr/storage.py
+++ b/zarr/storage.py
@@ -5,9 +5,18 @@
:mod:`collections` module in the Python standard library can be used as a Zarr
array store, as long as it accepts string (str) keys and bytes values.
+In addition to the :class:`MutableMapping` interface, store classes may also implement
+optional methods `listdir` (list members of a "directory") and `rmdir` (remove all
+members of a "directory"). These methods should be implemented if the store class is
+aware of the hierarchical organisation of resources within the store and can provide
+efficient implementations. If these methods are not available, Zarr will fall back to
+slower implementations that work via the :class:`MutableMapping` interface. Store
+classes may also optionally implement a `rename` method (rename all members under a given
+path) and a `getsize` method (return the size in bytes of a given value).
+
"""
from __future__ import absolute_import, print_function, division
-from collections import MutableMapping
+from collections import MutableMapping, OrderedDict
import os
import tempfile
import zipfile
@@ -28,10 +37,10 @@
normalize_storage_path, buffer_size,
normalize_fill_value, nolock, normalize_dtype)
from zarr.meta import encode_array_metadata, encode_group_metadata
-from zarr.compat import PY2, binary_type
+from zarr.compat import PY2, binary_type, OrderedDict_move_to_end
from numcodecs.registry import codec_registry
-from zarr.errors import (err_contains_group, err_contains_array, err_path_not_found,
- err_bad_compressor, err_fspath_exists_notdir, err_read_only)
+from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor,
+ err_fspath_exists_notdir, err_read_only)
array_meta_key = '.zarray'
@@ -80,7 +89,9 @@ def _rmdir_from_keys(store, path=None):
def rmdir(store, path=None):
- """Remove all items under the given path."""
+ """Remove all items under the given path. If `store` provides a `rmdir` method,
+ this will be called, otherwise will fall back to implementation via the
+ `MutableMapping` interface."""
path = normalize_storage_path(path)
if hasattr(store, 'rmdir'):
# pass through
@@ -101,7 +112,9 @@ def _rename_from_keys(store, src_path, dst_path):
def rename(store, src_path, dst_path):
- """Rename all items under the given path."""
+ """Rename all items under the given path. If `store` provides a `rename` method,
+ this will be called, otherwise will fall back to implementation via the
+ `MutableMapping` interface."""
src_path = normalize_storage_path(src_path)
dst_path = normalize_storage_path(dst_path)
if hasattr(store, 'rename'):
@@ -125,7 +138,9 @@ def _listdir_from_keys(store, path=None):
def listdir(store, path=None):
- """Obtain a directory listing for the given path."""
+ """Obtain a directory listing for the given path. If `store` provides a `listdir`
+ method, this will be called, otherwise will fall back to implementation via the
+ `MutableMapping` interface."""
path = normalize_storage_path(path)
if hasattr(store, 'listdir'):
# pass through
@@ -136,25 +151,31 @@ def listdir(store, path=None):
def getsize(store, path=None):
- """Compute size of stored items for a given path."""
+ """Compute size of stored items for a given path. If `store` provides a `getsize`
+ method, this will be called, otherwise will return -1."""
path = normalize_storage_path(path)
if hasattr(store, 'getsize'):
# pass through
return store.getsize(path)
elif isinstance(store, dict):
# compute from size of values
- prefix = _path_to_prefix(path)
- size = 0
- for k in listdir(store, path):
- try:
- v = store[prefix + k]
- except KeyError:
- pass
- else:
+ if path in store:
+ v = store[path]
+ size = buffer_size(v)
+ else:
+ members = listdir(store, path)
+ prefix = _path_to_prefix(path)
+ size = 0
+ for k in members:
try:
- size += buffer_size(v)
- except TypeError:
- return -1
+ v = store[prefix + k]
+ except KeyError:
+ pass
+ else:
+ try:
+ size += buffer_size(v)
+ except TypeError:
+ return -1
return size
else:
return -1
@@ -610,16 +631,21 @@ def getsize(self, path=None):
path = normalize_storage_path(path)
# obtain value to return size of
- value = self.root
+ value = None
if path:
try:
parent, key = self._get_parent(path)
value = parent[key]
except KeyError:
- err_path_not_found(path)
+ pass
+ else:
+ value = self.root
# obtain size of value
- if isinstance(value, self.cls):
+ if value is None:
+ return 0
+
+ elif isinstance(value, self.cls):
# total size for directory
size = 0
for v in value.values():
@@ -629,6 +655,7 @@ def getsize(self, path=None):
except TypeError:
return -1
return size
+
else:
try:
return buffer_size(value)
@@ -843,7 +870,7 @@ def getsize(self, path=None):
size += os.path.getsize(child_fs_path)
return size
else:
- err_path_not_found(path)
+ return 0
def clear(self):
shutil.rmtree(self.path)
@@ -857,6 +884,7 @@ def atexit_rmtree(path,
rmtree(path)
+# noinspection PyShadowingNames
def atexit_rmglob(path,
glob=glob.glob,
isdir=os.path.isdir,
@@ -1230,7 +1258,7 @@ def getsize(self, path=None):
info = self.zf.getinfo(path)
return info.compress_size
except KeyError:
- err_path_not_found(path)
+ return 0
else:
return 0
@@ -1676,3 +1704,185 @@ def __iter__(self):
def __len__(self):
return self.db.stat()['entries']
+
+
+class LRUStoreCache(MutableMapping):
+ """Storage class that implements a least-recently-used (LRU) cache layer over
+ some other store. Intended primarily for use with stores that can be slow to
+ access, e.g., remote stores that require network communication to store and
+ retrieve data.
+
+ Parameters
+ ----------
+ store : MutableMapping
+ The store containing the actual data to be cached.
+ max_size : int
+ The maximum size that the cache may grow to, in number of bytes. Provide `None`
+ if you would like the cache to have unlimited size.
+
+ Examples
+ --------
+ The example below wraps an S3 store with an LRU cache::
+
+ >>> import s3fs
+ >>> import zarr
+ >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2'))
+ >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False)
+ >>> cache = zarr.LRUStoreCache(store, max_size=2**28)
+ >>> root = zarr.group(store=cache)
+ >>> z = root['foo/bar/baz']
+ >>> from timeit import timeit
+ >>> # first data access is relatively slow, retrieved from store
+ ... timeit('print(z[:].tostring())', number=1, globals=globals()) # doctest: +SKIP
+ b'Hello from the cloud!'
+ 0.1081731989979744
+ >>> # second data access is faster, uses cache
+ ... timeit('print(z[:].tostring())', number=1, globals=globals()) # doctest: +SKIP
+ b'Hello from the cloud!'
+ 0.0009490990014455747
+
+ """
+
+ def __init__(self, store, max_size):
+ self._store = store
+ self._max_size = max_size
+ self._current_size = 0
+ self._keys_cache = None
+ self._contains_cache = None
+ self._listdir_cache = dict()
+ self._values_cache = OrderedDict()
+ self._mutex = Lock()
+ self.hits = self.misses = 0
+
+ def __getstate__(self):
+ return (self._store, self._max_size, self._current_size, self._keys_cache,
+ self._contains_cache, self._listdir_cache, self._values_cache, self.hits,
+ self.misses)
+
+ def __setstate__(self, state):
+ (self._store, self._max_size, self._current_size, self._keys_cache,
+ self._contains_cache, self._listdir_cache, self._values_cache, self.hits,
+ self.misses) = state
+ self._mutex = Lock()
+
+ def __len__(self):
+ return len(self._keys())
+
+ def __iter__(self):
+ return self.keys()
+
+ def __contains__(self, key):
+ with self._mutex:
+ if self._contains_cache is None:
+ self._contains_cache = set(self._keys())
+ return key in self._contains_cache
+
+ def clear(self):
+ self._store.clear()
+ self.invalidate()
+
+ def keys(self):
+ with self._mutex:
+ return iter(self._keys())
+
+ def _keys(self):
+ if self._keys_cache is None:
+ self._keys_cache = list(self._store.keys())
+ return self._keys_cache
+
+ def listdir(self, path=None):
+ with self._mutex:
+ try:
+ return self._listdir_cache[path]
+ except KeyError:
+ listing = listdir(self._store, path)
+ self._listdir_cache[path] = listing
+ return listing
+
+ def getsize(self, path=None):
+ return getsize(self._store, path=path)
+
+ def _pop_value(self):
+ # remove the first value from the cache, as this will be the least recently
+ # used value
+ _, v = self._values_cache.popitem(last=False)
+ return v
+
+ def _accommodate_value(self, value_size):
+ if self._max_size is None:
+ return
+ # ensure there is enough space in the cache for a new value
+ while self._current_size + value_size > self._max_size:
+ v = self._pop_value()
+ self._current_size -= buffer_size(v)
+
+ def _cache_value(self, key, value):
+ # cache a value
+ value_size = buffer_size(value)
+ # check size of the value against max size, as if the value itself exceeds max
+ # size then we are never going to cache it
+ if self._max_size is None or value_size <= self._max_size:
+ self._accommodate_value(value_size)
+ self._values_cache[key] = value
+ self._current_size += value_size
+
+ def invalidate(self):
+ """Completely clear the cache."""
+ with self._mutex:
+ self._values_cache.clear()
+ self._invalidate_keys()
+
+ def invalidate_values(self):
+ """Clear the values cache."""
+ with self._mutex:
+ self._values_cache.clear()
+
+ def invalidate_keys(self):
+ """Clear the keys cache."""
+ with self._mutex:
+ self._invalidate_keys()
+
+ def _invalidate_keys(self):
+ self._keys_cache = None
+ self._contains_cache = None
+ self._listdir_cache.clear()
+
+ def _invalidate_value(self, key):
+ if key in self._values_cache:
+ value = self._values_cache.pop(key)
+ self._current_size -= buffer_size(value)
+
+ def __getitem__(self, key):
+ try:
+ # first try to obtain the value from the cache
+ with self._mutex:
+ value = self._values_cache[key]
+ # cache hit if no KeyError is raised
+ self.hits += 1
+ # treat the end as most recently used
+ OrderedDict_move_to_end(self._values_cache, key)
+
+ except KeyError:
+ # cache miss, retrieve value from the store
+ value = self._store[key]
+ with self._mutex:
+ self.misses += 1
+ # need to check if key is not in the cache, as it may have been cached
+ # while we were retrieving the value from the store
+ if key not in self._values_cache:
+ self._cache_value(key, value)
+
+ return value
+
+ def __setitem__(self, key, value):
+ self._store[key] = value
+ with self._mutex:
+ self._invalidate_keys()
+ self._invalidate_value(key)
+ self._cache_value(key, value)
+
+ def __delitem__(self, key):
+ del self._store[key]
+ with self._mutex:
+ self._invalidate_keys()
+ self._invalidate_value(key)
diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py
index c50328422e..3867befdec 100644
--- a/zarr/tests/test_core.py
+++ b/zarr/tests/test_core.py
@@ -18,7 +18,8 @@
from zarr.storage import (DirectoryStore, init_array, init_group, NestedDirectoryStore,
- DBMStore, LMDBStore, atexit_rmtree, atexit_rmglob)
+ DBMStore, LMDBStore, atexit_rmtree, atexit_rmglob,
+ LRUStoreCache)
from zarr.core import Array
from zarr.errors import PermissionError
from zarr.compat import PY2, text_type, binary_type
@@ -1615,3 +1616,13 @@ def test_cache_metadata(self):
def test_object_arrays_danger(self):
# skip this one as it only works if metadata are cached
pass
+
+
+class TestArrayWithStoreCache(TestArray):
+
+ @staticmethod
+ def create_array(read_only=False, **kwargs):
+ store = LRUStoreCache(dict(), max_size=None)
+ kwargs.setdefault('compressor', Zlib(level=1))
+ init_array(store, **kwargs)
+ return Array(store, read_only=read_only)
diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py
index 2273568a0e..e19472d214 100644
--- a/zarr/tests/test_hierarchy.py
+++ b/zarr/tests/test_hierarchy.py
@@ -20,7 +20,8 @@
from zarr.storage import (DictStore, DirectoryStore, ZipStore, init_group, init_array,
array_meta_key, group_meta_key, atexit_rmtree,
- NestedDirectoryStore, DBMStore, LMDBStore, atexit_rmglob)
+ NestedDirectoryStore, DBMStore, LMDBStore, atexit_rmglob,
+ LRUStoreCache)
from zarr.core import Array
from zarr.compat import PY2, text_type
from zarr.hierarchy import Group, group, open_group
@@ -945,6 +946,14 @@ def test_chunk_store(self):
eq(expect, actual)
+class TestGroupWithStoreCache(TestGroup):
+
+ @staticmethod
+ def create_store():
+ store = LRUStoreCache(dict(), max_size=None)
+ return store, None
+
+
def test_group():
# test the group() convenience function
diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py
index d5c7c1e067..ec8443e778 100644
--- a/zarr/tests/test_storage.py
+++ b/zarr/tests/test_storage.py
@@ -14,19 +14,21 @@
from numpy.testing import assert_array_equal, assert_array_almost_equal
from nose import SkipTest
from nose.tools import assert_raises, eq_ as eq, assert_is_none
+import pytest
from zarr.storage import (init_array, array_meta_key, attrs_key, DictStore,
DirectoryStore, ZipStore, init_group, group_meta_key,
getsize, migrate_1to2, TempStore, atexit_rmtree,
NestedDirectoryStore, default_compressor, DBMStore,
- LMDBStore, atexit_rmglob)
+ LMDBStore, atexit_rmglob, LRUStoreCache)
from zarr.meta import (decode_array_metadata, encode_array_metadata, ZARR_FORMAT,
decode_group_metadata, encode_group_metadata)
from zarr.compat import PY2
from zarr.codecs import Zlib, Blosc, BZ2
from zarr.errors import PermissionError
from zarr.hierarchy import group
+from zarr.tests.util import CountingDict
class StoreTests(object):
@@ -139,23 +141,23 @@ def test_pickle(self):
def test_getsize(self):
store = self.create_store()
- if hasattr(store, 'getsize'):
- eq(0, store.getsize())
+ if isinstance(store, dict) or hasattr(store, 'getsize'):
+ eq(0, getsize(store))
store['foo'] = b'x'
- eq(1, store.getsize())
- eq(1, store.getsize('foo'))
+ eq(1, getsize(store))
+ eq(1, getsize(store, 'foo'))
store['bar'] = b'yy'
- eq(3, store.getsize())
- eq(2, store.getsize('bar'))
+ eq(3, getsize(store))
+ eq(2, getsize(store, 'bar'))
store['baz'] = bytearray(b'zzz')
- eq(6, store.getsize())
- eq(3, store.getsize('baz'))
+ eq(6, getsize(store))
+ eq(3, getsize(store, 'baz'))
store['quux'] = array.array('B', b'zzzz')
- eq(10, store.getsize())
- eq(4, store.getsize('quux'))
+ eq(10, getsize(store))
+ eq(4, getsize(store, 'quux'))
store['spong'] = np.frombuffer(b'zzzzz', dtype='u1')
- eq(15, store.getsize())
- eq(5, store.getsize('spong'))
+ eq(15, getsize(store))
+ eq(5, getsize(store, 'spong'))
# noinspection PyStatementEffect
def test_hierarchy(self):
@@ -197,18 +199,13 @@ def test_hierarchy(self):
eq(6, store.getsize('c/e'))
eq(3, store.getsize('c/e/f'))
eq(3, store.getsize('c/e/g'))
- with assert_raises(ValueError):
- store.getsize('x')
- with assert_raises(ValueError):
- store.getsize('a/x')
- with assert_raises(ValueError):
- store.getsize('c/x')
- with assert_raises(ValueError):
- store.getsize('c/x/y')
- with assert_raises(ValueError):
- store.getsize('c/d/y')
- with assert_raises(ValueError):
- store.getsize('c/d/y/z')
+ # non-existent paths
+ eq(0, store.getsize('x'))
+ eq(0, store.getsize('a/x'))
+ eq(0, store.getsize('c/x'))
+ eq(0, store.getsize('c/x/y'))
+ eq(0, store.getsize('c/d/y'))
+ eq(0, store.getsize('c/d/y/z'))
# test listdir (optional)
if hasattr(store, 'listdir'):
@@ -846,6 +843,230 @@ def test_context_manager(self):
eq(2, len(store))
+class TestLRUStoreCache(StoreTests, unittest.TestCase):
+
+ def create_store(self):
+ return LRUStoreCache(dict(), max_size=2**27)
+
+ def test_cache_values_no_max_size(self):
+
+ # setup store
+ store = CountingDict()
+ store['foo'] = b'xxx'
+ store['bar'] = b'yyy'
+ assert 0 == store.counter['__getitem__', 'foo']
+ assert 1 == store.counter['__setitem__', 'foo']
+ assert 0 == store.counter['__getitem__', 'bar']
+ assert 1 == store.counter['__setitem__', 'bar']
+
+ # setup cache
+ cache = LRUStoreCache(store, max_size=None)
+ assert 0 == cache.hits
+ assert 0 == cache.misses
+
+ # test first __getitem__, cache miss
+ assert b'xxx' == cache['foo']
+ assert 1 == store.counter['__getitem__', 'foo']
+ assert 1 == store.counter['__setitem__', 'foo']
+ assert 0 == cache.hits
+ assert 1 == cache.misses
+
+ # test second __getitem__, cache hit
+ assert b'xxx' == cache['foo']
+ assert 1 == store.counter['__getitem__', 'foo']
+ assert 1 == store.counter['__setitem__', 'foo']
+ assert 1 == cache.hits
+ assert 1 == cache.misses
+
+ # test __setitem__, __getitem__
+ cache['foo'] = b'zzz'
+ assert 1 == store.counter['__getitem__', 'foo']
+ assert 2 == store.counter['__setitem__', 'foo']
+ # should be a cache hit
+ assert b'zzz' == cache['foo']
+ assert 1 == store.counter['__getitem__', 'foo']
+ assert 2 == store.counter['__setitem__', 'foo']
+ assert 2 == cache.hits
+ assert 1 == cache.misses
+
+ # manually invalidate all cached values
+ cache.invalidate_values()
+ assert b'zzz' == cache['foo']
+ assert 2 == store.counter['__getitem__', 'foo']
+ assert 2 == store.counter['__setitem__', 'foo']
+ cache.invalidate()
+ assert b'zzz' == cache['foo']
+ assert 3 == store.counter['__getitem__', 'foo']
+ assert 2 == store.counter['__setitem__', 'foo']
+
+ # test __delitem__
+ del cache['foo']
+ with pytest.raises(KeyError):
+ # noinspection PyStatementEffect
+ cache['foo']
+ with pytest.raises(KeyError):
+ # noinspection PyStatementEffect
+ store['foo']
+
+ # verify other keys untouched
+ assert 0 == store.counter['__getitem__', 'bar']
+ assert 1 == store.counter['__setitem__', 'bar']
+
+ def test_cache_values_with_max_size(self):
+
+ # setup store
+ store = CountingDict()
+ store['foo'] = b'xxx'
+ store['bar'] = b'yyy'
+ assert 0 == store.counter['__getitem__', 'foo']
+ assert 0 == store.counter['__getitem__', 'bar']
+ # setup cache - can only hold one item
+ cache = LRUStoreCache(store, max_size=5)
+ assert 0 == cache.hits
+ assert 0 == cache.misses
+
+ # test first 'foo' __getitem__, cache miss
+ assert b'xxx' == cache['foo']
+ assert 1 == store.counter['__getitem__', 'foo']
+ assert 0 == cache.hits
+ assert 1 == cache.misses
+
+ # test second 'foo' __getitem__, cache hit
+ assert b'xxx' == cache['foo']
+ assert 1 == store.counter['__getitem__', 'foo']
+ assert 1 == cache.hits
+ assert 1 == cache.misses
+
+ # test first 'bar' __getitem__, cache miss
+ assert b'yyy' == cache['bar']
+ assert 1 == store.counter['__getitem__', 'bar']
+ assert 1 == cache.hits
+ assert 2 == cache.misses
+
+ # test second 'bar' __getitem__, cache hit
+ assert b'yyy' == cache['bar']
+ assert 1 == store.counter['__getitem__', 'bar']
+ assert 2 == cache.hits
+ assert 2 == cache.misses
+
+ # test 'foo' __getitem__, should have been evicted, cache miss
+ assert b'xxx' == cache['foo']
+ assert 2 == store.counter['__getitem__', 'foo']
+ assert 2 == cache.hits
+ assert 3 == cache.misses
+
+ # test 'bar' __getitem__, should have been evicted, cache miss
+ assert b'yyy' == cache['bar']
+ assert 2 == store.counter['__getitem__', 'bar']
+ assert 2 == cache.hits
+ assert 4 == cache.misses
+
+ # setup store
+ store = CountingDict()
+ store['foo'] = b'xxx'
+ store['bar'] = b'yyy'
+ assert 0 == store.counter['__getitem__', 'foo']
+ assert 0 == store.counter['__getitem__', 'bar']
+ # setup cache - can hold two items
+ cache = LRUStoreCache(store, max_size=6)
+ assert 0 == cache.hits
+ assert 0 == cache.misses
+
+ # test first 'foo' __getitem__, cache miss
+ assert b'xxx' == cache['foo']
+ assert 1 == store.counter['__getitem__', 'foo']
+ assert 0 == cache.hits
+ assert 1 == cache.misses
+
+ # test second 'foo' __getitem__, cache hit
+ assert b'xxx' == cache['foo']
+ assert 1 == store.counter['__getitem__', 'foo']
+ assert 1 == cache.hits
+ assert 1 == cache.misses
+
+ # test first 'bar' __getitem__, cache miss
+ assert b'yyy' == cache['bar']
+ assert 1 == store.counter['__getitem__', 'bar']
+ assert 1 == cache.hits
+ assert 2 == cache.misses
+
+ # test second 'bar' __getitem__, cache hit
+ assert b'yyy' == cache['bar']
+ assert 1 == store.counter['__getitem__', 'bar']
+ assert 2 == cache.hits
+ assert 2 == cache.misses
+
+ # test 'foo' __getitem__, should still be cached
+ assert b'xxx' == cache['foo']
+ assert 1 == store.counter['__getitem__', 'foo']
+ assert 3 == cache.hits
+ assert 2 == cache.misses
+
+ # test 'bar' __getitem__, should still be cached
+ assert b'yyy' == cache['bar']
+ assert 1 == store.counter['__getitem__', 'bar']
+ assert 4 == cache.hits
+ assert 2 == cache.misses
+
+ def test_cache_keys(self):
+
+ # setup
+ store = CountingDict()
+ store['foo'] = b'xxx'
+ store['bar'] = b'yyy'
+ assert 0 == store.counter['__contains__', 'foo']
+ assert 0 == store.counter['__iter__']
+ assert 0 == store.counter['keys']
+ cache = LRUStoreCache(store, max_size=None)
+
+ # keys should be cached on first call
+ keys = sorted(cache.keys())
+ assert keys == ['bar', 'foo']
+ assert 1 == store.counter['keys']
+ # keys should now be cached
+ assert keys == sorted(cache.keys())
+ assert 1 == store.counter['keys']
+ assert 'foo' in cache
+ assert 0 == store.counter['__contains__', 'foo']
+ assert keys == sorted(cache)
+ assert 0 == store.counter['__iter__']
+ assert 1 == store.counter['keys']
+
+ # cache should be cleared if store is modified - crude but simple for now
+ cache['baz'] = b'zzz'
+ keys = sorted(cache.keys())
+ assert keys == ['bar', 'baz', 'foo']
+ assert 2 == store.counter['keys']
+ # keys should now be cached
+ assert keys == sorted(cache.keys())
+ assert 2 == store.counter['keys']
+
+ # manually invalidate keys
+ cache.invalidate_keys()
+ keys = sorted(cache.keys())
+ assert keys == ['bar', 'baz', 'foo']
+ assert 3 == store.counter['keys']
+ assert 0 == store.counter['__contains__', 'foo']
+ assert 0 == store.counter['__iter__']
+ cache.invalidate_keys()
+ keys = sorted(cache)
+ assert keys == ['bar', 'baz', 'foo']
+ assert 4 == store.counter['keys']
+ assert 0 == store.counter['__contains__', 'foo']
+ assert 0 == store.counter['__iter__']
+ cache.invalidate_keys()
+ assert 'foo' in cache
+ assert 5 == store.counter['keys']
+ assert 0 == store.counter['__contains__', 'foo']
+ assert 0 == store.counter['__iter__']
+
+ # check these would get counted if called directly
+ assert 'foo' in store
+ assert 1 == store.counter['__contains__', 'foo']
+ assert keys == sorted(store)
+ assert 1 == store.counter['__iter__']
+
+
def test_getsize():
store = dict()
store['foo'] = b'aaa'
diff --git a/zarr/tests/util.py b/zarr/tests/util.py
new file mode 100644
index 0000000000..def9c61132
--- /dev/null
+++ b/zarr/tests/util.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, print_function, division
+import collections
+
+
+class CountingDict(collections.MutableMapping):
+
+ def __init__(self):
+ self.wrapped = dict()
+ self.counter = collections.Counter()
+
+ def __len__(self):
+ self.counter['__len__'] += 1
+ return len(self.wrapped)
+
+ def keys(self):
+ self.counter['keys'] += 1
+ return self.wrapped.keys()
+
+ def __iter__(self):
+ self.counter['__iter__'] += 1
+ return iter(self.wrapped)
+
+ def __contains__(self, item):
+ self.counter['__contains__', item] += 1
+ return item in self.wrapped
+
+ def __getitem__(self, item):
+ self.counter['__getitem__', item] += 1
+ return self.wrapped[item]
+
+ def __setitem__(self, key, value):
+ self.counter['__setitem__', key] += 1
+ self.wrapped[key] = value
+
+ def __delitem__(self, key):
+ self.counter['__delitem__', key] += 1
+ del self.wrapped[key]
diff --git a/zarr/util.py b/zarr/util.py
index 3b43e3a0d6..aedff4ab94 100644
--- a/zarr/util.py
+++ b/zarr/util.py
@@ -318,7 +318,10 @@ def buffer_size(v):
return v.buffer_info()[1] * v.itemsize
else: # pragma: py2 no cover
v = memoryview(v)
- return reduce(operator.mul, v.shape) * v.itemsize
+ if v.shape:
+ return reduce(operator.mul, v.shape) * v.itemsize
+ else:
+ return v.itemsize
def info_text_report(items):