From 8153810c4918d3ed65d516c76b268397b7a4a761 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 16 Dec 2018 01:29:53 -0500 Subject: [PATCH 01/54] Consolidate encode/store in _chunk_setitem_nosync Matches how these lines are written in `_set_basic_selection_zd`. --- zarr/core.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 80d1830c07..1c2013aec1 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1716,10 +1716,8 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non else: chunk[chunk_selection] = value - # encode chunk + # encode and store cdata = self._encode_chunk(chunk) - - # store self.chunk_store[ckey] = cdata def _chunk_key(self, chunk_coords): From c84839e53374b39b3b6f18ae2ab3f71573e7e553 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 16 Dec 2018 02:46:36 -0500 Subject: [PATCH 02/54] Clear key-value pair if chunk is just fill value Add a simple check to see if the key-value pair is just being set with a chunk equal to the fill value. If so, simply delete the key-value pair instead of storing a chunk that only contains the fill value. The Array will behave the same externally. However this will cutdown on the space require to store the Array. Also will make sure that copying one Array to another Array won't dramatically effect the storage size. --- zarr/core.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/zarr/core.py b/zarr/core.py index 1c2013aec1..33734aa304 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1476,6 +1476,17 @@ def _set_basic_selection_zd(self, selection, value, fields=None): else: chunk[selection] = value + # clear chunk if it only contains the fill value + if np.all(np.equal(chunk, self._fill_value)): + try: + del self.chunk_store[ckey] + return + except KeyError: + return + except Exception: + # deleting failed, fallback to overwriting + pass + # encode and store cdata = self._encode_chunk(chunk) self.chunk_store[ckey] = cdata @@ -1716,6 +1727,17 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non else: chunk[chunk_selection] = value + # clear chunk if it only contains the fill value + if np.all(np.equal(chunk, self._fill_value)): + try: + del self.chunk_store[ckey] + return + except KeyError: + return + except Exception: + # deleting failed, fallback to overwriting + pass + # encode and store cdata = self._encode_chunk(chunk) self.chunk_store[ckey] = cdata From 6ac2349b8abf21ddd5dcb1f05c9dd54ea6a9d688 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 10 May 2021 15:31:15 -0400 Subject: [PATCH 03/54] set empty chunk write behavior via array constructor --- zarr/core.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index dc2c6b5596..af8fe82ef1 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -74,7 +74,13 @@ class Array: If True and while the chunk_store is a FSStore and the compresion used is Blosc, when getting data from the array chunks will be partially read and decompressed when possible. - + write_empty_chunks: bool, optional + Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). + If True (default), all chunks will be written regardless of their contents. + If False, empty chunks will not be written, and the `store` entry for + the chunk key of an empty chunk will be deleted. Note that setting this option to False + will incur additional overhead per chunk write. + .. versionadded:: 2.7 Attributes @@ -106,7 +112,7 @@ class Array: info vindex oindex - + Methods ------- __getitem__ @@ -138,6 +144,7 @@ def __init__( cache_metadata=True, cache_attrs=True, partial_decompress=False, + write_empty_chunks=True, ): # N.B., expect at this point store is fully initialized with all # configuration metadata fully specified and normalized @@ -154,6 +161,7 @@ def __init__( self._cache_metadata = cache_metadata self._is_view = False self._partial_decompress = partial_decompress + self._write_empty_chunks = write_empty_chunks # initialize metadata self._load_metadata() @@ -1960,7 +1968,7 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): chunk[chunk_selection] = value # clear chunk if it only contains the fill value - if np.all(np.equal(chunk, self._fill_value)): + if self._write_empty_chunks and np.all(np.equal(chunk, self._fill_value)): try: del self.chunk_store[ckey] return From eb367138bdba6a842a001a2af6b2644c17ebb247 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 10 May 2021 16:24:35 -0400 Subject: [PATCH 04/54] add rudimentary tests, np.equal -> np.array_equal --- zarr/core.py | 2 +- zarr/tests/test_core.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/zarr/core.py b/zarr/core.py index af8fe82ef1..32fc3c9ce8 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1968,7 +1968,7 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): chunk[chunk_selection] = value # clear chunk if it only contains the fill value - if self._write_empty_chunks and np.all(np.equal(chunk, self._fill_value)): + if not self._write_empty_chunks and np.all(np.array_equal(chunk, self._fill_value)): try: del self.chunk_store[ckey] return diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 6914cff87e..c61ffd7746 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1596,6 +1596,20 @@ def test_nbytes_stored(self): assert -1 == z.nbytes_stored +class TestArrayWithoutEmptyWrites(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + path = mkdtemp() + atexit.register(shutil.rmtree, path) + store = DirectoryStore(path) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + kwargs.setdefault('compressor', Zlib(1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=cache_metadata, + cache_attrs=cache_attrs, write_empty_chunks=False) + class TestArrayWithDirectoryStore(TestArray): @staticmethod From 053ad4c9dc0470e5fd4875ada0ab1f398d3d360e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 10 May 2021 16:48:25 -0400 Subject: [PATCH 05/54] add test for chunk deletion --- zarr/tests/test_core.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index c61ffd7746..4acb5eeb03 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1610,6 +1610,22 @@ def create_array(read_only=False, **kwargs): return Array(store, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs, write_empty_chunks=False) + def test_nchunks_initialized(self): + for fill_value in -1, 0, 1, 10: + z = self.create_array(shape=100, chunks=10, fill_value=fill_value) + assert 0 == z.nchunks_initialized + # manually put something into the store to confuse matters + z.store['foo'] = b'bar' + assert 0 == z.nchunks_initialized + z[:] = 42 + assert 10 == z.nchunks_initialized + z[:] = fill_value + assert 0 == z.nchunks_initialized + z[0] = 42 + assert 1 == z.nchunks_initialized + if hasattr(z.store, 'close'): + z.store.close() + class TestArrayWithDirectoryStore(TestArray): @staticmethod From 3375bf018a937a70f2aadc0bf623ebaee443cfdd Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 11 May 2021 15:50:36 -0400 Subject: [PATCH 06/54] add flattening function --- zarr/util.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/zarr/util.py b/zarr/util.py index e3e9fca2f9..d60c83056c 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -9,12 +9,19 @@ import numpy as np from asciitree import BoxStyle, LeftAligned from asciitree.traversal import Traversal +from collections.abc import Iterable from numcodecs.compat import ensure_ndarray, ensure_text from numcodecs.registry import codec_registry from numcodecs.blosc import cbuffer_sizes, cbuffer_metainfo from typing import Any, Callable, Dict, Optional, Tuple, Union +def flatten(arg: Iterable) -> Iterable: + for element in arg: + if isinstance(element, Iterable) and not isinstance(element, (str, bytes)): + yield from flatten(element) + else: + yield element # codecs to use for object dtype convenience API object_codecs = { From 30c3a30810e11a6ca294237c013c70c791b0abe3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 11 May 2021 15:51:12 -0400 Subject: [PATCH 07/54] add kwarg for empty writes to array creators --- zarr/creation.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/zarr/creation.py b/zarr/creation.py index 73e10adff1..71cebf7d9a 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -20,7 +20,7 @@ def create(shape, chunks=True, dtype=None, compressor='default', fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, path=None, chunk_store=None, filters=None, cache_metadata=True, cache_attrs=True, read_only=False, - object_codec=None, dimension_separator=None, **kwargs): + object_codec=None, dimension_separator=None, write_empty_chunks=True, **kwargs): """Create an array. Parameters @@ -70,6 +70,12 @@ def create(shape, chunks=True, dtype=None, compressor='default', dimension_separator : {'.', '/'}, optional Separator placed between the dimensions of a chunk. .. versionadded:: 2.8 + write_empty_chunks : bool, optional + Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). + If True (default), all chunks will be written regardless of their contents. + If False, empty chunks will not be written, and the `store` entry for + the chunk key of an empty chunk will be deleted. Note that setting this option to False + will incur additional overhead per chunk write. Returns ------- @@ -140,7 +146,7 @@ def create(shape, chunks=True, dtype=None, compressor='default', # instantiate array z = Array(store, path=path, chunk_store=chunk_store, synchronizer=synchronizer, - cache_metadata=cache_metadata, cache_attrs=cache_attrs, read_only=read_only) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, read_only=read_only, write_empty_chunks=write_empty_chunks) return z @@ -396,6 +402,7 @@ def open_array( chunk_store=None, storage_options=None, partial_decompress=False, + write_empty_chunks=True, **kwargs ): """Open an array using file-mode-like semantics. @@ -450,8 +457,12 @@ def open_array( If True and while the chunk_store is a FSStore and the compresion used is Blosc, when getting data from the array chunks will be partially read and decompressed when possible. - - .. versionadded:: 2.7 + write_empty_chunks : bool, optional + Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). + If True (default), all chunks will be written regardless of their contents. + If False, empty chunks will not be written, and the `store` entry for + the chunk key of an empty chunk will be deleted. Note that setting this option to False + will incur additional overhead per chunk write. Returns ------- @@ -541,7 +552,7 @@ def open_array( # instantiate array z = Array(store, read_only=read_only, synchronizer=synchronizer, cache_metadata=cache_metadata, cache_attrs=cache_attrs, path=path, - chunk_store=chunk_store) + chunk_store=chunk_store, write_empty_chunks=write_empty_chunks) return z From d2fc396f10d968819d4a013de2d0cdb4a5532ad1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 11 May 2021 15:51:41 -0400 Subject: [PATCH 08/54] fix check for chunk equality to fill value --- zarr/core.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 32fc3c9ce8..0ac7637bd1 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -33,6 +33,7 @@ from zarr.util import ( InfoReporter, check_array_shape, + flatten, human_readable_size, is_total_slice, nolock, @@ -74,12 +75,12 @@ class Array: If True and while the chunk_store is a FSStore and the compresion used is Blosc, when getting data from the array chunks will be partially read and decompressed when possible. - write_empty_chunks: bool, optional + write_empty_chunks : bool, optional Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). If True (default), all chunks will be written regardless of their contents. If False, empty chunks will not be written, and the `store` entry for the chunk key of an empty chunk will be deleted. Note that setting this option to False - will incur additional overhead per chunk write. + will incur additional overhead per chunk write. .. versionadded:: 2.7 @@ -1911,7 +1912,8 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non ckey = self._chunk_key(chunk_coords) cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) # store - self.chunk_store[ckey] = cdata + if cdata is not None: + self.chunk_store[ckey] = cdata def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): if is_total_slice(chunk_selection, self._chunks) and not fields: @@ -1968,15 +1970,23 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): chunk[chunk_selection] = value # clear chunk if it only contains the fill value - if not self._write_empty_chunks and np.all(np.array_equal(chunk, self._fill_value)): - try: - del self.chunk_store[ckey] - return - except KeyError: - return - except Exception: - # deleting failed, fallback to overwriting - pass + if not self._write_empty_chunks: + if self.dtype == 'object': + # we have to flatten the result of np.equal to handle outputs like + # [np.array([True,True]), True, True] + is_empty = all(flatten(np.equal(chunk, self.fill_value, dtype='object'))) + else: + is_empty = np.all(chunk == self._fill_value) + + if is_empty: + try: + del self.chunk_store[ckey] + return + except KeyError: + return + except Exception: + # deleting failed, fallback to overwriting + pass # encode chunk return self._encode_chunk(chunk) From e4e40125c64884d5a13927e3afd9a6bd39862730 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 11 May 2021 16:14:56 -0400 Subject: [PATCH 09/54] flake8 --- zarr/core.py | 15 +++++++-------- zarr/creation.py | 15 ++++++++------- zarr/tests/test_core.py | 3 ++- zarr/util.py | 2 ++ 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 0ac7637bd1..a7f38ca1b4 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -76,12 +76,12 @@ class Array: is Blosc, when getting data from the array chunks will be partially read and decompressed when possible. write_empty_chunks : bool, optional - Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). - If True (default), all chunks will be written regardless of their contents. - If False, empty chunks will not be written, and the `store` entry for + Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). + If True (default), all chunks will be written regardless of their contents. + If False, empty chunks will not be written, and the `store` entry for the chunk key of an empty chunk will be deleted. Note that setting this option to False will incur additional overhead per chunk write. - + .. versionadded:: 2.7 Attributes @@ -113,7 +113,7 @@ class Array: info vindex oindex - + Methods ------- __getitem__ @@ -1972,12 +1972,12 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): # clear chunk if it only contains the fill value if not self._write_empty_chunks: if self.dtype == 'object': - # we have to flatten the result of np.equal to handle outputs like + # we have to flatten the result of np.equal to handle outputs like # [np.array([True,True]), True, True] is_empty = all(flatten(np.equal(chunk, self.fill_value, dtype='object'))) else: is_empty = np.all(chunk == self._fill_value) - + if is_empty: try: del self.chunk_store[ckey] @@ -1991,7 +1991,6 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): # encode chunk return self._encode_chunk(chunk) - def _chunk_key(self, chunk_coords): return self._key_prefix + '.'.join(map(str, chunk_coords)) diff --git a/zarr/creation.py b/zarr/creation.py index 71cebf7d9a..824a2de3e9 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -71,9 +71,9 @@ def create(shape, chunks=True, dtype=None, compressor='default', Separator placed between the dimensions of a chunk. .. versionadded:: 2.8 write_empty_chunks : bool, optional - Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). - If True (default), all chunks will be written regardless of their contents. - If False, empty chunks will not be written, and the `store` entry for + Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). + If True (default), all chunks will be written regardless of their contents. + If False, empty chunks will not be written, and the `store` entry for the chunk key of an empty chunk will be deleted. Note that setting this option to False will incur additional overhead per chunk write. @@ -146,7 +146,8 @@ def create(shape, chunks=True, dtype=None, compressor='default', # instantiate array z = Array(store, path=path, chunk_store=chunk_store, synchronizer=synchronizer, - cache_metadata=cache_metadata, cache_attrs=cache_attrs, read_only=read_only, write_empty_chunks=write_empty_chunks) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, read_only=read_only, + write_empty_chunks=write_empty_chunks) return z @@ -458,9 +459,9 @@ def open_array( is Blosc, when getting data from the array chunks will be partially read and decompressed when possible. write_empty_chunks : bool, optional - Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). - If True (default), all chunks will be written regardless of their contents. - If False, empty chunks will not be written, and the `store` entry for + Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). + If True (default), all chunks will be written regardless of their contents. + If False, empty chunks will not be written, and the `store` entry for the chunk key of an empty chunk will be deleted. Note that setting this option to False will incur additional overhead per chunk write. diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 4acb5eeb03..d3ff193fb5 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1608,7 +1608,7 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs, write_empty_chunks=False) + cache_attrs=cache_attrs, write_empty_chunks=False) def test_nchunks_initialized(self): for fill_value in -1, 0, 1, 10: @@ -1626,6 +1626,7 @@ def test_nchunks_initialized(self): if hasattr(z.store, 'close'): z.store.close() + class TestArrayWithDirectoryStore(TestArray): @staticmethod diff --git a/zarr/util.py b/zarr/util.py index d60c83056c..c91ed5df44 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -16,6 +16,7 @@ from typing import Any, Callable, Dict, Optional, Tuple, Union + def flatten(arg: Iterable) -> Iterable: for element in arg: if isinstance(element, Iterable) and not isinstance(element, (str, bytes)): @@ -23,6 +24,7 @@ def flatten(arg: Iterable) -> Iterable: else: yield element + # codecs to use for object dtype convenience API object_codecs = { str.__name__: 'vlen-utf8', From bd27b9af1ce4ea873d5f7748f49b6e1ecdcadcaa Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 12 May 2021 14:58:20 -0400 Subject: [PATCH 10/54] add None check to setitems --- zarr/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/core.py b/zarr/core.py index a7f38ca1b4..ac7db2d206 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1879,7 +1879,7 @@ def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): ckeys = [self._chunk_key(co) for co in lchunk_coords] cdatas = [self._process_for_setitem(key, sel, val, fields=fields) for key, sel, val in zip(ckeys, lchunk_selection, values)] - values = {k: v for k, v in zip(ckeys, cdatas)} + values = {k: v for k, v in zip(ckeys, cdatas) if v is not None} self.chunk_store.setitems(values) def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): From 814d0090915d87008be8a972cf9f244d2f41e410 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 12 May 2021 16:28:00 -0400 Subject: [PATCH 11/54] add write_empty_chunks to output of __getstate__ --- zarr/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/core.py b/zarr/core.py index ac7db2d206..f328fdcbae 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -2209,7 +2209,7 @@ def hexdigest(self, hashname="sha1"): def __getstate__(self): return (self._store, self._path, self._read_only, self._chunk_store, - self._synchronizer, self._cache_metadata, self._attrs.cache) + self._synchronizer, self._cache_metadata, self._attrs.cache, self._write_empty_chunks) def __setstate__(self, state): self.__init__(*state) From 769f5a635a5e5dda4f4dae0be8afeb175ddf3aa6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 12 May 2021 16:47:25 -0400 Subject: [PATCH 12/54] flake8 --- zarr/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zarr/core.py b/zarr/core.py index f328fdcbae..22f23a3736 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -2209,7 +2209,8 @@ def hexdigest(self, hashname="sha1"): def __getstate__(self): return (self._store, self._path, self._read_only, self._chunk_store, - self._synchronizer, self._cache_metadata, self._attrs.cache, self._write_empty_chunks) + self._synchronizer, self._cache_metadata, self._attrs.cache, + self._write_empty_chunks) def __setstate__(self, state): self.__init__(*state) From cd56b35a8bdc1bea5876152364cdc305f9e4f7ad Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 12 May 2021 16:52:05 -0400 Subject: [PATCH 13/54] add partial decompress to __get_state__ --- zarr/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/core.py b/zarr/core.py index 22f23a3736..6d6325fbc8 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -2210,7 +2210,7 @@ def hexdigest(self, hashname="sha1"): def __getstate__(self): return (self._store, self._path, self._read_only, self._chunk_store, self._synchronizer, self._cache_metadata, self._attrs.cache, - self._write_empty_chunks) + self._partial_decompress, self._write_empty_chunks) def __setstate__(self, state): self.__init__(*state) From 044a9b8c64a37d3ba9bfe247ad23114bd3411611 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 May 2021 13:53:48 -0400 Subject: [PATCH 14/54] functionalize emptiness checks and key deletion --- zarr/core.py | 68 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 6d6325fbc8..97562e1f88 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1596,7 +1596,7 @@ def _set_basic_selection_zd(self, selection, value, fields=None): chunk[selection] = value # clear chunk if it only contains the fill value - if np.all(np.equal(chunk, self._fill_value)): + if self._chunk_isempty(chunk): try: del self.chunk_store[ckey] return @@ -1878,10 +1878,40 @@ def _chunk_getitems(self, lchunk_coords, lchunk_selection, out, lout_selection, def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): ckeys = [self._chunk_key(co) for co in lchunk_coords] cdatas = [self._process_for_setitem(key, sel, val, fields=fields) - for key, sel, val in zip(ckeys, lchunk_selection, values)] - values = {k: v for k, v in zip(ckeys, cdatas) if v is not None} + for key, sel, val in zip(ckeys, lchunk_selection, values)] + values = {} + if not self._write_empty_chunks: + for ckey, cdata in zip(ckeys, cdatas): + if self._chunk_isempty(cdata) and not self._chunk_delitem(ckey): + values[ckey] = self._encode_chunk(cdata) + else: + values = dict(zip(ckeys, map(self._encode_chunk, cdatas))) self.chunk_store.setitems(values) + def _chunk_isempty(self, chunk): + if self.dtype == 'object': + # we have to flatten the result of np.equal to handle outputs like + # [np.array([True,True]), True, True] + is_empty = all(flatten(np.equal(chunk, self.fill_value, dtype='object'))) + else: + is_empty = np.all(chunk == self._fill_value) + return is_empty + + def _chunk_delitem(self, ckey): + """ + Attempt to delete the value associated with ckey. + Returns True if deletion succeeds or KeyError is raised. + Returns False if any other exception is raised. + """ + try: + del self.chunk_store[ckey] + return True + except KeyError: + return True + except Exception: + return False + + def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): """Replace part or whole of a chunk. @@ -1909,11 +1939,17 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): fields=fields) def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): + do_store = True ckey = self._chunk_key(chunk_coords) cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) + + # clear chunk if it only contains the fill value + if (not self._write_empty_chunks) and self._chunk_isempty(cdata): + do_store = not self._chunk_delitem(ckey) + # store - if cdata is not None: - self.chunk_store[ckey] = cdata + if do_store: + self.chunk_store[ckey] = self._encode_chunk(cdata) def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): if is_total_slice(chunk_selection, self._chunks) and not fields: @@ -1969,27 +2005,7 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): else: chunk[chunk_selection] = value - # clear chunk if it only contains the fill value - if not self._write_empty_chunks: - if self.dtype == 'object': - # we have to flatten the result of np.equal to handle outputs like - # [np.array([True,True]), True, True] - is_empty = all(flatten(np.equal(chunk, self.fill_value, dtype='object'))) - else: - is_empty = np.all(chunk == self._fill_value) - - if is_empty: - try: - del self.chunk_store[ckey] - return - except KeyError: - return - except Exception: - # deleting failed, fallback to overwriting - pass - - # encode chunk - return self._encode_chunk(chunk) + return chunk def _chunk_key(self, chunk_coords): return self._key_prefix + '.'.join(map(str, chunk_coords)) From 74e08526ee086b6c74a3e3698bb6b3e35bbde252 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 25 May 2021 14:10:01 -0400 Subject: [PATCH 15/54] flake8 --- zarr/core.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 97562e1f88..dde1fdff02 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1878,7 +1878,7 @@ def _chunk_getitems(self, lchunk_coords, lchunk_selection, out, lout_selection, def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): ckeys = [self._chunk_key(co) for co in lchunk_coords] cdatas = [self._process_for_setitem(key, sel, val, fields=fields) - for key, sel, val in zip(ckeys, lchunk_selection, values)] + for key, sel, val in zip(ckeys, lchunk_selection, values)] values = {} if not self._write_empty_chunks: for ckey, cdata in zip(ckeys, cdatas): @@ -1899,7 +1899,7 @@ def _chunk_isempty(self, chunk): def _chunk_delitem(self, ckey): """ - Attempt to delete the value associated with ckey. + Attempt to delete the value associated with ckey. Returns True if deletion succeeds or KeyError is raised. Returns False if any other exception is raised. """ @@ -1911,7 +1911,6 @@ def _chunk_delitem(self, ckey): except Exception: return False - def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): """Replace part or whole of a chunk. @@ -1942,11 +1941,11 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non do_store = True ckey = self._chunk_key(chunk_coords) cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) - + # clear chunk if it only contains the fill value if (not self._write_empty_chunks) and self._chunk_isempty(cdata): do_store = not self._chunk_delitem(ckey) - + # store if do_store: self.chunk_store[ckey] = self._encode_chunk(cdata) From 62a55ab2c24ea392817735d4079ae7660198cc39 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 27 May 2021 17:48:12 -0400 Subject: [PATCH 16/54] add path for delitems, and add some failing tests --- zarr/core.py | 25 +++++++++-------- zarr/tests/test_core.py | 62 +++++++++++++++++++++++++++++++---------- 2 files changed, 60 insertions(+), 27 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index dde1fdff02..b16de60077 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1881,9 +1881,17 @@ def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): for key, sel, val in zip(ckeys, lchunk_selection, values)] values = {} if not self._write_empty_chunks: + to_delete = [] for ckey, cdata in zip(ckeys, cdatas): - if self._chunk_isempty(cdata) and not self._chunk_delitem(ckey): + if self._chunk_isempty(cdata): + to_delete.append(ckey) + else: values[ckey] = self._encode_chunk(cdata) + if len(to_delete) > 0: + if hasattr(self.chunk_store.map, 'delitems'): + self.chunk_store.map.delitems(to_delete) + else: + [self._chunk_delitem(k) for k in to_delete] else: values = dict(zip(ckeys, map(self._encode_chunk, cdatas))) self.chunk_store.setitems(values) @@ -1900,16 +1908,12 @@ def _chunk_isempty(self, chunk): def _chunk_delitem(self, ckey): """ Attempt to delete the value associated with ckey. - Returns True if deletion succeeds or KeyError is raised. - Returns False if any other exception is raised. + Silently handle keyerror. """ try: del self.chunk_store[ckey] - return True except KeyError: - return True - except Exception: - return False + pass def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): """Replace part or whole of a chunk. @@ -1938,16 +1942,13 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): fields=fields) def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): - do_store = True ckey = self._chunk_key(chunk_coords) cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) # clear chunk if it only contains the fill value if (not self._write_empty_chunks) and self._chunk_isempty(cdata): - do_store = not self._chunk_delitem(ckey) - - # store - if do_store: + self._chunk_delitem(ckey) + else: self.chunk_store[ckey] = self._encode_chunk(cdata) def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 9118e91ae3..26b0dbf824 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1611,21 +1611,21 @@ def create_array(read_only=False, **kwargs): return Array(store, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs, write_empty_chunks=False) - def test_nchunks_initialized(self): - for fill_value in -1, 0, 1, 10: - z = self.create_array(shape=100, chunks=10, fill_value=fill_value) - assert 0 == z.nchunks_initialized - # manually put something into the store to confuse matters - z.store['foo'] = b'bar' - assert 0 == z.nchunks_initialized - z[:] = 42 - assert 10 == z.nchunks_initialized - z[:] = fill_value - assert 0 == z.nchunks_initialized - z[0] = 42 - assert 1 == z.nchunks_initialized - if hasattr(z.store, 'close'): - z.store.close() + def test_nchunks_initialized(self): + for fill_value in -1, 0, 1, 10: + z = self.create_array(shape=100, chunks=10, fill_value=fill_value) + assert 0 == z.nchunks_initialized + # manually put something into the store to confuse matters + z.store['foo'] = b'bar' + assert 0 == z.nchunks_initialized + z[:] = 42 + assert 10 == z.nchunks_initialized + z[:] = fill_value + assert 0 == z.nchunks_initialized + z[0] = 42 + assert 1 == z.nchunks_initialized + if hasattr(z.store, 'close'): + z.store.close() class TestArrayWithDirectoryStore(TestArray): @@ -2760,3 +2760,35 @@ def test_read_from_all_blocks(self): z[2:99_000] = 1 b = Array(z.store, read_only=True, partial_decompress=True) assert (b[2:99_000] == 1).all() + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestArrayWithFSStoreNoEmptyWrites(TestArray): + @staticmethod + def create_array(read_only=False, **kwargs): + path = mkdtemp() + atexit.register(shutil.rmtree, path) + key_separator = kwargs.pop('key_separator', ".") + store = FSStore(path, key_separator=key_separator, auto_mkdir=True) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + kwargs.setdefault('compressor', Blosc()) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=cache_metadata, + cache_attrs=cache_attrs, write_empty_chunks=False) + + + def test_nchunks_initialized(self): + for fill_value in -1, 0, 1, 10: + z = self.create_array(shape=100, chunks=10, fill_value=fill_value) + assert 0 == z.nchunks_initialized + # manually put something into the store to confuse matters + z.store['foo'] = b'bar' + assert 0 == z.nchunks_initialized + z[:] = 42 + assert 10 == z.nchunks_initialized + z[:] = fill_value + assert 0 == z.nchunks_initialized + z[0] = 42 + assert 1 == z.nchunks_initialized + if hasattr(z.store, 'close'): + z.store.close() \ No newline at end of file From dbc32fd8278cf5c61b75cd0ae46317e38479715f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 27 May 2021 18:07:23 -0400 Subject: [PATCH 17/54] flake8 --- zarr/tests/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 26b0dbf824..b8a2801f5a 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -2761,6 +2761,7 @@ def test_read_from_all_blocks(self): b = Array(z.store, read_only=True, partial_decompress=True) assert (b[2:99_000] == 1).all() + @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") class TestArrayWithFSStoreNoEmptyWrites(TestArray): @staticmethod @@ -2776,7 +2777,6 @@ def create_array(read_only=False, **kwargs): return Array(store, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs, write_empty_chunks=False) - def test_nchunks_initialized(self): for fill_value in -1, 0, 1, 10: z = self.create_array(shape=100, chunks=10, fill_value=fill_value) @@ -2791,4 +2791,4 @@ def test_nchunks_initialized(self): z[0] = 42 assert 1 == z.nchunks_initialized if hasattr(z.store, 'close'): - z.store.close() \ No newline at end of file + z.store.close() From cd28affb9492e96ed6c0d6a90e1d20c9713dc7c2 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 27 May 2021 19:15:26 -0400 Subject: [PATCH 18/54] add delitems method to FSStore, and correspondingly change zarr.Array behavior --- zarr/core.py | 4 ++-- zarr/storage.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index b16de60077..cf0fc72cbc 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1888,8 +1888,8 @@ def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): else: values[ckey] = self._encode_chunk(cdata) if len(to_delete) > 0: - if hasattr(self.chunk_store.map, 'delitems'): - self.chunk_store.map.delitems(to_delete) + if hasattr(self.chunk_store, 'delitems'): + self.chunk_store.delitems(to_delete) else: [self._chunk_delitem(k) for k in to_delete] else: diff --git a/zarr/storage.py b/zarr/storage.py index 82394a41fd..0a03edac7b 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1123,6 +1123,12 @@ def __delitem__(self, key): else: del self.map[key] + def delitems(self, keys): + if self.mode == 'r': + raise ReadOnlyError + nkeys = [self._normalize_key(key) for key in keys] + self.map.delitems(nkeys) + def __contains__(self, key): key = self._normalize_key(key) return key in self.map From 160c2dc490f1a222ebc3de9454fb76d201ffca7e Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 27 May 2021 19:15:53 -0400 Subject: [PATCH 19/54] add nested + empty writes test --- zarr/tests/test_core.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index b8a2801f5a..4397556377 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -2792,3 +2792,35 @@ def test_nchunks_initialized(self): assert 1 == z.nchunks_initialized if hasattr(z.store, 'close'): z.store.close() + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestArrayWithFSStoreNestedNoEmptyWrites(TestArray): + @staticmethod + def create_array(read_only=False, **kwargs): + path = mkdtemp() + atexit.register(shutil.rmtree, path) + key_separator = kwargs.pop('key_separator', "/") + store = FSStore(path, key_separator=key_separator, auto_mkdir=True) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + kwargs.setdefault('compressor', Blosc()) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=cache_metadata, + cache_attrs=cache_attrs, write_empty_chunks=False) + + def test_nchunks_initialized(self): + for fill_value in -1, 0, 1, 10: + z = self.create_array(shape=100, chunks=10, fill_value=fill_value) + assert 0 == z.nchunks_initialized + # manually put something into the store to confuse matters + z.store['foo'] = b'bar' + assert 0 == z.nchunks_initialized + z[:] = 42 + assert 10 == z.nchunks_initialized + z[:] = fill_value + assert 0 == z.nchunks_initialized + z[0] = 42 + assert 1 == z.nchunks_initialized + if hasattr(z.store, 'close'): + z.store.close() From af715fe58d7c2848768c8b761a3b5ef4e0ba5468 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 14 Jul 2021 12:22:56 -0400 Subject: [PATCH 20/54] set write_empty_chunks to True by default --- zarr/core.py | 27 +++++++++++++++------------ zarr/creation.py | 15 +++++++++------ 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index cf0fc72cbc..9398106138 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -145,7 +145,7 @@ def __init__( cache_metadata=True, cache_attrs=True, partial_decompress=False, - write_empty_chunks=True, + write_empty_chunks=False, ): # N.B., expect at this point store is fully initialized with all # configuration metadata fully specified and normalized @@ -1881,17 +1881,11 @@ def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): for key, sel, val in zip(ckeys, lchunk_selection, values)] values = {} if not self._write_empty_chunks: - to_delete = [] for ckey, cdata in zip(ckeys, cdatas): if self._chunk_isempty(cdata): - to_delete.append(ckey) + self._chunk_delitem(ckey) else: values[ckey] = self._encode_chunk(cdata) - if len(to_delete) > 0: - if hasattr(self.chunk_store, 'delitems'): - self.chunk_store.delitems(to_delete) - else: - [self._chunk_delitem(k) for k in to_delete] else: values = dict(zip(ckeys, map(self._encode_chunk, cdatas))) self.chunk_store.setitems(values) @@ -1905,15 +1899,21 @@ def _chunk_isempty(self, chunk): is_empty = np.all(chunk == self._fill_value) return is_empty + def _chunk_delitems(self, ckeys): + if isinstance(ckeys, str): + ckeys = [ckeys] + # todo: use the delitem method of fsstore.FSMap when it is better tested. + return tuple(map(self._chunk_delitem, ckeys)) + def _chunk_delitem(self, ckey): """ Attempt to delete the value associated with ckey. - Silently handle keyerror. """ try: del self.chunk_store[ckey] + return except KeyError: - pass + return def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): """Replace part or whole of a chunk. @@ -1942,13 +1942,16 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): fields=fields) def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): + do_store = True ckey = self._chunk_key(chunk_coords) cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) # clear chunk if it only contains the fill value if (not self._write_empty_chunks) and self._chunk_isempty(cdata): - self._chunk_delitem(ckey) - else: + do_store = not self._chunk_delitem(ckey) + + # store + if do_store: self.chunk_store[ckey] = self._encode_chunk(cdata) def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): diff --git a/zarr/creation.py b/zarr/creation.py index 824a2de3e9..826e723893 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -20,7 +20,7 @@ def create(shape, chunks=True, dtype=None, compressor='default', fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, path=None, chunk_store=None, filters=None, cache_metadata=True, cache_attrs=True, read_only=False, - object_codec=None, dimension_separator=None, write_empty_chunks=True, **kwargs): + object_codec=None, dimension_separator=None, write_empty_chunks=False, **kwargs): """Create an array. Parameters @@ -72,10 +72,13 @@ def create(shape, chunks=True, dtype=None, compressor='default', .. versionadded:: 2.8 write_empty_chunks : bool, optional Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). - If True (default), all chunks will be written regardless of their contents. - If False, empty chunks will not be written, and the `store` entry for - the chunk key of an empty chunk will be deleted. Note that setting this option to False - will incur additional overhead per chunk write. + If False (default), empty chunks will not be written, and the `store` entry for + the chunk key of an empty chunk will be deleted. Note that writings chunk with + `write_empty_chunks=False` will incur overhead associated with checking the contents + of each chunk. Use `write_empty_chunks=True` to avoid this overhead, at the expense of + potentially creating unnecessary objects in storage. + + Returns ------- @@ -403,7 +406,7 @@ def open_array( chunk_store=None, storage_options=None, partial_decompress=False, - write_empty_chunks=True, + write_empty_chunks=False, **kwargs ): """Open an array using file-mode-like semantics. From 59328f099e96c9bd1a0942da0d8ad97e2e25a03d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 14 Jul 2021 13:15:16 -0400 Subject: [PATCH 21/54] rename chunk_is_empty method and clean up logic in _chunk_setitem --- zarr/core.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 9398106138..cac3cba7ca 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1596,7 +1596,7 @@ def _set_basic_selection_zd(self, selection, value, fields=None): chunk[selection] = value # clear chunk if it only contains the fill value - if self._chunk_isempty(chunk): + if self._chunk_is_empty(chunk): try: del self.chunk_store[ckey] return @@ -1882,7 +1882,7 @@ def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): values = {} if not self._write_empty_chunks: for ckey, cdata in zip(ckeys, cdatas): - if self._chunk_isempty(cdata): + if self._chunk_is_empty(cdata): self._chunk_delitem(ckey) else: values[ckey] = self._encode_chunk(cdata) @@ -1890,7 +1890,7 @@ def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): values = dict(zip(ckeys, map(self._encode_chunk, cdatas))) self.chunk_store.setitems(values) - def _chunk_isempty(self, chunk): + def _chunk_is_empty(self, chunk): if self.dtype == 'object': # we have to flatten the result of np.equal to handle outputs like # [np.array([True,True]), True, True] @@ -1942,16 +1942,13 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None): fields=fields) def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None): - do_store = True ckey = self._chunk_key(chunk_coords) cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) - # clear chunk if it only contains the fill value - if (not self._write_empty_chunks) and self._chunk_isempty(cdata): - do_store = not self._chunk_delitem(ckey) - - # store - if do_store: + # attempt to delete chunk if it only contains the fill value + if (not self._write_empty_chunks) and self._chunk_is_empty(cdata): + self._chunk_delitem(ckey) + else: self.chunk_store[ckey] = self._encode_chunk(cdata) def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): From 72488a84bab289ab52c6158d2dd885c5a3cbe88b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 14 Jul 2021 13:15:31 -0400 Subject: [PATCH 22/54] rename test --- zarr/tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 9d99dc37c4..43ed096402 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1598,7 +1598,7 @@ def test_nbytes_stored(self): assert -1 == z.nbytes_stored -class TestArrayWithoutEmptyWrites(TestArray): +class TestArrayNoEmptyWrites(TestArray): @staticmethod def create_array(read_only=False, **kwargs): From 7489ae974d40242b903b7819499bcfabebe81058 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 14 Jul 2021 13:50:00 -0400 Subject: [PATCH 23/54] add test for flatten --- zarr/tests/test_util.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/zarr/tests/test_util.py b/zarr/tests/test_util.py index fa1f18fa63..3c435caeb2 100644 --- a/zarr/tests/test_util.py +++ b/zarr/tests/test_util.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from zarr.util import (guess_chunks, human_readable_size, info_html_report, +from zarr.util import (flatten, guess_chunks, human_readable_size, info_html_report, info_text_report, is_total_slice, normalize_chunks, normalize_dimension_separator, normalize_fill_value, normalize_order, @@ -211,3 +211,9 @@ def fail(x): for x in range(11, 15): pytest.raises(PermissionError, fail, x) + + +def test_flatten(): + assert list(flatten(['0', ['1', ['2', ['3', [4, ]]]]])) == ['0', '1', '2', '3', 4] + assert list(flatten('foo')) == ['f', 'o', 'o'] + assert list(flatten(['foo'])) == ['foo'] From 10199b3adfcaba6fe91a8090eb97c4369ea4ac9c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 14 Jul 2021 16:49:29 -0400 Subject: [PATCH 24/54] initial support for using delitems api in chunk_setitems --- zarr/core.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index cac3cba7ca..c562f45db9 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1876,19 +1876,22 @@ def _chunk_getitems(self, lchunk_coords, lchunk_selection, out, lout_selection, out[out_select] = fill_value def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): - ckeys = [self._chunk_key(co) for co in lchunk_coords] - cdatas = [self._process_for_setitem(key, sel, val, fields=fields) - for key, sel, val in zip(ckeys, lchunk_selection, values)] - values = {} - if not self._write_empty_chunks: - for ckey, cdata in zip(ckeys, cdatas): - if self._chunk_is_empty(cdata): + ckeys = map(self._chunk_key, lchunk_coords) + cdatas = {key: self._process_for_setitem(key, sel, val, fields=fields) + for key, sel, val in zip(ckeys, lchunk_selection, values)} + to_store = {} + if not self._write_empty_chunks: + empty_chunks = {k: v for k,v in cdatas.items() if self._chunk_is_empty(v)} + if hasattr(self.store, 'delitems'): + self.store.delitems(tuple(empty_chunks.keys())) + else: + for ckey in empty_chunks.keys(): self._chunk_delitem(ckey) - else: - values[ckey] = self._encode_chunk(cdata) + nonempty_keys = cdatas.keys() - empty_chunks.keys() + to_store = {k: self._encode_chunk(cdatas[k]) for k in nonempty_keys} else: - values = dict(zip(ckeys, map(self._encode_chunk, cdatas))) - self.chunk_store.setitems(values) + to_store = {k: self._encode_chunk(v) for k, v in cdatas.items()} + self.chunk_store.setitems(to_store) def _chunk_is_empty(self, chunk): if self.dtype == 'object': @@ -1902,8 +1905,12 @@ def _chunk_is_empty(self, chunk): def _chunk_delitems(self, ckeys): if isinstance(ckeys, str): ckeys = [ckeys] - # todo: use the delitem method of fsstore.FSMap when it is better tested. - return tuple(map(self._chunk_delitem, ckeys)) + + if hasattr(self.store, "delitems"): + del_op = self.store.delitems(ckeys) + else: + del_op = tuple(map(self._chunk_delitem, ckeys)) + return None def _chunk_delitem(self, ckey): """ From 88b48118e4cc9b21f3b1237e4a34ec49f4cb60d6 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 14 Jul 2021 21:48:25 -0400 Subject: [PATCH 25/54] flake8 --- zarr/core.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index c562f45db9..58769cdaa5 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1880,15 +1880,15 @@ def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): cdatas = {key: self._process_for_setitem(key, sel, val, fields=fields) for key, sel, val in zip(ckeys, lchunk_selection, values)} to_store = {} - if not self._write_empty_chunks: - empty_chunks = {k: v for k,v in cdatas.items() if self._chunk_is_empty(v)} + if not self._write_empty_chunks: + empty_chunks = {k: v for k, v in cdatas.items() if self._chunk_is_empty(v)} if hasattr(self.store, 'delitems'): self.store.delitems(tuple(empty_chunks.keys())) else: for ckey in empty_chunks.keys(): self._chunk_delitem(ckey) nonempty_keys = cdatas.keys() - empty_chunks.keys() - to_store = {k: self._encode_chunk(cdatas[k]) for k in nonempty_keys} + to_store = {k: self._encode_chunk(cdatas[k]) for k in nonempty_keys} else: to_store = {k: self._encode_chunk(v) for k, v in cdatas.items()} self.chunk_store.setitems(to_store) @@ -1907,9 +1907,9 @@ def _chunk_delitems(self, ckeys): ckeys = [ckeys] if hasattr(self.store, "delitems"): - del_op = self.store.delitems(ckeys) - else: - del_op = tuple(map(self._chunk_delitem, ckeys)) + self.store.delitems(ckeys) + else: + tuple(map(self._chunk_delitem, ckeys)) return None def _chunk_delitem(self, ckey): From 3c6971920803f175f0ebcf23775682fb3dbfb902 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 14 Jul 2021 21:48:58 -0400 Subject: [PATCH 26/54] strip path separator that was screwing up store.listdir --- zarr/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index de54ef6e9c..662ee100e4 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1168,7 +1168,7 @@ def listdir(self, path=None): if _prog_number.match(entry) and self.fs.isdir(entry_path): for file_name in self.fs.find(entry_path): file_path = os.path.join(dir_path, file_name) - rel_path = file_path.split(root_path)[1] + rel_path = file_path.split(root_path)[1].strip(os.path.sep) new_children.append(rel_path.replace(os.path.sep, '.')) else: new_children.append(entry) From 8aa93fa1d2cff5898865704044461d02c0f05aeb Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Wed, 14 Jul 2021 21:50:19 -0400 Subject: [PATCH 27/54] change tests to check for empty writing behavior --- zarr/tests/test_core.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 43ed096402..784dea74a6 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -2765,7 +2765,7 @@ def test_read_from_all_blocks(self): @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestArrayWithFSStoreNoEmptyWrites(TestArray): +class TestArrayWithFSStoreEmptyWrites(TestArray): @staticmethod def create_array(read_only=False, **kwargs): path = mkdtemp() @@ -2777,7 +2777,7 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Blosc()) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs, write_empty_chunks=False) + cache_attrs=cache_attrs, write_empty_chunks=True) def test_nchunks_initialized(self): for fill_value in -1, 0, 1, 10: @@ -2789,15 +2789,13 @@ def test_nchunks_initialized(self): z[:] = 42 assert 10 == z.nchunks_initialized z[:] = fill_value - assert 0 == z.nchunks_initialized - z[0] = 42 - assert 1 == z.nchunks_initialized + assert 10 == z.nchunks_initialized if hasattr(z.store, 'close'): z.store.close() @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestArrayWithFSStoreNestedNoEmptyWrites(TestArray): +class TestArrayWithFSStoreNestedEmptyWrites(TestArray): @staticmethod def create_array(read_only=False, **kwargs): path = mkdtemp() @@ -2809,7 +2807,7 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Blosc()) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs, write_empty_chunks=False) + cache_attrs=cache_attrs, write_empty_chunks=True) def test_nchunks_initialized(self): for fill_value in -1, 0, 1, 10: @@ -2821,8 +2819,6 @@ def test_nchunks_initialized(self): z[:] = 42 assert 10 == z.nchunks_initialized z[:] = fill_value - assert 0 == z.nchunks_initialized - z[0] = 42 - assert 1 == z.nchunks_initialized + assert 10 == z.nchunks_initialized if hasattr(z.store, 'close'): z.store.close() From 0dae9da2c5aa4eb1ef817654dc94dcf35c0c47cf Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 15 Jul 2021 09:08:42 -0400 Subject: [PATCH 28/54] bump fsspec and s3fs versions --- requirements_dev_optional.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements_dev_optional.txt b/requirements_dev_optional.txt index 8584e7bbad..3bd8d10bf2 100644 --- a/requirements_dev_optional.txt +++ b/requirements_dev_optional.txt @@ -17,6 +17,6 @@ flake8==3.9.2 pytest-cov==2.12.1 pytest-doctestplus==0.10.0 h5py==3.2.1 -s3fs==2021.6.0 -fsspec==2021.6.0 +s3fs==2021.7.0 +fsspec==2021.7.0 moto[server]>=1.3.14 From 40c3f147ff18242fee7f39161a222cc6a0a3ba38 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 15 Jul 2021 10:46:40 -0400 Subject: [PATCH 29/54] delitems: only attempt to delete keys that exist --- zarr/storage.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index 662ee100e4..d3b3b27ddd 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1126,7 +1126,8 @@ def __delitem__(self, key): def delitems(self, keys): if self.mode == 'r': raise ReadOnlyError - nkeys = [self._normalize_key(key) for key in keys] + # only remove the keys that exist in the store + nkeys = [self._normalize_key(key) for key in keys if key in self] self.map.delitems(nkeys) def __contains__(self, key): From 7dde846f05e298e0bddaacd7bae9d823dd73400b Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 17 Aug 2021 10:49:54 -0400 Subject: [PATCH 30/54] cdon't pass empty collections to self.map.delitems --- zarr/storage.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index d3b3b27ddd..b49bef3a99 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1128,7 +1128,9 @@ def delitems(self, keys): raise ReadOnlyError # only remove the keys that exist in the store nkeys = [self._normalize_key(key) for key in keys if key in self] - self.map.delitems(nkeys) + # rm errors if you pass an empty collection + if len(nkeys) > 0: + self.map.delitems(nkeys) def __contains__(self, key): key = self._normalize_key(key) From 7a45fd20ec5e958191c6170a27fe4ca1e7a74b70 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 17 Aug 2021 10:50:38 -0400 Subject: [PATCH 31/54] flake8 --- zarr/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index b49bef3a99..6620057158 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1128,7 +1128,7 @@ def delitems(self, keys): raise ReadOnlyError # only remove the keys that exist in the store nkeys = [self._normalize_key(key) for key in keys if key in self] - # rm errors if you pass an empty collection + # rm errors if you pass an empty collection if len(nkeys) > 0: self.map.delitems(nkeys) From 99f59ef360f7456ff5133e17217645c30cd62f9c Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 19 Aug 2021 19:43:44 -0400 Subject: [PATCH 32/54] use main branch of fsspec until a new release is cut --- requirements_dev_optional.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements_dev_optional.txt b/requirements_dev_optional.txt index 3bd8d10bf2..8947216630 100644 --- a/requirements_dev_optional.txt +++ b/requirements_dev_optional.txt @@ -18,5 +18,5 @@ pytest-cov==2.12.1 pytest-doctestplus==0.10.0 h5py==3.2.1 s3fs==2021.7.0 -fsspec==2021.7.0 +git+https://github.com/intake/filesystem_spec.git/@master#egg=fsspec moto[server]>=1.3.14 From a6ba3c7d7550e2f5a65131b7630a6c54245766e9 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Aug 2021 11:20:43 -0400 Subject: [PATCH 33/54] add support for checking if a chunk is all nans in _chunk_is_empty --- zarr/core.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 58769cdaa5..f823147663 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1893,13 +1893,21 @@ def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): to_store = {k: self._encode_chunk(v) for k, v in cdatas.items()} self.chunk_store.setitems(to_store) - def _chunk_is_empty(self, chunk): - if self.dtype == 'object': - # we have to flatten the result of np.equal to handle outputs like - # [np.array([True,True]), True, True] - is_empty = all(flatten(np.equal(chunk, self.fill_value, dtype='object'))) + def _chunk_is_empty(self, chunk) -> bool: + if self._fill_value is None: + is_empty = False else: - is_empty = np.all(chunk == self._fill_value) + if self.dtype == 'object': + # we have to flatten the result of np.equal to handle outputs like + # [np.array([True,True]), True, True] + is_empty = all(flatten(np.equal(chunk, self.fill_value, dtype='object'))) + else: + # Numpy errors if you call np.isnan on custom dtypes, so ensure + # we are working with floats before calling isnan + if isinstance(self._fill_value, float) and np.isnan(self._fill_value): + is_empty = np.all(np.isnan(chunk)) + else: + is_empty = np.all(chunk == self._fill_value) return is_empty def _chunk_delitems(self, ckeys): From 6f8b6c4200a622861545bddc535476a7c79de558 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Aug 2021 11:40:02 -0400 Subject: [PATCH 34/54] docstring tweaks --- zarr/core.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index f823147663..b624482a62 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -76,11 +76,14 @@ class Array: is Blosc, when getting data from the array chunks will be partially read and decompressed when possible. write_empty_chunks : bool, optional - Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). - If True (default), all chunks will be written regardless of their contents. - If False, empty chunks will not be written, and the `store` entry for - the chunk key of an empty chunk will be deleted. Note that setting this option to False - will incur additional overhead per chunk write. + Sets chunk writing behavior for chunks filled with `fill_value` + ("empty" chunks). If False (default), empty chunks will not be written, + and the `store` entry for the chunk key of an empty chunk will be deleted. + This setting enables sparser storage, because only chunks with + non-fill-value data are be written to disk, at the expense of + computational overhead, because each chunk must be compared with + the array's fill_value before writing. + If True, all chunks will be written regardless of their contents. .. versionadded:: 2.7 From 7025d19d9dcc30be25d785c13830c4f13b7f29b1 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Aug 2021 12:42:34 -0400 Subject: [PATCH 35/54] clean up empty_write tests --- zarr/tests/test_core.py | 83 +++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 784dea74a6..af0789bbac 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -983,14 +983,27 @@ def test_array_0d(self): z.store.close() def test_nchunks_initialized(self): + for fill_value in (0, 1.0, np.nan): + if isinstance(fill_value, int): + dtype = 'int' + else: + dtype = 'float' + z = self.create_array(shape=100, + chunks=10, + fill_value=fill_value, + dtype=dtype) - z = self.create_array(shape=100, chunks=10) - assert 0 == z.nchunks_initialized - # manually put something into the store to confuse matters - z.store['foo'] = b'bar' - assert 0 == z.nchunks_initialized - z[:] = 42 - assert 10 == z.nchunks_initialized + assert 0 == z.nchunks_initialized + # manually put something into the store to confuse matters + z.store['foo'] = b'bar' + assert 0 == z.nchunks_initialized + z[:] = 42 + assert 10 == z.nchunks_initialized + z[:] = z.fill_value + if z._write_empty_chunks: + assert 10 == z.nchunks_initialized + else: + assert 0 == z.nchunks_initialized if hasattr(z.store, 'close'): z.store.close() @@ -1598,7 +1611,7 @@ def test_nbytes_stored(self): assert -1 == z.nbytes_stored -class TestArrayNoEmptyWrites(TestArray): +class TestArrayEmptyWrites(TestArray): @staticmethod def create_array(read_only=False, **kwargs): @@ -1610,23 +1623,7 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs, write_empty_chunks=False) - - def test_nchunks_initialized(self): - for fill_value in -1, 0, 1, 10: - z = self.create_array(shape=100, chunks=10, fill_value=fill_value) - assert 0 == z.nchunks_initialized - # manually put something into the store to confuse matters - z.store['foo'] = b'bar' - assert 0 == z.nchunks_initialized - z[:] = 42 - assert 10 == z.nchunks_initialized - z[:] = fill_value - assert 0 == z.nchunks_initialized - z[0] = 42 - assert 1 == z.nchunks_initialized - if hasattr(z.store, 'close'): - z.store.close() + cache_attrs=cache_attrs, write_empty_chunks=True) class TestArrayWithDirectoryStore(TestArray): @@ -1786,6 +1783,24 @@ def test_array_1d_fill_value(self): z = self.create_array(shape=(nvalues,), chunks=100, dtype=dtype, fill_value=1) + def test_nchunks_initialized(self): + fill_value = 0 + z = self.create_array(shape=100, + chunks=10, + fill_value=fill_value) + + assert 0 == z.nchunks_initialized + # manually put something into the store to confuse matters + z.store['foo'] = b'bar' + assert 0 == z.nchunks_initialized + z[:] = 42 + assert 10 == z.nchunks_initialized + z[:] = z.fill_value + assert 0 == z.nchunks_initialized + + if hasattr(z.store, 'close'): + z.store.close() + def test_array_order(self): # N5 only supports 'C' at the moment @@ -2765,7 +2780,7 @@ def test_read_from_all_blocks(self): @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestArrayWithFSStoreEmptyWrites(TestArray): +class TestArrayWithFSStoreEmptyWrites(TestArrayEmptyWrites): @staticmethod def create_array(read_only=False, **kwargs): path = mkdtemp() @@ -2795,7 +2810,7 @@ def test_nchunks_initialized(self): @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestArrayWithFSStoreNestedEmptyWrites(TestArray): +class TestArrayWithFSStoreNestedEmptyWrites(TestArrayEmptyWrites): @staticmethod def create_array(read_only=False, **kwargs): path = mkdtemp() @@ -2808,17 +2823,3 @@ def create_array(read_only=False, **kwargs): init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs, write_empty_chunks=True) - - def test_nchunks_initialized(self): - for fill_value in -1, 0, 1, 10: - z = self.create_array(shape=100, chunks=10, fill_value=fill_value) - assert 0 == z.nchunks_initialized - # manually put something into the store to confuse matters - z.store['foo'] = b'bar' - assert 0 == z.nchunks_initialized - z[:] = 42 - assert 10 == z.nchunks_initialized - z[:] = fill_value - assert 10 == z.nchunks_initialized - if hasattr(z.store, 'close'): - z.store.close() From 3abcbc3ebc0b2f270a743f35cfb3f9695ce7e3f3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Fri, 20 Aug 2021 13:14:15 -0400 Subject: [PATCH 36/54] fix hexdigests for FSStore + empty writes, and remove redundant nchunks_initialized test --- zarr/tests/test_core.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 239656e2ba..0867cebc5f 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -2794,19 +2794,14 @@ def create_array(read_only=False, **kwargs): return Array(store, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs, write_empty_chunks=True) - def test_nchunks_initialized(self): - for fill_value in -1, 0, 1, 10: - z = self.create_array(shape=100, chunks=10, fill_value=fill_value) - assert 0 == z.nchunks_initialized - # manually put something into the store to confuse matters - z.store['foo'] = b'bar' - assert 0 == z.nchunks_initialized - z[:] = 42 - assert 10 == z.nchunks_initialized - z[:] = fill_value - assert 10 == z.nchunks_initialized - if hasattr(z.store, 'close'): - z.store.close() + def expected(self): + return [ + "ab753fc81df0878589535ca9bad2816ba88d91bc", + "c16261446f9436b1e9f962e57ce3e8f6074abe8a", + "c2ef3b2fb2bc9dcace99cd6dad1a7b66cc1ea058", + "6e52f95ac15b164a8e96843a230fcee0e610729b", + "091fa99bc60706095c9ce30b56ce2503e0223f56", + ] @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") @@ -2823,3 +2818,12 @@ def create_array(read_only=False, **kwargs): init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs, write_empty_chunks=True) + + def expected(self): + return [ + "94884f29b41b9beb8fc99ad7bf9c0cbf0f2ab3c9", + "077aa3bd77b8d354f8f6c15dce5ae4f545788a72", + "22be95d83c097460adb339d80b2d7fe19c513c16", + "85131cec526fa46938fd2c4a6083a58ee11037ea", + "c3167010c162c6198cb2bf3c1da2c46b047c69a1", + ] From b81c14ae4b4fd749cb6f2ab7651446ed2c7b5aba Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sat, 2 Oct 2021 11:47:27 -0400 Subject: [PATCH 37/54] resolve merge conflicts in favor of master --- zarr/storage.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 22c1c61e4a..804deea13c 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1202,14 +1202,9 @@ def listdir(self, path=None): if _prog_number.match(entry) and self.fs.isdir(entry_path): for file_name in self.fs.find(entry_path): file_path = os.path.join(dir_path, file_name) -<<<<<<< HEAD - rel_path = file_path.split(root_path)[1].strip(os.path.sep) - new_children.append(rel_path.replace(os.path.sep, '.')) -======= rel_path = file_path.split(root_path)[1] rel_path = rel_path.lstrip('/') new_children.append(rel_path.replace('/', '.')) ->>>>>>> 0200365bf96fe829e2106b2888ea19d80d9e19ca else: new_children.append(entry) return sorted(new_children) From 05716a3652888bad0ddcfd1ba8827823a54c0bec Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 3 Oct 2021 20:16:55 -0400 Subject: [PATCH 38/54] set write_empty_chunks to True by default; put chunk emptiness checking in a function in util.py; optimize chunk emptiness checking --- zarr/core.py | 52 +++++++++++++++++------------------------ zarr/creation.py | 16 ++++++------- zarr/tests/test_core.py | 23 +++++++++--------- zarr/tests/test_util.py | 23 +++++++++++++++++- zarr/util.py | 33 ++++++++++++++++++++++++++ 5 files changed, 95 insertions(+), 52 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index f7aa232dd3..9a11239db1 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -31,9 +31,9 @@ from zarr.meta import decode_array_metadata, encode_array_metadata from zarr.storage import array_meta_key, attrs_key, getsize, listdir from zarr.util import ( + all_equal, InfoReporter, check_array_shape, - flatten, human_readable_size, is_total_slice, nolock, @@ -76,14 +76,13 @@ class Array: is Blosc, when getting data from the array chunks will be partially read and decompressed when possible. write_empty_chunks : bool, optional - Sets chunk writing behavior for chunks filled with `fill_value` - ("empty" chunks). If False (default), empty chunks will not be written, - and the `store` entry for the chunk key of an empty chunk will be deleted. - This setting enables sparser storage, because only chunks with - non-fill-value data are be written to disk, at the expense of - computational overhead, because each chunk must be compared with - the array's fill_value before writing. - If True, all chunks will be written regardless of their contents. + If True (default), all chunks will be stored regardless of their + contents. If False, each chunk is compared to the array's fill + value prior to storing. If the chunk a uniformly equal to the fill + value, then that chunk is not be stored, and the store entry for + that chunk's key is deleted. This setting enables sparser storage + as only chunks with non-fill-value data are stored, at the expense + of overhead associated with checking the data of each chunk. .. versionadded:: 2.7 @@ -116,6 +115,7 @@ class Array: info vindex oindex + write_empty_chunks Methods ------- @@ -148,7 +148,7 @@ def __init__( cache_metadata=True, cache_attrs=True, partial_decompress=False, - write_empty_chunks=False, + write_empty_chunks=True, ): # N.B., expect at this point store is fully initialized with all # configuration metadata fully specified and normalized @@ -455,6 +455,13 @@ def vindex(self): :func:`set_mask_selection` for documentation and examples.""" return self._vindex + @property + def write_empty_chunks(self) -> bool: + """A Boolean, True if chunks composed of the array's fill value + will be stored. If False, such chunks will not be stored. + """ + return self._write_empty_chunks + def __eq__(self, other): return ( isinstance(other, Array) and @@ -1599,7 +1606,7 @@ def _set_basic_selection_zd(self, selection, value, fields=None): chunk[selection] = value # clear chunk if it only contains the fill value - if self._chunk_is_empty(chunk): + if all_equal(self.fill_value, chunk): try: del self.chunk_store[ckey] return @@ -1883,8 +1890,8 @@ def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): cdatas = {key: self._process_for_setitem(key, sel, val, fields=fields) for key, sel, val in zip(ckeys, lchunk_selection, values)} to_store = {} - if not self._write_empty_chunks: - empty_chunks = {k: v for k, v in cdatas.items() if self._chunk_is_empty(v)} + if not self.write_empty_chunks: + empty_chunks = {k: v for k, v in cdatas.items() if all_equal(self.fill_value, v)} if hasattr(self.store, 'delitems'): self.store.delitems(tuple(empty_chunks.keys())) else: @@ -1896,23 +1903,6 @@ def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): to_store = {k: self._encode_chunk(v) for k, v in cdatas.items()} self.chunk_store.setitems(to_store) - def _chunk_is_empty(self, chunk) -> bool: - if self._fill_value is None: - is_empty = False - else: - if self.dtype == 'object': - # we have to flatten the result of np.equal to handle outputs like - # [np.array([True,True]), True, True] - is_empty = all(flatten(np.equal(chunk, self.fill_value, dtype='object'))) - else: - # Numpy errors if you call np.isnan on custom dtypes, so ensure - # we are working with floats before calling isnan - if isinstance(self._fill_value, float) and np.isnan(self._fill_value): - is_empty = np.all(np.isnan(chunk)) - else: - is_empty = np.all(chunk == self._fill_value) - return is_empty - def _chunk_delitems(self, ckeys): if isinstance(ckeys, str): ckeys = [ckeys] @@ -1964,7 +1954,7 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields) # attempt to delete chunk if it only contains the fill value - if (not self._write_empty_chunks) and self._chunk_is_empty(cdata): + if (not self.write_empty_chunks) and all_equal(self.fill_value, cdata): self._chunk_delitem(ckey) else: self.chunk_store[ckey] = self._encode_chunk(cdata) diff --git a/zarr/creation.py b/zarr/creation.py index a1b1e8d1ff..bb59f81b81 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -21,7 +21,7 @@ def create(shape, chunks=True, dtype=None, compressor='default', fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, path=None, chunk_store=None, filters=None, cache_metadata=True, cache_attrs=True, read_only=False, - object_codec=None, dimension_separator=None, write_empty_chunks=False, **kwargs): + object_codec=None, dimension_separator=None, write_empty_chunks=True, **kwargs): """Create an array. Parameters @@ -72,13 +72,13 @@ def create(shape, chunks=True, dtype=None, compressor='default', Separator placed between the dimensions of a chunk. .. versionadded:: 2.8 write_empty_chunks : bool, optional - Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). - If False (default), empty chunks will not be written, and the `store` entry for - the chunk key of an empty chunk will be deleted. Note that writings chunk with - `write_empty_chunks=False` will incur overhead associated with checking the contents - of each chunk. Use `write_empty_chunks=True` to avoid this overhead, at the expense of - potentially creating unnecessary objects in storage. - + If True (default), all chunks will be stored regardless of their + contents. If False, each chunk is compared to the array's fill + value prior to storing. If the chunk a uniformly equal to the fill + value, then that chunk is not be stored, and the store entry for + that chunk's key is deleted. This setting enables sparser storage + as only chunks with non-fill-value data are stored, at the expense + of overhead associated with checking the data of each chunk. Returns diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 57dbe109d2..f9ba505ba3 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1658,7 +1658,7 @@ def test_nbytes_stored(self): assert -1 == z.nbytes_stored -class TestArrayEmptyWrites(TestArray): +class TestArrayNoEmptyWrites(TestArray): @staticmethod def create_array(read_only=False, **kwargs): @@ -1670,7 +1670,7 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs, write_empty_chunks=True) + cache_attrs=cache_attrs, write_empty_chunks=False) class TestArrayWithDirectoryStore(TestArray): @@ -1831,10 +1831,9 @@ def test_array_1d_fill_value(self): fill_value=1) def test_nchunks_initialized(self): - fill_value = 0 z = self.create_array(shape=100, chunks=10, - fill_value=fill_value) + fill_value=0) assert 0 == z.nchunks_initialized # manually put something into the store to confuse matters @@ -1843,10 +1842,10 @@ def test_nchunks_initialized(self): z[:] = 42 assert 10 == z.nchunks_initialized z[:] = z.fill_value - assert 0 == z.nchunks_initialized - - if hasattr(z.store, 'close'): - z.store.close() + if z.write_empty_chunks: + assert 10 == z.nchunks_initialized + else: + assert 0 == z.nchunks_initialized def test_array_order(self): @@ -2855,7 +2854,7 @@ def test_read_from_all_blocks(self): @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestArrayWithFSStoreEmptyWrites(TestArrayEmptyWrites): +class TestArrayWithFSStoreNoEmptyWrites(TestArrayNoEmptyWrites): @staticmethod def create_array(read_only=False, **kwargs): path = mkdtemp() @@ -2867,7 +2866,7 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Blosc()) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs, write_empty_chunks=True) + cache_attrs=cache_attrs, write_empty_chunks=False) def expected(self): return [ @@ -2880,7 +2879,7 @@ def expected(self): @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestArrayWithFSStoreNestedEmptyWrites(TestArrayEmptyWrites): +class TestArrayWithFSStoreNestedNoEmptyWrites(TestArrayNoEmptyWrites): @staticmethod def create_array(read_only=False, **kwargs): path = mkdtemp() @@ -2892,7 +2891,7 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Blosc()) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs, write_empty_chunks=True) + cache_attrs=cache_attrs, write_empty_chunks=False) def expected(self): return [ diff --git a/zarr/tests/test_util.py b/zarr/tests/test_util.py index 3c435caeb2..a65b26bae8 100644 --- a/zarr/tests/test_util.py +++ b/zarr/tests/test_util.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from zarr.util import (flatten, guess_chunks, human_readable_size, info_html_report, +from zarr.util import (all_equal, flatten, guess_chunks, human_readable_size, info_html_report, info_text_report, is_total_slice, normalize_chunks, normalize_dimension_separator, normalize_fill_value, normalize_order, @@ -217,3 +217,24 @@ def test_flatten(): assert list(flatten(['0', ['1', ['2', ['3', [4, ]]]]])) == ['0', '1', '2', '3', 4] assert list(flatten('foo')) == ['f', 'o', 'o'] assert list(flatten(['foo'])) == ['foo'] + + +def test_all_equal(): + assert all_equal(0, np.zeros((10, 10, 10))) + assert not all_equal(1, np.zeros((10, 10, 10))) + + assert all_equal(1, np.ones((10, 10, 10))) + assert not all_equal(1, 1 + np.ones((10, 10, 10))) + + assert all_equal(np.nan, np.array([np.nan, np.nan])) + assert not all_equal(np.nan, np.array([np.nan, 1.0])) + + assert all_equal({'a': -1}, np.array([{'a': -1}, {'a': -1}], dtype='object')) + assert not all_equal({'a': -1}, np.array([{'a': -1}, {'a': 2}], dtype='object')) + + assert all_equal(np.timedelta64(999, 'D'), np.array([999, 999], dtype='timedelta64[D]')) + assert not all_equal(np.timedelta64(999, 'D'), np.array([999, 998], dtype='timedelta64[D]')) + + # all_equal(None, *) always returns False + assert not all_equal(None, np.array([None, None])) + assert not all_equal(None, np.array([None, 10])) diff --git a/zarr/util.py b/zarr/util.py index b67911ab7d..c2769de205 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -14,6 +14,7 @@ from numcodecs.registry import codec_registry from numcodecs.blosc import cbuffer_sizes, cbuffer_metainfo +from numpy.typing import ArrayLike from typing import Any, Callable, Dict, Optional, Tuple, Union @@ -659,3 +660,35 @@ def retry_call(callabl: Callable, time.sleep(wait) else: raise + + +def all_equal(value: Any, array: ArrayLike) -> bool: + """ + Test if all the elements of an array are equivalent to a value. + If `value` is None, then this function does not do any comparison and + returns False. + """ + + if value is None: + return False + if not value: + # if `value` is falsey, then just 1 truthy value in `array` + # is sufficient to return False. We assume here that np.any is + # optimized to return on the first truthy value in `array`. + try: + return not np.any(array) + except TypeError: + pass + if np.issubdtype(array.dtype, np.object_): + # we have to flatten the result of np.equal to handle outputs like + # [np.array([True,True]), True, True] + return all(flatten(np.equal(value, array, dtype=array.dtype))) + else: + # Numpy errors if you call np.isnan on custom dtypes, so ensure + # we are working with floats before calling isnan + if np.issubdtype(array.dtype, np.floating) and np.isnan(value): + return np.all(np.isnan(array)) + else: + # using == raises warnings from numpy deprecated pattern, but + # using np.equal() raises type errors for structured dtypes... + return np.all(value == array) From b921b34f7907391344d9273a6fc4b976310b94df Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 3 Oct 2021 20:36:23 -0400 Subject: [PATCH 39/54] remove np.typing import --- zarr/util.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/zarr/util.py b/zarr/util.py index c2769de205..d9b0ce7c99 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -14,7 +14,6 @@ from numcodecs.registry import codec_registry from numcodecs.blosc import cbuffer_sizes, cbuffer_metainfo -from numpy.typing import ArrayLike from typing import Any, Callable, Dict, Optional, Tuple, Union @@ -662,7 +661,7 @@ def retry_call(callabl: Callable, raise -def all_equal(value: Any, array: ArrayLike) -> bool: +def all_equal(value: Any, array: Any) -> bool: """ Test if all the elements of an array are equivalent to a value. If `value` is None, then this function does not do any comparison and From 48c38c7ba36f4db551acc6e52a59b091c68a6322 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 3 Oct 2021 21:46:36 -0400 Subject: [PATCH 40/54] use public attribute in test_nchunks_initialized --- zarr/tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index f9ba505ba3..a488829315 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1001,7 +1001,7 @@ def test_nchunks_initialized(self): z[:] = 42 assert 10 == z.nchunks_initialized z[:] = z.fill_value - if z._write_empty_chunks: + if z.write_empty_chunks: assert 10 == z.nchunks_initialized else: assert 0 == z.nchunks_initialized From 7e4fbadbca0242fade3b615f1d2251929b310fe3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 3 Oct 2021 22:54:51 -0400 Subject: [PATCH 41/54] remove duplicated logic in _chunk_setitems, instead using _chunk_delitems; clean up 0d empty writes; add coverage exemptions --- zarr/core.py | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 9a11239db1..2b5115114c 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1605,20 +1605,18 @@ def _set_basic_selection_zd(self, selection, value, fields=None): else: chunk[selection] = value - # clear chunk if it only contains the fill value - if all_equal(self.fill_value, chunk): + # remove chunk if write_empty_chunks is false and it only contains the fill value + if (not self.write_empty_chunks) and all_equal(self.fill_value, chunk): try: del self.chunk_store[ckey] return - except KeyError: - return - except Exception: + except Exception: # pragma: no cover # deleting failed, fallback to overwriting pass - - # encode and store - cdata = self._encode_chunk(chunk) - self.chunk_store[ckey] = cdata + else: + # encode and store + cdata = self._encode_chunk(chunk) + self.chunk_store[ckey] = cdata def _set_basic_selection_nd(self, selection, value, fields=None): # implementation of __setitem__ for array with at least one dimension @@ -1892,11 +1890,7 @@ def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): to_store = {} if not self.write_empty_chunks: empty_chunks = {k: v for k, v in cdatas.items() if all_equal(self.fill_value, v)} - if hasattr(self.store, 'delitems'): - self.store.delitems(tuple(empty_chunks.keys())) - else: - for ckey in empty_chunks.keys(): - self._chunk_delitem(ckey) + self._chunk_delitems(empty_chunks.keys()) nonempty_keys = cdatas.keys() - empty_chunks.keys() to_store = {k: self._encode_chunk(cdatas[k]) for k in nonempty_keys} else: @@ -1904,12 +1898,12 @@ def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None): self.chunk_store.setitems(to_store) def _chunk_delitems(self, ckeys): - if isinstance(ckeys, str): - ckeys = [ckeys] - if hasattr(self.store, "delitems"): self.store.delitems(ckeys) - else: + else: # pragma: no cover + # exempting this branch from coverage as there are no extant stores + # that will trigger this condition, but it's possible that they + # will be developed in the future. tuple(map(self._chunk_delitem, ckeys)) return None From b063f5290a7eb7802637591aea6aec5636a541da Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 3 Oct 2021 23:00:30 -0400 Subject: [PATCH 42/54] expand 0d tests and nchunks_initialized tests to hit more parts of the write_empty_chunks logic --- zarr/tests/test_core.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index a488829315..60630ca572 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -973,6 +973,8 @@ def test_array_0d(self): assert 42 == z[()] z[()] = 43 assert 43 == z[()] + z[()] = z.fill_value + assert z.fill_value == z[()] with pytest.raises(IndexError): z[0] = 42 with pytest.raises(IndexError): @@ -1000,6 +1002,9 @@ def test_nchunks_initialized(self): assert 0 == z.nchunks_initialized z[:] = 42 assert 10 == z.nchunks_initialized + # manually remove a chunk from the store + del z.chunk_store[os.path.join(z.path, '0')] + assert 9 == z.nchunks_initialized z[:] = z.fill_value if z.write_empty_chunks: assert 10 == z.nchunks_initialized @@ -1841,6 +1846,8 @@ def test_nchunks_initialized(self): assert 0 == z.nchunks_initialized z[:] = 42 assert 10 == z.nchunks_initialized + del z.chunk_store[os.path.join(z.path, '0')] + assert 9 == z.nchunks_initialized z[:] = z.fill_value if z.write_empty_chunks: assert 10 == z.nchunks_initialized From 020475bfe17c6e0efa2de1824593e1539b041418 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Sun, 3 Oct 2021 23:16:02 -0400 Subject: [PATCH 43/54] remove return type annotation for all_equal that was breaking CI --- zarr/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/util.py b/zarr/util.py index d9b0ce7c99..710f1f21b8 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -661,7 +661,7 @@ def retry_call(callabl: Callable, raise -def all_equal(value: Any, array: Any) -> bool: +def all_equal(value: Any, array: Any): """ Test if all the elements of an array are equivalent to a value. If `value` is None, then this function does not do any comparison and From a81ac833f348e86f198cccfc47f9730ed808cbc4 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Oct 2021 10:33:16 -0400 Subject: [PATCH 44/54] refactor write_empty_chunks tests by expanding the create_array logic in the base test class, remove standalone write_empty_chunks tests --- zarr/tests/test_core.py | 201 ++++++++++++++++++++-------------------- zarr/tests/test_sync.py | 7 +- 2 files changed, 105 insertions(+), 103 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 60630ca572..0d7b233452 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -86,9 +86,10 @@ def create_array(self, read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(level=1)) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_store_has_text_keys(self): # Initialize array @@ -939,7 +940,7 @@ def test_array_0d(self): # setup a = np.zeros(()) - z = self.create_array(shape=(), dtype=a.dtype, fill_value=0) + z = self.create_array(shape=(), dtype=a.dtype, fill_value=0, write_empty_chunks=False) # check properties assert a.ndim == z.ndim @@ -994,7 +995,8 @@ def test_nchunks_initialized(self): z = self.create_array(shape=100, chunks=10, fill_value=fill_value, - dtype=dtype) + dtype=dtype, + write_empty_chunks=True) assert 0 == z.nchunks_initialized # manually put something into the store to confuse matters @@ -1005,14 +1007,27 @@ def test_nchunks_initialized(self): # manually remove a chunk from the store del z.chunk_store[os.path.join(z.path, '0')] assert 9 == z.nchunks_initialized + + if hasattr(z.store, 'close'): + z.store.close() + + # second round of similar tests with write_empty_chunks set to + # False + z = self.create_array(shape=100, + chunks=10, + fill_value=fill_value, + dtype=dtype, + write_empty_chunks=False) + z[:] = 42 + assert 10 == z.nchunks_initialized + # manually remove a chunk from the store + del z.chunk_store[os.path.join(z.path, '0')] + assert 9 == z.nchunks_initialized z[:] = z.fill_value - if z.write_empty_chunks: - assert 10 == z.nchunks_initialized - else: - assert 0 == z.nchunks_initialized + assert 0 == z.nchunks_initialized - if hasattr(z.store, 'close'): - z.store.close() + if hasattr(z.store, 'close'): + z.store.close() def test_array_dtype_shape(self): @@ -1563,9 +1578,11 @@ def create_array(read_only=False, **kwargs): store = dict() cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, path='foo/bar', **kwargs) return Array(store, path='foo/bar', read_only=read_only, - cache_metadata=cache_metadata, cache_attrs=cache_attrs) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, + write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -1618,9 +1635,11 @@ def create_array(read_only=False, **kwargs): chunk_store = dict() cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, chunk_store=chunk_store, **kwargs) return Array(store, read_only=read_only, chunk_store=chunk_store, - cache_metadata=cache_metadata, cache_attrs=cache_attrs) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, + write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -1663,21 +1682,6 @@ def test_nbytes_stored(self): assert -1 == z.nbytes_stored -class TestArrayNoEmptyWrites(TestArray): - - @staticmethod - def create_array(read_only=False, **kwargs): - path = mkdtemp() - atexit.register(shutil.rmtree, path) - store = DirectoryStore(path) - cache_metadata = kwargs.pop('cache_metadata', True) - cache_attrs = kwargs.pop('cache_attrs', True) - kwargs.setdefault('compressor', Zlib(1)) - init_array(store, **kwargs) - return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs, write_empty_chunks=False) - - class TestArrayWithDirectoryStore(TestArray): @staticmethod @@ -1687,10 +1691,11 @@ def create_array(read_only=False, **kwargs): store = DirectoryStore(path) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_nbytes_stored(self): @@ -1718,9 +1723,10 @@ def create_array(self, read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(1)) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) @pytest.mark.xfail def test_nbytes_stored(self): @@ -1741,10 +1747,11 @@ def create_array(read_only=False, **kwargs): store = NestedDirectoryStore(path) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def expected(self): return [ @@ -1765,10 +1772,11 @@ def create_array(read_only=False, **kwargs): store = N5Store(path) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_array_0d(self): # test behaviour for array with 0 dimensions @@ -1836,9 +1844,13 @@ def test_array_1d_fill_value(self): fill_value=1) def test_nchunks_initialized(self): + fill_value = 0 + dtype = 'int' z = self.create_array(shape=100, chunks=10, - fill_value=0) + fill_value=fill_value, + dtype=dtype, + write_empty_chunks=True) assert 0 == z.nchunks_initialized # manually put something into the store to confuse matters @@ -1846,13 +1858,30 @@ def test_nchunks_initialized(self): assert 0 == z.nchunks_initialized z[:] = 42 assert 10 == z.nchunks_initialized + # manually remove a chunk from the store + del z.chunk_store[os.path.join(z.path, '0')] + assert 9 == z.nchunks_initialized + + if hasattr(z.store, 'close'): + z.store.close() + + # second round of similar tests with write_empty_chunks set to + # False + z = self.create_array(shape=100, + chunks=10, + fill_value=fill_value, + dtype=dtype, + write_empty_chunks=False) + z[:] = 42 + assert 10 == z.nchunks_initialized + # manually remove a chunk from the store del z.chunk_store[os.path.join(z.path, '0')] assert 9 == z.nchunks_initialized z[:] = z.fill_value - if z.write_empty_chunks: - assert 10 == z.nchunks_initialized - else: - assert 0 == z.nchunks_initialized + assert 0 == z.nchunks_initialized + + if hasattr(z.store, 'close'): + z.store.close() def test_array_order(self): @@ -2081,9 +2110,10 @@ def create_array(read_only=False, **kwargs): cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) kwargs.setdefault('compressor', Zlib(1)) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) class TestArrayWithDBMStore(TestArray): @@ -2095,10 +2125,11 @@ def create_array(read_only=False, **kwargs): store = DBMStore(path, flag='n') cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_attrs=cache_attrs, - cache_metadata=cache_metadata) + cache_metadata=cache_metadata, write_empty_chunks=write_empty_chunks) def test_nbytes_stored(self): pass # not implemented @@ -2114,10 +2145,11 @@ def create_array(read_only=False, **kwargs): store = DBMStore(path, flag='n', open=bsddb3.btopen) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_nbytes_stored(self): pass # not implemented @@ -2133,10 +2165,11 @@ def create_array(read_only=False, **kwargs): store = LMDBStore(path, buffers=True) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_store_has_bytes_values(self): pass # returns values as memoryviews/buffers instead of bytes @@ -2155,10 +2188,11 @@ def create_array(read_only=False, **kwargs): store = LMDBStore(path, buffers=False) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_nbytes_stored(self): pass # not implemented @@ -2174,10 +2208,11 @@ def create_array(read_only=False, **kwargs): store = SQLiteStore(path) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Zlib(1)) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_nbytes_stored(self): pass # not implemented @@ -2190,9 +2225,10 @@ def create_array(self, read_only=False, **kwargs): kwargs.setdefault('compressor', None) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -2226,9 +2262,10 @@ def create_array(self, read_only=False, **kwargs): kwargs.setdefault('compressor', compressor) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -2262,9 +2299,10 @@ def create_array(self, read_only=False, **kwargs): kwargs.setdefault('compressor', compressor) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -2305,9 +2343,10 @@ def create_array(self, read_only=False, **kwargs): kwargs.setdefault('compressor', compressor) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -2348,9 +2387,10 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', compressor) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_attrs=cache_attrs, - cache_metadata=cache_metadata) + cache_metadata=cache_metadata, write_empty_chunks=write_empty_chunks) def test_hexdigest(self): # Check basic 1-D array @@ -2493,9 +2533,10 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(1)) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_nbytes_stored(self): z = self.create_array(shape=1000, chunks=100) @@ -2512,9 +2553,10 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(level=1)) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_cache_metadata(self): a1 = self.create_array(shape=100, chunks=10, dtype='i1', cache_metadata=False) @@ -2584,9 +2626,10 @@ def create_array(read_only=False, **kwargs): kwargs.setdefault('compressor', Zlib(level=1)) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def test_store_has_bytes_values(self): # skip as the cache has no control over how the store provides values @@ -2603,10 +2646,11 @@ def create_array(read_only=False, **kwargs): store = FSStore(path, key_separator=key_separator, auto_mkdir=True) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Blosc()) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def expected(self): return [ @@ -2654,6 +2698,7 @@ def create_array(read_only=False, **kwargs): store = FSStore(path) cache_metadata = kwargs.pop("cache_metadata", True) cache_attrs = kwargs.pop("cache_attrs", True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault("compressor", Blosc()) init_array(store, **kwargs) return Array( @@ -2662,6 +2707,7 @@ def create_array(read_only=False, **kwargs): cache_metadata=cache_metadata, cache_attrs=cache_attrs, partial_decompress=True, + write_empty_chunks=write_empty_chunks ) def test_hexdigest(self): @@ -2730,10 +2776,11 @@ def create_array(read_only=False, **kwargs): store = FSStore(path, key_separator=key_separator, auto_mkdir=True) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault('compressor', Blosc()) init_array(store, **kwargs) return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def expected(self): return [ @@ -2782,6 +2829,7 @@ def create_array(read_only=False, **kwargs): store = FSStore(path, key_separator=key_separator, auto_mkdir=True) cache_metadata = kwargs.pop("cache_metadata", True) cache_attrs = kwargs.pop("cache_attrs", True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) kwargs.setdefault("compressor", Blosc()) init_array(store, **kwargs) return Array( @@ -2790,6 +2838,7 @@ def create_array(read_only=False, **kwargs): cache_metadata=cache_metadata, cache_attrs=cache_attrs, partial_decompress=True, + write_empty_chunks=write_empty_chunks ) def expected(self): @@ -2858,53 +2907,3 @@ def test_read_from_all_blocks(self): z[2:99_000] = 1 b = Array(z.store, read_only=True, partial_decompress=True) assert (b[2:99_000] == 1).all() - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestArrayWithFSStoreNoEmptyWrites(TestArrayNoEmptyWrites): - @staticmethod - def create_array(read_only=False, **kwargs): - path = mkdtemp() - atexit.register(shutil.rmtree, path) - key_separator = kwargs.pop('key_separator', ".") - store = FSStore(path, key_separator=key_separator, auto_mkdir=True) - cache_metadata = kwargs.pop('cache_metadata', True) - cache_attrs = kwargs.pop('cache_attrs', True) - kwargs.setdefault('compressor', Blosc()) - init_array(store, **kwargs) - return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs, write_empty_chunks=False) - - def expected(self): - return [ - "ab753fc81df0878589535ca9bad2816ba88d91bc", - "c16261446f9436b1e9f962e57ce3e8f6074abe8a", - "c2ef3b2fb2bc9dcace99cd6dad1a7b66cc1ea058", - "6e52f95ac15b164a8e96843a230fcee0e610729b", - "091fa99bc60706095c9ce30b56ce2503e0223f56", - ] - - -@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") -class TestArrayWithFSStoreNestedNoEmptyWrites(TestArrayNoEmptyWrites): - @staticmethod - def create_array(read_only=False, **kwargs): - path = mkdtemp() - atexit.register(shutil.rmtree, path) - key_separator = kwargs.pop('key_separator', "/") - store = FSStore(path, key_separator=key_separator, auto_mkdir=True) - cache_metadata = kwargs.pop('cache_metadata', True) - cache_attrs = kwargs.pop('cache_attrs', True) - kwargs.setdefault('compressor', Blosc()) - init_array(store, **kwargs) - return Array(store, read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs, write_empty_chunks=False) - - def expected(self): - return [ - "94884f29b41b9beb8fc99ad7bf9c0cbf0f2ab3c9", - "077aa3bd77b8d354f8f6c15dce5ae4f545788a72", - "22be95d83c097460adb339d80b2d7fe19c513c16", - "85131cec526fa46938fd2c4a6083a58ee11037ea", - "c3167010c162c6198cb2bf3c1da2c46b047c69a1", - ] diff --git a/zarr/tests/test_sync.py b/zarr/tests/test_sync.py index 51b7fe0e10..274ce166be 100644 --- a/zarr/tests/test_sync.py +++ b/zarr/tests/test_sync.py @@ -99,10 +99,11 @@ def create_array(self, read_only=False, **kwargs): store = dict() cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) return Array(store, synchronizer=ThreadSynchronizer(), read_only=read_only, cache_metadata=cache_metadata, - cache_attrs=cache_attrs) + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) # noinspection PyMethodMayBeStatic def create_pool(self): @@ -141,12 +142,14 @@ def create_array(self, read_only=False, **kwargs): store = DirectoryStore(path) cache_metadata = kwargs.pop('cache_metadata', False) cache_attrs = kwargs.pop('cache_attrs', False) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) init_array(store, **kwargs) sync_path = tempfile.mkdtemp() atexit.register(atexit_rmtree, sync_path) synchronizer = ProcessSynchronizer(sync_path) return Array(store, synchronizer=synchronizer, read_only=read_only, - cache_metadata=cache_metadata, cache_attrs=cache_attrs) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, + write_empty_chunks=write_empty_chunks) # noinspection PyMethodMayBeStatic def create_pool(self): From 1c29fe89439999068845013c93c2b81c1b19f3ce Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Oct 2021 11:15:12 -0400 Subject: [PATCH 45/54] correctly handle merge from upstream master --- requirements_dev_optional.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/requirements_dev_optional.txt b/requirements_dev_optional.txt index 5088c8edab..74d14fc2e0 100644 --- a/requirements_dev_optional.txt +++ b/requirements_dev_optional.txt @@ -18,9 +18,5 @@ flake8==3.9.2 pytest-cov==2.12.1 pytest-doctestplus==0.10.1 h5py==3.4.0 -<<<<<<< HEAD -fsspec[s3]==2021.08.1 -======= fsspec[s3]==2021.10.0 ->>>>>>> 4f8cb35ecd9a24f402a3a7a02d2efe177abaf5c8 moto[server]>=1.3.14 From 710b875a4a5f6eae231bd9bfdca517c64cbe3027 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Oct 2021 11:54:00 -0400 Subject: [PATCH 46/54] don't use os.path.join for constructing a chunk key; instead use _chunk_key method --- zarr/tests/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 0d7b233452..9c550facc6 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1004,8 +1004,8 @@ def test_nchunks_initialized(self): assert 0 == z.nchunks_initialized z[:] = 42 assert 10 == z.nchunks_initialized - # manually remove a chunk from the store - del z.chunk_store[os.path.join(z.path, '0')] + # manually remove the first chunk from the store + del z.chunk_store[z._chunk_key((0,))] assert 9 == z.nchunks_initialized if hasattr(z.store, 'close'): From 8a06884208253f334a929295ea47b34d3bb5682d Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 4 Oct 2021 12:43:36 -0400 Subject: [PATCH 47/54] complete removal of os.path.join calls --- zarr/tests/test_core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 9c550facc6..198553ae1d 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1021,7 +1021,7 @@ def test_nchunks_initialized(self): z[:] = 42 assert 10 == z.nchunks_initialized # manually remove a chunk from the store - del z.chunk_store[os.path.join(z.path, '0')] + del z.chunk_store[z._chunk_key((0,))] assert 9 == z.nchunks_initialized z[:] = z.fill_value assert 0 == z.nchunks_initialized @@ -1859,7 +1859,7 @@ def test_nchunks_initialized(self): z[:] = 42 assert 10 == z.nchunks_initialized # manually remove a chunk from the store - del z.chunk_store[os.path.join(z.path, '0')] + del z.chunk_store[z._chunk_key((0,))] assert 9 == z.nchunks_initialized if hasattr(z.store, 'close'): @@ -1875,7 +1875,7 @@ def test_nchunks_initialized(self): z[:] = 42 assert 10 == z.nchunks_initialized # manually remove a chunk from the store - del z.chunk_store[os.path.join(z.path, '0')] + del z.chunk_store[z._chunk_key((0,))] assert 9 == z.nchunks_initialized z[:] = z.fill_value assert 0 == z.nchunks_initialized From 7f859c3f7e9917a21ee5753679558ca2baafd600 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Tue, 5 Oct 2021 21:32:12 -0400 Subject: [PATCH 48/54] add coverage exemption to type error branch in all_equal --- zarr/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/util.py b/zarr/util.py index 710f1f21b8..d092ffe0de 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -676,7 +676,7 @@ def all_equal(value: Any, array: Any): # optimized to return on the first truthy value in `array`. try: return not np.any(array) - except TypeError: + except TypeError: # pragma: no cover pass if np.issubdtype(array.dtype, np.object_): # we have to flatten the result of np.equal to handle outputs like From 0a7a3cc167f5692d1aaaeb819d6721818cdf3f03 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 7 Oct 2021 16:21:43 -0400 Subject: [PATCH 49/54] remove unreachable conditionals in n5 tests --- zarr/tests/test_core.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 198553ae1d..4544a6cae9 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1862,9 +1862,6 @@ def test_nchunks_initialized(self): del z.chunk_store[z._chunk_key((0,))] assert 9 == z.nchunks_initialized - if hasattr(z.store, 'close'): - z.store.close() - # second round of similar tests with write_empty_chunks set to # False z = self.create_array(shape=100, @@ -1880,9 +1877,6 @@ def test_nchunks_initialized(self): z[:] = z.fill_value assert 0 == z.nchunks_initialized - if hasattr(z.store, 'close'): - z.store.close() - def test_array_order(self): # N5 only supports 'C' at the moment From 1a0f41c3c1416ad6e5a6f801b6d24cb3d4144bb3 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 7 Oct 2021 16:22:21 -0400 Subject: [PATCH 50/54] instantiate ReadOnlyError --- zarr/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index 804deea13c..92be9df0aa 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1156,7 +1156,7 @@ def __delitem__(self, key): def delitems(self, keys): if self.mode == 'r': - raise ReadOnlyError + raise ReadOnlyError() # only remove the keys that exist in the store nkeys = [self._normalize_key(key) for key in keys if key in self] # rm errors if you pass an empty collection From 94d5d0aef67895f300465aa694466cb55d1f746f Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 7 Oct 2021 16:23:03 -0400 Subject: [PATCH 51/54] add explcit delitems and setitems calls to readonly fsstore tests --- zarr/tests/test_storage.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 1412ec2099..51bc9bf782 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1012,6 +1012,12 @@ def test_read_only(self): with pytest.raises(PermissionError): del store['foo'] + with pytest.raises(PermissionError): + store.delitems(['foo']) + + with pytest.raises(PermissionError): + store.setitems({'foo': b'baz'}) + with pytest.raises(PermissionError): store.clear() From a918f1d2e8929b74d21ac2cc9e955bbd06e47811 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Thu, 7 Oct 2021 19:26:58 -0400 Subject: [PATCH 52/54] Update docstrings --- zarr/core.py | 4 ++-- zarr/creation.py | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 2b5115114c..7c0085c454 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -78,9 +78,9 @@ class Array: write_empty_chunks : bool, optional If True (default), all chunks will be stored regardless of their contents. If False, each chunk is compared to the array's fill - value prior to storing. If the chunk a uniformly equal to the fill + value prior to storing. If a chunk is uniformly equal to the fill value, then that chunk is not be stored, and the store entry for - that chunk's key is deleted. This setting enables sparser storage + that chunk's key is deleted. This setting enables sparser storage, as only chunks with non-fill-value data are stored, at the expense of overhead associated with checking the data of each chunk. diff --git a/zarr/creation.py b/zarr/creation.py index bb59f81b81..75ff1d0212 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -74,9 +74,9 @@ def create(shape, chunks=True, dtype=None, compressor='default', write_empty_chunks : bool, optional If True (default), all chunks will be stored regardless of their contents. If False, each chunk is compared to the array's fill - value prior to storing. If the chunk a uniformly equal to the fill + value prior to storing. If a chunk is uniformly equal to the fill value, then that chunk is not be stored, and the store entry for - that chunk's key is deleted. This setting enables sparser storage + that chunk's key is deleted. This setting enables sparser storage, as only chunks with non-fill-value data are stored, at the expense of overhead associated with checking the data of each chunk. @@ -410,7 +410,7 @@ def open_array( chunk_store=None, storage_options=None, partial_decompress=False, - write_empty_chunks=False, + write_empty_chunks=True, **kwargs ): """Open an array using file-mode-like semantics. @@ -466,11 +466,13 @@ def open_array( is Blosc, when getting data from the array chunks will be partially read and decompressed when possible. write_empty_chunks : bool, optional - Determines chunk writing behavior for chunks filled with `fill_value` ("empty" chunks). - If True (default), all chunks will be written regardless of their contents. - If False, empty chunks will not be written, and the `store` entry for - the chunk key of an empty chunk will be deleted. Note that setting this option to False - will incur additional overhead per chunk write. + If True (default), all chunks will be stored regardless of their + contents. If False, each chunk is compared to the array's fill + value prior to storing. If a chunk is uniformly equal to the fill + value, then that chunk is not be stored, and the store entry for + that chunk's key is deleted. This setting enables sparser storage, + as only chunks with non-fill-value data are stored, at the expense + of overhead associated with checking the data of each chunk. Returns ------- From 3dd1afdcc43568c2aadc3a787d5fa13605186853 Mon Sep 17 00:00:00 2001 From: Davis Bennett Date: Mon, 11 Oct 2021 10:14:36 -0400 Subject: [PATCH 53/54] Update requirements_dev_optional --- requirements_dev_optional.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements_dev_optional.txt b/requirements_dev_optional.txt index 74d14fc2e0..10f5990837 100644 --- a/requirements_dev_optional.txt +++ b/requirements_dev_optional.txt @@ -16,7 +16,7 @@ tox==3.24.4 coverage flake8==3.9.2 pytest-cov==2.12.1 -pytest-doctestplus==0.10.1 +pytest-doctestplus==0.11.0 h5py==3.4.0 fsspec[s3]==2021.10.0 moto[server]>=1.3.14 From 4a4adb1f9e81f09a450414c24c91fdb40521e916 Mon Sep 17 00:00:00 2001 From: jmoore Date: Tue, 19 Oct 2021 21:58:53 +0200 Subject: [PATCH 54/54] Add changelog --- docs/release.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/release.rst b/docs/release.rst index 0810946ee6..e9c592a860 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -12,6 +12,9 @@ Enhancements * array indexing with [] (getitem and setitem) now supports fancy indexing. By :user:`Juan Nunez-Iglesias `; :issue:`725`. +* write_empty_chunks=False deletes chunks consisting of only fill_value. + By :user:`Davis Bennett `; :issue:`738`. + .. _release_2.10.2: 2.10.2