From 7eed366397a4c8bf98f8bb4adac34fa022ac4532 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 30 Nov 2018 12:18:30 -0500 Subject: [PATCH 01/14] Bump Numcodecs requirement to 0.6.1 --- requirements_dev.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index 2ad18f372c..d39ba9e9b8 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,3 +1,3 @@ asciitree==0.3.3 fasteners==0.14.1 -numcodecs==0.5.5 +numcodecs==0.6.1 diff --git a/setup.py b/setup.py index a5e8334e43..903af3bc04 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ 'asciitree', 'numpy>=1.7', 'fasteners', - 'numcodecs>=0.5.3', + 'numcodecs>=0.6.1', ], package_dir={'': '.'}, packages=['zarr', 'zarr.tests'], From 2552f620191cafa72429566f4a8ce4f49b4db4d3 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 30 Nov 2018 13:06:29 -0500 Subject: [PATCH 02/14] Assert MsgPack round-trips bytes objects correctly Previously MsgPack was turning bytes objects to unicode objects when round-tripping them. However this has been fixed in the latest version of Numcodecs. So correct this test now that MsgPack is working correctly. --- zarr/tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 11891f8fe9..544ec95c41 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -982,7 +982,7 @@ def test_object_arrays(self): z[0] = 'foo' assert z[0] == 'foo' z[1] = b'bar' - assert z[1] == 'bar' # msgpack gets this wrong + assert z[1] == b'bar' z[2] = 1 assert z[2] == 1 z[3] = [2, 4, 6, 'baz'] From aee5aceced5e5a3f2698f2363540f064c200f4a9 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 1 Dec 2018 14:09:40 +0000 Subject: [PATCH 03/14] properly guard against removal of object codec --- zarr/core.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index b4da45cd99..bcae03cb9f 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,6 +8,7 @@ import numpy as np +from numcodecs.compat import ensure_contiguous_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -1743,18 +1744,25 @@ def _decode_chunk(self, cdata): for f in self._filters[::-1]: chunk = f.decode(chunk) - # view as correct dtype + # view as numpy array with correct dtype if self._dtype == object: - if isinstance(chunk, np.ndarray): - chunk = chunk.astype(self._dtype) + # special case object dtype, because incorrect handling can lead to + # segfaults and other bad things happening + if isinstance(chunk, np.ndarray) and chunk.dtype == object: + # chunk is already of correct dtype, good to carry on + # flatten just to be sure we can reshape later + chunk = chunk.reshape(-1, order='A') else: + # If we end up here, someone must have hacked around with the filters. + # We cannot deal with object arrays unless there is an object + # codec in the filter chain, i.e., a filter that converts from object + # array to something else during encoding, and converts back to object + # array during decoding. raise RuntimeError('cannot read object array without object codec') - elif isinstance(chunk, np.ndarray): - chunk = chunk.view(self._dtype) else: - chunk = np.frombuffer(chunk, dtype=self._dtype) + chunk = ensure_contiguous_ndarray(chunk).view(self._dtype) - # reshape + # ensure correct chunk shape chunk = chunk.reshape(self._chunks, order=self._order) return chunk From bf4eee8cc763b1917e299fcfde04a5e5d9a0938b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 13:21:24 -0500 Subject: [PATCH 04/14] Ensure `chunk` in `_decode_chunk` is an `ndarray` --- zarr/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index bcae03cb9f..94bd94edde 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,7 +8,7 @@ import numpy as np -from numcodecs.compat import ensure_contiguous_ndarray +from numcodecs.compat import ensure_ndarray, ensure_contiguous_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -1745,10 +1745,11 @@ def _decode_chunk(self, cdata): chunk = f.decode(chunk) # view as numpy array with correct dtype + chunk = ensure_ndarray(chunk) if self._dtype == object: # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening - if isinstance(chunk, np.ndarray) and chunk.dtype == object: + if chunk.dtype == object: # chunk is already of correct dtype, good to carry on # flatten just to be sure we can reshape later chunk = chunk.reshape(-1, order='A') From b741fe12a0099cdcc0697a80b3ace31c82738cce Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 15:47:44 -0500 Subject: [PATCH 05/14] Reshape `chunk` ourselves since it is an `ndarray` As we already ensured the `chunk` is an `ndarray` viewing the original data, there is no need for us to do that here as well. Plus the checks performed by `ensure_contiguous_ndarray` are not needed for our use case here. Particularly as we have already handled the unusual type cases above. We also don't need to constrain the buffer size. As such the only thing we really need is to flatten the array and make it contiguous, which is what we handle here directly. --- zarr/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 94bd94edde..b5d0185faf 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,7 +8,7 @@ import numpy as np -from numcodecs.compat import ensure_ndarray, ensure_contiguous_ndarray +from numcodecs.compat import ensure_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -1761,7 +1761,7 @@ def _decode_chunk(self, cdata): # array during decoding. raise RuntimeError('cannot read object array without object codec') else: - chunk = ensure_contiguous_ndarray(chunk).view(self._dtype) + chunk = chunk.reshape(-1, order='A').view(self._dtype) # ensure correct chunk shape chunk = chunk.reshape(self._chunks, order=self._order) From f3144ae6b4fdc929eb1390b1ed87ee5a35e6862f Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 15:47:50 -0500 Subject: [PATCH 06/14] Refactor `reshape` from `_decode_chunk` As both the expected `object` case and the non-`object` case perform a `reshape` to flatten the data, go ahead and refactor that out of both cases and handle it generally. Simplifies the code a bit. --- zarr/core.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index b5d0185faf..ab5de14512 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1749,11 +1749,7 @@ def _decode_chunk(self, cdata): if self._dtype == object: # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening - if chunk.dtype == object: - # chunk is already of correct dtype, good to carry on - # flatten just to be sure we can reshape later - chunk = chunk.reshape(-1, order='A') - else: + if chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. # We cannot deal with object arrays unless there is an object # codec in the filter chain, i.e., a filter that converts from object @@ -1761,9 +1757,10 @@ def _decode_chunk(self, cdata): # array during decoding. raise RuntimeError('cannot read object array without object codec') else: - chunk = chunk.reshape(-1, order='A').view(self._dtype) + chunk = chunk.view(self._dtype) # ensure correct chunk shape + chunk = chunk.reshape(-1, order='A') chunk = chunk.reshape(self._chunks, order=self._order) return chunk From 3e3920af230e059e84f70563c4f215d60f845aed Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 15:47:53 -0500 Subject: [PATCH 07/14] Consolidate type checks in `_decode_chunk` As refactoring of the `reshape` step has effectively dropped the expected `object` type case, the checks for different types is a little more complicated than needed. To fix this, basically invert and swap the case ordering. This way we can handle all generally expected types first and simply cast them. Then we can raise if an `object` type shows up and is unexpected. --- zarr/core.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index ab5de14512..a2a07a29ba 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1746,18 +1746,17 @@ def _decode_chunk(self, cdata): # view as numpy array with correct dtype chunk = ensure_ndarray(chunk) - if self._dtype == object: - # special case object dtype, because incorrect handling can lead to - # segfaults and other bad things happening - if chunk.dtype != object: - # If we end up here, someone must have hacked around with the filters. - # We cannot deal with object arrays unless there is an object - # codec in the filter chain, i.e., a filter that converts from object - # array to something else during encoding, and converts back to object - # array during decoding. - raise RuntimeError('cannot read object array without object codec') - else: + # special case object dtype, because incorrect handling can lead to + # segfaults and other bad things happening + if self._dtype != object: chunk = chunk.view(self._dtype) + elif chunk.dtype != object: + # If we end up here, someone must have hacked around with the filters. + # We cannot deal with object arrays unless there is an object + # codec in the filter chain, i.e., a filter that converts from object + # array to something else during encoding, and converts back to object + # array during decoding. + raise RuntimeError('cannot read object array without object codec') # ensure correct chunk shape chunk = chunk.reshape(-1, order='A') From 9badf39ff815438244f131197457f815a51d56a6 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 18:56:44 -0500 Subject: [PATCH 08/14] Drop `ensure_bytes` definition from `zarr.storage` As Numcodecs now includes a very versatile and effective `ensure_bytes` function, there is no need to define our own in `zarr.storage` as well. So go ahead and drop it. --- zarr/storage.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 6720b42d12..a490f5e8fe 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -38,8 +38,9 @@ normalize_storage_path, buffer_size, normalize_fill_value, nolock, normalize_dtype) from zarr.meta import encode_array_metadata, encode_group_metadata -from zarr.compat import PY2, binary_type, OrderedDict_move_to_end +from zarr.compat import PY2, OrderedDict_move_to_end from numcodecs.registry import codec_registry +from numcodecs.compat import ensure_bytes from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, err_fspath_exists_notdir, err_read_only, MetadataError) @@ -444,23 +445,6 @@ def _init_group_metadata(store, overwrite=False, path=None, chunk_store=None): store[key] = encode_group_metadata(meta) -def ensure_bytes(s): - if isinstance(s, binary_type): - return s - if isinstance(s, np.ndarray): - if PY2: # pragma: py3 no cover - # noinspection PyArgumentList - return s.tostring(order='A') - else: # pragma: py2 no cover - # noinspection PyArgumentList - return s.tobytes(order='A') - if hasattr(s, 'tobytes'): - return s.tobytes() - if PY2 and hasattr(s, 'tostring'): # pragma: py3 no cover - return s.tostring() - return memoryview(s).tobytes() - - def _dict_store_keys(d, prefix='', cls=dict): for k in d.keys(): v = d[k] From 7bd8a2a1a3608def7335f1ea1a1115da5b1b1cec Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 18:56:45 -0500 Subject: [PATCH 09/14] Take flattened array views to avoid some copies Make use of Numcodecs' `ensure_contiguous_ndarray` to take `ndarray` views onto buffers to be stored in a few cases so as to reshape them and avoid a copy (thanks to the buffer protocol). This ensures that datetime/timedeltas are handled by default. Also catches things like object arrays. Finally this handles flattening the array if needed. All-in-all this gets as close to a `bytes` object as possible while not copying and doing its best to preserve type information while constructing something that fits the buffer protocol. --- zarr/storage.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index a490f5e8fe..24bd4506d0 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -31,16 +31,13 @@ import warnings -import numpy as np - - from zarr.util import (normalize_shape, normalize_chunks, normalize_order, normalize_storage_path, buffer_size, normalize_fill_value, nolock, normalize_dtype) from zarr.meta import encode_array_metadata, encode_group_metadata from zarr.compat import PY2, OrderedDict_move_to_end from numcodecs.registry import codec_registry -from numcodecs.compat import ensure_bytes +from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, err_fspath_exists_notdir, err_read_only, MetadataError) @@ -725,9 +722,8 @@ def __getitem__(self, key): def __setitem__(self, key, value): - # handle F-contiguous numpy arrays - if isinstance(value, np.ndarray) and value.flags.f_contiguous: - value = ensure_bytes(value) + # coerce to flat, contiguous array (ideally without copying) + value = ensure_contiguous_ndarray(value) # destination path for key file_path = os.path.join(self.path, key) @@ -1176,7 +1172,7 @@ def __getitem__(self, key): def __setitem__(self, key, value): if self.mode == 'r': err_read_only() - value = ensure_bytes(value) + value = ensure_contiguous_ndarray(value) with self.mutex: self.zf.writestr(key, value) From 2c6ac778d6e07364e49c712f967cd2ab325011a5 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 18:56:45 -0500 Subject: [PATCH 10/14] Simplify `buffer_size` by using `ensure_ndarray` Rewrite `buffer_size` to just use Numcodecs' `ensure_ndarray` to get an `ndarray` that views the data. Once the `ndarray` is gotten, all that is needed is to access its `nbytes` member, which returns the number of bytes that it takes up. --- zarr/util.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/zarr/util.py b/zarr/util.py index b79865bfe8..ad882c41d5 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division -import operator from textwrap import TextWrapper, dedent import numbers import uuid @@ -10,10 +9,11 @@ from asciitree import BoxStyle, LeftAligned from asciitree.traversal import Traversal import numpy as np +from numcodecs.compat import ensure_ndarray from numcodecs.registry import codec_registry -from zarr.compat import PY2, reduce, text_type, binary_type +from zarr.compat import PY2, text_type, binary_type # codecs to use for object dtype convenience API @@ -314,17 +314,7 @@ def normalize_storage_path(path): def buffer_size(v): - from array import array as _stdlib_array - if PY2 and isinstance(v, _stdlib_array): # pragma: py3 no cover - # special case array.array because does not support buffer - # interface in PY2 - return v.buffer_info()[1] * v.itemsize - else: # pragma: py2 no cover - v = memoryview(v) - if v.shape: - return reduce(operator.mul, v.shape) * v.itemsize - else: - return v.itemsize + return ensure_ndarray(v).nbytes def info_text_report(items): From 398820f9137c9b2e4690f8cf7653436186af6fa9 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 18:56:46 -0500 Subject: [PATCH 11/14] Simplify `ensure_str` in `zarr.meta` If the data is already a `str` instance, turn `ensure_str` into a no-op. For all other cases, make use of Numcodecs' `ensure_bytes` to aid `ensure_str` in coercing data through the buffer protocol. If we are on Python 3, then decode the `bytes` object to a `str`. --- zarr/meta.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index 9ce580eff2..7984efb701 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -5,9 +5,10 @@ import numpy as np +from numcodecs.compat import ensure_bytes -from zarr.compat import PY2, binary_type, Mapping +from zarr.compat import PY2, Mapping from zarr.errors import MetadataError @@ -15,14 +16,9 @@ def ensure_str(s): - if PY2: # pragma: py3 no cover - # noinspection PyUnresolvedReferences - if isinstance(s, buffer): # noqa - s = str(s) - else: # pragma: py2 no cover - if isinstance(s, memoryview): - s = s.tobytes() - if isinstance(s, binary_type): + if not isinstance(s, str): + s = ensure_bytes(s) + if not PY2: # pragma: py2 no cover s = s.decode('ascii') return s From bc4d57907a0406cea92f3fa18c4c0daad54e44fc Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 18:59:20 -0500 Subject: [PATCH 12/14] Bump to Numcodecs 0.6.2 --- requirements_dev.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index d39ba9e9b8..03eaa8e871 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,3 +1,3 @@ asciitree==0.3.3 fasteners==0.14.1 -numcodecs==0.6.1 +numcodecs==0.6.2 diff --git a/setup.py b/setup.py index 903af3bc04..b6d237fe0a 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ 'asciitree', 'numpy>=1.7', 'fasteners', - 'numcodecs>=0.6.1', + 'numcodecs>=0.6.2', ], package_dir={'': '.'}, packages=['zarr', 'zarr.tests'], From efacb5234573fae72661d50e15180db414fe6ff8 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 19:11:06 -0500 Subject: [PATCH 13/14] Update tutorial's info content As Blosc got upgraded and it contained an upgrade of Zstd, the results changed a little bit for this example. So update them accordingly. Should fix the doctest failure. --- docs/tutorial.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 606b5acef5..29ce8b0935 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -178,8 +178,8 @@ print some diagnostics, e.g.:: : blocksize=0) Store type : builtins.dict No. bytes : 400000000 (381.5M) - No. bytes stored : 3242241 (3.1M) - Storage ratio : 123.4 + No. bytes stored : 3379344 (3.2M) + Storage ratio : 118.4 Chunks initialized : 100/100 If you don't specify a compressor, by default Zarr uses the Blosc From cc1d7762e64a78cd0e8941430479de4a18d70382 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 4 Dec 2018 22:24:43 +0000 Subject: [PATCH 14/14] release notes [ci skip] --- docs/release.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/release.rst b/docs/release.rst index 96ac7c8f2f..335ec58fb3 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -22,6 +22,11 @@ Enhancements Maintenance ~~~~~~~~~~~ +* The required version of the `numcodecs `_ package has been upgraded + to 0.6.2, which has enabled some code simplification and fixes a failing test involving + msgpack encoding. By :user:`John Kirkham `, :issue:`352`, :issue:`355`, + :issue:`324`. + * CI and test environments have been upgraded to include Python 3.7, drop Python 3.4, and upgrade all pinned package requirements. :issue:`308`.