From 46a00303a581b24dc84e3de46534ba69716dfa57 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 14 Apr 2019 23:31:09 -0400 Subject: [PATCH 01/11] Rewrite `ensure_str` to be `ensure_text_type` To simplify the branching required for Python 2/3 compatibility. Rename `ensure_str` to `ensure_text_type` and rework the code to coerce data that is `bytes` or `bytes`-like to `bytes` and then to text data. It appears JSON on Python 2 or Python 3 handles this just fine. So should make handling these two cases a bit more straightforward. --- zarr/meta.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index ad37e45133..3bb5452116 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -2,24 +2,24 @@ from __future__ import absolute_import, print_function, division import json import base64 +import codecs import numpy as np -from numcodecs.compat import ensure_bytes +from numcodecs.compat import ensure_contiguous_ndarray -from zarr.compat import PY2, Mapping +from zarr.compat import PY2, Mapping, text_type from zarr.errors import MetadataError ZARR_FORMAT = 2 -def ensure_str(s): - if not isinstance(s, str): - s = ensure_bytes(s) - if not PY2: # pragma: py2 no cover - s = s.decode('ascii') +def ensure_text_type(s): + if not isinstance(s, text_type): + s = ensure_contiguous_ndarray(s) + s = codecs.decode(s, 'ascii') return s From ca343046eb6025c94c385b99127a3c5fffc5af6a Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 14 Apr 2019 23:36:07 -0400 Subject: [PATCH 02/11] Use `ensure_text_type` elsewhere in `meta` --- zarr/meta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/meta.py b/zarr/meta.py index 3bb5452116..18d65ea8dd 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -42,7 +42,7 @@ def parse_metadata(s): else: # assume metadata needs to be parsed as JSON - s = ensure_str(s) + s = ensure_text_type(s) meta = json.loads(s) return meta From 987f287e1a43256617e1a91896bbefe503ef9176 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 14 Apr 2019 23:37:01 -0400 Subject: [PATCH 03/11] Rework `convenience` to use `ensure_text_type` --- zarr/convenience.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index c6e5432664..dedf67a107 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -15,7 +15,7 @@ from zarr.errors import err_path_not_found, CopyError from zarr.util import normalize_storage_path, TreeViewer, buffer_size from zarr.compat import PY2, text_type -from zarr.meta import ensure_str, json_dumps +from zarr.meta import ensure_text_type, json_dumps # noinspection PyShadowingBuiltins @@ -1123,7 +1123,7 @@ def is_zarr_key(key): out = { 'zarr_consolidated_format': 1, 'metadata': { - key: json.loads(ensure_str(store[key])) + key: json.loads(ensure_text_type(store[key])) for key in store if is_zarr_key(key) } } From 5b6794c41843595b036f8f1e1e5f143fb1091ed5 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 14 Apr 2019 23:37:39 -0400 Subject: [PATCH 04/11] Use `ensure_text_type` in `n5` --- zarr/n5.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/zarr/n5.py b/zarr/n5.py index 8139464dc6..d0f18dc44e 100644 --- a/zarr/n5.py +++ b/zarr/n5.py @@ -2,7 +2,7 @@ """This module contains a storage class and codec to support the N5 format. """ from __future__ import absolute_import, division -from .meta import ZARR_FORMAT, ensure_str, json_dumps +from .meta import ZARR_FORMAT, ensure_text_type, json_dumps from .storage import ( NestedDirectoryStore, group_meta_key as zarr_group_meta_key, @@ -103,7 +103,7 @@ def __setitem__(self, key, value): key = key.replace(zarr_group_meta_key, n5_attrs_key) - value = ensure_str(value) + value = ensure_text_type(value) n5_attrs = self._load_n5_attrs(key) n5_attrs.update(**group_metadata_to_n5(json.loads(value))) @@ -113,7 +113,7 @@ def __setitem__(self, key, value): key = key.replace(zarr_array_meta_key, n5_attrs_key) - value = ensure_str(value) + value = ensure_text_type(value) n5_attrs = self._load_n5_attrs(key) n5_attrs.update(**array_metadata_to_n5(json.loads(value))) @@ -123,7 +123,7 @@ def __setitem__(self, key, value): key = key.replace(zarr_attrs_key, n5_attrs_key) - value = ensure_str(value) + value = ensure_text_type(value) n5_attrs = self._load_n5_attrs(key) zarr_attrs = json.loads(value) @@ -246,7 +246,7 @@ def listdir(self, path=None): def _load_n5_attrs(self, path): try: s = super(N5Store, self).__getitem__(path) - s = ensure_str(s) + s = ensure_text_type(s) return json.loads(s) except KeyError: return {} From cbd8d31458907a4ee4050e5fc91803e2aa38176f Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 14 Apr 2019 23:38:26 -0400 Subject: [PATCH 05/11] Use `ensure_text_type` in attribute tests --- zarr/tests/test_core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 45f863472e..6c8485dd9e 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -26,7 +26,7 @@ from zarr.core import Array from zarr.errors import PermissionError from zarr.compat import PY2, text_type, binary_type, zip_longest -from zarr.meta import ensure_str +from zarr.meta import ensure_text_type from zarr.util import buffer_size from zarr.n5 import n5_keywords, N5Store from numcodecs import (Delta, FixedScaleOffset, LZ4, GZip, Zlib, Blosc, BZ2, MsgPack, Pickle, @@ -1273,10 +1273,10 @@ def test_endian(self): def test_attributes(self): a = self.create_array(shape=10, chunks=10, dtype='i8') a.attrs['foo'] = 'bar' - attrs = json.loads(ensure_str(a.store[a.attrs.key])) + attrs = json.loads(ensure_text_type(a.store[a.attrs.key])) assert 'foo' in attrs and attrs['foo'] == 'bar' a.attrs['bar'] = 'foo' - attrs = json.loads(ensure_str(a.store[a.attrs.key])) + attrs = json.loads(ensure_text_type(a.store[a.attrs.key])) assert 'foo' in attrs and attrs['foo'] == 'bar' assert 'bar' in attrs and attrs['bar'] == 'foo' From 4e0e0d690fe8b0b699feb3edb4f4623dece32caf Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sun, 14 Apr 2019 23:45:56 -0400 Subject: [PATCH 06/11] Drop Python 2 workaround for `bson.Binary` `MongoDBStore` inherited the behavior on `pymongo` with respect to returning `bson.Binary` for blob values on Python 2. As this caused some issues on Python 2 when parsing JSON content (as the parser was unable) to work with objects that were not `bytes` type (i.e. `bson.Binary`), a workaround was needed to coerce `bson.Binary` to `bytes` on Python 2. It's worth noting that this workaround is not needed for loading binary data from chunks as we use the buffer protocol there. As we have now fixed our handling of JSON data to coerce data to text on Python 2/3 and leverage the buffer protocol in the effort, we no longer need this workaround in `MongoDBStore`. Hence we go ahead and drop it. --- zarr/storage.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index c6ed891abf..befcae826d 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -2297,13 +2297,6 @@ def __getitem__(self, key): raise KeyError(key) else: value = doc[self._value] - - # Coerce `bson.Binary` to `bytes` type on Python 2. - # PyMongo handles this conversion for us on Python 3. - # ref: http://api.mongodb.com/python/current/python3.html#id3 - if PY2: # pragma: py3 no cover - value = binary_type(value) - return value def __setitem__(self, key, value): From 526789521d20365e76ef49bc8001dfa9593eb0fd Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 15 Apr 2019 00:25:42 -0400 Subject: [PATCH 07/11] Simplify `MongoDBStore`'s `__getitem__`'s `return` --- zarr/storage.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index befcae826d..0d959b8899 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -2296,8 +2296,7 @@ def __getitem__(self, key): if doc is None: raise KeyError(key) else: - value = doc[self._value] - return value + return doc[self._value] def __setitem__(self, key, value): value = ensure_bytes(value) From d4734a8448f16b5cda37ddec7e524550ecfb8bb4 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 15 Apr 2019 01:01:16 -0400 Subject: [PATCH 08/11] Drop unused import of `binary_type` in `storage` --- zarr/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index 0d959b8899..4539d8d009 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -38,7 +38,7 @@ normalize_storage_path, buffer_size, normalize_fill_value, nolock, normalize_dtype) from zarr.meta import encode_array_metadata, encode_group_metadata -from zarr.compat import PY2, OrderedDict_move_to_end, binary_type +from zarr.compat import PY2, OrderedDict_move_to_end from numcodecs.registry import codec_registry from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, From 075577368c8c06f1c878dc3ba9abcd65fddf877e Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 16 Apr 2019 19:35:14 -0400 Subject: [PATCH 09/11] Add a helper function for loading JSON Much as we have a helper function for writing JSON, this adds a helper function for loading JSON. Mainly it ensure data is coerced to text before handing it off to the JSON parser. Should simplify code that is loading JSON. --- zarr/meta.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/zarr/meta.py b/zarr/meta.py index 18d65ea8dd..2bae7b49a9 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -29,6 +29,11 @@ def json_dumps(o): separators=(',', ': ')) +def json_loads(s): + """Read JSON in a consistent way.""" + return json.loads(ensure_text_type(s)) + + def parse_metadata(s): # Here we allow that a store may return an already-parsed metadata object, From 55294561dc4f7a2fa3e1663bbbd28034cbaa29e3 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 16 Apr 2019 19:43:31 -0400 Subject: [PATCH 10/11] Rewrite code to use `json_loads` directly Changes other library code to use `json_loads` for handling text encoding and JSON parsing. Should simplify things a bit and avoid having some errors sneak in. --- zarr/convenience.py | 6 ++---- zarr/meta.py | 3 +-- zarr/n5.py | 15 +++++---------- zarr/tests/test_core.py | 7 +++---- 4 files changed, 11 insertions(+), 20 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index dedf67a107..efa0a99c41 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -15,7 +15,7 @@ from zarr.errors import err_path_not_found, CopyError from zarr.util import normalize_storage_path, TreeViewer, buffer_size from zarr.compat import PY2, text_type -from zarr.meta import ensure_text_type, json_dumps +from zarr.meta import json_dumps, json_loads # noinspection PyShadowingBuiltins @@ -1112,8 +1112,6 @@ def consolidate_metadata(store, metadata_key='.zmetadata'): open_consolidated """ - import json - store = normalize_store_arg(store) def is_zarr_key(key): @@ -1123,7 +1121,7 @@ def is_zarr_key(key): out = { 'zarr_consolidated_format': 1, 'metadata': { - key: json.loads(ensure_text_type(store[key])) + key: json_loads(store[key]) for key in store if is_zarr_key(key) } } diff --git a/zarr/meta.py b/zarr/meta.py index 2bae7b49a9..29eb987584 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -47,8 +47,7 @@ def parse_metadata(s): else: # assume metadata needs to be parsed as JSON - s = ensure_text_type(s) - meta = json.loads(s) + meta = json_loads(s) return meta diff --git a/zarr/n5.py b/zarr/n5.py index d0f18dc44e..48dfe0a5e5 100644 --- a/zarr/n5.py +++ b/zarr/n5.py @@ -2,7 +2,7 @@ """This module contains a storage class and codec to support the N5 format. """ from __future__ import absolute_import, division -from .meta import ZARR_FORMAT, ensure_text_type, json_dumps +from .meta import ZARR_FORMAT, json_dumps, json_loads from .storage import ( NestedDirectoryStore, group_meta_key as zarr_group_meta_key, @@ -12,7 +12,6 @@ from numcodecs.abc import Codec from numcodecs.compat import ndarray_copy from numcodecs.registry import register_codec, get_codec -import json import numpy as np import struct import sys @@ -103,9 +102,8 @@ def __setitem__(self, key, value): key = key.replace(zarr_group_meta_key, n5_attrs_key) - value = ensure_text_type(value) n5_attrs = self._load_n5_attrs(key) - n5_attrs.update(**group_metadata_to_n5(json.loads(value))) + n5_attrs.update(**group_metadata_to_n5(json_loads(value))) value = json_dumps(n5_attrs).encode('ascii') @@ -113,9 +111,8 @@ def __setitem__(self, key, value): key = key.replace(zarr_array_meta_key, n5_attrs_key) - value = ensure_text_type(value) n5_attrs = self._load_n5_attrs(key) - n5_attrs.update(**array_metadata_to_n5(json.loads(value))) + n5_attrs.update(**array_metadata_to_n5(json_loads(value))) value = json_dumps(n5_attrs).encode('ascii') @@ -123,9 +120,8 @@ def __setitem__(self, key, value): key = key.replace(zarr_attrs_key, n5_attrs_key) - value = ensure_text_type(value) n5_attrs = self._load_n5_attrs(key) - zarr_attrs = json.loads(value) + zarr_attrs = json_loads(value) for k in n5_keywords: if k in zarr_attrs.keys(): @@ -246,8 +242,7 @@ def listdir(self, path=None): def _load_n5_attrs(self, path): try: s = super(N5Store, self).__getitem__(path) - s = ensure_text_type(s) - return json.loads(s) + return json_loads(s) except KeyError: return {} diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 6c8485dd9e..ee4dda90c1 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -3,7 +3,6 @@ import unittest from tempfile import mkdtemp, mktemp import atexit -import json import shutil import pickle import os @@ -26,7 +25,7 @@ from zarr.core import Array from zarr.errors import PermissionError from zarr.compat import PY2, text_type, binary_type, zip_longest -from zarr.meta import ensure_text_type +from zarr.meta import json_loads from zarr.util import buffer_size from zarr.n5 import n5_keywords, N5Store from numcodecs import (Delta, FixedScaleOffset, LZ4, GZip, Zlib, Blosc, BZ2, MsgPack, Pickle, @@ -1273,10 +1272,10 @@ def test_endian(self): def test_attributes(self): a = self.create_array(shape=10, chunks=10, dtype='i8') a.attrs['foo'] = 'bar' - attrs = json.loads(ensure_text_type(a.store[a.attrs.key])) + attrs = json_loads(a.store[a.attrs.key]) assert 'foo' in attrs and attrs['foo'] == 'bar' a.attrs['bar'] = 'foo' - attrs = json.loads(ensure_text_type(a.store[a.attrs.key])) + attrs = json_loads(a.store[a.attrs.key]) assert 'foo' in attrs and attrs['foo'] == 'bar' assert 'bar' in attrs and attrs['bar'] == 'foo' From bba219a90e390e51154376e36c645753e66c4b4a Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 16 Apr 2019 20:17:46 -0400 Subject: [PATCH 11/11] Note JSON changes in release notes --- docs/release.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/release.rst b/docs/release.rst index efe66707c1..76911bff48 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -1,6 +1,18 @@ Release notes ============= +.. _release_2.3.2: + +2.3.2 +----- + +Bug fixes +~~~~~~~~~ + +* Coerce data to text for JSON parsing. + By :user:`John Kirkham `; :issue:`429` + + .. _release_2.3.1: 2.3.1