From 8301fa621c1739cc1afd821b2899b27464bf2197 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 26 Jun 2018 16:42:56 -0400 Subject: [PATCH 001/168] POC of making a single file out of zarr dot files --- zarr/convenience.py | 21 +++++++++++++++++++++ zarr/tests/test_convenience.py | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 19de7b2826..2c06dac732 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1069,3 +1069,24 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) return n_copied, n_skipped, n_bytes_copied + + +def consolidate_metadata(mapping, out_key='.zmetadata'): + """ + Read all the metadata in the files within the given dataset and join + + Parameters + ---------- + mapping : MutableMapping instance + Containing metadata and data keys of a zarr dataset + out_key : str + Key to place the consolidated data into + """ + import json + + def is_zarr_key(key): + return (key.endswith('.zarray') or key.endswith('.zgroup') or + key.endswith('.zattrs')) + + out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} + mapping[out_key] = json.dumps(out) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index c77006c4f6..cae105c23e 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -12,8 +12,8 @@ import pytest -from zarr.convenience import open, save, save_group, load, copy_store, copy -from zarr.storage import atexit_rmtree +from zarr.convenience import open, save, save_group, load, copy_store, copy, consolidate_metadata +from zarr.storage import atexit_rmtree, DictStore from zarr.core import Array from zarr.hierarchy import Group, group from zarr.errors import CopyError @@ -91,6 +91,34 @@ def test_lazy_loader(): assert_array_equal(bar, loader['bar']) +def test_consolidate_metadata(): + import json + store = DictStore() + z = group(store) + z.create_group('g1') + g2 = z.create_group('g2') + g2.attrs['hello'] = 'world' + arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') + arr.attrs['data'] = 1 + arr[:] = 1.0 + consolidate_metadata(store) + assert '.zmetadata' in store + for key in ['.zgroup', + 'g1/.zgroup', + 'g2/.zgroup', + 'g2/.zattrs', + 'g2/arr/.zarray', + 'g2/arr/.zattrs']: + del store[key] + meta = json.loads(store['.zmetadata']) + meta = {k: v.encode() for k, v in meta.items()} + z2 = group(meta, chunk_store=store) + assert list(z2) == ['g1', 'g2'] + assert z2.g2.attrs['hello'] == 'world' + assert z2.g2.arr.attrs['data'] == 1 + assert (z2.g2.arr[:] == 1.0).all() + + class TestCopyStore(unittest.TestCase): def setUp(self): From be6d70650d86b250674d926693e64dd6792b16a6 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 2 Jul 2018 14:46:58 -0400 Subject: [PATCH 002/168] (WIP) include simple code that would load metadata Again, this is for example only, not intended final structure --- zarr/convenience.py | 2 +- zarr/hierarchy.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 2c06dac732..7aac2e385c 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1089,4 +1089,4 @@ def is_zarr_key(key): key.endswith('.zattrs')) out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} - mapping[out_key] = json.dumps(out) + mapping[out_key] = json.dumps(out).encode() diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index e9565caa13..9e401eed69 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -92,6 +92,14 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): + try: + import json + metadata = json.loads(store['.zmetadata']) + meta_store = {k: v.encode() for k, v in metadata.items()} + chunk_store, store = store, meta_store + except (KeyError, ValueError, json.JSONDecodeError): + pass + self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) From f1128ff92fd780f6a5d2fcc1351b8f6ae794c609 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 12:26:14 -0400 Subject: [PATCH 003/168] Implement ConsolidatedMetadataStore --- zarr/convenience.py | 17 ++++++++- zarr/hierarchy.py | 9 ----- zarr/storage.py | 64 ++++++++++++++++++++++++++++++++++ zarr/tests/test_convenience.py | 11 +++--- 4 files changed, 86 insertions(+), 15 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 7aac2e385c..db3fa2f85a 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1073,7 +1073,16 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, def consolidate_metadata(mapping, out_key='.zmetadata'): """ - Read all the metadata in the files within the given dataset and join + Store all the metadata in the files within the given dataset in one key + + This produces a single file in the backend store, containing all the + metadata read from all the zarr-related keys that can be found. This + should be used in conjunction with ``storage.ConsolidatedMetadataStore`` + to reduce the number of operations on the backend store at read time. + + Note, however, that if the dataset is changed after this consolidation, + then the metadata read by ``storage.ConsolidatedMetadataStore`` would + be out of sync with reality unless this function is called again. Parameters ---------- @@ -1081,8 +1090,13 @@ def consolidate_metadata(mapping, out_key='.zmetadata'): Containing metadata and data keys of a zarr dataset out_key : str Key to place the consolidated data into + + Returns + ------- + ConsolidatedMetadataStore instance, based on the same base store. """ import json + from .storage import ConsolidatedMetadataStore def is_zarr_key(key): return (key.endswith('.zarray') or key.endswith('.zgroup') or @@ -1090,3 +1104,4 @@ def is_zarr_key(key): out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} mapping[out_key] = json.dumps(out).encode() + return ConsolidatedMetadataStore(mapping, out_key) diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 9e401eed69..f20b899b2b 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -91,15 +91,6 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): - - try: - import json - metadata = json.loads(store['.zmetadata']) - meta_store = {k: v.encode() for k, v in metadata.items()} - chunk_store, store = store, meta_store - except (KeyError, ValueError, json.JSONDecodeError): - pass - self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) diff --git a/zarr/storage.py b/zarr/storage.py index 39a497d08b..a8ed34773f 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -24,6 +24,7 @@ import atexit import re import sys +import json import multiprocessing from threading import Lock, RLock import glob @@ -1883,3 +1884,66 @@ def __delitem__(self, key): with self._mutex: self._invalidate_keys() self._invalidate_value(key) + + +class ConsolidatedMetadataStore(MutableMapping): + """A layer over other storage, with the metadata within a single key + + The purpose of this class, is to be able to get all of the metadata for + a given dataset in a single read operation from the underlying storage. + See ``convenience.consolidate_metadata()`` for how to create this single + metadata key. + + This class loads from the one key, and stores the data in a dict, so that + accessing the keys no longer requires operations on the backend store. + + This class is read-only, and attempts to change the dataset metadata will + fail, but changing the data is possible. If the backend storage is changed + directly, then the metadata stored here could become obsolete, and + ``conslidate_metadata`` should be called again and the class re-invoked. + The use case is for write once, read many times. + + """ + def __init__(self, store, metadata_key='.zmetadata'): + """ + + Parameters + ---------- + store: MutableMapping + Containing the zarr dataset + metadata_key: str + The target in the store where all of the metadata are stores. We + assume JSON encoding. + """ + self.store = store + metadata = json.loads(store[metadata_key]) + self.meta_store = {k: v.encode() for k, v in metadata.items()} + + def __getitem__(self, key): + """Try local dict before falling back to real storage""" + try: + return self.meta_store[key] + except KeyError: + return self.store[key] + + def __iter__(self): + """Only list local keys - data must be got via getitem""" + return iter(self.meta_store) + + def __len__(self): + """Only len of local keys""" + return len(self.meta_store) + + def __delitem__(self, key): + """Data can be deleted from storage""" + if key not in self: + del self.store[key] + else: + raise NotImplementedError + + def __setitem__(self, key, value): + """Data can be written to storage""" + if key not in self: + self.store[key] = value + else: + raise NotImplementedError diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index cae105c23e..92984f95c1 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -92,7 +92,7 @@ def test_lazy_loader(): def test_consolidate_metadata(): - import json + from zarr.storage import ConsolidatedMetadataStore store = DictStore() z = group(store) z.create_group('g1') @@ -101,7 +101,8 @@ def test_consolidate_metadata(): arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') arr.attrs['data'] = 1 arr[:] = 1.0 - consolidate_metadata(store) + out = consolidate_metadata(store) + assert isinstance(out, ConsolidatedMetadataStore) assert '.zmetadata' in store for key in ['.zgroup', 'g1/.zgroup', @@ -110,13 +111,13 @@ def test_consolidate_metadata(): 'g2/arr/.zarray', 'g2/arr/.zattrs']: del store[key] - meta = json.loads(store['.zmetadata']) - meta = {k: v.encode() for k, v in meta.items()} - z2 = group(meta, chunk_store=store) + cstore = ConsolidatedMetadataStore(store) + z2 = open(cstore, mode='r') assert list(z2) == ['g1', 'g2'] assert z2.g2.attrs['hello'] == 'world' assert z2.g2.arr.attrs['data'] == 1 assert (z2.g2.arr[:] == 1.0).all() + assert list(out) class TestCopyStore(unittest.TestCase): From 66663912edde64d6ec47b9627b84f7084b0c147e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 12:41:07 -0400 Subject: [PATCH 004/168] fix for py34 py35 --- zarr/storage.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index a8ed34773f..8494427ea7 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1912,11 +1912,15 @@ def __init__(self, store, metadata_key='.zmetadata'): store: MutableMapping Containing the zarr dataset metadata_key: str - The target in the store where all of the metadata are stores. We + The target in the store where all of the metadata are stored. We assume JSON encoding. """ self.store = store - metadata = json.loads(store[metadata_key]) + if sys.version_info.major == 3 and sys.version_info.minor < 6: + d = store[metadata_key].decode() + else: + d = store[metadata_key] + metadata = json.loads(d) self.meta_store = {k: v.encode() for k, v in metadata.items()} def __getitem__(self, key): From a369073d57436683aba4d24200c35ca67bb21f85 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 13:34:24 -0400 Subject: [PATCH 005/168] improve coverage; data write in consolidated store --- zarr/storage.py | 4 ++-- zarr/tests/test_convenience.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 8494427ea7..ce3b0c8b4d 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1940,14 +1940,14 @@ def __len__(self): def __delitem__(self, key): """Data can be deleted from storage""" - if key not in self: + if key not in self.meta_store: del self.store[key] else: raise NotImplementedError def __setitem__(self, key, value): """Data can be written to storage""" - if key not in self: + if key not in self.meta_store: self.store[key] = value else: raise NotImplementedError diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 92984f95c1..379a039e01 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -112,12 +112,18 @@ def test_consolidate_metadata(): 'g2/arr/.zattrs']: del store[key] cstore = ConsolidatedMetadataStore(store) - z2 = open(cstore, mode='r') + z2 = open(cstore) assert list(z2) == ['g1', 'g2'] assert z2.g2.attrs['hello'] == 'world' assert z2.g2.arr.attrs['data'] == 1 assert (z2.g2.arr[:] == 1.0).all() - assert list(out) + assert list(out) == list(cstore) + + # tests del/write on the store + del cstore['g2/arr/0.0'] + assert (z2.g2.arr[:] == 0).all() + z2.g2.arr[:] = 2 + assert (z2.g2.arr[:] == 2).all() class TestCopyStore(unittest.TestCase): From 96e1fb0b963b0f6a3164c6977a1a2dc30f1792b9 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 15:37:59 -0400 Subject: [PATCH 006/168] coverage --- zarr/storage.py | 4 ++-- zarr/tests/test_convenience.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index ce3b0c8b4d..ece3c11677 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1917,8 +1917,8 @@ def __init__(self, store, metadata_key='.zmetadata'): """ self.store = store if sys.version_info.major == 3 and sys.version_info.minor < 6: - d = store[metadata_key].decode() - else: + d = store[metadata_key].decode() # pragma: no cover + else: # pragma: no cover d = store[metadata_key] metadata = json.loads(d) self.meta_store = {k: v.encode() for k, v in metadata.items()} diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 379a039e01..62da7d4b77 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -120,6 +120,10 @@ def test_consolidate_metadata(): assert list(out) == list(cstore) # tests del/write on the store + with pytest.raises(NotImplementedError): + del cstore['.zgroup'] + with pytest.raises(NotImplementedError): + cstore['.zgroup'] = None del cstore['g2/arr/0.0'] assert (z2.g2.arr[:] == 0).all() z2.g2.arr[:] = 2 From 36139cba87a6535d71fab9f3c8cb3c47da52d57e Mon Sep 17 00:00:00 2001 From: shikharsg Date: Mon, 13 Aug 2018 00:57:05 +0530 Subject: [PATCH 007/168] implemented the rest of the mutable mapping functions. tests pass with python 3.5 --- zarr/storage.py | 57 +++++++++++++++++++++++--------------- zarr/tests/test_storage.py | 11 +++++++- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 7964e3dd01..9096414121 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1933,27 +1933,38 @@ def __enter__(self): def __exit__(self, *args): pass + @staticmethod def _append_path_to_prefix(path, prefix): return '/'.join([normalize_storage_path(prefix), normalize_storage_path(path)]) def full_path(self, path=None): - return _append_path_to_prefix(path, self.prefix) + return self._append_path_to_prefix(path, self.prefix) def __getitem__(self, key): + from azure.common import AzureMissingResourceHttpError blob_name = '/'.join([self.prefix, key]) - blob = self.client.get_blob_to_bytes(self.container_name, blob_name) - if blob: + try: + blob = self.client.get_blob_to_bytes(self.container_name, blob_name) return blob.content - else: + except AzureMissingResourceHttpError: raise KeyError('Blob %s not found' % blob_name) def __setitem__(self, key, value): + import io blob_name = '/'.join([self.prefix, key]) - self.client.create_blob_from_text(self.container_name, blob_name, value) + buffer = io.BytesIO() + buffer.write(value) + buffer.seek(0) + self.client.create_blob_from_bytes(self.container_name, blob_name, buffer.read()) def __delitem__(self, key): - raise NotImplementedError + if self.client.exists(self.container_name, '/'.join([self.prefix, key])): + self.client.delete_blob(self.container_name, '/'.join([self.prefix, key])) + elif self.__contains__(key): + self.rmdir(key) + else: + raise KeyError def __eq__(self, other): return ( @@ -1963,13 +1974,14 @@ def __eq__(self, other): ) def keys(self): - raise NotImplementedError + return list(self.__iter__()) def __iter__(self): - raise NotImplementedError + for blob in self.client.list_blobs(self.container_name, self.prefix + '/'): + yield self._strip_prefix_from_path(blob.name, self.prefix) def __len__(self): - raise NotImplementedError + return len(self.keys()) def __contains__(self, key): blob_name = '/'.join([self.prefix, key]) @@ -1986,6 +1998,7 @@ def list_abs_subdirectories(self, prefix): """Return list of all "subdirectories" from an abs prefix.""" return list(set([blob.name.rsplit('/', 1)[0] for blob in self.client.list_blobs(self.container_name) if '/' in blob.name])) + @staticmethod def _strip_prefix_from_path(path, prefix): # normalized things will not have any leading or trailing slashes path_norm = normalize_storage_path(path) @@ -2015,24 +2028,24 @@ def dir_path(self, path=None): dir_path += '/' return dir_path - def listdir(self, path=None): - dir_path = self.dir_path(path) - return sorted(self.list_abs_directory(dir_path, strip_prefix=True)) - - def rename(self, src_path, dst_path): - raise NotImplementedErrror + # def listdir(self, path=None): + # dir_path = self.dir_path(path) + # return sorted(self.list_abs_directory(dir_path, strip_prefix=True)) + # + # def rename(self, src_path, dst_path): + # raise NotImplementedErrror def rmdir(self, path=None): dir_path = normalize_storage_path(self.full_path(path)) + '/' for blob in self.client.list_blobs(self.container_name, prefix=dir_path): self.client.delete_blob(self.container_name, blob.name) - def getsize(self, path=None): - dir_path = self.dir_path(path) - size = 0 - for blob in self.client.list_blobs(self.container_name, prefix=dir_path): - size += blob.properties.content_length - return size + # def getsize(self, path=None): + # dir_path = self.dir_path(path) + # size = 0 + # for blob in self.client.list_blobs(self.container_name, prefix=dir_path): + # size += blob.properties.content_length + # return size def clear(self): - raise NotImplementedError + self.rmdir() diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index f68f8a6ed6..515875b297 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -19,7 +19,7 @@ DirectoryStore, ZipStore, init_group, group_meta_key, getsize, migrate_1to2, TempStore, atexit_rmtree, NestedDirectoryStore, default_compressor, DBMStore, - LMDBStore, atexit_rmglob, LRUStoreCache) + LMDBStore, atexit_rmglob, LRUStoreCache, ABSStore) from zarr.meta import (decode_array_metadata, encode_array_metadata, ZARR_FORMAT, decode_group_metadata, encode_group_metadata) from zarr.compat import PY2 @@ -1235,3 +1235,12 @@ def test_format_compatibility(): else: assert compressor.codec_id == z.compressor.codec_id assert compressor.get_config() == z.compressor.get_config() + + +class TestABSStore(StoreTests, unittest.TestCase): + + def create_store(self): + from zarr.azureblob import BLOB_ACCOUNT_NAME, BLOB_ACCOUNT_KEY + store = ABSStore('test', 'zarrtesting/', BLOB_ACCOUNT_NAME, BLOB_ACCOUNT_KEY) + store.rmdir() + return store From bda0c3ffe826651cd6e9cff349e4f411da55ec8d Mon Sep 17 00:00:00 2001 From: shikharsg Date: Tue, 14 Aug 2018 13:20:16 +0530 Subject: [PATCH 008/168] using local blob emulator for storage.ABSStore testing --- zarr/storage.py | 11 ++--------- zarr/tests/test_storage.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 9096414121..80c1ffb5a3 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1907,16 +1907,10 @@ class ABSStore(MutableMapping): `Python Client Library `_ version >= 1.3.0. """ - def __init__(self, container_name, prefix, account_name, account_key): - self.account_name = account_name - self.account_key = account_key + def __init__(self, container_name, prefix, blob_client): + self.client = blob_client self.container_name = container_name self.prefix = normalize_storage_path(prefix) - self.initialize_container() - - def initialize_container(self): - from azure.storage.blob import BlockBlobService - self.client = BlockBlobService(self.account_name, self.account_key) # needed for pickling def __getstate__(self): @@ -1925,7 +1919,6 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__.update(state) - self.initialize_container() def __enter__(self): return self diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 515875b297..d6f39ea0bf 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1240,7 +1240,13 @@ def test_format_compatibility(): class TestABSStore(StoreTests, unittest.TestCase): def create_store(self): - from zarr.azureblob import BLOB_ACCOUNT_NAME, BLOB_ACCOUNT_KEY - store = ABSStore('test', 'zarrtesting/', BLOB_ACCOUNT_NAME, BLOB_ACCOUNT_KEY) + from azure.storage.blob import BlockBlobService + blob_emulator_connection_string = 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;'+\ + 'AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;'+\ + 'BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;'+\ + 'TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;'+\ + 'QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' + blob_client = BlockBlobService(is_emulated=True, connection_string=blob_emulator_connection_string) + store = ABSStore('test', 'zarrtesting/', blob_client) store.rmdir() return store From 447c473d4f12fffb32e441807ebe6113862745ca Mon Sep 17 00:00:00 2001 From: shikharsg Date: Tue, 14 Aug 2018 13:30:01 +0530 Subject: [PATCH 009/168] fixed PY2 array.array error in storage.ABSStore --- zarr/storage.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/zarr/storage.py b/zarr/storage.py index 80c1ffb5a3..3c62c28fa4 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1945,6 +1945,9 @@ def __getitem__(self, key): def __setitem__(self, key, value): import io + import array + if PY2 and isinstance(value, array.array): + value = value.tostring() blob_name = '/'.join([self.prefix, key]) buffer = io.BytesIO() buffer.write(value) From c6858ed75fafc8e2278717a5d1bdb2f0c128b554 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Tue, 14 Aug 2018 18:57:07 +0530 Subject: [PATCH 010/168] create test container if not exists in ABSStore test --- zarr/tests/test_storage.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index d6f39ea0bf..dc0b015b5b 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1247,6 +1247,8 @@ def create_store(self): 'TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;'+\ 'QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' blob_client = BlockBlobService(is_emulated=True, connection_string=blob_emulator_connection_string) + if not blob_client.exists('test'): + blob_client.create_container('test') store = ABSStore('test', 'zarrtesting/', blob_client) store.rmdir() return store From 8e51b3bbe7c9194eb1765059c4b9e2715d8e97cb Mon Sep 17 00:00:00 2001 From: shikharsg Date: Wed, 15 Aug 2018 00:17:25 +0530 Subject: [PATCH 011/168] added more tests for ABSStore --- zarr/tests/test_core.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 390f888287..a42fa64064 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -16,7 +16,7 @@ from zarr.storage import (DirectoryStore, init_array, init_group, NestedDirectoryStore, DBMStore, LMDBStore, atexit_rmtree, atexit_rmglob, - LRUStoreCache) + LRUStoreCache, ABSStore) from zarr.core import Array from zarr.errors import PermissionError from zarr.compat import PY2, text_type, binary_type @@ -1211,6 +1211,33 @@ def test_nbytes_stored(self): assert expect_nbytes_stored == z.nbytes_stored +class TestArrayWithABSStore(TestArray): + + @staticmethod + def absstore(): + from azure.storage.blob import BlockBlobService + blob_emulator_connection_string = 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;'+\ + 'AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;'+\ + 'BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;'+\ + 'TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;'+\ + 'QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' + blob_client = BlockBlobService(is_emulated=True, connection_string=blob_emulator_connection_string) + if not blob_client.exists('test'): + blob_client.create_container('test') + store = ABSStore('test', 'zarrtesting/', blob_client) + store.rmdir() + return store + + def create_array(self, read_only=False, **kwargs): + store = self.absstore() + kwargs.setdefault('compressor', Zlib(1)) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=cache_metadata, + cache_attrs=cache_attrs) + + class TestArrayWithNestedDirectoryStore(TestArrayWithDirectoryStore): @staticmethod From ec4e3f1dd7d478cdd164bc3a2ef8c0ff0104625e Mon Sep 17 00:00:00 2001 From: shikharsg Date: Wed, 15 Aug 2018 09:22:24 +0530 Subject: [PATCH 012/168] reverted blob client creation to inside of ABSStore --- zarr/storage.py | 10 ++++++++-- zarr/tests/test_core.py | 3 ++- zarr/tests/test_storage.py | 3 ++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 3c62c28fa4..52e43b55b3 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1907,11 +1907,16 @@ class ABSStore(MutableMapping): `Python Client Library `_ version >= 1.3.0. """ - def __init__(self, container_name, prefix, blob_client): - self.client = blob_client + def __init__(self, container_name, prefix, account_name, account_key): + self.account_name = account_name + self.account_key = account_key self.container_name = container_name self.prefix = normalize_storage_path(prefix) + def initialize_container(self): + from azure.storage.blob import BlockBlobService + self.client = BlockBlobService(self.account_name, self.account_key) + # needed for pickling def __getstate__(self): state = self.__dict__.copy() @@ -1919,6 +1924,7 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__.update(state) + self.initialize_container() def __enter__(self): return self diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index a42fa64064..988db09667 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1224,7 +1224,8 @@ def absstore(): blob_client = BlockBlobService(is_emulated=True, connection_string=blob_emulator_connection_string) if not blob_client.exists('test'): blob_client.create_container('test') - store = ABSStore('test', 'zarrtesting/', blob_client) + store = ABSStore(container_name='test', prefix='zarrtesting/', account_name='foo', account_key='bar') + store.client = blob_client store.rmdir() return store diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index dc0b015b5b..7d6c9ff3a7 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1249,6 +1249,7 @@ def create_store(self): blob_client = BlockBlobService(is_emulated=True, connection_string=blob_emulator_connection_string) if not blob_client.exists('test'): blob_client.create_container('test') - store = ABSStore('test', 'zarrtesting/', blob_client) + store = ABSStore(container_name='test', prefix='zarrtesting/', account_name='foo', account_key='bar') + store.client = blob_client store.rmdir() return store From bde7b5ece967d44543e2b20e5ffe8759b6ef0f7c Mon Sep 17 00:00:00 2001 From: shikharsg Date: Wed, 15 Aug 2018 09:50:19 +0530 Subject: [PATCH 013/168] added group test for ABSStore --- zarr/tests/test_hierarchy.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index f47012cf88..e9c60b5027 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -18,7 +18,7 @@ from zarr.storage import (DictStore, DirectoryStore, ZipStore, init_group, init_array, array_meta_key, group_meta_key, atexit_rmtree, NestedDirectoryStore, DBMStore, LMDBStore, atexit_rmglob, - LRUStoreCache) + LRUStoreCache, ABSStore) from zarr.core import Array from zarr.compat import PY2, text_type from zarr.hierarchy import Group, group, open_group @@ -856,6 +856,25 @@ def create_store(): return store, None +class TestGroupWithABSStore(TestGroup): + + @staticmethod + def create_store(): + from azure.storage.blob import BlockBlobService + blob_emulator_connection_string = 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;' + \ + 'AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;' + \ + 'BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;' + \ + 'TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;' + \ + 'QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' + blob_client = BlockBlobService(is_emulated=True, connection_string=blob_emulator_connection_string) + if not blob_client.exists('test'): + blob_client.create_container('test') + store = ABSStore(container_name='test', prefix='zarrtesting/', account_name='foo', account_key='bar') + store.client = blob_client + store.rmdir() + return store, None + + class TestGroupWithNestedDirectoryStore(TestGroup): @staticmethod From b86cf5305e9e37a2adb13358122e564e664b1a1c Mon Sep 17 00:00:00 2001 From: shikharsg Date: Wed, 15 Aug 2018 12:35:27 +0530 Subject: [PATCH 014/168] emulator connection string not needed --- zarr/tests/test_core.py | 7 +------ zarr/tests/test_hierarchy.py | 7 +------ zarr/tests/test_storage.py | 7 +------ 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 988db09667..9ac33da36a 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1216,12 +1216,7 @@ class TestArrayWithABSStore(TestArray): @staticmethod def absstore(): from azure.storage.blob import BlockBlobService - blob_emulator_connection_string = 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;'+\ - 'AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;'+\ - 'BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;'+\ - 'TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;'+\ - 'QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' - blob_client = BlockBlobService(is_emulated=True, connection_string=blob_emulator_connection_string) + blob_client = BlockBlobService(is_emulated=True) if not blob_client.exists('test'): blob_client.create_container('test') store = ABSStore(container_name='test', prefix='zarrtesting/', account_name='foo', account_key='bar') diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index e9c60b5027..029075d889 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -861,12 +861,7 @@ class TestGroupWithABSStore(TestGroup): @staticmethod def create_store(): from azure.storage.blob import BlockBlobService - blob_emulator_connection_string = 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;' + \ - 'AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;' + \ - 'BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;' + \ - 'TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;' + \ - 'QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' - blob_client = BlockBlobService(is_emulated=True, connection_string=blob_emulator_connection_string) + blob_client = BlockBlobService(is_emulated=True) if not blob_client.exists('test'): blob_client.create_container('test') store = ABSStore(container_name='test', prefix='zarrtesting/', account_name='foo', account_key='bar') diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 7d6c9ff3a7..191da27bd3 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1241,12 +1241,7 @@ class TestABSStore(StoreTests, unittest.TestCase): def create_store(self): from azure.storage.blob import BlockBlobService - blob_emulator_connection_string = 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;'+\ - 'AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;'+\ - 'BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;'+\ - 'TableEndpoint=http://127.0.0.1:10002/devstoreaccount1;'+\ - 'QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' - blob_client = BlockBlobService(is_emulated=True, connection_string=blob_emulator_connection_string) + blob_client = BlockBlobService(is_emulated=True) if not blob_client.exists('test'): blob_client.create_container('test') store = ABSStore(container_name='test', prefix='zarrtesting/', account_name='foo', account_key='bar') From f66dadd7421586c5992313c93ef16074497857c1 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Wed, 15 Aug 2018 12:43:28 +0530 Subject: [PATCH 015/168] fixed import statement location and put azure-storage-blob in requirements --- requirements.txt | 1 + zarr/storage.py | 8 ++++---- zarr/tests/test_core.py | 2 +- zarr/tests/test_hierarchy.py | 2 +- zarr/tests/test_storage.py | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index 8720210cf5..1b6d78b7a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ pytest numpy fasteners numcodecs +azure-storage-blob diff --git a/zarr/storage.py b/zarr/storage.py index 52e43b55b3..8b5f084082 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -28,9 +28,13 @@ from threading import Lock, RLock import glob import warnings +import io +import array import numpy as np +from azure.storage.blob import BlockBlobService +from azure.common import AzureMissingResourceHttpError from zarr.util import (normalize_shape, normalize_chunks, normalize_order, @@ -1914,7 +1918,6 @@ def __init__(self, container_name, prefix, account_name, account_key): self.prefix = normalize_storage_path(prefix) def initialize_container(self): - from azure.storage.blob import BlockBlobService self.client = BlockBlobService(self.account_name, self.account_key) # needed for pickling @@ -1941,7 +1944,6 @@ def full_path(self, path=None): return self._append_path_to_prefix(path, self.prefix) def __getitem__(self, key): - from azure.common import AzureMissingResourceHttpError blob_name = '/'.join([self.prefix, key]) try: blob = self.client.get_blob_to_bytes(self.container_name, blob_name) @@ -1950,8 +1952,6 @@ def __getitem__(self, key): raise KeyError('Blob %s not found' % blob_name) def __setitem__(self, key, value): - import io - import array if PY2 and isinstance(value, array.array): value = value.tostring() blob_name = '/'.join([self.prefix, key]) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 9ac33da36a..f936dfe1b1 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -12,6 +12,7 @@ import numpy as np from numpy.testing import assert_array_equal, assert_array_almost_equal import pytest +from azure.storage.blob import BlockBlobService from zarr.storage import (DirectoryStore, init_array, init_group, NestedDirectoryStore, @@ -1215,7 +1216,6 @@ class TestArrayWithABSStore(TestArray): @staticmethod def absstore(): - from azure.storage.blob import BlockBlobService blob_client = BlockBlobService(is_emulated=True) if not blob_client.exists('test'): blob_client.create_container('test') diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index 029075d889..9f7056ef9d 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -13,6 +13,7 @@ import numpy as np from numpy.testing import assert_array_equal import pytest +from azure.storage.blob import BlockBlobService from zarr.storage import (DictStore, DirectoryStore, ZipStore, init_group, init_array, @@ -860,7 +861,6 @@ class TestGroupWithABSStore(TestGroup): @staticmethod def create_store(): - from azure.storage.blob import BlockBlobService blob_client = BlockBlobService(is_emulated=True) if not blob_client.exists('test'): blob_client.create_container('test') diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 191da27bd3..ca1bfadaae 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -13,6 +13,7 @@ import numpy as np from numpy.testing import assert_array_equal, assert_array_almost_equal import pytest +from azure.storage.blob import BlockBlobService from zarr.storage import (init_array, array_meta_key, attrs_key, DictStore, @@ -1240,7 +1241,6 @@ def test_format_compatibility(): class TestABSStore(StoreTests, unittest.TestCase): def create_store(self): - from azure.storage.blob import BlockBlobService blob_client = BlockBlobService(is_emulated=True) if not blob_client.exists('test'): blob_client.create_container('test') From b8f60fed587f75a97bf240b61623ab0af9fb4ed7 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Wed, 15 Aug 2018 16:31:10 +0530 Subject: [PATCH 016/168] fixed pickle tests --- zarr/storage.py | 16 +++++++++------- zarr/tests/test_core.py | 4 ++-- zarr/tests/test_hierarchy.py | 4 ++-- zarr/tests/test_storage.py | 4 ++-- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 8b5f084082..721c6ac8fb 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1911,14 +1911,16 @@ class ABSStore(MutableMapping): `Python Client Library `_ version >= 1.3.0. """ - def __init__(self, container_name, prefix, account_name, account_key): + def __init__(self, container, prefix, account_name=None, account_key=None, blob_service_kwargs=None): + self.container_name = container + self.prefix = normalize_storage_path(prefix) self.account_name = account_name self.account_key = account_key - self.container_name = container_name - self.prefix = normalize_storage_path(prefix) - - def initialize_container(self): - self.client = BlockBlobService(self.account_name, self.account_key) + if blob_service_kwargs is not None: + self.blob_service_kwargs = blob_service_kwargs + else: + self.blob_service_kwargs = dict() + self.client = BlockBlobService(self.account_name, self.account_key, **self.blob_service_kwargs) # needed for pickling def __getstate__(self): @@ -1927,7 +1929,7 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__.update(state) - self.initialize_container() + self.client = BlockBlobService(self.account_name, self.account_key, **self.blob_service_kwargs) def __enter__(self): return self diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index f936dfe1b1..89bb9702b7 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1219,8 +1219,8 @@ def absstore(): blob_client = BlockBlobService(is_emulated=True) if not blob_client.exists('test'): blob_client.create_container('test') - store = ABSStore(container_name='test', prefix='zarrtesting/', account_name='foo', account_key='bar') - store.client = blob_client + store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', account_key='bar', + blob_service_kwargs={'is_emulated':True}) store.rmdir() return store diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index 9f7056ef9d..2cbac95bcd 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -864,8 +864,8 @@ def create_store(): blob_client = BlockBlobService(is_emulated=True) if not blob_client.exists('test'): blob_client.create_container('test') - store = ABSStore(container_name='test', prefix='zarrtesting/', account_name='foo', account_key='bar') - store.client = blob_client + store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', account_key='bar', + blob_service_kwargs={'is_emulated': True}) store.rmdir() return store, None diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index ca1bfadaae..7a9aa8d972 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1244,7 +1244,7 @@ def create_store(self): blob_client = BlockBlobService(is_emulated=True) if not blob_client.exists('test'): blob_client.create_container('test') - store = ABSStore(container_name='test', prefix='zarrtesting/', account_name='foo', account_key='bar') - store.client = blob_client + store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', account_key='bar', + blob_service_kwargs={'is_emulated':True}) store.rmdir() return store From edd5a71022c62bd2723da38feabc216aaa25e614 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Wed, 15 Aug 2018 21:54:16 +0530 Subject: [PATCH 017/168] fixed listdir in ABSStore --- zarr/storage.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 721c6ac8fb..99fb4eff6e 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1996,11 +1996,15 @@ def __contains__(self, key): def list_abs_directory_blobs(self, prefix): """Return list of all blobs from an abs prefix.""" - return [blob.name for blob in self.client.list_blobs(self.container_name)] + return [blob.name for blob in self.client.list_blobs(self.container_name, prefix=prefix) if '/' not in blob.name[len(prefix):]] def list_abs_subdirectories(self, prefix): """Return list of all "subdirectories" from an abs prefix.""" - return list(set([blob.name.rsplit('/', 1)[0] for blob in self.client.list_blobs(self.container_name) if '/' in blob.name])) + dirs = [] + for blob in self.client.list_blobs(self.container_name, prefix=prefix): + if '/' in blob.name[len(prefix):]: + dirs.append(blob.name[:blob.name.find('/', len(prefix))]) + return dirs @staticmethod def _strip_prefix_from_path(path, prefix): @@ -2019,7 +2023,7 @@ def list_abs_directory(self, prefix, strip_prefix=True): items.update(self.list_abs_subdirectories(prefix)) items = list(items) if strip_prefix: - items = [_strip_prefix_from_path(path, prefix) for path in items] + items = [self._strip_prefix_from_path(path, prefix) for path in items] return items def dir_path(self, path=None): @@ -2028,14 +2032,13 @@ def dir_path(self, path=None): dir_path = self.prefix if store_path: dir_path = os.path.join(dir_path, store_path) - else: - dir_path += '/' + dir_path += '/' return dir_path - # def listdir(self, path=None): - # dir_path = self.dir_path(path) - # return sorted(self.list_abs_directory(dir_path, strip_prefix=True)) - # + def listdir(self, path=None): + dir_path = self.dir_path(path) + return sorted(self.list_abs_directory(dir_path, strip_prefix=True)) + # def rename(self, src_path, dst_path): # raise NotImplementedErrror From 3fbe5896f5b65e6aace4db4c18a85fbec31df508 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Thu, 16 Aug 2018 23:28:59 +0530 Subject: [PATCH 018/168] fixed getsize --- zarr/storage.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 99fb4eff6e..51831700e8 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -2047,12 +2047,18 @@ def rmdir(self, path=None): for blob in self.client.list_blobs(self.container_name, prefix=dir_path): self.client.delete_blob(self.container_name, blob.name) - # def getsize(self, path=None): - # dir_path = self.dir_path(path) - # size = 0 - # for blob in self.client.list_blobs(self.container_name, prefix=dir_path): - # size += blob.properties.content_length - # return size + def getsize(self, path=None): + store_path = normalize_storage_path(path) + fs_path = self.prefix + if store_path: + fs_path = self._append_path_to_prefix(store_path, self.prefix) + if self.client.exists(self.container_name, fs_path): + return self.client.get_blob_properties(self.container_name, fs_path).properties.content_length + else: + size = 0 + for blob_name in self.list_abs_directory_blobs(fs_path + '/'): + size += self.client.get_blob_properties(self.container_name, blob_name).properties.content_length + return size def clear(self): self.rmdir() From 4b8560e7790e4c1a7fdfeb1716a9b3f23d3cc563 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 17 Aug 2018 00:23:14 +0530 Subject: [PATCH 019/168] Fixed PY2 pickle test. python 2 pickle can't pickle instance methods --- zarr/storage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zarr/storage.py b/zarr/storage.py index 51831700e8..8168c77dfa 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1925,6 +1925,7 @@ def __init__(self, container, prefix, account_name=None, account_key=None, blob_ # needed for pickling def __getstate__(self): state = self.__dict__.copy() + del state['client'] return state def __setstate__(self, state): From 631051c080c175a33bb282088ed7802bb8b0662d Mon Sep 17 00:00:00 2001 From: shikharsg Date: Tue, 4 Sep 2018 17:06:12 +0530 Subject: [PATCH 020/168] implemented the suggestion from here: https://github.com/zarr-developers/zarr/pull/293#discussion_r214753603 --- zarr/storage.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 8168c77dfa..0131d8a490 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1958,10 +1958,8 @@ def __setitem__(self, key, value): if PY2 and isinstance(value, array.array): value = value.tostring() blob_name = '/'.join([self.prefix, key]) - buffer = io.BytesIO() - buffer.write(value) - buffer.seek(0) - self.client.create_blob_from_bytes(self.container_name, blob_name, buffer.read()) + buffer = io.BytesIO(value) + self.client.create_blob_from_stream(self.container_name, blob_name, buffer) def __delitem__(self, key): if self.client.exists(self.container_name, '/'.join([self.prefix, key])): From ea933524d728e51470c5652cad318f4c1b4be89a Mon Sep 17 00:00:00 2001 From: shikharsg Date: Wed, 5 Sep 2018 14:28:33 +0530 Subject: [PATCH 021/168] flake-8 fixes --- zarr/storage.py | 27 ++++++++++++++++++--------- zarr/tests/test_core.py | 4 ++-- zarr/tests/test_hierarchy.py | 4 ++-- zarr/tests/test_storage.py | 4 ++-- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 0131d8a490..a449e54520 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1908,19 +1908,21 @@ class ABSStore(MutableMapping): Notes ----- In order to use this store, you must install the Azure Blob Storage - `Python Client Library `_ version >= 1.3.0. + https://github.com/Azure/azure-storage-python/tree/master/azure-storage-blob_ version >= 1.3.0. """ - def __init__(self, container, prefix, account_name=None, account_key=None, blob_service_kwargs=None): + def __init__(self, container, prefix, account_name=None, account_key=None, + blob_service_kwargs=None): self.container_name = container self.prefix = normalize_storage_path(prefix) self.account_name = account_name self.account_key = account_key if blob_service_kwargs is not None: - self.blob_service_kwargs = blob_service_kwargs + self.blob_service_kwargs = blob_service_kwargs else: - self.blob_service_kwargs = dict() - self.client = BlockBlobService(self.account_name, self.account_key, **self.blob_service_kwargs) + self.blob_service_kwargs = dict() + self.client = BlockBlobService(self.account_name, self.account_key, + **self.blob_service_kwargs) # needed for pickling def __getstate__(self): @@ -1930,7 +1932,8 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__.update(state) - self.client = BlockBlobService(self.account_name, self.account_key, **self.blob_service_kwargs) + self.client = BlockBlobService(self.account_name, self.account_key, + **self.blob_service_kwargs) def __enter__(self): return self @@ -1995,7 +1998,11 @@ def __contains__(self, key): def list_abs_directory_blobs(self, prefix): """Return list of all blobs from an abs prefix.""" - return [blob.name for blob in self.client.list_blobs(self.container_name, prefix=prefix) if '/' not in blob.name[len(prefix):]] + blobs = list() + for blob in self.client.list_blobs(self.container_name, prefix=prefix): + if '/' not in blob.name[len(prefix):]: + blobs.append(blob.name) + return blobs def list_abs_subdirectories(self, prefix): """Return list of all "subdirectories" from an abs prefix.""" @@ -2052,11 +2059,13 @@ def getsize(self, path=None): if store_path: fs_path = self._append_path_to_prefix(store_path, self.prefix) if self.client.exists(self.container_name, fs_path): - return self.client.get_blob_properties(self.container_name, fs_path).properties.content_length + return self.client.get_blob_properties(self.container_name, + fs_path).properties.content_length else: size = 0 for blob_name in self.list_abs_directory_blobs(fs_path + '/'): - size += self.client.get_blob_properties(self.container_name, blob_name).properties.content_length + size += self.client.get_blob_properties(self.container_name, + blob_name).properties.content_length return size def clear(self): diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 89bb9702b7..e4f0a68bef 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1219,8 +1219,8 @@ def absstore(): blob_client = BlockBlobService(is_emulated=True) if not blob_client.exists('test'): blob_client.create_container('test') - store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', account_key='bar', - blob_service_kwargs={'is_emulated':True}) + store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', + account_key='bar', blob_service_kwargs={'is_emulated': True}) store.rmdir() return store diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index 2cbac95bcd..145124f90c 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -864,8 +864,8 @@ def create_store(): blob_client = BlockBlobService(is_emulated=True) if not blob_client.exists('test'): blob_client.create_container('test') - store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', account_key='bar', - blob_service_kwargs={'is_emulated': True}) + store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', + account_key='bar', blob_service_kwargs={'is_emulated': True}) store.rmdir() return store, None diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 7a9aa8d972..c76ba39b9a 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1244,7 +1244,7 @@ def create_store(self): blob_client = BlockBlobService(is_emulated=True) if not blob_client.exists('test'): blob_client.create_container('test') - store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', account_key='bar', - blob_service_kwargs={'is_emulated':True}) + store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', + account_key='bar', blob_service_kwargs={'is_emulated': True}) store.rmdir() return store From 9d1f9330ebe84e5f714914d8b340d22f61103416 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Fri, 5 Oct 2018 00:19:24 -0700 Subject: [PATCH 022/168] Create CODE_OF_CONDUCT.md --- CODE_OF_CONDUCT.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000..9bc2cf9f61 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,46 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at TODO@something.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ From 260b27a64d23383b8d26d6f0d49f58ee90bd7269 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 11 Oct 2018 15:06:03 -0700 Subject: [PATCH 023/168] add email --- CODE_OF_CONDUCT.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 9bc2cf9f61..93175dd661 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -34,7 +34,7 @@ This Code of Conduct applies both within project spaces and in public spaces whe ## Enforcement -Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at TODO@something.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at zarr.conduct@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. From 2df74ed9c6db4d3f71491b2c942ddc1b8316782c Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 14:39:17 +0100 Subject: [PATCH 024/168] fix failing pickle tests --- zarr/storage.py | 6 +++++- zarr/tests/test_core.py | 40 +++++++++++++++++++++++++++--------- zarr/tests/test_hierarchy.py | 36 +++++++++++++++++++------------- zarr/tests/test_storage.py | 20 +++++++++++++++--- 4 files changed, 74 insertions(+), 28 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 39a497d08b..a78fec28be 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1437,7 +1437,11 @@ def __init__(self, path, flag='c', mode=0o666, open=None, write_lock=True, self.open_kwargs = open_kwargs def __getstate__(self): - self.flush() # needed for py2 and ndbm + try: + self.flush() # needed for py2 and ndbm + except: + # flush may fail if db has already been closed + pass return (self.path, self.flag, self.mode, self.open, self.write_lock, self.open_kwargs) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 390f888287..bc957031d5 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -656,19 +656,39 @@ def test_read_only(self): def test_pickle(self): + # setup array z = self.create_array(shape=1000, chunks=100, dtype=int, cache_metadata=False, cache_attrs=False) - z[:] = np.random.randint(0, 1000, 1000) - z2 = pickle.loads(pickle.dumps(z)) - assert z.shape == z2.shape - assert z.chunks == z2.chunks - assert z.dtype == z2.dtype + shape = z.shape + chunks = z.chunks + dtype = z.dtype + compressor_config = None if z.compressor: - assert z.compressor.get_config() == z2.compressor.get_config() - assert z.fill_value == z2.fill_value - assert z._cache_metadata == z2._cache_metadata - assert z.attrs.cache == z2.attrs.cache - assert_array_equal(z[:], z2[:]) + compressor_config = z.compressor.get_config() + fill_value = z.fill_value + cache_metadata = z._cache_metadata + attrs_cache = z.attrs.cache + a = np.random.randint(0, 1000, 1000) + z[:] = a + + # round trip through pickle + dump = pickle.dumps(z) + # some stores cannot be opened twice at the same time, need to close first + # store before can round-trip through pickle + if hasattr(z.store, 'close'): + z.store.close() + z2 = pickle.loads(dump) + + # verify + assert shape == z2.shape + assert chunks == z2.chunks + assert dtype == z2.dtype + if z2.compressor: + assert compressor_config == z2.compressor.get_config() + assert fill_value == z2.fill_value + assert cache_metadata == z2._cache_metadata + assert attrs_cache == z2.attrs.cache + assert_array_equal(a, z2[:]) def test_np_ufuncs(self): z = self.create_array(shape=(100, 100), chunks=(10, 10)) diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index f47012cf88..7820441c81 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -820,23 +820,31 @@ def test_paths(self): g1['foo/../bar'] def test_pickle(self): - # setup + + # setup group g = self.create_group() d = g.create_dataset('foo/bar', shape=100, chunks=10) d[:] = np.arange(100) - - # needed for zip store - if hasattr(g.store, 'flush'): - g.store.flush() - - # pickle round trip - g2 = pickle.loads(pickle.dumps(g)) - assert g.path == g2.path - assert g.name == g2.name - assert len(g) == len(g2) - assert list(g) == list(g2) - assert g['foo'] == g2['foo'] - assert g['foo/bar'] == g2['foo/bar'] + path = g.path + name = g.name + n = len(g) + keys = list(g) + + # round-trip through pickle + dump = pickle.dumps(g) + # some stores cannot be opened twice at the same time, need to close first + # store before can round-trip through pickle + if hasattr(g.store, 'close'): + g.store.close() + g2 = pickle.loads(dump) + + # verify + assert path == g2.path + assert name == g2.name + assert n == len(g2) + assert keys == list(g2) + assert isinstance(g2['foo'], Group) + assert isinstance(g2['foo/bar'], Array) class TestGroupWithDictStore(TestGroup): diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index f68f8a6ed6..10e615b56c 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -128,12 +128,25 @@ def test_iterators(self): set(store.items())) def test_pickle(self): + + # setup store store = self.create_store() store['foo'] = b'bar' store['baz'] = b'quux' - store2 = pickle.loads(pickle.dumps(store)) - assert len(store) == len(store2) - assert sorted(store.keys()) == sorted(store2.keys()) + n = len(store) + keys = sorted(store.keys()) + + # round-trip through pickle + dump = pickle.dumps(store) + # some stores cannot be opened twice at the same time, need to close first + # store before can round-trip through pickle + if hasattr(store, 'close'): + store.close() + store2 = pickle.loads(dump) + + # verify + assert n == len(store2) + assert keys == sorted(store2.keys()) assert b'bar' == store2['foo'] assert b'quux' == store2['baz'] @@ -745,6 +758,7 @@ class TestDBMStore(StoreTests, unittest.TestCase): def create_store(self): path = tempfile.mktemp(suffix='.anydbm') atexit.register(atexit_rmglob, path + '*') + # create store using default dbm implementation store = DBMStore(path, flag='n') return store From d28bfd3ac9e889a6446a8f8371b52fc0d01e0920 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 14:52:34 +0100 Subject: [PATCH 025/168] flake8 --- zarr/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index a78fec28be..8b551d1254 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1439,7 +1439,7 @@ def __init__(self, path, flag='c', mode=0o666, open=None, write_lock=True, def __getstate__(self): try: self.flush() # needed for py2 and ndbm - except: + except Exception: # flush may fail if db has already been closed pass return (self.path, self.flag, self.mode, self.open, self.write_lock, From 7a5c81d368c5e2584aa0eac586b0912645abcc81 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 26 Jun 2018 16:42:56 -0400 Subject: [PATCH 026/168] POC of making a single file out of zarr dot files --- zarr/convenience.py | 21 +++++++++++++++++++++ zarr/tests/test_convenience.py | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 19de7b2826..2c06dac732 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1069,3 +1069,24 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) return n_copied, n_skipped, n_bytes_copied + + +def consolidate_metadata(mapping, out_key='.zmetadata'): + """ + Read all the metadata in the files within the given dataset and join + + Parameters + ---------- + mapping : MutableMapping instance + Containing metadata and data keys of a zarr dataset + out_key : str + Key to place the consolidated data into + """ + import json + + def is_zarr_key(key): + return (key.endswith('.zarray') or key.endswith('.zgroup') or + key.endswith('.zattrs')) + + out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} + mapping[out_key] = json.dumps(out) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index c77006c4f6..cae105c23e 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -12,8 +12,8 @@ import pytest -from zarr.convenience import open, save, save_group, load, copy_store, copy -from zarr.storage import atexit_rmtree +from zarr.convenience import open, save, save_group, load, copy_store, copy, consolidate_metadata +from zarr.storage import atexit_rmtree, DictStore from zarr.core import Array from zarr.hierarchy import Group, group from zarr.errors import CopyError @@ -91,6 +91,34 @@ def test_lazy_loader(): assert_array_equal(bar, loader['bar']) +def test_consolidate_metadata(): + import json + store = DictStore() + z = group(store) + z.create_group('g1') + g2 = z.create_group('g2') + g2.attrs['hello'] = 'world' + arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') + arr.attrs['data'] = 1 + arr[:] = 1.0 + consolidate_metadata(store) + assert '.zmetadata' in store + for key in ['.zgroup', + 'g1/.zgroup', + 'g2/.zgroup', + 'g2/.zattrs', + 'g2/arr/.zarray', + 'g2/arr/.zattrs']: + del store[key] + meta = json.loads(store['.zmetadata']) + meta = {k: v.encode() for k, v in meta.items()} + z2 = group(meta, chunk_store=store) + assert list(z2) == ['g1', 'g2'] + assert z2.g2.attrs['hello'] == 'world' + assert z2.g2.arr.attrs['data'] == 1 + assert (z2.g2.arr[:] == 1.0).all() + + class TestCopyStore(unittest.TestCase): def setUp(self): From 0711920696d3b0e717c7a851dfe7308d6955c146 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 2 Jul 2018 14:46:58 -0400 Subject: [PATCH 027/168] (WIP) include simple code that would load metadata Again, this is for example only, not intended final structure --- zarr/convenience.py | 2 +- zarr/hierarchy.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 2c06dac732..7aac2e385c 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1089,4 +1089,4 @@ def is_zarr_key(key): key.endswith('.zattrs')) out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} - mapping[out_key] = json.dumps(out) + mapping[out_key] = json.dumps(out).encode() diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index e9565caa13..9e401eed69 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -92,6 +92,14 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): + try: + import json + metadata = json.loads(store['.zmetadata']) + meta_store = {k: v.encode() for k, v in metadata.items()} + chunk_store, store = store, meta_store + except (KeyError, ValueError, json.JSONDecodeError): + pass + self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) From f8e6a2f62fdcd54604d9fde532977572418b0884 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 12:26:14 -0400 Subject: [PATCH 028/168] Implement ConsolidatedMetadataStore --- zarr/convenience.py | 17 ++++++++- zarr/hierarchy.py | 9 ----- zarr/storage.py | 64 ++++++++++++++++++++++++++++++++++ zarr/tests/test_convenience.py | 11 +++--- 4 files changed, 86 insertions(+), 15 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 7aac2e385c..db3fa2f85a 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1073,7 +1073,16 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, def consolidate_metadata(mapping, out_key='.zmetadata'): """ - Read all the metadata in the files within the given dataset and join + Store all the metadata in the files within the given dataset in one key + + This produces a single file in the backend store, containing all the + metadata read from all the zarr-related keys that can be found. This + should be used in conjunction with ``storage.ConsolidatedMetadataStore`` + to reduce the number of operations on the backend store at read time. + + Note, however, that if the dataset is changed after this consolidation, + then the metadata read by ``storage.ConsolidatedMetadataStore`` would + be out of sync with reality unless this function is called again. Parameters ---------- @@ -1081,8 +1090,13 @@ def consolidate_metadata(mapping, out_key='.zmetadata'): Containing metadata and data keys of a zarr dataset out_key : str Key to place the consolidated data into + + Returns + ------- + ConsolidatedMetadataStore instance, based on the same base store. """ import json + from .storage import ConsolidatedMetadataStore def is_zarr_key(key): return (key.endswith('.zarray') or key.endswith('.zgroup') or @@ -1090,3 +1104,4 @@ def is_zarr_key(key): out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} mapping[out_key] = json.dumps(out).encode() + return ConsolidatedMetadataStore(mapping, out_key) diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 9e401eed69..f20b899b2b 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -91,15 +91,6 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): - - try: - import json - metadata = json.loads(store['.zmetadata']) - meta_store = {k: v.encode() for k, v in metadata.items()} - chunk_store, store = store, meta_store - except (KeyError, ValueError, json.JSONDecodeError): - pass - self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) diff --git a/zarr/storage.py b/zarr/storage.py index 8b551d1254..440996d0c6 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -24,6 +24,7 @@ import atexit import re import sys +import json import multiprocessing from threading import Lock, RLock import glob @@ -1887,3 +1888,66 @@ def __delitem__(self, key): with self._mutex: self._invalidate_keys() self._invalidate_value(key) + + +class ConsolidatedMetadataStore(MutableMapping): + """A layer over other storage, with the metadata within a single key + + The purpose of this class, is to be able to get all of the metadata for + a given dataset in a single read operation from the underlying storage. + See ``convenience.consolidate_metadata()`` for how to create this single + metadata key. + + This class loads from the one key, and stores the data in a dict, so that + accessing the keys no longer requires operations on the backend store. + + This class is read-only, and attempts to change the dataset metadata will + fail, but changing the data is possible. If the backend storage is changed + directly, then the metadata stored here could become obsolete, and + ``conslidate_metadata`` should be called again and the class re-invoked. + The use case is for write once, read many times. + + """ + def __init__(self, store, metadata_key='.zmetadata'): + """ + + Parameters + ---------- + store: MutableMapping + Containing the zarr dataset + metadata_key: str + The target in the store where all of the metadata are stores. We + assume JSON encoding. + """ + self.store = store + metadata = json.loads(store[metadata_key]) + self.meta_store = {k: v.encode() for k, v in metadata.items()} + + def __getitem__(self, key): + """Try local dict before falling back to real storage""" + try: + return self.meta_store[key] + except KeyError: + return self.store[key] + + def __iter__(self): + """Only list local keys - data must be got via getitem""" + return iter(self.meta_store) + + def __len__(self): + """Only len of local keys""" + return len(self.meta_store) + + def __delitem__(self, key): + """Data can be deleted from storage""" + if key not in self: + del self.store[key] + else: + raise NotImplementedError + + def __setitem__(self, key, value): + """Data can be written to storage""" + if key not in self: + self.store[key] = value + else: + raise NotImplementedError diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index cae105c23e..92984f95c1 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -92,7 +92,7 @@ def test_lazy_loader(): def test_consolidate_metadata(): - import json + from zarr.storage import ConsolidatedMetadataStore store = DictStore() z = group(store) z.create_group('g1') @@ -101,7 +101,8 @@ def test_consolidate_metadata(): arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') arr.attrs['data'] = 1 arr[:] = 1.0 - consolidate_metadata(store) + out = consolidate_metadata(store) + assert isinstance(out, ConsolidatedMetadataStore) assert '.zmetadata' in store for key in ['.zgroup', 'g1/.zgroup', @@ -110,13 +111,13 @@ def test_consolidate_metadata(): 'g2/arr/.zarray', 'g2/arr/.zattrs']: del store[key] - meta = json.loads(store['.zmetadata']) - meta = {k: v.encode() for k, v in meta.items()} - z2 = group(meta, chunk_store=store) + cstore = ConsolidatedMetadataStore(store) + z2 = open(cstore, mode='r') assert list(z2) == ['g1', 'g2'] assert z2.g2.attrs['hello'] == 'world' assert z2.g2.arr.attrs['data'] == 1 assert (z2.g2.arr[:] == 1.0).all() + assert list(out) class TestCopyStore(unittest.TestCase): From c4436c749c36e2bdd039518793d24b94b69ef8ee Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 12:41:07 -0400 Subject: [PATCH 029/168] fix for py34 py35 --- zarr/storage.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 440996d0c6..5c9aa0d76e 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1916,11 +1916,15 @@ def __init__(self, store, metadata_key='.zmetadata'): store: MutableMapping Containing the zarr dataset metadata_key: str - The target in the store where all of the metadata are stores. We + The target in the store where all of the metadata are stored. We assume JSON encoding. """ self.store = store - metadata = json.loads(store[metadata_key]) + if sys.version_info.major == 3 and sys.version_info.minor < 6: + d = store[metadata_key].decode() + else: + d = store[metadata_key] + metadata = json.loads(d) self.meta_store = {k: v.encode() for k, v in metadata.items()} def __getitem__(self, key): From 0757a72daccac7924c20bf4ee539b027cefa8d9e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 13:34:24 -0400 Subject: [PATCH 030/168] improve coverage; data write in consolidated store --- zarr/storage.py | 4 ++-- zarr/tests/test_convenience.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 5c9aa0d76e..3e211c608a 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1944,14 +1944,14 @@ def __len__(self): def __delitem__(self, key): """Data can be deleted from storage""" - if key not in self: + if key not in self.meta_store: del self.store[key] else: raise NotImplementedError def __setitem__(self, key, value): """Data can be written to storage""" - if key not in self: + if key not in self.meta_store: self.store[key] = value else: raise NotImplementedError diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 92984f95c1..379a039e01 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -112,12 +112,18 @@ def test_consolidate_metadata(): 'g2/arr/.zattrs']: del store[key] cstore = ConsolidatedMetadataStore(store) - z2 = open(cstore, mode='r') + z2 = open(cstore) assert list(z2) == ['g1', 'g2'] assert z2.g2.attrs['hello'] == 'world' assert z2.g2.arr.attrs['data'] == 1 assert (z2.g2.arr[:] == 1.0).all() - assert list(out) + assert list(out) == list(cstore) + + # tests del/write on the store + del cstore['g2/arr/0.0'] + assert (z2.g2.arr[:] == 0).all() + z2.g2.arr[:] = 2 + assert (z2.g2.arr[:] == 2).all() class TestCopyStore(unittest.TestCase): From da3f6d7509ebb1a82f7e2fbc60c7ef7e53df5fac Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 15:37:59 -0400 Subject: [PATCH 031/168] coverage --- zarr/storage.py | 4 ++-- zarr/tests/test_convenience.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 3e211c608a..e3f2506f22 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1921,8 +1921,8 @@ def __init__(self, store, metadata_key='.zmetadata'): """ self.store = store if sys.version_info.major == 3 and sys.version_info.minor < 6: - d = store[metadata_key].decode() - else: + d = store[metadata_key].decode() # pragma: no cover + else: # pragma: no cover d = store[metadata_key] metadata = json.loads(d) self.meta_store = {k: v.encode() for k, v in metadata.items()} diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 379a039e01..62da7d4b77 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -120,6 +120,10 @@ def test_consolidate_metadata(): assert list(out) == list(cstore) # tests del/write on the store + with pytest.raises(NotImplementedError): + del cstore['.zgroup'] + with pytest.raises(NotImplementedError): + cstore['.zgroup'] = None del cstore['g2/arr/0.0'] assert (z2.g2.arr[:] == 0).all() z2.g2.arr[:] = 2 From f921ed414584300c32e8f6ec9e1c6a34a3910b3d Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 16:50:53 +0100 Subject: [PATCH 032/168] add py37; drop py34; upgrade requirements --- .travis.yml | 5 +-- appveyor.yml | 33 ++++++--------- requirements_dev.txt | 80 ++++++++++++++++++++++------------- requirements_dev_optional.txt | 4 +- tox.ini | 30 ++++++++----- 5 files changed, 86 insertions(+), 66 deletions(-) diff --git a/.travis.yml b/.travis.yml index d126fb755a..ed992b39f2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,13 +13,12 @@ addons: python: - 2.7 - - 3.4 - 3.5 - 3.6 + - 3.7 install: - - pip install -U pip setuptools wheel - - pip install -U tox-travis coveralls + - pip install -U pip setuptools wheel tox-travis coveralls script: - tox diff --git a/appveyor.yml b/appveyor.yml index ef94c37a54..f0d0ca733d 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -14,45 +14,36 @@ environment: - PYTHON: "C:\\Python27" PYTHON_VERSION: "2.7" - NUMPY_VERSION: "1.13.3" + NUMPY_VERSION: "1.15.2" - PYTHON: "C:\\Python27-x64" PYTHON_VERSION: "2.7" - NUMPY_VERSION: "1.13.3" - DISTUTILS_USE_SDK: "1" - - - PYTHON: "C:\\Python34" - NUMPY_VERSION: "1.13.3" - PYTHON_VERSION: "3.4" - - - PYTHON: "C:\\Python34-x64" - PYTHON_VERSION: "3.4" - NUMPY_VERSION: "1.13.3" + NUMPY_VERSION: "1.15.2" DISTUTILS_USE_SDK: "1" - PYTHON: "C:\\Python35" PYTHON_VERSION: "3.5" - NUMPY_VERSION: "1.13.3" + NUMPY_VERSION: "1.15.2" - PYTHON: "C:\\Python35-x64" PYTHON_VERSION: "3.5" - NUMPY_VERSION: "1.13.3" + NUMPY_VERSION: "1.15.2" - PYTHON: "C:\\Python36" PYTHON_VERSION: "3.6" - NUMPY_VERSION: "1.13.3" + NUMPY_VERSION: "1.15.2" - PYTHON: "C:\\Python36-x64" PYTHON_VERSION: "3.6" - NUMPY_VERSION: "1.13.3" + NUMPY_VERSION: "1.15.2" - - PYTHON: "C:\\Python36" - PYTHON_VERSION: "3.6" - NUMPY_VERSION: "1.14.0" + - PYTHON: "C:\\Python37" + PYTHON_VERSION: "3.7" + NUMPY_VERSION: "1.15.2" - - PYTHON: "C:\\Python36-x64" - PYTHON_VERSION: "3.6" - NUMPY_VERSION: "1.14.0" + - PYTHON: "C:\\Python37-x64" + PYTHON_VERSION: "3.7" + NUMPY_VERSION: "1.15.2" install: - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" diff --git a/requirements_dev.txt b/requirements_dev.txt index 95e4a556b4..d495e04bfd 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,38 +1,60 @@ -appdirs==1.4.3 -args==0.1.0 asciitree==0.3.3 -certifi==2017.7.27.1 +asn1crypto==0.24.0 +atomicwrites==1.2.1 +attrs==18.2.0 +bleach==3.0.2 +boto3==1.9.26 +botocore==1.12.26 +certifi==2018.10.15 +cffi==1.11.5 chardet==3.0.4 -clint==0.5.1 -coverage==4.4.1 -coveralls==1.2.0 -Cython==0.27.2 +cmarkgfm==0.4.2 +configparser==3.5.0 +coverage==4.5.1 +coveralls==1.5.1 +cryptography==2.3.1 +Cython==0.29 docopt==0.6.2 +docutils==0.14 +enum34==1.1.6 fasteners==0.14.1 +filelock==3.0.9 flake8==3.5.0 -h5py==2.7.1 -idna==2.6 +funcsigs==1.0.2 +future==0.16.0 +h5py==2.8.0 +idna==2.7 +ipaddress==1.0.22 +jmespath==0.9.3 mccabe==0.6.1 -monotonic==1.3 -msgpack-python==0.4.8 -numcodecs==0.5.4 -packaging==16.8 -pkginfo==1.4.1 -pluggy==0.5.2 -py==1.4.34 -py-cpuinfo==3.3.0 +monotonic==1.5 +more-itertools==4.3.0 +msgpack-python==0.5.6 +numcodecs==0.5.5 +pathlib2==2.3.2 +pkginfo==1.4.2 +pluggy==0.8.0 +py==1.7.0 pycodestyle==2.3.1 +pycparser==2.19 pyflakes==1.6.0 -pyparsing==2.2.0 -pytest==3.2.3 -pytest-cov==2.5.1 -requests==2.18.4 +Pygments==2.2.0 +pyOpenSSL==18.0.0 +pytest==3.9.1 +pytest-cov==2.6.0 +python-dateutil==2.7.3 +readme-renderer==22.0 +requests==2.19.1 requests-toolbelt==0.8.0 -setuptools-scm==1.15.6 -s3fs==0.1.2 -tox==2.9.1 -tox-travis==0.8 -tqdm==4.19.4 -twine==1.9.1 -urllib3==1.22 -virtualenv==15.1.0 +s3fs==0.1.6 +s3transfer==0.1.13 +scandir==1.9.0 +six==1.11.0 +toml==0.10.0 +tox==3.5.2 +tox-travis==0.11 +tqdm==4.27.0 +twine==1.12.1 +urllib3==1.23 +virtualenv==16.0.0 +webencodings==0.5.1 diff --git a/requirements_dev_optional.txt b/requirements_dev_optional.txt index ad6f7064c6..a4e7c2a6bd 100644 --- a/requirements_dev_optional.txt +++ b/requirements_dev_optional.txt @@ -1,2 +1,2 @@ -bsddb3==6.2.5 -lmdb==0.93 +bsddb3==6.2.6 +lmdb==0.94 diff --git a/tox.ini b/tox.ini index 21e869df54..15e72d5e69 100644 --- a/tox.ini +++ b/tox.ini @@ -4,27 +4,35 @@ # and then run "tox" from this directory. [tox] -envlist = py27, py34, py35, py36-npy{113,114}, docs +envlist = py27, py35, py36, py37-npy{113,114,115}, docs [testenv] +install_command = pip install -v --no-binary=numcodecs {opts} {packages} setenv = PYTHONHASHSEED = 42 # hooks for coverage exclusions based on Python major version - py34,py35,py36: PY_MAJOR_VERSION = py3 + py35,py36,py37: PY_MAJOR_VERSION = py3 py27: PY_MAJOR_VERSION = py2 commands = + # clear out any data files generated during tests python -c 'import glob; import shutil; import os; [(shutil.rmtree(d) if os.path.isdir(d) else os.remove(d) if os.path.isfile(d) else None) for d in glob.glob("./example*")]' - py27,py34,py35: pytest -v --cov=zarr zarr - # don't run py36-npy114 with coverage because it is run together with py35-npy113 on travis - py36-npy114: pytest -v zarr - py36-npy113: pytest -v --cov=zarr --doctest-modules zarr - py27,py34,py35,py36-npy113: coverage report -m - py36-npy113: python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst - py36-npy113: flake8 --max-line-length=100 zarr + # main unit test runner + # N.B., don't run npy113 or npy114 with coverage because it is run together npy115 on travis + py27,py35,py36: pytest -v --cov=zarr --cov-config=.coveragerc zarr + py37-npy113: pytest -v zarr + py37-npy114: pytest -v zarr + py37-npy115: pytest -v --cov=zarr --cov-config=.coveragerc --doctest-modules zarr + # generate a coverate report + py27,py35,py36,py37-npy115: coverage report -m + # run doctests in the tutorial and spec + py37-npy115: python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst + # pep8 checks + py37-npy115: flake8 --max-line-length=100 zarr deps = py27: backports.lzma - py27,py34,py35,py36-npy113: numpy==1.13.3 - py36-npy114: numpy==1.14.0 + py37-npy113: numpy==1.13.3 + py37-npy114: numpy==1.14.6 + py27,py35,py36,py37-npy115: numpy==1.15.2 -rrequirements_dev.txt # linux only -rrequirements_dev_optional.txt From 4c6aaad85cddebda55b9c0f3d19f2b33975f638a Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 16:54:17 +0100 Subject: [PATCH 033/168] fix docstrings for npy115 --- zarr/core.py | 2 +- zarr/creation.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 03d9bdc667..262ae5d4ef 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -2157,7 +2157,7 @@ def view(self, shape=None, chunks=None, dtype=None, array([0, 0, 1, ..., 1, 0, 0], dtype=uint8) >>> v = a.view(dtype=bool) >>> v[:] - array([False, False, True, ..., True, False, False], dtype=bool) + array([False, False, True, ..., True, False, False]) >>> np.all(a[:].view(dtype=bool) == v[:]) True diff --git a/zarr/creation.py b/zarr/creation.py index 004b0e4ad1..49b4a9d2ea 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -224,8 +224,8 @@ def zeros(shape, **kwargs): >>> z >>> z[:2, :2] - array([[ 0., 0.], - [ 0., 0.]]) + array([[0., 0.], + [0., 0.]]) """ @@ -245,8 +245,8 @@ def ones(shape, **kwargs): >>> z >>> z[:2, :2] - array([[ 1., 1.], - [ 1., 1.]]) + array([[1., 1.], + [1., 1.]]) """ @@ -266,8 +266,8 @@ def full(shape, fill_value, **kwargs): >>> z >>> z[:2, :2] - array([[ 42., 42.], - [ 42., 42.]]) + array([[42., 42.], + [42., 42.]]) """ From e62ae58e2692809b79963a86e8179303e781325d Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 16:55:02 +0100 Subject: [PATCH 034/168] cannot get warnings tests to work on py2, skip them --- zarr/tests/test_creation.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index eb437706f0..304714991e 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -3,7 +3,6 @@ import tempfile import shutil import atexit -import warnings import numpy as np @@ -22,12 +21,6 @@ from zarr.compat import PY2 -# needed for PY2/PY3 consistent behaviour -if PY2: # pragma: py3 no cover - warnings.resetwarnings() - warnings.simplefilter('always') - - # something bcolz-like class MockBcolzArray(object): @@ -457,12 +450,15 @@ def test_compression_args(): assert 'zlib' == z.compressor.codec_id assert 9 == z.compressor.level - with pytest.warns(UserWarning): - # 'compressor' overrides 'compression' - create(100, compressor=Zlib(9), compression='bz2', compression_opts=1) - with pytest.warns(UserWarning): - # 'compressor' ignores 'compression_opts' - create(100, compressor=Zlib(9), compression_opts=1) + # cannot get warning tests to work on PY2 + if not PY2: # pragma: py2 no cover + + with pytest.warns(UserWarning): + # 'compressor' overrides 'compression' + create(100, compressor=Zlib(9), compression='bz2', compression_opts=1) + with pytest.warns(UserWarning): + # 'compressor' ignores 'compression_opts' + create(100, compressor=Zlib(9), compression_opts=1) def test_create_read_only(): From 8fb6c1e88af7155751ff202d566e0c7802a467d2 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 16:55:29 +0100 Subject: [PATCH 035/168] tweaks to storage pickle and tests --- zarr/storage.py | 6 +++++- zarr/tests/test_storage.py | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index 8b551d1254..173325e23a 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1635,7 +1635,11 @@ def __init__(self, path, buffers=True, **kwargs): self.kwargs = kwargs def __getstate__(self): - self.flush() # just in case + try: + self.flush() # just in case + except Exception: + # flush may fail if db has already been closed + pass return self.path, self.buffers, self.kwargs def __setstate__(self, state): diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 10e615b56c..92b76ca06e 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -142,6 +142,8 @@ def test_pickle(self): # store before can round-trip through pickle if hasattr(store, 'close'): store.close() + # check can still pickle after close + assert dump == pickle.dumps(store) store2 = pickle.loads(dump) # verify From 90e1d497b1436ea8b2f45a4e1959bab78c31adca Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 17:00:57 +0100 Subject: [PATCH 036/168] simplify PR template --- .github/PULL_REQUEST_TEMPLATE.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e31c477477..88234e69e8 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,15 +2,9 @@ TODO: * [ ] Add unit tests and/or doctests in docstrings -* [ ] Unit tests and doctests pass locally under Python 3.6 (e.g., run ``tox -e py36`` or - ``pytest -v --doctest-modules zarr``) -* [ ] Unit tests pass locally under Python 2.7 (e.g., run ``tox -e py27`` or - ``pytest -v zarr``) -* [ ] PEP8 checks pass (e.g., run ``tox -e py36`` or ``flake8 --max-line-length=100 zarr``) * [ ] Add docstrings and API docs for any new/modified user-facing classes and functions * [ ] New/modified features documented in docs/tutorial.rst -* [ ] Doctests in tutorial pass (e.g., run ``tox -e py36`` or ``python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst``) * [ ] Changes documented in docs/release.rst * [ ] Docs build locally (e.g., run ``tox -e docs``) * [ ] AppVeyor and Travis CI passes -* [ ] Test coverage to 100% (Coveralls passes) +* [ ] Test coverage is 100% (Coveralls passes) From d1415a13469770ed9d2618833a4101661d97c0e3 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 17:06:58 +0100 Subject: [PATCH 037/168] get py37 working on travis --- .travis.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index ed992b39f2..8a5e1fe521 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,11 +11,14 @@ addons: packages: - libdb-dev -python: - - 2.7 - - 3.5 - - 3.6 - - 3.7 +matrix: + include: + - python: 2.7 + - python: 3.5 + - python: 3.6 + - python: 3.7 + dist: xenial + sudo: true install: - pip install -U pip setuptools wheel tox-travis coveralls From 1e51c864220fa9de513b2d45221bb301a14260d0 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 17:25:57 +0100 Subject: [PATCH 038/168] reduce verbosity --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 15e72d5e69..c14da54eb2 100644 --- a/tox.ini +++ b/tox.ini @@ -7,7 +7,7 @@ envlist = py27, py35, py36, py37-npy{113,114,115}, docs [testenv] -install_command = pip install -v --no-binary=numcodecs {opts} {packages} +install_command = pip install --no-binary=numcodecs {opts} {packages} setenv = PYTHONHASHSEED = 42 # hooks for coverage exclusions based on Python major version From b8856fd1afb6f6040b5bf53aee8d4eae4ca7cf3f Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 17:26:07 +0100 Subject: [PATCH 039/168] release notes --- docs/release.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/release.rst b/docs/release.rst index fdcd3cb0e2..9acab25fde 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -1,6 +1,18 @@ Release notes ============= +.. _release_2.3.0: + +2.3.0 (Work in Progress) +------------------------ + +Maintenance +~~~~~~~~~~~ + +* CI and test environments have been upgraded to include Python 3.7, drop Python 3.4, and + upgrade all package requirements. :issue:`308`. + + .. _release_2.2.0: 2.2.0 From 703ed6d633d48a8725fdc94de4d51204ae97ea35 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 17:26:20 +0100 Subject: [PATCH 040/168] skip a doctest --- docs/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index c174a57ae5..5c090669ce 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -1331,7 +1331,7 @@ internal threads. The number of Blosc threads can be changed to increase or decrease this number, e.g.:: >>> from zarr import blosc - >>> blosc.set_nthreads(2) + >>> blosc.set_nthreads(2) # doctest: +SKIP 8 When a Zarr array is being used within a multi-threaded program, Zarr From 887846fe72b77950fd71d8b291a294034d261f02 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 17:38:51 +0100 Subject: [PATCH 041/168] try to get appveyor working for py37 --- appveyor.yml | 2 +- setup.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index f0d0ca733d..8161f8b296 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -53,7 +53,7 @@ build: off test_script: - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" - "%CMD_IN_ENV% python -m pip install numpy==%NUMPY_VERSION%" + - "%CMD_IN_ENV% python -m pip install -v --no-binary=numcodecs numcodecs==0.5.5" - "%CMD_IN_ENV% python -m pip install -rrequirements_dev.txt" - "%CMD_IN_ENV% python setup.py install" - "%CMD_IN_ENV% python -m pytest -v --pyargs zarr" - diff --git a/setup.py b/setup.py index ae89a66ceb..a5e8334e43 100644 --- a/setup.py +++ b/setup.py @@ -45,6 +45,7 @@ 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', ], maintainer='Alistair Miles', maintainer_email='alimanfoo@googlemail.com', From f87bfa9a64f76571562216c69188a64ea559875b Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 17:53:04 +0100 Subject: [PATCH 042/168] test different npy under py36 not py37 --- tox.ini | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/tox.ini b/tox.ini index c14da54eb2..7435b01f45 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,9 @@ # and then run "tox" from this directory. [tox] -envlist = py27, py35, py36, py37-npy{113,114,115}, docs +# N.B., test different versions of numpy under py36 rather than py37 +# because wheels for npy113 not available for py37 +envlist = py27, py35, py36-npy{113,114,115}, py37, docs [testenv] install_command = pip install --no-binary=numcodecs {opts} {packages} @@ -18,21 +20,21 @@ commands = python -c 'import glob; import shutil; import os; [(shutil.rmtree(d) if os.path.isdir(d) else os.remove(d) if os.path.isfile(d) else None) for d in glob.glob("./example*")]' # main unit test runner # N.B., don't run npy113 or npy114 with coverage because it is run together npy115 on travis - py27,py35,py36: pytest -v --cov=zarr --cov-config=.coveragerc zarr - py37-npy113: pytest -v zarr - py37-npy114: pytest -v zarr - py37-npy115: pytest -v --cov=zarr --cov-config=.coveragerc --doctest-modules zarr + py27,py35,py36-npy115: pytest -v --cov=zarr --cov-config=.coveragerc zarr + py36-npy113: pytest -v zarr + py36-npy114: pytest -v zarr + py37: pytest -v --cov=zarr --cov-config=.coveragerc --doctest-modules zarr # generate a coverate report - py27,py35,py36,py37-npy115: coverage report -m + py27,py35,py36-npy115,py37: coverage report -m # run doctests in the tutorial and spec - py37-npy115: python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst + py37: python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst # pep8 checks - py37-npy115: flake8 --max-line-length=100 zarr + py37: flake8 --max-line-length=100 zarr deps = py27: backports.lzma - py37-npy113: numpy==1.13.3 - py37-npy114: numpy==1.14.6 - py27,py35,py36,py37-npy115: numpy==1.15.2 + py36-npy113: numpy==1.13.3 + py36-npy114: numpy==1.14.6 + py27,py35,py36-npy115,py37: numpy==1.15.2 -rrequirements_dev.txt # linux only -rrequirements_dev_optional.txt From 56e2384a596a9eb47e1ea117eb33b422b6ede2f6 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 21:27:41 +0100 Subject: [PATCH 043/168] ensure cythonize numcodecs for py37 on appveyor --- appveyor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/appveyor.yml b/appveyor.yml index 8161f8b296..987b51c1c4 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -53,6 +53,7 @@ build: off test_script: - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" - "%CMD_IN_ENV% python -m pip install numpy==%NUMPY_VERSION%" + - "%CMD_IN_ENV% python -m pip install cython==0.29" - "%CMD_IN_ENV% python -m pip install -v --no-binary=numcodecs numcodecs==0.5.5" - "%CMD_IN_ENV% python -m pip install -rrequirements_dev.txt" - "%CMD_IN_ENV% python setup.py install" From b0da1f5c30da13ebd89f24fddad729f323f791d2 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 19 Oct 2018 01:01:37 +0100 Subject: [PATCH 044/168] tweak comment [ci skip] --- zarr/tests/test_core.py | 2 +- zarr/tests/test_hierarchy.py | 2 +- zarr/tests/test_storage.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index bc957031d5..374b298c22 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -673,7 +673,7 @@ def test_pickle(self): # round trip through pickle dump = pickle.dumps(z) - # some stores cannot be opened twice at the same time, need to close first + # some stores cannot be opened twice at the same time, need to close # store before can round-trip through pickle if hasattr(z.store, 'close'): z.store.close() diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index 7820441c81..7758976c8c 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -832,7 +832,7 @@ def test_pickle(self): # round-trip through pickle dump = pickle.dumps(g) - # some stores cannot be opened twice at the same time, need to close first + # some stores cannot be opened twice at the same time, need to close # store before can round-trip through pickle if hasattr(g.store, 'close'): g.store.close() diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 92b76ca06e..79e6adaeac 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -138,7 +138,7 @@ def test_pickle(self): # round-trip through pickle dump = pickle.dumps(store) - # some stores cannot be opened twice at the same time, need to close first + # some stores cannot be opened twice at the same time, need to close # store before can round-trip through pickle if hasattr(store, 'close'): store.close() From 552a0841d2f8a4e83f3ffcaf31d383c55b1f4636 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 26 Jun 2018 16:42:56 -0400 Subject: [PATCH 045/168] POC of making a single file out of zarr dot files --- zarr/convenience.py | 21 +++++++++++++++++++++ zarr/tests/test_convenience.py | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 19de7b2826..2c06dac732 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1069,3 +1069,24 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) return n_copied, n_skipped, n_bytes_copied + + +def consolidate_metadata(mapping, out_key='.zmetadata'): + """ + Read all the metadata in the files within the given dataset and join + + Parameters + ---------- + mapping : MutableMapping instance + Containing metadata and data keys of a zarr dataset + out_key : str + Key to place the consolidated data into + """ + import json + + def is_zarr_key(key): + return (key.endswith('.zarray') or key.endswith('.zgroup') or + key.endswith('.zattrs')) + + out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} + mapping[out_key] = json.dumps(out) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index c77006c4f6..cae105c23e 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -12,8 +12,8 @@ import pytest -from zarr.convenience import open, save, save_group, load, copy_store, copy -from zarr.storage import atexit_rmtree +from zarr.convenience import open, save, save_group, load, copy_store, copy, consolidate_metadata +from zarr.storage import atexit_rmtree, DictStore from zarr.core import Array from zarr.hierarchy import Group, group from zarr.errors import CopyError @@ -91,6 +91,34 @@ def test_lazy_loader(): assert_array_equal(bar, loader['bar']) +def test_consolidate_metadata(): + import json + store = DictStore() + z = group(store) + z.create_group('g1') + g2 = z.create_group('g2') + g2.attrs['hello'] = 'world' + arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') + arr.attrs['data'] = 1 + arr[:] = 1.0 + consolidate_metadata(store) + assert '.zmetadata' in store + for key in ['.zgroup', + 'g1/.zgroup', + 'g2/.zgroup', + 'g2/.zattrs', + 'g2/arr/.zarray', + 'g2/arr/.zattrs']: + del store[key] + meta = json.loads(store['.zmetadata']) + meta = {k: v.encode() for k, v in meta.items()} + z2 = group(meta, chunk_store=store) + assert list(z2) == ['g1', 'g2'] + assert z2.g2.attrs['hello'] == 'world' + assert z2.g2.arr.attrs['data'] == 1 + assert (z2.g2.arr[:] == 1.0).all() + + class TestCopyStore(unittest.TestCase): def setUp(self): From 8f3325f3ba82dd7ba4c7b3c6b50e878f7ffb86dd Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 2 Jul 2018 14:46:58 -0400 Subject: [PATCH 046/168] (WIP) include simple code that would load metadata Again, this is for example only, not intended final structure --- zarr/convenience.py | 2 +- zarr/hierarchy.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 2c06dac732..7aac2e385c 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1089,4 +1089,4 @@ def is_zarr_key(key): key.endswith('.zattrs')) out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} - mapping[out_key] = json.dumps(out) + mapping[out_key] = json.dumps(out).encode() diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index e9565caa13..9e401eed69 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -92,6 +92,14 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): + try: + import json + metadata = json.loads(store['.zmetadata']) + meta_store = {k: v.encode() for k, v in metadata.items()} + chunk_store, store = store, meta_store + except (KeyError, ValueError, json.JSONDecodeError): + pass + self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) From 5da425fb2ffc4fcba254b88ef5e5949e84efa19b Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 12:26:14 -0400 Subject: [PATCH 047/168] Implement ConsolidatedMetadataStore --- zarr/convenience.py | 17 ++++++++- zarr/hierarchy.py | 9 ----- zarr/storage.py | 64 ++++++++++++++++++++++++++++++++++ zarr/tests/test_convenience.py | 11 +++--- 4 files changed, 86 insertions(+), 15 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 7aac2e385c..db3fa2f85a 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1073,7 +1073,16 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, def consolidate_metadata(mapping, out_key='.zmetadata'): """ - Read all the metadata in the files within the given dataset and join + Store all the metadata in the files within the given dataset in one key + + This produces a single file in the backend store, containing all the + metadata read from all the zarr-related keys that can be found. This + should be used in conjunction with ``storage.ConsolidatedMetadataStore`` + to reduce the number of operations on the backend store at read time. + + Note, however, that if the dataset is changed after this consolidation, + then the metadata read by ``storage.ConsolidatedMetadataStore`` would + be out of sync with reality unless this function is called again. Parameters ---------- @@ -1081,8 +1090,13 @@ def consolidate_metadata(mapping, out_key='.zmetadata'): Containing metadata and data keys of a zarr dataset out_key : str Key to place the consolidated data into + + Returns + ------- + ConsolidatedMetadataStore instance, based on the same base store. """ import json + from .storage import ConsolidatedMetadataStore def is_zarr_key(key): return (key.endswith('.zarray') or key.endswith('.zgroup') or @@ -1090,3 +1104,4 @@ def is_zarr_key(key): out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} mapping[out_key] = json.dumps(out).encode() + return ConsolidatedMetadataStore(mapping, out_key) diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 9e401eed69..f20b899b2b 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -91,15 +91,6 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): - - try: - import json - metadata = json.loads(store['.zmetadata']) - meta_store = {k: v.encode() for k, v in metadata.items()} - chunk_store, store = store, meta_store - except (KeyError, ValueError, json.JSONDecodeError): - pass - self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) diff --git a/zarr/storage.py b/zarr/storage.py index 173325e23a..2d92ff4a78 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -24,6 +24,7 @@ import atexit import re import sys +import json import multiprocessing from threading import Lock, RLock import glob @@ -1891,3 +1892,66 @@ def __delitem__(self, key): with self._mutex: self._invalidate_keys() self._invalidate_value(key) + + +class ConsolidatedMetadataStore(MutableMapping): + """A layer over other storage, with the metadata within a single key + + The purpose of this class, is to be able to get all of the metadata for + a given dataset in a single read operation from the underlying storage. + See ``convenience.consolidate_metadata()`` for how to create this single + metadata key. + + This class loads from the one key, and stores the data in a dict, so that + accessing the keys no longer requires operations on the backend store. + + This class is read-only, and attempts to change the dataset metadata will + fail, but changing the data is possible. If the backend storage is changed + directly, then the metadata stored here could become obsolete, and + ``conslidate_metadata`` should be called again and the class re-invoked. + The use case is for write once, read many times. + + """ + def __init__(self, store, metadata_key='.zmetadata'): + """ + + Parameters + ---------- + store: MutableMapping + Containing the zarr dataset + metadata_key: str + The target in the store where all of the metadata are stores. We + assume JSON encoding. + """ + self.store = store + metadata = json.loads(store[metadata_key]) + self.meta_store = {k: v.encode() for k, v in metadata.items()} + + def __getitem__(self, key): + """Try local dict before falling back to real storage""" + try: + return self.meta_store[key] + except KeyError: + return self.store[key] + + def __iter__(self): + """Only list local keys - data must be got via getitem""" + return iter(self.meta_store) + + def __len__(self): + """Only len of local keys""" + return len(self.meta_store) + + def __delitem__(self, key): + """Data can be deleted from storage""" + if key not in self: + del self.store[key] + else: + raise NotImplementedError + + def __setitem__(self, key, value): + """Data can be written to storage""" + if key not in self: + self.store[key] = value + else: + raise NotImplementedError diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index cae105c23e..92984f95c1 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -92,7 +92,7 @@ def test_lazy_loader(): def test_consolidate_metadata(): - import json + from zarr.storage import ConsolidatedMetadataStore store = DictStore() z = group(store) z.create_group('g1') @@ -101,7 +101,8 @@ def test_consolidate_metadata(): arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') arr.attrs['data'] = 1 arr[:] = 1.0 - consolidate_metadata(store) + out = consolidate_metadata(store) + assert isinstance(out, ConsolidatedMetadataStore) assert '.zmetadata' in store for key in ['.zgroup', 'g1/.zgroup', @@ -110,13 +111,13 @@ def test_consolidate_metadata(): 'g2/arr/.zarray', 'g2/arr/.zattrs']: del store[key] - meta = json.loads(store['.zmetadata']) - meta = {k: v.encode() for k, v in meta.items()} - z2 = group(meta, chunk_store=store) + cstore = ConsolidatedMetadataStore(store) + z2 = open(cstore, mode='r') assert list(z2) == ['g1', 'g2'] assert z2.g2.attrs['hello'] == 'world' assert z2.g2.arr.attrs['data'] == 1 assert (z2.g2.arr[:] == 1.0).all() + assert list(out) class TestCopyStore(unittest.TestCase): From e62d39c29d5059100d7eb08e2d50e8c3f906cd6f Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 12:41:07 -0400 Subject: [PATCH 048/168] fix for py34 py35 --- zarr/storage.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 2d92ff4a78..1b2c97e4f1 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1920,11 +1920,15 @@ def __init__(self, store, metadata_key='.zmetadata'): store: MutableMapping Containing the zarr dataset metadata_key: str - The target in the store where all of the metadata are stores. We + The target in the store where all of the metadata are stored. We assume JSON encoding. """ self.store = store - metadata = json.loads(store[metadata_key]) + if sys.version_info.major == 3 and sys.version_info.minor < 6: + d = store[metadata_key].decode() + else: + d = store[metadata_key] + metadata = json.loads(d) self.meta_store = {k: v.encode() for k, v in metadata.items()} def __getitem__(self, key): From 01e815a9b2a5310476b0e974a2da8fc5e12b2f05 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 13:34:24 -0400 Subject: [PATCH 049/168] improve coverage; data write in consolidated store --- zarr/storage.py | 4 ++-- zarr/tests/test_convenience.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 1b2c97e4f1..ad6e3990c4 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1948,14 +1948,14 @@ def __len__(self): def __delitem__(self, key): """Data can be deleted from storage""" - if key not in self: + if key not in self.meta_store: del self.store[key] else: raise NotImplementedError def __setitem__(self, key, value): """Data can be written to storage""" - if key not in self: + if key not in self.meta_store: self.store[key] = value else: raise NotImplementedError diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 92984f95c1..379a039e01 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -112,12 +112,18 @@ def test_consolidate_metadata(): 'g2/arr/.zattrs']: del store[key] cstore = ConsolidatedMetadataStore(store) - z2 = open(cstore, mode='r') + z2 = open(cstore) assert list(z2) == ['g1', 'g2'] assert z2.g2.attrs['hello'] == 'world' assert z2.g2.arr.attrs['data'] == 1 assert (z2.g2.arr[:] == 1.0).all() - assert list(out) + assert list(out) == list(cstore) + + # tests del/write on the store + del cstore['g2/arr/0.0'] + assert (z2.g2.arr[:] == 0).all() + z2.g2.arr[:] = 2 + assert (z2.g2.arr[:] == 2).all() class TestCopyStore(unittest.TestCase): From 1561eaded817632f45a897bc21241d3967dc25b0 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 15:37:59 -0400 Subject: [PATCH 050/168] coverage --- zarr/storage.py | 4 ++-- zarr/tests/test_convenience.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index ad6e3990c4..290bb8f0c9 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1925,8 +1925,8 @@ def __init__(self, store, metadata_key='.zmetadata'): """ self.store = store if sys.version_info.major == 3 and sys.version_info.minor < 6: - d = store[metadata_key].decode() - else: + d = store[metadata_key].decode() # pragma: no cover + else: # pragma: no cover d = store[metadata_key] metadata = json.loads(d) self.meta_store = {k: v.encode() for k, v in metadata.items()} diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 379a039e01..62da7d4b77 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -120,6 +120,10 @@ def test_consolidate_metadata(): assert list(out) == list(cstore) # tests del/write on the store + with pytest.raises(NotImplementedError): + del cstore['.zgroup'] + with pytest.raises(NotImplementedError): + cstore['.zgroup'] = None del cstore['g2/arr/0.0'] assert (z2.g2.arr[:] == 0).all() z2.g2.arr[:] = 2 From 03d1dbcae3cdf993bef59c7e110774ba60be36c1 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 23:06:57 +0100 Subject: [PATCH 051/168] doc and param style --- zarr/convenience.py | 30 +++++++++++++++++------------- zarr/storage.py | 22 ++++++++++------------ 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index db3fa2f85a..f651f67260 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1071,37 +1071,41 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, return n_copied, n_skipped, n_bytes_copied -def consolidate_metadata(mapping, out_key='.zmetadata'): +def consolidate_metadata(store, metadata_key='.zmetadata'): """ - Store all the metadata in the files within the given dataset in one key + Consolidate all metadata for groups and arrays within the given store + into a single resource and put it under the given key. - This produces a single file in the backend store, containing all the + This produces a single object in the backend store, containing all the metadata read from all the zarr-related keys that can be found. This should be used in conjunction with ``storage.ConsolidatedMetadataStore`` to reduce the number of operations on the backend store at read time. - Note, however, that if the dataset is changed after this consolidation, - then the metadata read by ``storage.ConsolidatedMetadataStore`` would - be out of sync with reality unless this function is called again. + Note, however, that if any metadata in the store is changed after this + consolidation, then the metadata read by ``storage.ConsolidatedMetadataStore`` + would be out of sync with reality unless this function is called again. Parameters ---------- - mapping : MutableMapping instance - Containing metadata and data keys of a zarr dataset - out_key : str - Key to place the consolidated data into + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + metadata_key : str + Key to put the consolidated metadata under. Returns ------- ConsolidatedMetadataStore instance, based on the same base store. + """ import json from .storage import ConsolidatedMetadataStore + store = normalize_store_arg(store) + def is_zarr_key(key): return (key.endswith('.zarray') or key.endswith('.zgroup') or key.endswith('.zattrs')) - out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} - mapping[out_key] = json.dumps(out).encode() - return ConsolidatedMetadataStore(mapping, out_key) + out = {key: store[key].decode() for key in store if is_zarr_key(key)} + store[metadata_key] = json.dumps(out).encode() + return ConsolidatedMetadataStore(store, metadata_key) diff --git a/zarr/storage.py b/zarr/storage.py index 290bb8f0c9..91ebd8e382 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1895,7 +1895,7 @@ def __delitem__(self, key): class ConsolidatedMetadataStore(MutableMapping): - """A layer over other storage, with the metadata within a single key + """A layer over other storage, with the metadata within a single key. The purpose of this class, is to be able to get all of the metadata for a given dataset in a single read operation from the underlying storage. @@ -1908,21 +1908,19 @@ class ConsolidatedMetadataStore(MutableMapping): This class is read-only, and attempts to change the dataset metadata will fail, but changing the data is possible. If the backend storage is changed directly, then the metadata stored here could become obsolete, and - ``conslidate_metadata`` should be called again and the class re-invoked. + ``consolidate_metadata`` should be called again and the class re-invoked. The use case is for write once, read many times. + Parameters + ---------- + store: MutableMapping + Containing the zarr dataset. + metadata_key: str + The target in the store where all of the metadata are stored. We + assume JSON encoding. + """ def __init__(self, store, metadata_key='.zmetadata'): - """ - - Parameters - ---------- - store: MutableMapping - Containing the zarr dataset - metadata_key: str - The target in the store where all of the metadata are stored. We - assume JSON encoding. - """ self.store = store if sys.version_info.major == 3 and sys.version_info.minor < 6: d = store[metadata_key].decode() # pragma: no cover From 4e555488e01eff63786631bb63f8c90583c8826f Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 23:27:44 +0100 Subject: [PATCH 052/168] add test for nchunks_initialized --- zarr/tests/test_convenience.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 62da7d4b77..7eb9626405 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -98,9 +98,12 @@ def test_consolidate_metadata(): z.create_group('g1') g2 = z.create_group('g2') g2.attrs['hello'] = 'world' - arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') + arr = g2.create_dataset('arr', shape=(20, 20), chunks=(5, 5), dtype='f8') + assert 16 == arr.nchunks + assert 0 == arr.nchunks_initialized arr.attrs['data'] = 1 arr[:] = 1.0 + assert 16 == arr.nchunks_initialized out = consolidate_metadata(store) assert isinstance(out, ConsolidatedMetadataStore) assert '.zmetadata' in store @@ -113,10 +116,12 @@ def test_consolidate_metadata(): del store[key] cstore = ConsolidatedMetadataStore(store) z2 = open(cstore) - assert list(z2) == ['g1', 'g2'] - assert z2.g2.attrs['hello'] == 'world' - assert z2.g2.arr.attrs['data'] == 1 + assert ['g1', 'g2'] == list(z2) + assert 'world' == z2.g2.attrs['hello'] + assert 1 == z2.g2.arr.attrs['data'] assert (z2.g2.arr[:] == 1.0).all() + assert 16 == z2.g2.arr.nchunks + assert 16 == z2.g2.arr.nchunks_initialized assert list(out) == list(cstore) # tests del/write on the store From c283487acdbdde06a71745dce1793438ee2c56af Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 19 Oct 2018 00:10:36 +0100 Subject: [PATCH 053/168] expose chunk_store param in open* functions --- zarr/creation.py | 18 ++++++++++++------ zarr/hierarchy.py | 15 ++++++++++----- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/zarr/creation.py b/zarr/creation.py index 49b4a9d2ea..35cb0cf8c0 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -349,7 +349,7 @@ def array(data, **kwargs): def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor='default', fill_value=0, order='C', synchronizer=None, filters=None, cache_metadata=True, cache_attrs=True, path=None, object_codec=None, - **kwargs): + chunk_store=None, **kwargs): """Open an array using file-mode-like semantics. Parameters @@ -391,6 +391,8 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= Array path within store. object_codec : Codec, optional A codec to encode object arrays, only needed if dtype=object. + chunk_store : MutableMapping or string, optional + Store or path to directory in file system or name of zip file. Returns ------- @@ -426,7 +428,10 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= # a : read/write if exists, create otherwise (default) # handle polymorphic store arg - store = normalize_store_arg(store, clobber=(mode == 'w')) + clobber = mode == 'w' + store = normalize_store_arg(store, clobber=clobber) + if chunk_store is not None: + chunk_store = normalize_store_arg(chunk_store, clobber=clobber) path = normalize_storage_path(path) # API compatibility with h5py @@ -448,7 +453,7 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, overwrite=True, path=path, - object_codec=object_codec) + object_codec=object_codec, chunk_store=chunk_store) elif mode == 'a': if contains_group(store, path=path): @@ -457,7 +462,7 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, path=path, - object_codec=object_codec) + object_codec=object_codec, chunk_store=chunk_store) elif mode in ['w-', 'x']: if contains_group(store, path=path): @@ -468,14 +473,15 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, path=path, - object_codec=object_codec) + object_codec=object_codec, chunk_store=chunk_store) # determine read only status read_only = mode == 'r' # instantiate array z = Array(store, read_only=read_only, synchronizer=synchronizer, - cache_metadata=cache_metadata, cache_attrs=cache_attrs, path=path) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, path=path, + chunk_store=chunk_store) return z diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index f20b899b2b..b7359dafa7 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -1058,7 +1058,8 @@ def group(store=None, overwrite=False, chunk_store=None, cache_attrs=cache_attrs, synchronizer=synchronizer, path=path) -def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): +def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None, + chunk_store=None): """Open a group using file-mode-like semantics. Parameters @@ -1078,6 +1079,8 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): Array synchronizer. path : string, optional Group path within store. + chunk_store : MutableMapping or string, optional + Store or path to directory in file system or name of zip file. Returns ------- @@ -1101,6 +1104,8 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): # handle polymorphic store arg store = _normalize_store_arg(store) + if chunk_store is not None: + chunk_store = _normalize_store_arg(chunk_store) path = normalize_storage_path(path) # ensure store is initialized @@ -1112,13 +1117,13 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): err_group_not_found(path) elif mode == 'w': - init_group(store, overwrite=True, path=path) + init_group(store, overwrite=True, path=path, chunk_store=chunk_store) elif mode == 'a': if contains_array(store, path=path): err_contains_array(path) if not contains_group(store, path=path): - init_group(store, path=path) + init_group(store, path=path, chunk_store=chunk_store) elif mode in ['w-', 'x']: if contains_array(store, path=path): @@ -1126,10 +1131,10 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): elif contains_group(store, path=path): err_contains_group(path) else: - init_group(store, path=path) + init_group(store, path=path, chunk_store=chunk_store) # determine read only status read_only = mode == 'r' return Group(store, read_only=read_only, cache_attrs=cache_attrs, - synchronizer=synchronizer, path=path) + synchronizer=synchronizer, path=path, chunk_store=chunk_store) From cc9d7c774ab7c2c8aab8cd00591b47d46743a08d Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 19 Oct 2018 00:11:36 +0100 Subject: [PATCH 054/168] implement open_consolidated --- zarr/convenience.py | 23 ++++++++++++++++++-- zarr/storage.py | 29 +++++++++++-------------- zarr/tests/test_convenience.py | 39 +++++++++++++++++++++++++--------- 3 files changed, 62 insertions(+), 29 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index f651f67260..9bc8be4438 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -68,7 +68,8 @@ def open(store, mode='a', **kwargs): path = kwargs.get('path', None) # handle polymorphic store arg - store = normalize_store_arg(store, clobber=(mode == 'w')) + clobber = mode == 'w' + store = normalize_store_arg(store, clobber=clobber) path = normalize_storage_path(path) if mode in {'w', 'w-', 'x'}: @@ -1108,4 +1109,22 @@ def is_zarr_key(key): out = {key: store[key].decode() for key in store if is_zarr_key(key)} store[metadata_key] = json.dumps(out).encode() - return ConsolidatedMetadataStore(store, metadata_key) + return ConsolidatedMetadataStore(store, metadata_key=metadata_key) + + +def open_consolidated(store, metadata_key='.zmetadata', mode='r'): + """TODO doc me""" + + from .storage import ConsolidatedMetadataStore + + # normalize parameters + store = normalize_store_arg(store) + if mode not in 'ra': + raise ValueError("invalid mode, expected either 'r' or 'a'; found {!r}" + .format(mode)) + + # setup metadata sotre + meta_store = ConsolidatedMetadataStore(store, metadata_key=metadata_key) + + # pass through + return open(store=meta_store, chunk_store=store, mode=mode) diff --git a/zarr/storage.py b/zarr/storage.py index 91ebd8e382..06d2232d9f 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1930,30 +1930,25 @@ def __init__(self, store, metadata_key='.zmetadata'): self.meta_store = {k: v.encode() for k, v in metadata.items()} def __getitem__(self, key): - """Try local dict before falling back to real storage""" - try: - return self.meta_store[key] - except KeyError: - return self.store[key] + return self.meta_store[key] + + def __contains__(self, item): + return item in self.meta_store def __iter__(self): - """Only list local keys - data must be got via getitem""" return iter(self.meta_store) def __len__(self): - """Only len of local keys""" return len(self.meta_store) def __delitem__(self, key): - """Data can be deleted from storage""" - if key not in self.meta_store: - del self.store[key] - else: - raise NotImplementedError + raise PermissionError def __setitem__(self, key, value): - """Data can be written to storage""" - if key not in self.meta_store: - self.store[key] = value - else: - raise NotImplementedError + raise PermissionError + + def getsize(self, path): + return getsize(self.meta_store, path) + + def listdir(self, path): + return listdir(self.meta_store, path) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 7eb9626405..4eebc97aea 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -12,7 +12,8 @@ import pytest -from zarr.convenience import open, save, save_group, load, copy_store, copy, consolidate_metadata +from zarr.convenience import (open, save, save_group, load, copy_store, copy, + consolidate_metadata, open_consolidated) from zarr.storage import atexit_rmtree, DictStore from zarr.core import Array from zarr.hierarchy import Group, group @@ -93,6 +94,8 @@ def test_lazy_loader(): def test_consolidate_metadata(): from zarr.storage import ConsolidatedMetadataStore + + # setup initial data store = DictStore() z = group(store) z.create_group('g1') @@ -104,6 +107,8 @@ def test_consolidate_metadata(): arr.attrs['data'] = 1 arr[:] = 1.0 assert 16 == arr.nchunks_initialized + + # perform consolidation out = consolidate_metadata(store) assert isinstance(out, ConsolidatedMetadataStore) assert '.zmetadata' in store @@ -114,23 +119,37 @@ def test_consolidate_metadata(): 'g2/arr/.zarray', 'g2/arr/.zattrs']: del store[key] - cstore = ConsolidatedMetadataStore(store) - z2 = open(cstore) + + # open consolidated + z2 = open_consolidated(store, mode='a') assert ['g1', 'g2'] == list(z2) assert 'world' == z2.g2.attrs['hello'] assert 1 == z2.g2.arr.attrs['data'] assert (z2.g2.arr[:] == 1.0).all() assert 16 == z2.g2.arr.nchunks assert 16 == z2.g2.arr.nchunks_initialized - assert list(out) == list(cstore) # tests del/write on the store - with pytest.raises(NotImplementedError): - del cstore['.zgroup'] - with pytest.raises(NotImplementedError): - cstore['.zgroup'] = None - del cstore['g2/arr/0.0'] - assert (z2.g2.arr[:] == 0).all() + with pytest.raises(PermissionError): + del out['.zgroup'] + with pytest.raises(PermissionError): + out['.zgroup'] = None + + # test new metadata are not writeable + with pytest.raises(PermissionError): + z2.create_group('g3') + with pytest.raises(PermissionError): + z2.create_dataset('spam', shape=42, chunks=7, dtype='i4') + with pytest.raises(PermissionError): + del z2['g2'] + + # test consolidated metadata are not writeable + with pytest.raises(PermissionError): + z2.g2.attrs['hello'] = 'universe' + with pytest.raises(PermissionError): + z2.g2.arr.attrs['foo'] = 'bar' + + # test the data are writeable z2.g2.arr[:] = 2 assert (z2.g2.arr[:] == 2).all() From a14b045237f046a5f55d09645edf96dc81dad2bd Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 19 Oct 2018 00:34:04 +0100 Subject: [PATCH 055/168] tweaks to consolidated behaviour --- zarr/__init__.py | 3 ++- zarr/convenience.py | 6 +++--- zarr/creation.py | 10 +++++----- zarr/hierarchy.py | 4 ++-- zarr/storage.py | 4 ++-- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/zarr/__init__.py b/zarr/__init__.py index 56d060fdac..cf34d3d427 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -12,6 +12,7 @@ from zarr.sync import ThreadSynchronizer, ProcessSynchronizer from zarr.codecs import * from zarr.convenience import (open, save, save_array, save_group, load, copy_store, - copy, copy_all, tree) + copy, copy_all, tree, consolidate_metadata, + open_consolidated) from zarr.errors import CopyError, MetadataError, PermissionError from zarr.version import version as __version__ diff --git a/zarr/convenience.py b/zarr/convenience.py index 9bc8be4438..7bc66f98d8 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -18,12 +18,12 @@ # noinspection PyShadowingBuiltins -def open(store, mode='a', **kwargs): +def open(store=None, mode='a', **kwargs): """Convenience function to open a group or array using file-mode-like semantics. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system or name of zip file. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means @@ -1112,7 +1112,7 @@ def is_zarr_key(key): return ConsolidatedMetadataStore(store, metadata_key=metadata_key) -def open_consolidated(store, metadata_key='.zmetadata', mode='r'): +def open_consolidated(store, metadata_key='.zmetadata', mode='a'): """TODO doc me""" from .storage import ConsolidatedMetadataStore diff --git a/zarr/creation.py b/zarr/creation.py index 35cb0cf8c0..0184a4a5da 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -346,15 +346,15 @@ def array(data, **kwargs): return z -def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor='default', - fill_value=0, order='C', synchronizer=None, filters=None, - cache_metadata=True, cache_attrs=True, path=None, object_codec=None, - chunk_store=None, **kwargs): +def open_array(store=None, mode='a', shape=None, chunks=True, dtype=None, + compressor='default', fill_value=0, order='C', synchronizer=None, + filters=None, cache_metadata=True, cache_attrs=True, path=None, + object_codec=None, chunk_store=None, **kwargs): """Open an array using file-mode-like semantics. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system or name of zip file. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index b7359dafa7..17821130eb 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -1058,13 +1058,13 @@ def group(store=None, overwrite=False, chunk_store=None, cache_attrs=cache_attrs, synchronizer=synchronizer, path=path) -def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None, +def open_group(store=None, mode='a', cache_attrs=True, synchronizer=None, path=None, chunk_store=None): """Open a group using file-mode-like semantics. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system or name of zip file. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means diff --git a/zarr/storage.py b/zarr/storage.py index 06d2232d9f..a86c7dfc05 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1942,10 +1942,10 @@ def __len__(self): return len(self.meta_store) def __delitem__(self, key): - raise PermissionError + err_read_only() def __setitem__(self, key, value): - raise PermissionError + err_read_only() def getsize(self, path): return getsize(self.meta_store, path) From 0cbda1538bf3f8fc50a174f46e55a1ffb99aa4d7 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 19 Oct 2018 00:54:46 +0100 Subject: [PATCH 056/168] py2 fix --- zarr/tests/test_convenience.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 4eebc97aea..b7da890522 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -17,7 +17,7 @@ from zarr.storage import atexit_rmtree, DictStore from zarr.core import Array from zarr.hierarchy import Group, group -from zarr.errors import CopyError +from zarr.errors import CopyError, PermissionError def test_open_array(): From 21a1c610b29f9440937fb3eaf4bfb2780ddd30e0 Mon Sep 17 00:00:00 2001 From: Tarik Onalan Date: Thu, 30 Aug 2018 20:24:10 -0700 Subject: [PATCH 057/168] Add support for shapes in dtype definitions --- zarr/meta.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index 3f48f3f3e1..27df3fdabb 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -83,10 +83,11 @@ def _decode_dtype_descr(d): # recurse to handle nested structures if PY2: # pragma: py3 no cover # under PY2 numpy rejects unicode field names - d = [(f.encode('ascii'), _decode_dtype_descr(v)) - for f, v in d] + unpack = lambda f, v, *s: (f.encode('ascii'), _decode_dtype_descr(v), *s) + d = [unpack(*k) for k in d] else: # pragma: py2 no cover - d = [(f, _decode_dtype_descr(v)) for f, v in d] + unpack = lambda f, v, *s: (f, _decode_dtype_descr(v), *s) + d = [unpack(*k) for k in d] return d From 16d59aa3d6b3ea209cc16e379dd0d8794b2bfc1d Mon Sep 17 00:00:00 2001 From: Tarik Onalan Date: Thu, 30 Aug 2018 22:54:26 -0700 Subject: [PATCH 058/168] Add shape encoding and metadata test --- zarr/meta.py | 9 ++++++--- zarr/tests/test_meta.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index 27df3fdabb..755a80e3a8 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -71,10 +71,13 @@ def encode_array_metadata(meta): def encode_dtype(d): - if d.fields is None: - return d.str - else: + if d.fields is not None: return d.descr + elif d.subdtype is not None: + sdname, sdshape = d.subdtype + return "%s%s" % (str(sdshape), sdname.str) + else: + return d.str def _decode_dtype_descr(d): diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index a8c781421f..4fc55c579f 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -116,6 +116,45 @@ def test_encode_decode_array_2(): assert [df.get_config()] == meta_dec['filters'] +def test_encode_decode_array_shape(): + + meta = dict( + shape=(100,), + chunks=(10,), + dtype=np.dtype('(10, 10)f8'), + compressor=Zlib(1).get_config(), + fill_value=None, + filters=None, + order='C' + ) + + meta_json = '''{ + "chunks": [10], + "compressor": {"id": "zlib", "level": 1}, + "dtype": "(10, 10) Date: Thu, 30 Aug 2018 23:04:21 -0700 Subject: [PATCH 059/168] Resolve flake complaint --- zarr/meta.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index 755a80e3a8..3f42d7464d 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -86,11 +86,9 @@ def _decode_dtype_descr(d): # recurse to handle nested structures if PY2: # pragma: py3 no cover # under PY2 numpy rejects unicode field names - unpack = lambda f, v, *s: (f.encode('ascii'), _decode_dtype_descr(v), *s) - d = [unpack(*k) for k in d] + d = [(lambda f, v, *s: (f.encode("ascii"), _decode_dtype_descr(v), *s))(*k) for k in d] else: # pragma: py2 no cover - unpack = lambda f, v, *s: (f, _decode_dtype_descr(v), *s) - d = [unpack(*k) for k in d] + d = [(lambda f, v, *s: (f, _decode_dtype_descr(v), *s))(*k) for k in d] return d From 45daca94c4e3f67283ab4e57ca17794b9deb31ce Mon Sep 17 00:00:00 2001 From: Tarik Onalan Date: Thu, 30 Aug 2018 23:28:26 -0700 Subject: [PATCH 060/168] Fix Python2 syntax error and separate unpacker --- zarr/meta.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index 3f42d7464d..e3de81391b 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -80,15 +80,25 @@ def encode_dtype(d): return d.str +def _unpack_fields(field, vtype, *shape): + if PY2: # pragma: py3 no cover + if len(shape): + return (field.encode("ascii"), _decode_dtype_descr(vtype), shape[0]) + else: + return (field.encode("ascii"), _decode_dtype_descr(vtype)) + else: # pragma: py2 no cover + return (field, _decode_dtype_descr(vtype), *shape) + + def _decode_dtype_descr(d): # need to convert list of lists to list of tuples if isinstance(d, list): # recurse to handle nested structures if PY2: # pragma: py3 no cover # under PY2 numpy rejects unicode field names - d = [(lambda f, v, *s: (f.encode("ascii"), _decode_dtype_descr(v), *s))(*k) for k in d] + d = [_unpack_fields(*k) for k in d] else: # pragma: py2 no cover - d = [(lambda f, v, *s: (f, _decode_dtype_descr(v), *s))(*k) for k in d] + d = [_unpack_fields(*k) for k in d] return d From c325ca4584d6b28379f5b8daabe76ea51da4d549 Mon Sep 17 00:00:00 2001 From: Tarik Onalan Date: Thu, 30 Aug 2018 23:46:14 -0700 Subject: [PATCH 061/168] Refactor field unpacking --- zarr/meta.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index e3de81391b..9c5fc645f3 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -80,25 +80,15 @@ def encode_dtype(d): return d.str -def _unpack_fields(field, vtype, *shape): - if PY2: # pragma: py3 no cover - if len(shape): - return (field.encode("ascii"), _decode_dtype_descr(vtype), shape[0]) - else: - return (field.encode("ascii"), _decode_dtype_descr(vtype)) - else: # pragma: py2 no cover - return (field, _decode_dtype_descr(vtype), *shape) - - def _decode_dtype_descr(d): # need to convert list of lists to list of tuples if isinstance(d, list): # recurse to handle nested structures if PY2: # pragma: py3 no cover # under PY2 numpy rejects unicode field names - d = [_unpack_fields(*k) for k in d] + d = [(k[0].encode("ascii"), _decode_dtype_descr(k[1])) + tuple(k[2:]) for k in d] else: # pragma: py2 no cover - d = [_unpack_fields(*k) for k in d] + d = [(k[0], _decode_dtype_descr(k[1])) + tuple(k[2:]) for k in d] return d From 32208da0b046c806478daf4963edab606917e252 Mon Sep 17 00:00:00 2001 From: Tarik Onalan Date: Fri, 31 Aug 2018 04:09:56 -0700 Subject: [PATCH 062/168] Expand unstructured array dimensions into shape As a result, disable dtype and shape checks in test_encode_decode_array_shape. One drawback is that zarr.zeros and similar functions setting fill_value != None throw exceptions. --- zarr/meta.py | 8 ++++---- zarr/tests/test_meta.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index 9c5fc645f3..20fda33098 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -54,9 +54,12 @@ def decode_array_metadata(s): def encode_array_metadata(meta): dtype = meta['dtype'] + sdshape = () + if dtype.subdtype is not None: + dtype, sdshape = dtype.subdtype meta = dict( zarr_format=ZARR_FORMAT, - shape=meta['shape'], + shape=meta['shape'] + sdshape, chunks=meta['chunks'], dtype=encode_dtype(dtype), compressor=meta['compressor'], @@ -73,9 +76,6 @@ def encode_array_metadata(meta): def encode_dtype(d): if d.fields is not None: return d.descr - elif d.subdtype is not None: - sdname, sdshape = d.subdtype - return "%s%s" % (str(sdshape), sdname.str) else: return d.str diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index 4fc55c579f..f84e244518 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -131,11 +131,11 @@ def test_encode_decode_array_shape(): meta_json = '''{ "chunks": [10], "compressor": {"id": "zlib", "level": 1}, - "dtype": "(10, 10) Date: Fri, 31 Aug 2018 04:48:17 -0700 Subject: [PATCH 063/168] Undo unnecessary change --- zarr/meta.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index 20fda33098..291e5c6643 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -74,10 +74,10 @@ def encode_array_metadata(meta): def encode_dtype(d): - if d.fields is not None: - return d.descr - else: + if d.fields is None: return d.str + else: + return d.descr def _decode_dtype_descr(d): From 4ab7fc73bc584c6e0d7e2fb7a19945a38a2e2856 Mon Sep 17 00:00:00 2001 From: Tarik Onalan Date: Wed, 5 Sep 2018 03:11:55 -0700 Subject: [PATCH 064/168] Write subshape tests and remove old meta test --- zarr/tests/test_meta.py | 43 --------- zarr/tests/test_subshapes.py | 181 +++++++++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+), 43 deletions(-) create mode 100644 zarr/tests/test_subshapes.py diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index f84e244518..a8c781421f 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -116,49 +116,6 @@ def test_encode_decode_array_2(): assert [df.get_config()] == meta_dec['filters'] -def test_encode_decode_array_shape(): - - meta = dict( - shape=(100,), - chunks=(10,), - dtype=np.dtype('(10, 10)f8'), - compressor=Zlib(1).get_config(), - fill_value=None, - filters=None, - order='C' - ) - - meta_json = '''{ - "chunks": [10], - "compressor": {"id": "zlib", "level": 1}, - "dtype": " Date: Wed, 5 Sep 2018 10:05:34 -0700 Subject: [PATCH 065/168] Revert "Write subshape tests and remove old ..." This reverts commit 55e725d3322fe1250ecf420e3e6b5222462455f5. --- zarr/tests/test_meta.py | 43 +++++++++ zarr/tests/test_subshapes.py | 181 ----------------------------------- 2 files changed, 43 insertions(+), 181 deletions(-) delete mode 100644 zarr/tests/test_subshapes.py diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index a8c781421f..f84e244518 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -116,6 +116,49 @@ def test_encode_decode_array_2(): assert [df.get_config()] == meta_dec['filters'] +def test_encode_decode_array_shape(): + + meta = dict( + shape=(100,), + chunks=(10,), + dtype=np.dtype('(10, 10)f8'), + compressor=Zlib(1).get_config(), + fill_value=None, + filters=None, + order='C' + ) + + meta_json = '''{ + "chunks": [10], + "compressor": {"id": "zlib", "level": 1}, + "dtype": " Date: Wed, 5 Sep 2018 10:24:16 -0700 Subject: [PATCH 066/168] Refine unstructured/structured metadata tests --- zarr/tests/test_meta.py | 49 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index f84e244518..9c4ceb44eb 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -116,7 +116,7 @@ def test_encode_decode_array_2(): assert [df.get_config()] == meta_dec['filters'] -def test_encode_decode_array_shape(): +def test_encode_decode_array_unstructured(): meta = dict( shape=(100,), @@ -148,11 +148,54 @@ def test_encode_decode_array_shape(): assert ZARR_FORMAT == meta_dec['zarr_format'] # NOTE(onalant): https://github.com/zarr-developers/zarr/pull/296#issuecomment-417608487 # To maintain consistency with numpy unstructured arrays, unpack dimensions into shape. - # assert meta['shape'] == meta_dec['shape'] + assert meta['shape'] + meta['dtype'].shape == meta_dec['shape'] assert meta['chunks'] == meta_dec['chunks'] # NOTE(onalant): https://github.com/zarr-developers/zarr/pull/296#issuecomment-417608487 + # To maintain consistency with numpy unstructured arrays, unpack dtypes. + assert meta['dtype'].base == meta_dec['dtype'] + assert meta['compressor'] == meta_dec['compressor'] + assert meta['order'] == meta_dec['order'] + assert meta_dec['fill_value'] is None + assert meta_dec['filters'] is None + + +def test_encode_decode_array_structured(): + + meta = dict( + shape=(100,), + chunks=(10,), + dtype=np.dtype('i8, (10, 10)f8, (5, 10, 15)u1'), + compressor=Zlib(1).get_config(), + fill_value=None, + filters=None, + order='C' + ) + + meta_json = '''{ + "chunks": [10], + "compressor": {"id": "zlib", "level": 1}, + "dtype": [["f0", " Date: Wed, 5 Sep 2018 11:31:33 -0700 Subject: [PATCH 067/168] Write crude write-read test Cases for unstructured, structured, and nested-structured dtypes. Probably not ready for production as-is. --- zarr/tests/test_crude.py | 53 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 zarr/tests/test_crude.py diff --git a/zarr/tests/test_crude.py b/zarr/tests/test_crude.py new file mode 100644 index 0000000000..08fe510ed2 --- /dev/null +++ b/zarr/tests/test_crude.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, print_function, division +from tempfile import mkdtemp +import atexit +import shutil + + +import numpy as np +import pytest + + +from zarr.creation import array, open_array +from zarr.storage import DirectoryStore + + +@pytest.mark.parametrize( + "dtype", + map(np.dtype, [ + # NOTE(onalant): unstructured dtypes + "f8", "10f8", "(10, 10)f8", + # NOTE(onalant): structured dtypes + "i8, f8", "i8, 10f8", "i8, (10, 10)f8", "i8, (10, 10)f8, (5, 10, 15)u1", + # NOTE(onalant): nested dtypes + [("f0", "i8"), ("f1", [("f0", "f8"), ("f1", "10f8"), ("f2", "(10, 10)f8")])] + ]) +) +def test_write_read(dtype): + + path = mkdtemp() + atexit.register(shutil.rmtree, path) + shape = (100, 100) + sbytes = np.random.bytes(np.product(shape) * dtype.itemsize) + s = np.frombuffer(sbytes, dtype=dtype) + # NOTE(onalant): np.frombuffer only creates 1D arrays; expand to shape + s.reshape(shape + s.shape[1:]) + + store = DirectoryStore(path) + z = array(s, store=store) + + assert(s.dtype == z.dtype) + assert(s.shape == z.shape) + assert(s.tobytes() == z[:].tobytes()) + + del store + del z + + store = DirectoryStore(path) + z = open_array(store) + + assert(s.dtype == z.dtype) + assert(s.shape == z.shape) + assert(s.tobytes() == z[:].tobytes()) + From 7bd2a7e7d63f5b16cf8c12953f088498df282845 Mon Sep 17 00:00:00 2001 From: Tarik Onalan Date: Thu, 13 Sep 2018 14:11:36 -0700 Subject: [PATCH 068/168] Fix dtype expansion during store initialization Otherwise, array data would not match shape with unstructured dtype. --- zarr/storage.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index 173325e23a..a945b1a932 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -329,8 +329,9 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='defa err_contains_group(path) # normalize metadata - shape = normalize_shape(shape) dtype, object_codec = normalize_dtype(dtype, object_codec) + shape = normalize_shape(shape) + dtype.shape + dtype = dtype.base chunks = normalize_chunks(chunks, shape, dtype.itemsize) order = normalize_order(order) fill_value = normalize_fill_value(fill_value, dtype) From aad7e94c6b270382f995209eb8531e584b21132c Mon Sep 17 00:00:00 2001 From: Tarik Onalan Date: Thu, 13 Sep 2018 14:12:34 -0700 Subject: [PATCH 069/168] Move crude tests to core tests --- zarr/tests/test_core.py | 98 ++++++++++++++++++++++++++++++++++++++++ zarr/tests/test_crude.py | 53 ---------------------- 2 files changed, 98 insertions(+), 53 deletions(-) delete mode 100644 zarr/tests/test_crude.py diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 374b298c22..fc4168a4f3 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -846,6 +846,28 @@ def test_nchunks_initialized(self): z[:] = 42 assert 10 == z.nchunks_initialized + def test_unstructured_array(self): + + subshape = (2, 2) + dt = np.dtype("%sf4" % (subshape,)) + # setup some data + d = np.array([((0, 1), + (1, 2)), + ((1, 2), + (2, 3)), + ((2, 3), + (3, 4))], + dtype=dt) + + for a in (d, d[:0]): + for fill_value in None, 0: + z = self.create_array(shape=a.shape[:-len(subshape)], chunks=2, dtype=dt, fill_value=fill_value) + assert len(a) == len(z) + if fill_value is not None: + assert fill_value == z.fill_value + z[...] = a + assert_array_equal(a, z[...]) + def test_structured_array(self): # setup some data @@ -875,6 +897,70 @@ def test_structured_array(self): assert_array_equal(a['bar'], z['bar']) assert_array_equal(a['baz'], z['baz']) + def test_structured_array_subshapes(self): + + # setup some data + d = np.array([(0, ((0, 1, 2), (1, 2, 3)), b'aaa'), + (1, ((1, 2, 3), (2, 3, 4)), b'bbb'), + (2, ((2, 3, 4), (3, 4, 5)), b'ccc')], + dtype=[('foo', 'i8'), ('bar', '(2, 3)f4'), ('baz', 'S3')]) + for a in (d, d[:0]): + for fill_value in None, b'', (0, ((0, 0, 0), (1, 1, 1)), b'zzz'): + z = self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=fill_value) + assert len(a) == len(z) + if fill_value is not None: + if fill_value == b'': + # numpy 1.14 compatibility + np_fill_value = np.array(fill_value, dtype=a.dtype.str).view(a.dtype)[()] + else: + np_fill_value = np.array(fill_value, dtype=a.dtype)[()] + assert np_fill_value == z.fill_value + if len(z): + assert np_fill_value == z[0] + assert np_fill_value == z[-1] + z[...] = a + if len(a): + assert a[0] == z[0] + assert_array_equal(a, z[...]) + assert_array_equal(a['foo'], z['foo']) + assert_array_equal(a['bar'], z['bar']) + assert_array_equal(a['baz'], z['baz']) + else: + # BUG(onalant): numpy cannot compare empty arrays of structured dtypes with shapes + assert a.tobytes() == z[...].tobytes() + + def test_structured_array_nested(self): + + # setup some data + d = np.array([(0, (0, ((0, 1), (1, 2), (2, 3)), 0), b'aaa'), + (1, (1, ((1, 2), (2, 3), (3, 4)), 1), b'bbb'), + (2, (2, ((2, 3), (3, 4), (4, 5)), 2), b'ccc')], + dtype=[('foo', 'i8'), ('bar', [('foo', 'i4'), ('bar', '(3, 2)f4'), ('baz', 'u1')]), ('baz', 'S3')]) + for a in (d, d[:0]): + for fill_value in None, b'', (0, (0, ((0, 0), (1, 1), (2, 2)), 0), b'zzz'): + z = self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=fill_value) + assert len(a) == len(z) + if fill_value is not None: + if fill_value == b'': + # numpy 1.14 compatibility + np_fill_value = np.array(fill_value, dtype=a.dtype.str).view(a.dtype)[()] + else: + np_fill_value = np.array(fill_value, dtype=a.dtype)[()] + assert np_fill_value == z.fill_value + if len(z): + assert np_fill_value == z[0] + assert np_fill_value == z[-1] + z[...] = a + if len(a): + assert a[0] == z[0] + assert_array_equal(a, z[...]) + assert_array_equal(a['foo'], z['foo']) + assert_array_equal(a['bar'], z['bar']) + assert_array_equal(a['baz'], z['baz']) + else: + # BUG(onalant): numpy cannot compare empty arrays of structured dtypes with shapes + assert a.tobytes() == z[...].tobytes() + def test_dtypes(self): # integers @@ -1489,6 +1575,10 @@ class TestArrayWithFilters(TestArray): def create_array(read_only=False, **kwargs): store = dict() dtype = kwargs.get('dtype', None) + # WARN(onalant): this is to compensate for unstructured dtypes + # should this be in upstream numcodecs? + if isinstance(dtype, np.dtype): + dtype = dtype.base filters = [ Delta(dtype=dtype), FixedScaleOffset(dtype=dtype, scale=1, offset=0), @@ -1563,6 +1653,14 @@ def test_structured_array(self): # skip this one, cannot do delta on structured array pass + def test_structured_array_subshapes(self): + # skip this one, cannot do delta on structured array + pass + + def test_structured_array_nested(self): + # skip this one, cannot do delta on structured array + pass + def test_dtypes(self): # skip this one, delta messes up floats pass diff --git a/zarr/tests/test_crude.py b/zarr/tests/test_crude.py deleted file mode 100644 index 08fe510ed2..0000000000 --- a/zarr/tests/test_crude.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import, print_function, division -from tempfile import mkdtemp -import atexit -import shutil - - -import numpy as np -import pytest - - -from zarr.creation import array, open_array -from zarr.storage import DirectoryStore - - -@pytest.mark.parametrize( - "dtype", - map(np.dtype, [ - # NOTE(onalant): unstructured dtypes - "f8", "10f8", "(10, 10)f8", - # NOTE(onalant): structured dtypes - "i8, f8", "i8, 10f8", "i8, (10, 10)f8", "i8, (10, 10)f8, (5, 10, 15)u1", - # NOTE(onalant): nested dtypes - [("f0", "i8"), ("f1", [("f0", "f8"), ("f1", "10f8"), ("f2", "(10, 10)f8")])] - ]) -) -def test_write_read(dtype): - - path = mkdtemp() - atexit.register(shutil.rmtree, path) - shape = (100, 100) - sbytes = np.random.bytes(np.product(shape) * dtype.itemsize) - s = np.frombuffer(sbytes, dtype=dtype) - # NOTE(onalant): np.frombuffer only creates 1D arrays; expand to shape - s.reshape(shape + s.shape[1:]) - - store = DirectoryStore(path) - z = array(s, store=store) - - assert(s.dtype == z.dtype) - assert(s.shape == z.shape) - assert(s.tobytes() == z[:].tobytes()) - - del store - del z - - store = DirectoryStore(path) - z = open_array(store) - - assert(s.dtype == z.dtype) - assert(s.shape == z.shape) - assert(s.tobytes() == z[:].tobytes()) - From 1c8e45bf73f9cb52a5163bdeb9c0af23b9ddfba7 Mon Sep 17 00:00:00 2001 From: Tarik Onalan Date: Thu, 13 Sep 2018 14:19:12 -0700 Subject: [PATCH 070/168] Fix flake8 complaints --- zarr/tests/test_core.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index fc4168a4f3..db6b92bb8a 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -848,8 +848,7 @@ def test_nchunks_initialized(self): def test_unstructured_array(self): - subshape = (2, 2) - dt = np.dtype("%sf4" % (subshape,)) + dt = "(2, 2)f4" # setup some data d = np.array([((0, 1), (1, 2)), @@ -861,7 +860,7 @@ def test_unstructured_array(self): for a in (d, d[:0]): for fill_value in None, 0: - z = self.create_array(shape=a.shape[:-len(subshape)], chunks=2, dtype=dt, fill_value=fill_value) + z = self.create_array(shape=a.shape[:-2], chunks=2, dtype=dt, fill_value=fill_value) assert len(a) == len(z) if fill_value is not None: assert fill_value == z.fill_value @@ -926,7 +925,8 @@ def test_structured_array_subshapes(self): assert_array_equal(a['bar'], z['bar']) assert_array_equal(a['baz'], z['baz']) else: - # BUG(onalant): numpy cannot compare empty arrays of structured dtypes with shapes + # BUG(onalant): numpy cannot compare empty arrays of structured dtypes with + # subshapes assert a.tobytes() == z[...].tobytes() def test_structured_array_nested(self): @@ -935,7 +935,8 @@ def test_structured_array_nested(self): d = np.array([(0, (0, ((0, 1), (1, 2), (2, 3)), 0), b'aaa'), (1, (1, ((1, 2), (2, 3), (3, 4)), 1), b'bbb'), (2, (2, ((2, 3), (3, 4), (4, 5)), 2), b'ccc')], - dtype=[('foo', 'i8'), ('bar', [('foo', 'i4'), ('bar', '(3, 2)f4'), ('baz', 'u1')]), ('baz', 'S3')]) + dtype=[('foo', 'i8'), ('bar', [('foo', 'i4'), ('bar', '(3, 2)f4'), + ('baz', 'u1')]), ('baz', 'S3')]) for a in (d, d[:0]): for fill_value in None, b'', (0, (0, ((0, 0), (1, 1), (2, 2)), 0), b'zzz'): z = self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=fill_value) @@ -958,7 +959,8 @@ def test_structured_array_nested(self): assert_array_equal(a['bar'], z['bar']) assert_array_equal(a['baz'], z['baz']) else: - # BUG(onalant): numpy cannot compare empty arrays of structured dtypes with shapes + # BUG(onalant): numpy cannot compare empty arrays of structured dtypes with + # subshapes assert a.tobytes() == z[...].tobytes() def test_dtypes(self): From c0eacea716da376f64cde5df5de264c3bb41af63 Mon Sep 17 00:00:00 2001 From: Tarik Onalan Date: Thu, 13 Sep 2018 14:45:50 -0700 Subject: [PATCH 071/168] Fix accidental test break --- zarr/tests/test_core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index db6b92bb8a..914ebfc4d7 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1577,10 +1577,6 @@ class TestArrayWithFilters(TestArray): def create_array(read_only=False, **kwargs): store = dict() dtype = kwargs.get('dtype', None) - # WARN(onalant): this is to compensate for unstructured dtypes - # should this be in upstream numcodecs? - if isinstance(dtype, np.dtype): - dtype = dtype.base filters = [ Delta(dtype=dtype), FixedScaleOffset(dtype=dtype, scale=1, offset=0), @@ -1651,6 +1647,10 @@ def test_astype(self): expected = data.astype(astype) assert_array_equal(expected, z2) + def test_unstructured_array(self): + # skip this one, cannot do delta on unstructured array + pass + def test_structured_array(self): # skip this one, cannot do delta on structured array pass From 7356b8b674854634fed2cb4576f330be09fecb72 Mon Sep 17 00:00:00 2001 From: Tarik Onalan Date: Thu, 13 Sep 2018 15:10:51 -0700 Subject: [PATCH 072/168] Reference upstream numpy bug in test suite --- zarr/tests/test_core.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 914ebfc4d7..3a18feb904 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -925,8 +925,7 @@ def test_structured_array_subshapes(self): assert_array_equal(a['bar'], z['bar']) assert_array_equal(a['baz'], z['baz']) else: - # BUG(onalant): numpy cannot compare empty arrays of structured dtypes with - # subshapes + # BUG(onalant): https://www.github.com/numpy/numpy/issues/11946 assert a.tobytes() == z[...].tobytes() def test_structured_array_nested(self): @@ -959,8 +958,7 @@ def test_structured_array_nested(self): assert_array_equal(a['bar'], z['bar']) assert_array_equal(a['baz'], z['baz']) else: - # BUG(onalant): numpy cannot compare empty arrays of structured dtypes with - # subshapes + # BUG(onalant): https://www.github.com/numpy/numpy/issues/11946 assert a.tobytes() == z[...].tobytes() def test_dtypes(self): From 9311f4f90fe2a391de6143bd00f755bd647ee200 Mon Sep 17 00:00:00 2001 From: Tarik Onalan Date: Thu, 11 Oct 2018 21:30:26 -0700 Subject: [PATCH 073/168] Style revisions --- zarr/tests/test_core.py | 8 ++++---- zarr/tests/test_meta.py | 14 +++++--------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 3a18feb904..c67339a734 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -846,7 +846,7 @@ def test_nchunks_initialized(self): z[:] = 42 assert 10 == z.nchunks_initialized - def test_unstructured_array(self): + def test_array_dtype_shape(self): dt = "(2, 2)f4" # setup some data @@ -925,7 +925,7 @@ def test_structured_array_subshapes(self): assert_array_equal(a['bar'], z['bar']) assert_array_equal(a['baz'], z['baz']) else: - # BUG(onalant): https://www.github.com/numpy/numpy/issues/11946 + # workaround for numpy bug https://www.github.com/numpy/numpy/issues/11946 assert a.tobytes() == z[...].tobytes() def test_structured_array_nested(self): @@ -958,7 +958,7 @@ def test_structured_array_nested(self): assert_array_equal(a['bar'], z['bar']) assert_array_equal(a['baz'], z['baz']) else: - # BUG(onalant): https://www.github.com/numpy/numpy/issues/11946 + # workaround for numpy bug https://www.github.com/numpy/numpy/issues/11946 assert a.tobytes() == z[...].tobytes() def test_dtypes(self): @@ -1645,7 +1645,7 @@ def test_astype(self): expected = data.astype(astype) assert_array_equal(expected, z2) - def test_unstructured_array(self): + def test_array_dtype_shape(self): # skip this one, cannot do delta on unstructured array pass diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index 9c4ceb44eb..904c2146a7 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -116,7 +116,7 @@ def test_encode_decode_array_2(): assert [df.get_config()] == meta_dec['filters'] -def test_encode_decode_array_unstructured(): +def test_encode_decode_array_dtype_shape(): meta = dict( shape=(100,), @@ -146,12 +146,10 @@ def test_encode_decode_array_unstructured(): # test decoding meta_dec = decode_array_metadata(meta_enc) assert ZARR_FORMAT == meta_dec['zarr_format'] - # NOTE(onalant): https://github.com/zarr-developers/zarr/pull/296#issuecomment-417608487 - # To maintain consistency with numpy unstructured arrays, unpack dimensions into shape. + # to maintain consistency with numpy unstructured arrays, unpack dimensions into shape assert meta['shape'] + meta['dtype'].shape == meta_dec['shape'] assert meta['chunks'] == meta_dec['chunks'] - # NOTE(onalant): https://github.com/zarr-developers/zarr/pull/296#issuecomment-417608487 - # To maintain consistency with numpy unstructured arrays, unpack dtypes. + # to maintain consistency with numpy unstructured arrays, unpack dtypes assert meta['dtype'].base == meta_dec['dtype'] assert meta['compressor'] == meta_dec['compressor'] assert meta['order'] == meta_dec['order'] @@ -189,12 +187,10 @@ def test_encode_decode_array_structured(): # test decoding meta_dec = decode_array_metadata(meta_enc) assert ZARR_FORMAT == meta_dec['zarr_format'] - # NOTE(onalant): https://github.com/zarr-developers/zarr/pull/296#issuecomment-417608487 - # To maintain consistency with numpy unstructured arrays, unpack dimensions into shape. + # to maintain consistency with numpy unstructured arrays, unpack dimensions into shape assert meta['shape'] + meta['dtype'].shape == meta_dec['shape'] assert meta['chunks'] == meta_dec['chunks'] - # NOTE(onalant): https://github.com/zarr-developers/zarr/pull/296#issuecomment-417608487 - # To maintain consistency with numpy unstructured arrays, unpack dtypes. + # to maintain consistency with numpy unstructured arrays, unpack dimensions into shape assert meta['dtype'].base == meta_dec['dtype'] assert meta['compressor'] == meta_dec['compressor'] assert meta['order'] == meta_dec['order'] From 4daf0ddbc70a338a8fe6f5b19ff1f99626aaba05 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 19 Oct 2018 12:07:40 +0100 Subject: [PATCH 074/168] clarify how to encode structured data types --- docs/spec/v2.rst | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/docs/spec/v2.rst b/docs/spec/v2.rst index 2a3bbd9a54..59d94c87f9 100644 --- a/docs/spec/v2.rst +++ b/docs/spec/v2.rst @@ -140,13 +140,31 @@ the `NumPy documentation on Datetimes and Timedeltas `_. For example, ``"`_. For -example, the JSON list ``[["r", "|u1"], ["g", "|u1"], ["b", "|u1"]]`` defines a -data type composed of three single-byte unsigned integers labelled "r", "g" and -"b". - +Structured data types (i.e., with multiple named fields) are encoded +as a list of lists, following `NumPy array protocol type descriptions +(descr) +`_. Each +sub-list has the form ``[fieldname, datatype, shape]`` where ``shape`` +is optional. ``fieldname`` is a string, ``datatype`` is a string +specifying a simple data type (see above), and ``shape`` is a list of +integers specifying subarray shape. For example, the JSON list below +defines a data type composed of three single-byte unsigned integer +fields named "r", "g" and "b":: + + [["r", "|u1"], ["g", "|u1"], ["b", "|u1"]] + +For example, the JSON list below defines a data type composed of three +fields named "x", "y" and "z", where "x" and "y" each contain 32-bit +floats, and each item in "z" is a 2 by 2 array of floats:: + + [["x", " Date: Fri, 19 Oct 2018 12:12:39 +0100 Subject: [PATCH 075/168] minor edit --- docs/spec/v2.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/spec/v2.rst b/docs/spec/v2.rst index 59d94c87f9..67941b7902 100644 --- a/docs/spec/v2.rst +++ b/docs/spec/v2.rst @@ -530,10 +530,9 @@ initially published to clarify ambiguities and add some missing information. either arrays or groups, and if absent then custom attributes should be treated as empty. -* The specification now clarifies that structured datatypes with - subarray shapes and/or with nested structured data types are - supported, and describes the JSON syntax for encoding them in array - metadata (:issue:`111`, :issue:`296`). +* The specification now describes how structured datatypes with + subarray shapes and/or with nested structured data types are encoded + in array metadata (:issue:`111`, :issue:`296`). Changes from version 1 to version 2 From 6d4179ea4cff4ae9646036cb3705c009afc84717 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 19 Oct 2018 12:44:26 +0100 Subject: [PATCH 076/168] test and fix for uninitialised structured array --- zarr/core.py | 6 ++- zarr/tests/test_core.py | 100 ++++++++++++++-------------------------- 2 files changed, 40 insertions(+), 66 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 262ae5d4ef..00ad269557 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1568,7 +1568,11 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, except KeyError: # chunk not initialized if self._fill_value is not None: - out[out_selection] = self._fill_value + if fields: + fill_value = self._fill_value[fields] + else: + fill_value = self._fill_value + out[out_selection] = fill_value else: diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index c67339a734..11891f8fe9 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -867,17 +867,15 @@ def test_array_dtype_shape(self): z[...] = a assert_array_equal(a, z[...]) - def test_structured_array(self): - - # setup some data - d = np.array([(b'aaa', 1, 4.2), - (b'bbb', 2, 8.4), - (b'ccc', 3, 12.6)], - dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + def check_structured_array(self, d, fill_values): for a in (d, d[:0]): - for fill_value in None, b'', (b'zzz', 42, 16.8): + for fill_value in fill_values: z = self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=fill_value) assert len(a) == len(z) + assert a.shape == z.shape + assert a.dtype == z.dtype + + # check use of fill value before array is initialised with data if fill_value is not None: if fill_value == b'': # numpy 1.14 compatibility @@ -885,81 +883,53 @@ def test_structured_array(self): else: np_fill_value = np.array(fill_value, dtype=a.dtype)[()] assert np_fill_value == z.fill_value - if len(z): + if len(a): assert np_fill_value == z[0] assert np_fill_value == z[-1] + empty = np.empty_like(a) + empty[:] = np_fill_value + assert empty[0] == z[0] + assert_array_equal(empty[0:2], z[0:2]) + assert_array_equal(empty, z[...]) + for f in a.dtype.names: + assert_array_equal(empty[f], z[f]) + + # store data in array z[...] = a + + # check stored data if len(a): assert a[0] == z[0] - assert_array_equal(a, z[...]) - assert_array_equal(a['foo'], z['foo']) - assert_array_equal(a['bar'], z['bar']) - assert_array_equal(a['baz'], z['baz']) + assert a[-1] == z[-1] + assert_array_equal(a[0:2], z[0:2]) + assert_array_equal(a, z[...]) + for f in a.dtype.names: + assert_array_equal(a[f], z[f]) - def test_structured_array_subshapes(self): + def test_structured_array(self): + d = np.array([(b'aaa', 1, 4.2), + (b'bbb', 2, 8.4), + (b'ccc', 3, 12.6)], + dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + fill_values = None, b'', (b'zzz', 42, 16.8) + self.check_structured_array(d, fill_values) - # setup some data + def test_structured_array_subshapes(self): d = np.array([(0, ((0, 1, 2), (1, 2, 3)), b'aaa'), (1, ((1, 2, 3), (2, 3, 4)), b'bbb'), (2, ((2, 3, 4), (3, 4, 5)), b'ccc')], dtype=[('foo', 'i8'), ('bar', '(2, 3)f4'), ('baz', 'S3')]) - for a in (d, d[:0]): - for fill_value in None, b'', (0, ((0, 0, 0), (1, 1, 1)), b'zzz'): - z = self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=fill_value) - assert len(a) == len(z) - if fill_value is not None: - if fill_value == b'': - # numpy 1.14 compatibility - np_fill_value = np.array(fill_value, dtype=a.dtype.str).view(a.dtype)[()] - else: - np_fill_value = np.array(fill_value, dtype=a.dtype)[()] - assert np_fill_value == z.fill_value - if len(z): - assert np_fill_value == z[0] - assert np_fill_value == z[-1] - z[...] = a - if len(a): - assert a[0] == z[0] - assert_array_equal(a, z[...]) - assert_array_equal(a['foo'], z['foo']) - assert_array_equal(a['bar'], z['bar']) - assert_array_equal(a['baz'], z['baz']) - else: - # workaround for numpy bug https://www.github.com/numpy/numpy/issues/11946 - assert a.tobytes() == z[...].tobytes() + fill_values = None, b'', (0, ((0, 0, 0), (1, 1, 1)), b'zzz') + self.check_structured_array(d, fill_values) def test_structured_array_nested(self): - - # setup some data d = np.array([(0, (0, ((0, 1), (1, 2), (2, 3)), 0), b'aaa'), (1, (1, ((1, 2), (2, 3), (3, 4)), 1), b'bbb'), (2, (2, ((2, 3), (3, 4), (4, 5)), 2), b'ccc')], dtype=[('foo', 'i8'), ('bar', [('foo', 'i4'), ('bar', '(3, 2)f4'), ('baz', 'u1')]), ('baz', 'S3')]) - for a in (d, d[:0]): - for fill_value in None, b'', (0, (0, ((0, 0), (1, 1), (2, 2)), 0), b'zzz'): - z = self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=fill_value) - assert len(a) == len(z) - if fill_value is not None: - if fill_value == b'': - # numpy 1.14 compatibility - np_fill_value = np.array(fill_value, dtype=a.dtype.str).view(a.dtype)[()] - else: - np_fill_value = np.array(fill_value, dtype=a.dtype)[()] - assert np_fill_value == z.fill_value - if len(z): - assert np_fill_value == z[0] - assert np_fill_value == z[-1] - z[...] = a - if len(a): - assert a[0] == z[0] - assert_array_equal(a, z[...]) - assert_array_equal(a['foo'], z['foo']) - assert_array_equal(a['bar'], z['bar']) - assert_array_equal(a['baz'], z['baz']) - else: - # workaround for numpy bug https://www.github.com/numpy/numpy/issues/11946 - assert a.tobytes() == z[...].tobytes() + fill_values = None, b'', (0, (0, ((0, 0), (1, 1), (2, 2)), 0), b'zzz') + self.check_structured_array(d, fill_values) def test_dtypes(self): From 5c95aaae854603a2533abf7f3da1aa0b4030804f Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 19 Oct 2018 13:49:22 +0100 Subject: [PATCH 077/168] release notes [ci skip] --- docs/release.rst | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/release.rst b/docs/release.rst index 9acab25fde..7968840cb0 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -6,12 +6,23 @@ Release notes 2.3.0 (Work in Progress) ------------------------ +Enhancements +~~~~~~~~~~~~ + +* Support has been added for structured arrays with sub-array shape and/or nested fields. By + :user:`Tarik Onalan `, :issue:`111`, :issue:`296`. + Maintenance ~~~~~~~~~~~ * CI and test environments have been upgraded to include Python 3.7, drop Python 3.4, and - upgrade all package requirements. :issue:`308`. + upgrade all pinned package requirements. :issue:`308`. +* Failing tests related to pickling/unpickling have been fixed. By :user:`Ryan Williams `, + :issue:`273`, :issue:`308`. + +Acknowledgments +~~~~~~~~~~~~~~~ .. _release_2.2.0: From 0cb9f7766edc112e513db8216f473693f59f270e Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 19 Oct 2018 17:29:02 -0400 Subject: [PATCH 078/168] Fix typo in comment --- zarr/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/indexing.py b/zarr/indexing.py index ecbc403509..52e11fbf28 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -678,7 +678,7 @@ def __init__(self, selection, array): else: sel_sort = None - # store atrributes + # store attributes self.selection = selection self.sel_sort = sel_sort self.shape = selection[0].shape if selection[0].shape else (1,) From b4b60aa591f8ce9205e76a9382c5e94aaa16a363 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 23 Oct 2018 10:03:04 -0400 Subject: [PATCH 079/168] Update docstrings --- zarr/convenience.py | 41 +++++++++++++++++++++++++++------- zarr/tests/test_convenience.py | 7 +++--- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 7bc66f98d8..d45dadc715 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1080,11 +1080,13 @@ def consolidate_metadata(store, metadata_key='.zmetadata'): This produces a single object in the backend store, containing all the metadata read from all the zarr-related keys that can be found. This should be used in conjunction with ``storage.ConsolidatedMetadataStore`` - to reduce the number of operations on the backend store at read time. + to reduce the number of operations on the backend store at read time; + normally, users will call ``open_consolidated()`` to open in optimised, + read-only mode. - Note, however, that if any metadata in the store is changed after this - consolidation, then the metadata read by ``storage.ConsolidatedMetadataStore`` - would be out of sync with reality unless this function is called again. + Note, that if the metadata in the store is changed after this + consolidation, then the metadata read by ``open_consolidated()`` + would be incorrect unless this function is called again. Parameters ---------- @@ -1095,11 +1097,10 @@ def consolidate_metadata(store, metadata_key='.zmetadata'): Returns ------- - ConsolidatedMetadataStore instance, based on the same base store. + Group instance, opened with the new consolidated metadata """ import json - from .storage import ConsolidatedMetadataStore store = normalize_store_arg(store) @@ -1109,11 +1110,35 @@ def is_zarr_key(key): out = {key: store[key].decode() for key in store if is_zarr_key(key)} store[metadata_key] = json.dumps(out).encode() - return ConsolidatedMetadataStore(store, metadata_key=metadata_key) + return open_consolidated(store, metadata_key=metadata_key) def open_consolidated(store, metadata_key='.zmetadata', mode='a'): - """TODO doc me""" + """Open group using metadata consolidated into a single key + + This is an optimised method for opening a Zarr group, where instead of + traversing the group/array hierarchy by accessing the metadata keys at + each level, a single key contains all of the metadata for everything. + For remote data sources where the overhead of accessing a key is large + compared to the time to read data. + + The group accessed must have already had its metadata consolidated into a + single key using the function ``consolidate_metadata()``. + + This optimised method only works in modes which do not change the + metadata, although the data may still be written/updated. + + Parameters + ---------- + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + metadata_key : str + Key to read the consolidated metadata from. The default (.zmetadata) + corresponds to the default used by ``consolidate_metadata()``. + mode : {'r', 'a'}, optional + Persistence mode. Only modes which cannot change the metadata are + allowed. + """ from .storage import ConsolidatedMetadataStore diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index b7da890522..07e4451f45 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -110,7 +110,7 @@ def test_consolidate_metadata(): # perform consolidation out = consolidate_metadata(store) - assert isinstance(out, ConsolidatedMetadataStore) + assert isinstance(out, Group) assert '.zmetadata' in store for key in ['.zgroup', 'g1/.zgroup', @@ -130,10 +130,11 @@ def test_consolidate_metadata(): assert 16 == z2.g2.arr.nchunks_initialized # tests del/write on the store + cmd = ConsolidatedMetadataStore(store) with pytest.raises(PermissionError): - del out['.zgroup'] + del cmd['.zgroup'] with pytest.raises(PermissionError): - out['.zgroup'] = None + cmd['.zgroup'] = None # test new metadata are not writeable with pytest.raises(PermissionError): From cae30daa6d735e36aecb5da72a9d082b938ab105 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 31 Oct 2018 17:00:07 -0500 Subject: [PATCH 080/168] added api docs; consistify references in docstrings --- docs/api/convenience.rst | 2 ++ docs/api/storage.rst | 2 ++ zarr/convenience.py | 61 +++++++++++++++++++++++++++------------- zarr/storage.py | 19 +++++++++---- 4 files changed, 60 insertions(+), 24 deletions(-) diff --git a/docs/api/convenience.rst b/docs/api/convenience.rst index 51997a4dc2..a70a90ce7c 100644 --- a/docs/api/convenience.rst +++ b/docs/api/convenience.rst @@ -10,3 +10,5 @@ Convenience functions (``zarr.convenience``) .. autofunction:: copy_all .. autofunction:: copy_store .. autofunction:: tree +.. autofunction:: consolidate_metadata +.. autofunction:: open_consolidated diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 2365359fa9..74801d3115 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -27,6 +27,8 @@ Storage (``zarr.storage``) .. automethod:: invalidate_values .. automethod:: invalidate_keys +.. autoclass:: ConsolidatedMetadataStore + .. autofunction:: init_array .. autofunction:: init_group .. autofunction:: contains_array diff --git a/zarr/convenience.py b/zarr/convenience.py index d45dadc715..0f4dfa9094 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -31,12 +31,17 @@ def open(store=None, mode='a', **kwargs): exist); 'w' means create (overwrite if exists); 'w-' means create (fail if exists). **kwargs - Additional parameters are passed through to :func:`zarr.open_array` or - :func:`zarr.open_group`. + Additional parameters are passed through to :func:`zarr.creation.open_array` or + :func:`zarr.hierarchy.open_group`. + + Returns + ------- + z : :class:`zarr.core.Array` or :class:`zarr.hierarchy.Group` + Array or group, depending on what exists in the given store. See Also -------- - zarr.open_array, zarr.open_group + zarr.creation.open_array, zarr.hierarchy.open_group Examples -------- @@ -1078,16 +1083,17 @@ def consolidate_metadata(store, metadata_key='.zmetadata'): into a single resource and put it under the given key. This produces a single object in the backend store, containing all the - metadata read from all the zarr-related keys that can be found. This - should be used in conjunction with ``storage.ConsolidatedMetadataStore`` - to reduce the number of operations on the backend store at read time; - normally, users will call ``open_consolidated()`` to open in optimised, - read-only mode. + metadata read from all the zarr-related keys that can be found. After + metadata have been consolidated, use :func:`open_consolidated` to open + the root group in optimised, read-only mode, using the consolidated + metadata to reduce the number of read operations on the backend store. Note, that if the metadata in the store is changed after this - consolidation, then the metadata read by ``open_consolidated()`` + consolidation, then the metadata read by :func:`open_consolidated` would be incorrect unless this function is called again. + .. note:: This is an experimental feature. + Parameters ---------- store : MutableMapping or string @@ -1097,7 +1103,12 @@ def consolidate_metadata(store, metadata_key='.zmetadata'): Returns ------- - Group instance, opened with the new consolidated metadata + g : :class:`zarr.hierarchy.Group` + Group instance, opened with the new consolidated metadata. + + See Also + -------- + open_consolidated """ import json @@ -1113,8 +1124,8 @@ def is_zarr_key(key): return open_consolidated(store, metadata_key=metadata_key) -def open_consolidated(store, metadata_key='.zmetadata', mode='a'): - """Open group using metadata consolidated into a single key +def open_consolidated(store, metadata_key='.zmetadata', mode='r+'): + """Open group using metadata previously consolidated into a single key. This is an optimised method for opening a Zarr group, where instead of traversing the group/array hierarchy by accessing the metadata keys at @@ -1123,7 +1134,7 @@ def open_consolidated(store, metadata_key='.zmetadata', mode='a'): compared to the time to read data. The group accessed must have already had its metadata consolidated into a - single key using the function ``consolidate_metadata()``. + single key using the function :func:`consolidate_metadata`. This optimised method only works in modes which do not change the metadata, although the data may still be written/updated. @@ -1134,18 +1145,30 @@ def open_consolidated(store, metadata_key='.zmetadata', mode='a'): Store or path to directory in file system or name of zip file. metadata_key : str Key to read the consolidated metadata from. The default (.zmetadata) - corresponds to the default used by ``consolidate_metadata()``. - mode : {'r', 'a'}, optional - Persistence mode. Only modes which cannot change the metadata are - allowed. + corresponds to the default used by :func:`consolidate_metadata`. + mode : {'r', 'r+'}, optional + Persistence mode: 'r' means read only (must exist); 'r+' means + read/write (must exist) although only writes to data are allowed, + changes to metadata including creation of new arrays or group + are not allowed. + + Returns + ------- + g : :class:`zarr.hierarchy.Group` + Group instance, opened with the consolidated metadata. + + See Also + -------- + consolidate_metadata + """ from .storage import ConsolidatedMetadataStore # normalize parameters store = normalize_store_arg(store) - if mode not in 'ra': - raise ValueError("invalid mode, expected either 'r' or 'a'; found {!r}" + if mode not in {'r', 'r+'}: + raise ValueError("invalid mode, expected either 'r' or 'r+'; found {!r}" .format(mode)) # setup metadata sotre diff --git a/zarr/storage.py b/zarr/storage.py index a86c7dfc05..f79c313cdf 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1895,12 +1895,13 @@ def __delitem__(self, key): class ConsolidatedMetadataStore(MutableMapping): - """A layer over other storage, with the metadata within a single key. + """A layer over other storage, where the metadata has been consolidated into + a single key. The purpose of this class, is to be able to get all of the metadata for a given dataset in a single read operation from the underlying storage. - See ``convenience.consolidate_metadata()`` for how to create this single - metadata key. + See :func:`zarr.convenience.consolidate_metadata` for how to create this + single metadata key. This class loads from the one key, and stores the data in a dict, so that accessing the keys no longer requires operations on the backend store. @@ -1908,8 +1909,12 @@ class ConsolidatedMetadataStore(MutableMapping): This class is read-only, and attempts to change the dataset metadata will fail, but changing the data is possible. If the backend storage is changed directly, then the metadata stored here could become obsolete, and - ``consolidate_metadata`` should be called again and the class re-invoked. - The use case is for write once, read many times. + :func:`zarr.convenience.consolidate_metadata` should be called again and the class + re-invoked. The use case is for write once, read many times. + + .. versionadded:: 2.3 + + .. note:: This is an experimental feature. Parameters ---------- @@ -1919,6 +1924,10 @@ class ConsolidatedMetadataStore(MutableMapping): The target in the store where all of the metadata are stored. We assume JSON encoding. + See Also + -------- + zarr.convenience.consolidate_metadata, zarr.convenience.open_consolidated + """ def __init__(self, store, metadata_key='.zmetadata'): self.store = store From ba99cfaea09d41f714f991fe097eed13d301a3ec Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 1 Nov 2018 09:00:40 -0500 Subject: [PATCH 081/168] add tests --- zarr/tests/test_convenience.py | 14 +++++++++++--- zarr/tests/test_creation.py | 9 +++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 07e4451f45..91a3418a96 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -14,7 +14,7 @@ from zarr.convenience import (open, save, save_group, load, copy_store, copy, consolidate_metadata, open_consolidated) -from zarr.storage import atexit_rmtree, DictStore +from zarr.storage import atexit_rmtree, DictStore, getsize, ConsolidatedMetadataStore from zarr.core import Array from zarr.hierarchy import Group, group from zarr.errors import CopyError, PermissionError @@ -93,7 +93,6 @@ def test_lazy_loader(): def test_consolidate_metadata(): - from zarr.storage import ConsolidatedMetadataStore # setup initial data store = DictStore() @@ -121,7 +120,7 @@ def test_consolidate_metadata(): del store[key] # open consolidated - z2 = open_consolidated(store, mode='a') + z2 = open_consolidated(store, mode='r+') assert ['g1', 'g2'] == list(z2) assert 'world' == z2.g2.attrs['hello'] assert 1 == z2.g2.arr.attrs['data'] @@ -136,6 +135,9 @@ def test_consolidate_metadata(): with pytest.raises(PermissionError): cmd['.zgroup'] = None + # test getsize on the store + assert getsize(cmd) == getsize(store) + # test new metadata are not writeable with pytest.raises(PermissionError): z2.create_group('g3') @@ -154,6 +156,12 @@ def test_consolidate_metadata(): z2.g2.arr[:] = 2 assert (z2.g2.arr[:] == 2).all() + # test invalid modes + with pytest.raises(ValueError): + open_consolidated(store, mode='a') + with pytest.raises(ValueError): + open_consolidated(store, mode='w') + class TestCopyStore(unittest.TestCase): diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 304714991e..ef2232c234 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -3,6 +3,7 @@ import tempfile import shutil import atexit +import os.path import numpy as np @@ -240,6 +241,14 @@ def test_open_array(): assert isinstance(z, Array) assert 'foo/bar' == z.path + # with chunk store + meta_store = 'data/meta.zarr' + chunk_store = 'data/chunks.zarr' + z = open_array(store=meta_store, chunk_store=chunk_store, shape=11, mode='w') + z[:] = 42 + assert os.path.abspath(meta_store) == z.store.path + assert os.path.abspath(chunk_store) == z.chunk_store.path + def test_empty_like(): From 6f01dece292e8bb0fd6e0a01b5178dcd1887518d Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 1 Nov 2018 13:25:27 -0400 Subject: [PATCH 082/168] Add section to tutorial, add to release notes --- docs/release.rst | 9 +++++++++ docs/tutorial.rst | 27 +++++++++++++++++++++++++++ zarr/tests/test_convenience.py | 2 +- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/docs/release.rst b/docs/release.rst index 9acab25fde..a2428e990e 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -6,6 +6,15 @@ Release notes 2.3.0 (Work in Progress) ------------------------ +Enhancements +~~~~~~~~~~~~ + +* Add "consolidated" metadata as an experimental option: use :func:`zarr.consolidate_metadata` to copy + all metadata from the various keys within a data-set under a single key, and + :func:`zarr.open_consolidated` to use this single key. This can greatly cut down the + number of calls to the storage backend, and so remove a lot of of over head for + remote data. :issue:`268`. + Maintenance ~~~~~~~~~~~ diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 5c090669ce..7a0d32ea43 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -804,6 +804,33 @@ interface to the storage. .. _tutorial_copy: +Consolidating metadata +~~~~~~~~~~~~~~~~~~~~~~ + +(This is an experimental feature.) + +Since there is a significant overhead for every connection to s3, the pattern described in +the previous section may incur significant latency while scanning the metadata of the data-set +hierarchy, even though each individual file is small. For cases such as these, once the file +is static and can be regarded as read-only, at least for the metadata/structure of the +data-set, the many metadata files can be consolidated into a single one. +Doing this can greatly increase the speed of reading the data-set hierarchy:: + + >>> zarr.consolidate_metadata(store) + +Creates a special key with a copy of all of the metadata from the many files. +Later:: + + >>> root = zarr.open_consolidated(store) + +Uses this special key to read all of the metadata in a single call to the backend storage. + +Note that, the data-set could still be opened in the normal way and altered, causing the +consolidated metadata to become out of sync with the real state of the data-set. In this +case, :func:`zarr.consolidate_metadata` would need to be called again. The data-set +returned by :func:`zarr.open_consolidated` is read-only for the metadata, but the data +values can still be updated. + Copying/migrating data ---------------------- diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 07e4451f45..c6439c7c8e 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -121,7 +121,7 @@ def test_consolidate_metadata(): del store[key] # open consolidated - z2 = open_consolidated(store, mode='a') + z2 = open_consolidated(store) assert ['g1', 'g2'] == list(z2) assert 'world' == z2.g2.attrs['hello'] assert 1 == z2.g2.arr.attrs['data'] From f5130ac890dc6f0430f3d77da95de3add6a5f48e Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 1 Nov 2018 12:46:44 -0500 Subject: [PATCH 083/168] fix getsize test --- zarr/tests/test_convenience.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 91a3418a96..0166df6320 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -136,7 +136,7 @@ def test_consolidate_metadata(): cmd['.zgroup'] = None # test getsize on the store - assert getsize(cmd) == getsize(store) + assert isinstance(getsize(cmd), int) # test new metadata are not writeable with pytest.raises(PermissionError): From 3d3cb2f73c2992faeb6eaaed939ec280d0c68a6a Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 1 Nov 2018 17:11:48 -0400 Subject: [PATCH 084/168] add setuptools-scm to dev env so can go fully offline --- requirements_dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements_dev.txt b/requirements_dev.txt index d495e04bfd..671fc789f6 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -46,6 +46,7 @@ python-dateutil==2.7.3 readme-renderer==22.0 requests==2.19.1 requests-toolbelt==0.8.0 +setuptools-scm=3.1.0 s3fs==0.1.6 s3transfer==0.1.13 scandir==1.9.0 From 8acf83a4b9c70915589df3fa5724209264d81911 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 1 Nov 2018 17:15:43 -0400 Subject: [PATCH 085/168] fix requirements --- requirements_dev.txt | 2 +- zarr/tests/test_convenience.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index 671fc789f6..23de426def 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -46,7 +46,7 @@ python-dateutil==2.7.3 readme-renderer==22.0 requests==2.19.1 requests-toolbelt==0.8.0 -setuptools-scm=3.1.0 +setuptools-scm==3.1.0 s3fs==0.1.6 s3transfer==0.1.13 scandir==1.9.0 diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 773a135411..12bfab4a5a 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -4,6 +4,7 @@ import atexit import os import unittest +from numbers import Integral import numpy as np @@ -136,7 +137,7 @@ def test_consolidate_metadata(): cmd['.zgroup'] = None # test getsize on the store - assert isinstance(getsize(cmd), int) + assert isinstance(getsize(cmd), Integral) # test new metadata are not writeable with pytest.raises(PermissionError): From 2f8953543636f559750de0c643c221298495409a Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 1 Nov 2018 17:34:25 -0400 Subject: [PATCH 086/168] skip consolidate doctests; minor edits --- docs/tutorial.rst | 59 +++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 7a0d32ea43..606b5acef5 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -778,9 +778,11 @@ chunk size, which will reduce the number of chunks and thus reduce the number of round-trips required to retrieve data for an array (and thus reduce the impact of network latency). Another option is to try to increase the compression ratio by changing compression options or trying a different compressor (which will reduce the impact of -limited network bandwidth). As of version 2.2, Zarr also provides the -:class:`zarr.storage.LRUStoreCache` which can be used to implement a local in-memory cache -layer over a remote store. E.g.:: +limited network bandwidth). + +As of version 2.2, Zarr also provides the :class:`zarr.storage.LRUStoreCache` +which can be used to implement a local in-memory cache layer over a remote +store. E.g.:: >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) @@ -797,10 +799,10 @@ layer over a remote store. E.g.:: b'Hello from the cloud!' 0.0009490990014455747 -If you are still experiencing poor performance with distributed/cloud storage, please -raise an issue on the GitHub issue tracker with any profiling data you can provide, as -there may be opportunities to optimise further either within Zarr or within the mapping -interface to the storage. +If you are still experiencing poor performance with distributed/cloud storage, +please raise an issue on the GitHub issue tracker with any profiling data you +can provide, as there may be opportunities to optimise further either within +Zarr or within the mapping interface to the storage. .. _tutorial_copy: @@ -809,27 +811,38 @@ Consolidating metadata (This is an experimental feature.) -Since there is a significant overhead for every connection to s3, the pattern described in -the previous section may incur significant latency while scanning the metadata of the data-set -hierarchy, even though each individual file is small. For cases such as these, once the file -is static and can be regarded as read-only, at least for the metadata/structure of the -data-set, the many metadata files can be consolidated into a single one. -Doing this can greatly increase the speed of reading the data-set hierarchy:: +Since there is a significant overhead for every connection to a cloud object +store such as S3, the pattern described in the previous section may incur +significant latency while scanning the metadata of the dataset hierarchy, even +though each individual metadata object is small. For cases such as these, once +the data are static and can be regarded as read-only, at least for the +metadata/structure of the dataset hierarchy, the many metadata objects can be +consolidated into a single one via +:func:`zarr.convenience.consolidate_metadata`. Doing this can greatly increase +the speed of reading the dataset metadata, e.g.:: + + >>> zarr.consolidate_metadata(store) # doctest: +SKIP + +This creates a special key with a copy of all of the metadata from all of the +metadata objects in the store. - >>> zarr.consolidate_metadata(store) +Later, to open a Zarr store with consolidated metadata, use +:func:`zarr.convenience.open_consolidated`, e.g.:: -Creates a special key with a copy of all of the metadata from the many files. -Later:: + >>> root = zarr.open_consolidated(store) # doctest: +SKIP - >>> root = zarr.open_consolidated(store) +This uses the special key to read all of the metadata in a single call to the +backend storage. -Uses this special key to read all of the metadata in a single call to the backend storage. +Note that, the hierarchy could still be opened in the normal way and altered, +causing the consolidated metadata to become out of sync with the real state of +the dataset hierarchy. In this case, +:func:`zarr.convenience.consolidate_metadata` would need to be called again. -Note that, the data-set could still be opened in the normal way and altered, causing the -consolidated metadata to become out of sync with the real state of the data-set. In this -case, :func:`zarr.consolidate_metadata` would need to be called again. The data-set -returned by :func:`zarr.open_consolidated` is read-only for the metadata, but the data -values can still be updated. +To protect against consolidated metadata accidentally getting out of sync, the +root group returned by :func:`zarr.convenience.open_consolidated` is read-only +for the metadata, meaning that no new groups or arrays can be created, and +arrays cannot be resized. However, data values with arrays can still be updated. Copying/migrating data ---------------------- From c8ed0f60838cbd79ec684ba5be44633829543299 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 1 Nov 2018 18:16:02 -0400 Subject: [PATCH 087/168] fix refs [ci skip] --- docs/release.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/release.rst b/docs/release.rst index 3421057fe4..96ac7c8f2f 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -9,11 +9,12 @@ Release notes Enhancements ~~~~~~~~~~~~ -* Add "consolidated" metadata as an experimental option: use :func:`zarr.consolidate_metadata` to copy - all metadata from the various keys within a data-set under a single key, and - :func:`zarr.open_consolidated` to use this single key. This can greatly cut down the - number of calls to the storage backend, and so remove a lot of of over head for - remote data. By :user:`Martin Durant `, :issue:`268`. +* Add "consolidated" metadata as an experimental feature: use + :func:`zarr.convenience.consolidate_metadata` to copy all metadata from the various + metadata keys within a dataset hierarchy under a single key, and + :func:`zarr.convenience.open_consolidated` to use this single key. This can greatly + cut down the number of calls to the storage backend, and so remove a lot of overhead + for reading remote data. By :user:`Martin Durant `, :issue:`268`. * Support has been added for structured arrays with sub-array shape and/or nested fields. By :user:`Tarik Onalan `, :issue:`111`, :issue:`296`. From 9c0c621194d4a061e609a4ed5b48f64b7934586e Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 3 Nov 2018 17:09:21 -0400 Subject: [PATCH 088/168] make consolidated metadata human-readable --- zarr/attrs.py | 4 ++-- zarr/compat.py | 4 ++++ zarr/convenience.py | 12 ++++++++++-- zarr/core.py | 3 +++ zarr/meta.py | 43 +++++++++++++++++++++++++++++++------------ zarr/storage.py | 16 +++++++++++++--- 6 files changed, 63 insertions(+), 19 deletions(-) diff --git a/zarr/attrs.py b/zarr/attrs.py index 6d74d6479a..21cb77bc10 100644 --- a/zarr/attrs.py +++ b/zarr/attrs.py @@ -4,8 +4,8 @@ from collections import MutableMapping -from zarr.compat import text_type from zarr.errors import PermissionError +from zarr.meta import parse_metadata class Attributes(MutableMapping): @@ -43,7 +43,7 @@ def _get_nosync(self): except KeyError: d = dict() else: - d = json.loads(text_type(data, 'ascii')) + d = parse_metadata(data) return d def asdict(self): diff --git a/zarr/compat.py b/zarr/compat.py index 9be3384123..117a8edf59 100644 --- a/zarr/compat.py +++ b/zarr/compat.py @@ -19,6 +19,8 @@ class PermissionError(Exception): def OrderedDict_move_to_end(od, key): od[key] = od.pop(key) + from collections import Mapping + else: # pragma: py2 no cover @@ -29,3 +31,5 @@ def OrderedDict_move_to_end(od, key): def OrderedDict_move_to_end(od, key): od.move_to_end(key) + + from collections.abc import Mapping diff --git a/zarr/convenience.py b/zarr/convenience.py index 0f4dfa9094..27b0655baa 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -15,6 +15,7 @@ from zarr.errors import err_path_not_found, CopyError from zarr.util import normalize_storage_path, TreeViewer, buffer_size from zarr.compat import PY2, text_type +from zarr.meta import ensure_str, json_dumps # noinspection PyShadowingBuiltins @@ -1119,8 +1120,15 @@ def is_zarr_key(key): return (key.endswith('.zarray') or key.endswith('.zgroup') or key.endswith('.zattrs')) - out = {key: store[key].decode() for key in store if is_zarr_key(key)} - store[metadata_key] = json.dumps(out).encode() +# out = {key: store[key].decode() for key in store if is_zarr_key(key)} + out = { + 'zarr_consolidated_format': 1, + 'metadata': { + key: json.loads(ensure_str(store[key])) + for key in store if is_zarr_key(key) + } + } + store[metadata_key] = json_dumps(out).encode() return open_consolidated(store, metadata_key=metadata_key) diff --git a/zarr/core.py b/zarr/core.py index 00ad269557..b4da45cd99 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -165,6 +165,9 @@ def _load_metadata_nosync(self): if config is None: self._compressor = None else: + # temporary workaround for + # https://github.com/zarr-developers/numcodecs/issues/78 + config = dict(config) self._compressor = get_codec(config) # setup filters diff --git a/zarr/meta.py b/zarr/meta.py index 291e5c6643..bef53c2917 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -7,14 +7,14 @@ import numpy as np -from zarr.compat import PY2, binary_type +from zarr.compat import PY2, binary_type, Mapping from zarr.errors import MetadataError ZARR_FORMAT = 2 -def _ensure_str(s): +def ensure_str(s): if PY2: # pragma: py3 no cover # noinspection PyUnresolvedReferences if isinstance(s, buffer): # noqa @@ -27,12 +27,32 @@ def _ensure_str(s): return s +def json_dumps(o): + """Write JSON in a consistent, human-readable way.""" + return json.dumps(o, indent=4, sort_keys=True, ensure_ascii=True, + separators=(',', ': ')) + + +def parse_metadata(s): + if isinstance(s, Mapping): + # assume metadata has already been parsed into a mapping object + meta = s + else: + # assume metadata needs to be parsed as JSON + s = ensure_str(s) + meta = json.loads(s) + return meta + + def decode_array_metadata(s): - s = _ensure_str(s) - meta = json.loads(s) + meta = parse_metadata(s) + + # check metadata format zarr_format = meta.get('zarr_format', None) if zarr_format != ZARR_FORMAT: raise MetadataError('unsupported zarr format: %s' % zarr_format) + + # extract array metadata fields try: dtype = decode_dtype(meta['dtype']) fill_value = decode_fill_value(meta['fill_value'], dtype) @@ -67,8 +87,7 @@ def encode_array_metadata(meta): order=meta['order'], filters=meta['filters'], ) - s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True, - separators=(',', ': ')) + s = json_dumps(meta) b = s.encode('ascii') return b @@ -98,14 +117,14 @@ def decode_dtype(d): def decode_group_metadata(s): - s = _ensure_str(s) - meta = json.loads(s) + meta = parse_metadata(s) + + # check metadata format version zarr_format = meta.get('zarr_format', None) if zarr_format != ZARR_FORMAT: raise MetadataError('unsupported zarr format: %s' % zarr_format) - meta = dict( - zarr_format=ZARR_FORMAT, - ) + + meta = dict(zarr_format=zarr_format) return meta @@ -115,7 +134,7 @@ def encode_group_metadata(meta=None): meta = dict( zarr_format=ZARR_FORMAT, ) - s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True) + s = json_dumps(meta) b = s.encode('ascii') return b diff --git a/zarr/storage.py b/zarr/storage.py index 5c8e1f611c..6720b42d12 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -41,7 +41,7 @@ from zarr.compat import PY2, binary_type, OrderedDict_move_to_end from numcodecs.registry import codec_registry from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, - err_fspath_exists_notdir, err_read_only) + err_fspath_exists_notdir, err_read_only, MetadataError) array_meta_key = '.zarray' @@ -1932,12 +1932,22 @@ class ConsolidatedMetadataStore(MutableMapping): """ def __init__(self, store, metadata_key='.zmetadata'): self.store = store + + # retrieve consolidated metadata if sys.version_info.major == 3 and sys.version_info.minor < 6: d = store[metadata_key].decode() # pragma: no cover else: # pragma: no cover d = store[metadata_key] - metadata = json.loads(d) - self.meta_store = {k: v.encode() for k, v in metadata.items()} + meta = json.loads(d) + + # check format of consolidated metadata + consolidated_format = meta.get('zarr_consolidated_format', None) + if consolidated_format != 1: + raise MetadataError('unsupported zarr consolidated metadata format: %s' % + consolidated_format) + + # decode metadata + self.meta_store = meta['metadata'] def __getitem__(self, key): return self.meta_store[key] From ccef26c3e86dfc45e310c5df15197e9f02a92819 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 6 Nov 2018 10:10:51 -0500 Subject: [PATCH 089/168] comments [ci skip] --- zarr/convenience.py | 1 - zarr/meta.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 27b0655baa..1bb99c92e4 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1120,7 +1120,6 @@ def is_zarr_key(key): return (key.endswith('.zarray') or key.endswith('.zgroup') or key.endswith('.zattrs')) -# out = {key: store[key].decode() for key in store if is_zarr_key(key)} out = { 'zarr_consolidated_format': 1, 'metadata': { diff --git a/zarr/meta.py b/zarr/meta.py index bef53c2917..9ce580eff2 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -34,13 +34,21 @@ def json_dumps(o): def parse_metadata(s): + + # Here we allow that a store may return an already-parsed metadata object, + # or a string of JSON that we will parse here. We allow for an already-parsed + # object to accommodate a consolidated metadata store, where all the metadata for + # all groups and arrays will already have been parsed from JSON. + if isinstance(s, Mapping): # assume metadata has already been parsed into a mapping object meta = s + else: # assume metadata needs to be parsed as JSON s = ensure_str(s) meta = json.loads(s) + return meta From a9cfa56b86414abcc378d5cc93110fe004872bf4 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 8 Nov 2018 11:10:43 +0000 Subject: [PATCH 090/168] rework requirements for pyup.io --- .pyup.yml | 24 +++++++++++++++ appveyor.yml | 32 ++++---------------- requirements.txt | 4 +-- requirements_dev.txt | 57 ----------------------------------- requirements_dev_npy.txt | 4 +++ requirements_dev_optional.txt | 1 + requirements_test.txt | 10 ++++++ tox.ini | 5 ++- 8 files changed, 51 insertions(+), 86 deletions(-) create mode 100644 .pyup.yml create mode 100644 requirements_dev_npy.txt create mode 100644 requirements_test.txt diff --git a/.pyup.yml b/.pyup.yml new file mode 100644 index 0000000000..0c85ee8e03 --- /dev/null +++ b/.pyup.yml @@ -0,0 +1,24 @@ +# pyup.io config file +# see https://pyup.io/docs/configuration/ for all available options + +schedule: every month + +requirements: + - requirements.txt: + pin: False + update: False + - requirements_test.txt: + pin: False + update: False + - requirements_rtfd.txt: + pin: False + update: False + - requirements_dev.txt: + pin: True + update: all + - requirements_dev_npy.txt: + pin: True + update: all + - requirements_dev_optional.txt: + pin: True + update: all diff --git a/appveyor.yml b/appveyor.yml index 987b51c1c4..67058550dc 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -12,49 +12,29 @@ environment: matrix: - - PYTHON: "C:\\Python27" - PYTHON_VERSION: "2.7" - NUMPY_VERSION: "1.15.2" - - PYTHON: "C:\\Python27-x64" PYTHON_VERSION: "2.7" - NUMPY_VERSION: "1.15.2" DISTUTILS_USE_SDK: "1" - - PYTHON: "C:\\Python35" - PYTHON_VERSION: "3.5" - NUMPY_VERSION: "1.15.2" - - PYTHON: "C:\\Python35-x64" PYTHON_VERSION: "3.5" - NUMPY_VERSION: "1.15.2" - - - PYTHON: "C:\\Python36" - PYTHON_VERSION: "3.6" - NUMPY_VERSION: "1.15.2" - PYTHON: "C:\\Python36-x64" PYTHON_VERSION: "3.6" - NUMPY_VERSION: "1.15.2" - - - PYTHON: "C:\\Python37" - PYTHON_VERSION: "3.7" - NUMPY_VERSION: "1.15.2" - PYTHON: "C:\\Python37-x64" PYTHON_VERSION: "3.7" - NUMPY_VERSION: "1.15.2" install: - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" + - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" + - "%CMD_IN_ENV% python -m pip install -rrequirements_test.txt" + - "%CMD_IN_ENV% python -m pip install -rrequirements_dev_npy.txt" + - "%CMD_IN_ENV% python -m pip install --no-binary=numcodecs -rrequirements_dev.txt" + - "%CMD_IN_ENV% python setup.py install" + - "%CMD_IN_ENV% python -m pip freeze" build: off test_script: - - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" - - "%CMD_IN_ENV% python -m pip install numpy==%NUMPY_VERSION%" - - "%CMD_IN_ENV% python -m pip install cython==0.29" - - "%CMD_IN_ENV% python -m pip install -v --no-binary=numcodecs numcodecs==0.5.5" - - "%CMD_IN_ENV% python -m pip install -rrequirements_dev.txt" - - "%CMD_IN_ENV% python setup.py install" - "%CMD_IN_ENV% python -m pytest -v --pyargs zarr" diff --git a/requirements.txt b/requirements.txt index 8720210cf5..ab79134cb7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ asciitree -pytest -numpy fasteners numcodecs +numpy +pytest diff --git a/requirements_dev.txt b/requirements_dev.txt index d495e04bfd..2ad18f372c 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,60 +1,3 @@ asciitree==0.3.3 -asn1crypto==0.24.0 -atomicwrites==1.2.1 -attrs==18.2.0 -bleach==3.0.2 -boto3==1.9.26 -botocore==1.12.26 -certifi==2018.10.15 -cffi==1.11.5 -chardet==3.0.4 -cmarkgfm==0.4.2 -configparser==3.5.0 -coverage==4.5.1 -coveralls==1.5.1 -cryptography==2.3.1 -Cython==0.29 -docopt==0.6.2 -docutils==0.14 -enum34==1.1.6 fasteners==0.14.1 -filelock==3.0.9 -flake8==3.5.0 -funcsigs==1.0.2 -future==0.16.0 -h5py==2.8.0 -idna==2.7 -ipaddress==1.0.22 -jmespath==0.9.3 -mccabe==0.6.1 -monotonic==1.5 -more-itertools==4.3.0 -msgpack-python==0.5.6 numcodecs==0.5.5 -pathlib2==2.3.2 -pkginfo==1.4.2 -pluggy==0.8.0 -py==1.7.0 -pycodestyle==2.3.1 -pycparser==2.19 -pyflakes==1.6.0 -Pygments==2.2.0 -pyOpenSSL==18.0.0 -pytest==3.9.1 -pytest-cov==2.6.0 -python-dateutil==2.7.3 -readme-renderer==22.0 -requests==2.19.1 -requests-toolbelt==0.8.0 -s3fs==0.1.6 -s3transfer==0.1.13 -scandir==1.9.0 -six==1.11.0 -toml==0.10.0 -tox==3.5.2 -tox-travis==0.11 -tqdm==4.27.0 -twine==1.12.1 -urllib3==1.23 -virtualenv==16.0.0 -webencodings==0.5.1 diff --git a/requirements_dev_npy.txt b/requirements_dev_npy.txt new file mode 100644 index 0000000000..78b80223e2 --- /dev/null +++ b/requirements_dev_npy.txt @@ -0,0 +1,4 @@ +# Break this out into a separate file to allow testing against +# different versions of numpy. This file should pin to the latest +# numpy version. +numpy==1.15.4 diff --git a/requirements_dev_optional.txt b/requirements_dev_optional.txt index a4e7c2a6bd..1ea71451d9 100644 --- a/requirements_dev_optional.txt +++ b/requirements_dev_optional.txt @@ -1,2 +1,3 @@ +# These packages are currently not available on Windows. bsddb3==6.2.6 lmdb==0.94 diff --git a/requirements_test.txt b/requirements_test.txt new file mode 100644 index 0000000000..6aebf7ec63 --- /dev/null +++ b/requirements_test.txt @@ -0,0 +1,10 @@ +coverage +coveralls +flake8 +h5py +msgpack-python +pytest +pytest-cov +s3fs +setuptools-scm +tox diff --git a/tox.ini b/tox.ini index 7435b01f45..e70bf85bfc 100644 --- a/tox.ini +++ b/tox.ini @@ -30,12 +30,15 @@ commands = py37: python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst # pep8 checks py37: flake8 --max-line-length=100 zarr + # print environment for debugging + pip freeze deps = py27: backports.lzma py36-npy113: numpy==1.13.3 py36-npy114: numpy==1.14.6 - py27,py35,py36-npy115,py37: numpy==1.15.2 + py27,py35,py36-npy115,py37: -rrequirements_dev_npy.txt -rrequirements_dev.txt + -rrequirements_test.txt # linux only -rrequirements_dev_optional.txt From c8bb0073eb0448f06c1b98d37ebe2aafebf297ac Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 14 Nov 2018 09:15:28 +0300 Subject: [PATCH 091/168] include cython in test requirements --- requirements_test.txt | 1 + tox.ini | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements_test.txt b/requirements_test.txt index 6aebf7ec63..a668f130cc 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -1,5 +1,6 @@ coverage coveralls +cython flake8 h5py msgpack-python diff --git a/tox.ini b/tox.ini index e70bf85bfc..d443dec5df 100644 --- a/tox.ini +++ b/tox.ini @@ -37,8 +37,8 @@ deps = py36-npy113: numpy==1.13.3 py36-npy114: numpy==1.14.6 py27,py35,py36-npy115,py37: -rrequirements_dev_npy.txt - -rrequirements_dev.txt -rrequirements_test.txt + -rrequirements_dev.txt # linux only -rrequirements_dev_optional.txt From f304350bf3c56a6e70f12ae16e4b34a2aafb5e5c Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 14 Nov 2018 19:19:39 +0300 Subject: [PATCH 092/168] fix coverage --- zarr/tests/test_storage.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 79e6adaeac..b4eb671724 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -19,12 +19,13 @@ DirectoryStore, ZipStore, init_group, group_meta_key, getsize, migrate_1to2, TempStore, atexit_rmtree, NestedDirectoryStore, default_compressor, DBMStore, - LMDBStore, atexit_rmglob, LRUStoreCache) + LMDBStore, atexit_rmglob, LRUStoreCache, + ConsolidatedMetadataStore) from zarr.meta import (decode_array_metadata, encode_array_metadata, ZARR_FORMAT, decode_group_metadata, encode_group_metadata) from zarr.compat import PY2 from zarr.codecs import Zlib, Blosc, BZ2 -from zarr.errors import PermissionError +from zarr.errors import PermissionError, MetadataError from zarr.hierarchy import group from zarr.tests.util import CountingDict @@ -1251,3 +1252,16 @@ def test_format_compatibility(): else: assert compressor.codec_id == z.compressor.codec_id assert compressor.get_config() == z.compressor.get_config() + + +class TestConsolidatedMetadataStore(unittest.TestCase): + + def test_bad_format(self): + store = dict() + metadata = json.dumps({ + # bad format version + 'zarr_consolidated_format': 0, + }) + store['.zmetadata'] = metadata + with pytest.raises(MetadataError): + ConsolidatedMetadataStore(store) From 892470fc00cba504b271bd93d102ff9cdf19258d Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 14 Nov 2018 19:29:51 +0300 Subject: [PATCH 093/168] add basic tests for reading and writing to ConsolidatedMetadataStore --- zarr/tests/test_storage.py | 39 +++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index b4eb671724..7636a7078c 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1257,11 +1257,44 @@ def test_format_compatibility(): class TestConsolidatedMetadataStore(unittest.TestCase): def test_bad_format(self): + + # setup store with consolidated metdata store = dict() - metadata = json.dumps({ + consolidated = { # bad format version 'zarr_consolidated_format': 0, - }) - store['.zmetadata'] = metadata + } + store['.zmetadata'] = json.dumps(consolidated) + + # check appropriate error is raised with pytest.raises(MetadataError): ConsolidatedMetadataStore(store) + + def test_read_write(self): + + # setup store with consolidated metdata + store = dict() + consolidated = { + 'zarr_consolidated_format': 1, + 'metadata': { + 'foo': 'bar', + 'baz': 42, + } + } + store['.zmetadata'] = json.dumps(consolidated) + + # create consolidated store + cs = ConsolidatedMetadataStore(store) + + # test __contains__, __getitem__ + for key, value in consolidated['metadata'].items(): + assert key in cs + assert value == cs[key] + + # test __delitem__, __setitem__ + with pytest.raises(PermissionError): + del cs['foo'] + with pytest.raises(PermissionError): + cs['bar'] = 0 + with pytest.raises(PermissionError): + cs['spam'] = 'eggs' From 081dcd2e258026ee76a8c1ea14a5e7e97d38c6eb Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 14 Nov 2018 23:38:30 +0300 Subject: [PATCH 094/168] fix consolidated tests --- zarr/tests/test_storage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 7636a7078c..33c65f36c9 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1264,7 +1264,7 @@ def test_bad_format(self): # bad format version 'zarr_consolidated_format': 0, } - store['.zmetadata'] = json.dumps(consolidated) + store['.zmetadata'] = json.dumps(consolidated).encode() # check appropriate error is raised with pytest.raises(MetadataError): @@ -1281,7 +1281,7 @@ def test_read_write(self): 'baz': 42, } } - store['.zmetadata'] = json.dumps(consolidated) + store['.zmetadata'] = json.dumps(consolidated).encode() # create consolidated store cs = ConsolidatedMetadataStore(store) From 4ac24fcf949f651393b1cb8a4581f632d203a15b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 16 Nov 2018 11:35:36 -0500 Subject: [PATCH 095/168] Configure flake8's line length as 100 This allows users to run `flake8 zarr` and pick up the desired configuration by default. --- tox.ini | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index d443dec5df..29303876a6 100644 --- a/tox.ini +++ b/tox.ini @@ -29,7 +29,7 @@ commands = # run doctests in the tutorial and spec py37: python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst # pep8 checks - py37: flake8 --max-line-length=100 zarr + py37: flake8 zarr # print environment for debugging pip freeze deps = @@ -49,3 +49,6 @@ deps = -rrequirements_rtfd.txt commands = sphinx-build -W -b html -d {envtmpdir}/doctrees . {envtmpdir}/html + +[flake8] +max-line-length = 100 From a0f34d331fe8c46c25cd090c6f054b11b140392c Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 19 Nov 2018 11:30:27 -0500 Subject: [PATCH 096/168] pass kwargs to consolidate --- zarr/convenience.py | 7 +++++-- zarr/tests/test_convenience.py | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 1bb99c92e4..c6e5432664 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1131,7 +1131,7 @@ def is_zarr_key(key): return open_consolidated(store, metadata_key=metadata_key) -def open_consolidated(store, metadata_key='.zmetadata', mode='r+'): +def open_consolidated(store, metadata_key='.zmetadata', mode='r+', **kwargs): """Open group using metadata previously consolidated into a single key. This is an optimised method for opening a Zarr group, where instead of @@ -1158,6 +1158,9 @@ def open_consolidated(store, metadata_key='.zmetadata', mode='r+'): read/write (must exist) although only writes to data are allowed, changes to metadata including creation of new arrays or group are not allowed. + **kwargs + Additional parameters are passed through to :func:`zarr.creation.open_array` or + :func:`zarr.hierarchy.open_group`. Returns ------- @@ -1182,4 +1185,4 @@ def open_consolidated(store, metadata_key='.zmetadata', mode='r+'): meta_store = ConsolidatedMetadataStore(store, metadata_key=metadata_key) # pass through - return open(store=meta_store, chunk_store=store, mode=mode) + return open(store=meta_store, chunk_store=store, mode=mode, **kwargs) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 12bfab4a5a..f64d27ed16 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -163,6 +163,9 @@ def test_consolidate_metadata(): with pytest.raises(ValueError): open_consolidated(store, mode='w') + # make sure keyword arguments are passed through without error + open_consolidated(store, cache_attrs=True, synchronizer=None) + class TestCopyStore(unittest.TestCase): From 6cb8e851f35a750df136fdcbc43439f77ded31f2 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Wed, 21 Nov 2018 13:07:21 -0500 Subject: [PATCH 097/168] Mark decoding as not covered by Python 2 tests --- zarr/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/core.py b/zarr/core.py index b4da45cd99..97d1bdc0f8 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1924,7 +1924,7 @@ def hexdigest(self, hashname="sha1"): checksum = binascii.hexlify(self.digest(hashname=hashname)) # This is a bytes object on Python 3 and we want a str. - if type(checksum) is not str: + if type(checksum) is not str: # pragma: py2 no cover checksum = checksum.decode('utf8') return checksum From 08fe155078d8956987bb2293cad5212792ee2538 Mon Sep 17 00:00:00 2001 From: shikhar Date: Tue, 27 Nov 2018 13:10:13 +0530 Subject: [PATCH 098/168] added azure-storage-blob --- requirements_test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements_test.txt b/requirements_test.txt index a668f130cc..1492a673c1 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -9,3 +9,4 @@ pytest-cov s3fs setuptools-scm tox +azure-storage-blob From 957b405aa780697c6758506a395fcc6069048270 Mon Sep 17 00:00:00 2001 From: shikhar Date: Fri, 30 Nov 2018 13:44:12 +0530 Subject: [PATCH 099/168] first attempt at docker build with azurite --- .travis.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.travis.yml b/.travis.yml index 8a5e1fe521..72494e0080 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,9 @@ addons: packages: - libdb-dev +services: + - docker + matrix: include: - python: 2.7 @@ -20,6 +23,11 @@ matrix: dist: xenial sudo: true +before_install: + - docker pull arafato/azurite + - mkdir ~/blob_emulator + - docker run -e executable=blob -d -t -p 10000:10000 -v ~/blob_emulator:/opt/azurite/folder arafato/azurite + install: - pip install -U pip setuptools wheel tox-travis coveralls From 9c128dbab7d7ff619f81d43e2b287749da0c4272 Mon Sep 17 00:00:00 2001 From: shikhar Date: Fri, 30 Nov 2018 14:48:35 +0530 Subject: [PATCH 100/168] azure storage emulator in appveyor --- appveyor.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/appveyor.yml b/appveyor.yml index 67058550dc..c5326ec5b2 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -26,6 +26,13 @@ environment: PYTHON_VERSION: "3.7" install: + - ps: | + $msiPath = "$($env:USERPROFILE)\MicrosoftAzureStorageEmulator.msi" + (New-Object Net.WebClient).DownloadFile('https://download.microsoft.com/download/B/4/A/B4A8422F-C564-4393-80DA-6865A8C4B32D/MicrosoftAzureStorageEmulator.msi', $msiPath) + cmd /c start /wait msiexec /i $msiPath /quiet + - cmd: | + "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" start + "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" status - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" - "%CMD_IN_ENV% python -m pip install -rrequirements_test.txt" @@ -38,3 +45,7 @@ build: off test_script: - "%CMD_IN_ENV% python -m pytest -v --pyargs zarr" + +on_finish: + - cmd: | + "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" stop From 2da245306e27c6256834063f64bb2d464dde9fb1 Mon Sep 17 00:00:00 2001 From: shikhar Date: Fri, 30 Nov 2018 14:50:48 +0530 Subject: [PATCH 101/168] syntax correction --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index c5326ec5b2..482736e6f0 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -30,7 +30,7 @@ install: $msiPath = "$($env:USERPROFILE)\MicrosoftAzureStorageEmulator.msi" (New-Object Net.WebClient).DownloadFile('https://download.microsoft.com/download/B/4/A/B4A8422F-C564-4393-80DA-6865A8C4B32D/MicrosoftAzureStorageEmulator.msi', $msiPath) cmd /c start /wait msiexec /i $msiPath /quiet - - cmd: | + - cmd: | "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" start "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" status - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" From a09e2c9b8ae5e53efdeab60490c30fbf50c805be Mon Sep 17 00:00:00 2001 From: shikhar Date: Fri, 30 Nov 2018 15:33:35 +0530 Subject: [PATCH 102/168] checking if emulator is preinstalled --- appveyor.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 482736e6f0..7c15c0756e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -26,10 +26,6 @@ environment: PYTHON_VERSION: "3.7" install: - - ps: | - $msiPath = "$($env:USERPROFILE)\MicrosoftAzureStorageEmulator.msi" - (New-Object Net.WebClient).DownloadFile('https://download.microsoft.com/download/B/4/A/B4A8422F-C564-4393-80DA-6865A8C4B32D/MicrosoftAzureStorageEmulator.msi', $msiPath) - cmd /c start /wait msiexec /i $msiPath /quiet - cmd: | "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" start "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" status From 5ce6a4c46f407315de1a48441471bf220fde0273 Mon Sep 17 00:00:00 2001 From: shikhar Date: Fri, 30 Nov 2018 15:36:52 +0530 Subject: [PATCH 103/168] syntax fix --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 7c15c0756e..38ee256444 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -26,7 +26,7 @@ environment: PYTHON_VERSION: "3.7" install: - - cmd: | + - cmd: "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" start "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" status - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" From bf8aa371a99b4db5622264b904d98a6132106deb Mon Sep 17 00:00:00 2001 From: shikhar Date: Fri, 30 Nov 2018 15:38:31 +0530 Subject: [PATCH 104/168] syntax fix --- appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 38ee256444..4977cce4ae 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -27,8 +27,8 @@ environment: install: - cmd: - "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" start - "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" status + - "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" start + - "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" status - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" - "%CMD_IN_ENV% python -m pip install -rrequirements_test.txt" From 730255c5edeccad1a694d10e859b1fc021b79210 Mon Sep 17 00:00:00 2001 From: shikhar Date: Fri, 30 Nov 2018 15:50:22 +0530 Subject: [PATCH 105/168] syntax fix --- appveyor.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 4977cce4ae..e33b697357 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -26,9 +26,7 @@ environment: PYTHON_VERSION: "3.7" install: - - cmd: - - "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" start - - "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" status + - cmd: "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" start - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" - "%CMD_IN_ENV% python -m pip install -rrequirements_test.txt" From 7eed366397a4c8bf98f8bb4adac34fa022ac4532 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 30 Nov 2018 12:18:30 -0500 Subject: [PATCH 106/168] Bump Numcodecs requirement to 0.6.1 --- requirements_dev.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index 2ad18f372c..d39ba9e9b8 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,3 +1,3 @@ asciitree==0.3.3 fasteners==0.14.1 -numcodecs==0.5.5 +numcodecs==0.6.1 diff --git a/setup.py b/setup.py index a5e8334e43..903af3bc04 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ 'asciitree', 'numpy>=1.7', 'fasteners', - 'numcodecs>=0.5.3', + 'numcodecs>=0.6.1', ], package_dir={'': '.'}, packages=['zarr', 'zarr.tests'], From 2552f620191cafa72429566f4a8ce4f49b4db4d3 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 30 Nov 2018 13:06:29 -0500 Subject: [PATCH 107/168] Assert MsgPack round-trips bytes objects correctly Previously MsgPack was turning bytes objects to unicode objects when round-tripping them. However this has been fixed in the latest version of Numcodecs. So correct this test now that MsgPack is working correctly. --- zarr/tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 11891f8fe9..544ec95c41 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -982,7 +982,7 @@ def test_object_arrays(self): z[0] = 'foo' assert z[0] == 'foo' z[1] = b'bar' - assert z[1] == 'bar' # msgpack gets this wrong + assert z[1] == b'bar' z[2] = 1 assert z[2] == 1 z[3] = [2, 4, 6, 'baz'] From aee5aceced5e5a3f2698f2363540f064c200f4a9 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 1 Dec 2018 14:09:40 +0000 Subject: [PATCH 108/168] properly guard against removal of object codec --- zarr/core.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index b4da45cd99..bcae03cb9f 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,6 +8,7 @@ import numpy as np +from numcodecs.compat import ensure_contiguous_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -1743,18 +1744,25 @@ def _decode_chunk(self, cdata): for f in self._filters[::-1]: chunk = f.decode(chunk) - # view as correct dtype + # view as numpy array with correct dtype if self._dtype == object: - if isinstance(chunk, np.ndarray): - chunk = chunk.astype(self._dtype) + # special case object dtype, because incorrect handling can lead to + # segfaults and other bad things happening + if isinstance(chunk, np.ndarray) and chunk.dtype == object: + # chunk is already of correct dtype, good to carry on + # flatten just to be sure we can reshape later + chunk = chunk.reshape(-1, order='A') else: + # If we end up here, someone must have hacked around with the filters. + # We cannot deal with object arrays unless there is an object + # codec in the filter chain, i.e., a filter that converts from object + # array to something else during encoding, and converts back to object + # array during decoding. raise RuntimeError('cannot read object array without object codec') - elif isinstance(chunk, np.ndarray): - chunk = chunk.view(self._dtype) else: - chunk = np.frombuffer(chunk, dtype=self._dtype) + chunk = ensure_contiguous_ndarray(chunk).view(self._dtype) - # reshape + # ensure correct chunk shape chunk = chunk.reshape(self._chunks, order=self._order) return chunk From bf4eee8cc763b1917e299fcfde04a5e5d9a0938b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 13:21:24 -0500 Subject: [PATCH 109/168] Ensure `chunk` in `_decode_chunk` is an `ndarray` --- zarr/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index bcae03cb9f..94bd94edde 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,7 +8,7 @@ import numpy as np -from numcodecs.compat import ensure_contiguous_ndarray +from numcodecs.compat import ensure_ndarray, ensure_contiguous_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -1745,10 +1745,11 @@ def _decode_chunk(self, cdata): chunk = f.decode(chunk) # view as numpy array with correct dtype + chunk = ensure_ndarray(chunk) if self._dtype == object: # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening - if isinstance(chunk, np.ndarray) and chunk.dtype == object: + if chunk.dtype == object: # chunk is already of correct dtype, good to carry on # flatten just to be sure we can reshape later chunk = chunk.reshape(-1, order='A') From b741fe12a0099cdcc0697a80b3ace31c82738cce Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 15:47:44 -0500 Subject: [PATCH 110/168] Reshape `chunk` ourselves since it is an `ndarray` As we already ensured the `chunk` is an `ndarray` viewing the original data, there is no need for us to do that here as well. Plus the checks performed by `ensure_contiguous_ndarray` are not needed for our use case here. Particularly as we have already handled the unusual type cases above. We also don't need to constrain the buffer size. As such the only thing we really need is to flatten the array and make it contiguous, which is what we handle here directly. --- zarr/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 94bd94edde..b5d0185faf 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,7 +8,7 @@ import numpy as np -from numcodecs.compat import ensure_ndarray, ensure_contiguous_ndarray +from numcodecs.compat import ensure_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -1761,7 +1761,7 @@ def _decode_chunk(self, cdata): # array during decoding. raise RuntimeError('cannot read object array without object codec') else: - chunk = ensure_contiguous_ndarray(chunk).view(self._dtype) + chunk = chunk.reshape(-1, order='A').view(self._dtype) # ensure correct chunk shape chunk = chunk.reshape(self._chunks, order=self._order) From f3144ae6b4fdc929eb1390b1ed87ee5a35e6862f Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 15:47:50 -0500 Subject: [PATCH 111/168] Refactor `reshape` from `_decode_chunk` As both the expected `object` case and the non-`object` case perform a `reshape` to flatten the data, go ahead and refactor that out of both cases and handle it generally. Simplifies the code a bit. --- zarr/core.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index b5d0185faf..ab5de14512 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1749,11 +1749,7 @@ def _decode_chunk(self, cdata): if self._dtype == object: # special case object dtype, because incorrect handling can lead to # segfaults and other bad things happening - if chunk.dtype == object: - # chunk is already of correct dtype, good to carry on - # flatten just to be sure we can reshape later - chunk = chunk.reshape(-1, order='A') - else: + if chunk.dtype != object: # If we end up here, someone must have hacked around with the filters. # We cannot deal with object arrays unless there is an object # codec in the filter chain, i.e., a filter that converts from object @@ -1761,9 +1757,10 @@ def _decode_chunk(self, cdata): # array during decoding. raise RuntimeError('cannot read object array without object codec') else: - chunk = chunk.reshape(-1, order='A').view(self._dtype) + chunk = chunk.view(self._dtype) # ensure correct chunk shape + chunk = chunk.reshape(-1, order='A') chunk = chunk.reshape(self._chunks, order=self._order) return chunk From 3e3920af230e059e84f70563c4f215d60f845aed Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Sat, 1 Dec 2018 15:47:53 -0500 Subject: [PATCH 112/168] Consolidate type checks in `_decode_chunk` As refactoring of the `reshape` step has effectively dropped the expected `object` type case, the checks for different types is a little more complicated than needed. To fix this, basically invert and swap the case ordering. This way we can handle all generally expected types first and simply cast them. Then we can raise if an `object` type shows up and is unexpected. --- zarr/core.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index ab5de14512..a2a07a29ba 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1746,18 +1746,17 @@ def _decode_chunk(self, cdata): # view as numpy array with correct dtype chunk = ensure_ndarray(chunk) - if self._dtype == object: - # special case object dtype, because incorrect handling can lead to - # segfaults and other bad things happening - if chunk.dtype != object: - # If we end up here, someone must have hacked around with the filters. - # We cannot deal with object arrays unless there is an object - # codec in the filter chain, i.e., a filter that converts from object - # array to something else during encoding, and converts back to object - # array during decoding. - raise RuntimeError('cannot read object array without object codec') - else: + # special case object dtype, because incorrect handling can lead to + # segfaults and other bad things happening + if self._dtype != object: chunk = chunk.view(self._dtype) + elif chunk.dtype != object: + # If we end up here, someone must have hacked around with the filters. + # We cannot deal with object arrays unless there is an object + # codec in the filter chain, i.e., a filter that converts from object + # array to something else during encoding, and converts back to object + # array during decoding. + raise RuntimeError('cannot read object array without object codec') # ensure correct chunk shape chunk = chunk.reshape(-1, order='A') From 9badf39ff815438244f131197457f815a51d56a6 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 18:56:44 -0500 Subject: [PATCH 113/168] Drop `ensure_bytes` definition from `zarr.storage` As Numcodecs now includes a very versatile and effective `ensure_bytes` function, there is no need to define our own in `zarr.storage` as well. So go ahead and drop it. --- zarr/storage.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 6720b42d12..a490f5e8fe 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -38,8 +38,9 @@ normalize_storage_path, buffer_size, normalize_fill_value, nolock, normalize_dtype) from zarr.meta import encode_array_metadata, encode_group_metadata -from zarr.compat import PY2, binary_type, OrderedDict_move_to_end +from zarr.compat import PY2, OrderedDict_move_to_end from numcodecs.registry import codec_registry +from numcodecs.compat import ensure_bytes from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, err_fspath_exists_notdir, err_read_only, MetadataError) @@ -444,23 +445,6 @@ def _init_group_metadata(store, overwrite=False, path=None, chunk_store=None): store[key] = encode_group_metadata(meta) -def ensure_bytes(s): - if isinstance(s, binary_type): - return s - if isinstance(s, np.ndarray): - if PY2: # pragma: py3 no cover - # noinspection PyArgumentList - return s.tostring(order='A') - else: # pragma: py2 no cover - # noinspection PyArgumentList - return s.tobytes(order='A') - if hasattr(s, 'tobytes'): - return s.tobytes() - if PY2 and hasattr(s, 'tostring'): # pragma: py3 no cover - return s.tostring() - return memoryview(s).tobytes() - - def _dict_store_keys(d, prefix='', cls=dict): for k in d.keys(): v = d[k] From 7bd8a2a1a3608def7335f1ea1a1115da5b1b1cec Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 18:56:45 -0500 Subject: [PATCH 114/168] Take flattened array views to avoid some copies Make use of Numcodecs' `ensure_contiguous_ndarray` to take `ndarray` views onto buffers to be stored in a few cases so as to reshape them and avoid a copy (thanks to the buffer protocol). This ensures that datetime/timedeltas are handled by default. Also catches things like object arrays. Finally this handles flattening the array if needed. All-in-all this gets as close to a `bytes` object as possible while not copying and doing its best to preserve type information while constructing something that fits the buffer protocol. --- zarr/storage.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index a490f5e8fe..24bd4506d0 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -31,16 +31,13 @@ import warnings -import numpy as np - - from zarr.util import (normalize_shape, normalize_chunks, normalize_order, normalize_storage_path, buffer_size, normalize_fill_value, nolock, normalize_dtype) from zarr.meta import encode_array_metadata, encode_group_metadata from zarr.compat import PY2, OrderedDict_move_to_end from numcodecs.registry import codec_registry -from numcodecs.compat import ensure_bytes +from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, err_fspath_exists_notdir, err_read_only, MetadataError) @@ -725,9 +722,8 @@ def __getitem__(self, key): def __setitem__(self, key, value): - # handle F-contiguous numpy arrays - if isinstance(value, np.ndarray) and value.flags.f_contiguous: - value = ensure_bytes(value) + # coerce to flat, contiguous array (ideally without copying) + value = ensure_contiguous_ndarray(value) # destination path for key file_path = os.path.join(self.path, key) @@ -1176,7 +1172,7 @@ def __getitem__(self, key): def __setitem__(self, key, value): if self.mode == 'r': err_read_only() - value = ensure_bytes(value) + value = ensure_contiguous_ndarray(value) with self.mutex: self.zf.writestr(key, value) From 2c6ac778d6e07364e49c712f967cd2ab325011a5 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 18:56:45 -0500 Subject: [PATCH 115/168] Simplify `buffer_size` by using `ensure_ndarray` Rewrite `buffer_size` to just use Numcodecs' `ensure_ndarray` to get an `ndarray` that views the data. Once the `ndarray` is gotten, all that is needed is to access its `nbytes` member, which returns the number of bytes that it takes up. --- zarr/util.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/zarr/util.py b/zarr/util.py index b79865bfe8..ad882c41d5 100644 --- a/zarr/util.py +++ b/zarr/util.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division -import operator from textwrap import TextWrapper, dedent import numbers import uuid @@ -10,10 +9,11 @@ from asciitree import BoxStyle, LeftAligned from asciitree.traversal import Traversal import numpy as np +from numcodecs.compat import ensure_ndarray from numcodecs.registry import codec_registry -from zarr.compat import PY2, reduce, text_type, binary_type +from zarr.compat import PY2, text_type, binary_type # codecs to use for object dtype convenience API @@ -314,17 +314,7 @@ def normalize_storage_path(path): def buffer_size(v): - from array import array as _stdlib_array - if PY2 and isinstance(v, _stdlib_array): # pragma: py3 no cover - # special case array.array because does not support buffer - # interface in PY2 - return v.buffer_info()[1] * v.itemsize - else: # pragma: py2 no cover - v = memoryview(v) - if v.shape: - return reduce(operator.mul, v.shape) * v.itemsize - else: - return v.itemsize + return ensure_ndarray(v).nbytes def info_text_report(items): From 398820f9137c9b2e4690f8cf7653436186af6fa9 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 18:56:46 -0500 Subject: [PATCH 116/168] Simplify `ensure_str` in `zarr.meta` If the data is already a `str` instance, turn `ensure_str` into a no-op. For all other cases, make use of Numcodecs' `ensure_bytes` to aid `ensure_str` in coercing data through the buffer protocol. If we are on Python 3, then decode the `bytes` object to a `str`. --- zarr/meta.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/zarr/meta.py b/zarr/meta.py index 9ce580eff2..7984efb701 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -5,9 +5,10 @@ import numpy as np +from numcodecs.compat import ensure_bytes -from zarr.compat import PY2, binary_type, Mapping +from zarr.compat import PY2, Mapping from zarr.errors import MetadataError @@ -15,14 +16,9 @@ def ensure_str(s): - if PY2: # pragma: py3 no cover - # noinspection PyUnresolvedReferences - if isinstance(s, buffer): # noqa - s = str(s) - else: # pragma: py2 no cover - if isinstance(s, memoryview): - s = s.tobytes() - if isinstance(s, binary_type): + if not isinstance(s, str): + s = ensure_bytes(s) + if not PY2: # pragma: py2 no cover s = s.decode('ascii') return s From bc4d57907a0406cea92f3fa18c4c0daad54e44fc Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 18:59:20 -0500 Subject: [PATCH 117/168] Bump to Numcodecs 0.6.2 --- requirements_dev.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index d39ba9e9b8..03eaa8e871 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,3 +1,3 @@ asciitree==0.3.3 fasteners==0.14.1 -numcodecs==0.6.1 +numcodecs==0.6.2 diff --git a/setup.py b/setup.py index 903af3bc04..b6d237fe0a 100644 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ 'asciitree', 'numpy>=1.7', 'fasteners', - 'numcodecs>=0.6.1', + 'numcodecs>=0.6.2', ], package_dir={'': '.'}, packages=['zarr', 'zarr.tests'], From efacb5234573fae72661d50e15180db414fe6ff8 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 19:11:06 -0500 Subject: [PATCH 118/168] Update tutorial's info content As Blosc got upgraded and it contained an upgrade of Zstd, the results changed a little bit for this example. So update them accordingly. Should fix the doctest failure. --- docs/tutorial.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 606b5acef5..29ce8b0935 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -178,8 +178,8 @@ print some diagnostics, e.g.:: : blocksize=0) Store type : builtins.dict No. bytes : 400000000 (381.5M) - No. bytes stored : 3242241 (3.1M) - Storage ratio : 123.4 + No. bytes stored : 3379344 (3.2M) + Storage ratio : 118.4 Chunks initialized : 100/100 If you don't specify a compressor, by default Zarr uses the Blosc From 3299c42abbf8a34d1cf2a7fc104ffd956675c8db Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 3 Dec 2018 23:28:48 -0500 Subject: [PATCH 119/168] Fix missing backslash in docs [ci skip] --- docs/spec/v2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/spec/v2.rst b/docs/spec/v2.rst index 67941b7902..fa47e38e8d 100644 --- a/docs/spec/v2.rst +++ b/docs/spec/v2.rst @@ -253,7 +253,7 @@ keys like "foo/bar/0.0", "foo/bar/0.1", etc. To ensure consistent behaviour across different storage systems, logical paths MUST be normalized as follows: -* Replace all backward slash characters ("\\") with forward slash characters +* Replace all backward slash characters ("\\\\") with forward slash characters ("/") * Strip any leading "/" characters * Strip any trailing "/" characters From 15d0e126d845bcea3807b53d37316290dad3c174 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 4 Dec 2018 09:58:09 -0500 Subject: [PATCH 120/168] Skip coverage of temp file cleanup In the ideal case, this cleanup step never happens on CI as the file got moved into place and so no longer exists at the old location. Given this, we ignore coverage on this line. --- zarr/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index 6720b42d12..65afc58539 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -778,7 +778,7 @@ def __setitem__(self, key, value): finally: # clean up if temp file still exists for whatever reason - if temp_path is not None and os.path.exists(temp_path): + if temp_path is not None and os.path.exists(temp_path): # pragma: no cover os.remove(temp_path) def __delitem__(self, key): From cc1d7762e64a78cd0e8941430479de4a18d70382 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 4 Dec 2018 22:24:43 +0000 Subject: [PATCH 121/168] release notes [ci skip] --- docs/release.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/release.rst b/docs/release.rst index 96ac7c8f2f..335ec58fb3 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -22,6 +22,11 @@ Enhancements Maintenance ~~~~~~~~~~~ +* The required version of the `numcodecs `_ package has been upgraded + to 0.6.2, which has enabled some code simplification and fixes a failing test involving + msgpack encoding. By :user:`John Kirkham `, :issue:`352`, :issue:`355`, + :issue:`324`. + * CI and test environments have been upgraded to include Python 3.7, drop Python 3.4, and upgrade all pinned package requirements. :issue:`308`. From 94f7a8d08de6f43b05fd79725c67845b9df91ec8 Mon Sep 17 00:00:00 2001 From: sbalmer Date: Wed, 5 Dec 2018 00:00:13 +0100 Subject: [PATCH 122/168] avoid race condition during chunk write (#327) * avoid race condition during chunk write When the chunk file is first removed before the new version is moved into place, racing reads may encounter a missing chunk. Using rename() or replace() without remove() avoids the issue on Posix-Systems as the methods are atomic. The fallback of remove() -> rename() is included for Windows pre Python 3.3. Fixes #263 * move feature-detection to init-time so it's not repeated on every write * use pyosreplace to get atomic replace() on Windows * disable coverage count for legacy branches * Use conditional instead of env marker Because the env markers didn't work. Just guessing at this point. * add pyosreplace package to requirements * release notes [ci skip] --- docs/release.rst | 20 +++++++++++++------- requirements.txt | 1 + requirements_dev.txt | 1 + setup.py | 19 ++++++++++++------- zarr/storage.py | 13 ++++++++++--- 5 files changed, 37 insertions(+), 17 deletions(-) diff --git a/docs/release.rst b/docs/release.rst index 335ec58fb3..45eb9c8a49 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -19,22 +19,28 @@ Enhancements * Support has been added for structured arrays with sub-array shape and/or nested fields. By :user:`Tarik Onalan `, :issue:`111`, :issue:`296`. -Maintenance -~~~~~~~~~~~ +Bug fixes +~~~~~~~~~ + +* The implementation of the :class:`zarr.storage.DirectoryStore` class has been modified to + ensure that writes are atomic and there are no race conditions where a chunk might appear + transiently missing during a write operation. By :user:`sbalmer `, :issue:`327`, + :issue:`263`. * The required version of the `numcodecs `_ package has been upgraded to 0.6.2, which has enabled some code simplification and fixes a failing test involving msgpack encoding. By :user:`John Kirkham `, :issue:`352`, :issue:`355`, :issue:`324`. -* CI and test environments have been upgraded to include Python 3.7, drop Python 3.4, and - upgrade all pinned package requirements. :issue:`308`. - * Failing tests related to pickling/unpickling have been fixed. By :user:`Ryan Williams `, :issue:`273`, :issue:`308`. -Acknowledgments -~~~~~~~~~~~~~~~ +Maintenance +~~~~~~~~~~~ + +* CI and test environments have been upgraded to include Python 3.7, drop Python 3.4, and + upgrade all pinned package requirements. :issue:`308`. + .. _release_2.2.0: diff --git a/requirements.txt b/requirements.txt index ab79134cb7..e035a2fc72 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ fasteners numcodecs numpy pytest +pyosreplace; python_version < '3.3' and sys.platform == 'win32' diff --git a/requirements_dev.txt b/requirements_dev.txt index 03eaa8e871..957edac387 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,3 +1,4 @@ asciitree==0.3.3 fasteners==0.14.1 numcodecs==0.6.2 +pyosreplace==0.1; python_version < '3.3' and sys.platform == 'win32' diff --git a/setup.py b/setup.py index b6d237fe0a..69f434756f 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division from setuptools import setup +import sys DESCRIPTION = 'An implementation of chunked, compressed, ' \ @@ -9,6 +10,16 @@ with open('README.rst') as f: LONG_DESCRIPTION = f.read() +dependencies = [ + 'asciitree', + 'numpy>=1.7', + 'fasteners', + 'numcodecs>=0.6.2', +] + +if sys.version_info < (3, 3) and sys.platform == "win32": + dependencies.append('pyosreplace') + setup( name='zarr', description=DESCRIPTION, @@ -22,12 +33,7 @@ 'setuptools>18.0', 'setuptools-scm>1.5.4' ], - install_requires=[ - 'asciitree', - 'numpy>=1.7', - 'fasteners', - 'numcodecs>=0.6.2', - ], + install_requires=dependencies, package_dir={'': '.'}, packages=['zarr', 'zarr.tests'], classifiers=[ @@ -42,7 +48,6 @@ 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', diff --git a/zarr/storage.py b/zarr/storage.py index 82ba1d308a..d173acf735 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -53,6 +53,15 @@ from zarr.codecs import Zlib default_compressor = Zlib() +# Find which function to use for atomic replace +if sys.version_info >= (3, 3): + from os import replace +elif sys.platform == "win32": # pragma: no cover + from osreplace import replace +else: # pragma: no cover + # POSIX rename() is always atomic + from os import rename as replace + def _path_to_prefix(path): # assume path already normalized @@ -752,9 +761,7 @@ def __setitem__(self, key, value): f.write(value) # move temporary file into place - if os.path.exists(file_path): - os.remove(file_path) - os.rename(temp_path, file_path) + replace(temp_path, file_path) finally: # clean up if temp file still exists for whatever reason From 8ebb16cc809df531fbead79c5a743d2fd767f149 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Fri, 7 Dec 2018 04:20:45 -0500 Subject: [PATCH 123/168] Ensure `DictStore` contains only `bytes` (#350) * Ensure `DictStore` gets `bytes` coercible data As the spec requires that the data in a store be a sequence of `bytes`, make sure that non-`DictStore` input meets this requirement when setting values. This effectively ensures that other `DictStore` meet this requirement as well. So we don't need to go through and check their values too. * Drop undefined buffer size case from `DictStore` As everything in `DictStore` must either be another `DictStore` or `bytes`, there shouldn't be any cases where the size is undefined nor cases that this exception should need handling. Given this go ahead and drop the special casing for unknown sizes in `DictStore`. * Drop `test_getsize_ext` While this test case does test a useful subset of the `getsize` API, the contents being added to the store here are non-conforming to our expectations of store contents. Namely the store should only contain values that are an "arbitrary sequence of bytes", which this test case is not. * Test `getsize` with an unknown size case This creates a non-conforming store to make sure that `getsize` handles its contents in the expected way. Namely that it returns `-1`. * Note `DictStore` only contains `bytes` now [ci skip] * Check for `TypeError` for non-buffer objects Add a test to ensure that a non-buffer supporting object when stored into a valid store, will raise a `TypeError` instead of storing it. Disable this checking for generic `MappingStore`s (e.g. `dict`) as they do not perform this sort of checking on the data they accept as values. * Check that `DictStore` coerces all data to `bytes` Provide a simple test for `DictStore` to ensure that non-`bytes` is coerced to `bytes` before storing it and is retrieved as `bytes`. * Disallow mutation of the internal `DictStore` Make sure that users are only able to add data to the `DictStore`. Disallow the storing of a nested `DictStore` though. --- docs/release.rst | 3 +++ zarr/storage.py | 11 +++-------- zarr/tests/test_storage.py | 27 +++++++++++++++++++-------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/docs/release.rst b/docs/release.rst index 45eb9c8a49..0c1440b9ba 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -35,6 +35,9 @@ Bug fixes * Failing tests related to pickling/unpickling have been fixed. By :user:`Ryan Williams `, :issue:`273`, :issue:`308`. +* Ensure ``DictStore`` contains only ``bytes`` to facilitate comparisons and protect against writes. + By :user:`John Kirkham `, :issue:`350` + Maintenance ~~~~~~~~~~~ diff --git a/zarr/storage.py b/zarr/storage.py index d173acf735..e7d70ea7bc 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -544,6 +544,7 @@ def __getitem__(self, item): def __setitem__(self, item, value): with self.write_mutex: parent, key = self._require_parent(item) + value = ensure_bytes(value) parent[key] = value def __delitem__(self, item): @@ -642,17 +643,11 @@ def getsize(self, path=None): size = 0 for v in value.values(): if not isinstance(v, self.cls): - try: - size += buffer_size(v) - except TypeError: - return -1 + size += buffer_size(v) return size else: - try: - return buffer_size(value) - except TypeError: - return -1 + return buffer_size(value) def clear(self): with self.write_mutex: diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 33c65f36c9..0416470dd7 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -63,6 +63,12 @@ def test_get_set_del_contains(self): # noinspection PyStatementEffect del store['foo'] + def test_set_invalid_content(self): + store = self.create_store() + + with pytest.raises(TypeError): + store['baz'] = list(range(5)) + def test_clear(self): store = self.create_store() store['foo'] = b'bar' @@ -586,6 +592,10 @@ class TestMappingStore(StoreTests, unittest.TestCase): def create_store(self): return dict() + def test_set_invalid_content(self): + # Generic mappings support non-buffer types + pass + def setdel_hierarchy_checks(store): # these tests are for stores that are aware of hierarchy levels; this @@ -629,17 +639,14 @@ class TestDictStore(StoreTests, unittest.TestCase): def create_store(self): return DictStore() - def test_setdel(self): + def test_store_contains_bytes(self): store = self.create_store() - setdel_hierarchy_checks(store) + store['foo'] = np.array([97, 98, 99, 100, 101], dtype=np.uint8) + assert store['foo'] == b'abcde' - def test_getsize_ext(self): + def test_setdel(self): store = self.create_store() - store['a'] = list(range(10)) - store['b/c'] = list(range(10)) - assert -1 == store.getsize() - assert -1 == store.getsize('a') - assert -1 == store.getsize('b') + setdel_hierarchy_checks(store) class TestDirectoryStore(StoreTests, unittest.TestCase): @@ -1096,6 +1103,10 @@ def test_getsize(): assert 7 == getsize(store) assert 5 == getsize(store, 'baz') + store = dict() + store['boo'] = None + assert -1 == getsize(store) + def test_migrate_1to2(): from zarr import meta_v1 From 8e18c81703fc9fb4b7344033a59250d85d711096 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Mon, 10 Dec 2018 03:29:59 -0500 Subject: [PATCH 124/168] Cast datetime and timedelta to signed 64-bit int (#344) * Cast datetime and timedelta to signed 64-bit int The `NaT` type is represented as `-0`. As a result, casting to an unsigned integral fails and throws an error. However casting to a signed integral type does not have this problem and proceeds without issues. * Update datetime/timedelta test to use signed ints * Test encode/decode of a datetime/timedelta array Use a structured array with datetime and timedelta values and a fill value of NaT to test a bunch of different workarounds for encoding and decoding datetime and timedelta values and array. * Note improved `NaT` handling in the changelog * Use `dtype` to cast `NaT` explicitly * test NaT as fill_value for zarr.full() * Drop extra whitespace Should fix the flake8 error seen on CI. ref: https://travis-ci.org/zarr-developers/zarr/jobs/465075634 --- docs/release.rst | 5 ++++ zarr/meta.py | 2 +- zarr/tests/test_core.py | 5 ++-- zarr/tests/test_creation.py | 6 +++++ zarr/tests/test_meta.py | 54 +++++++++++++++++++++++++++++++++++++ 5 files changed, 69 insertions(+), 3 deletions(-) diff --git a/docs/release.rst b/docs/release.rst index 0c1440b9ba..d42759e30c 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -44,6 +44,11 @@ Maintenance * CI and test environments have been upgraded to include Python 3.7, drop Python 3.4, and upgrade all pinned package requirements. :issue:`308`. +* Corrects handling of ``NaT`` in ``datetime64`` and ``timedelta64`` in various + compressors (by :user:`John Kirkham `; :issue:`344`). + +Acknowledgments +~~~~~~~~~~~~~~~ .. _release_2.2.0: diff --git a/zarr/meta.py b/zarr/meta.py index 7984efb701..c90c12ff38 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -209,6 +209,6 @@ def encode_fill_value(v, dtype): elif dtype.kind == 'U': return v elif dtype.kind in 'mM': - return int(v.view('u8')) + return int(v.view('i8')) else: return v diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 544ec95c41..cbad222edb 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -955,8 +955,9 @@ def test_dtypes(self): dtype = '{}8[{}]'.format(base_type, resolution) z = self.create_array(shape=100, dtype=dtype, fill_value=0) assert z.dtype == np.dtype(dtype) - a = np.random.randint(0, np.iinfo('u8').max, size=z.shape[0], - dtype='u8').view(dtype) + a = np.random.randint(np.iinfo('i8').min, np.iinfo('i8').max, + size=z.shape[0], + dtype='i8').view(dtype) z[:] = a assert_array_equal(a, z[:]) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index ef2232c234..4c2af854fb 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -135,6 +135,12 @@ def test_full(): z = full(100, chunks=10, fill_value=np.nan, dtype='f8') assert np.all(np.isnan(z[:])) + # NaT + z = full(100, chunks=10, fill_value='NaT', dtype='M8[s]') + assert np.all(np.isnat(z[:])) + z = full(100, chunks=10, fill_value='NaT', dtype='m8[s]') + assert np.all(np.isnat(z[:])) + # byte string dtype v = b'xxx' z = full(100, chunks=10, fill_value=v, dtype='S3') diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index 904c2146a7..12dda299c8 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -116,6 +116,60 @@ def test_encode_decode_array_2(): assert [df.get_config()] == meta_dec['filters'] +def test_encode_decode_array_datetime_timedelta(): + + # some variations + for k in ['m8[s]', 'M8[s]']: + compressor = Blosc(cname='lz4', clevel=3, shuffle=2) + dtype = np.dtype(k) + fill_value = dtype.type("NaT") + meta = dict( + shape=(100, 100), + chunks=(10, 10), + dtype=dtype, + compressor=compressor.get_config(), + fill_value=fill_value, + order=dtype.char, + filters=[] + ) + + meta_json = '''{ + "chunks": [10, 10], + "compressor": { + "id": "blosc", + "clevel": 3, + "cname": "lz4", + "shuffle": 2, + "blocksize": 0 + }, + "dtype": "%s", + "fill_value": -9223372036854775808, + "filters": [], + "order": "%s", + "shape": [100, 100], + "zarr_format": %s + }''' % (dtype.str, dtype.char, ZARR_FORMAT) + + # test encoding + meta_enc = encode_array_metadata(meta) + assert_json_equal(meta_json, meta_enc) + + # test decoding + meta_dec = decode_array_metadata(meta_enc) + assert ZARR_FORMAT == meta_dec['zarr_format'] + assert meta['shape'] == meta_dec['shape'] + assert meta['chunks'] == meta_dec['chunks'] + assert meta['dtype'] == meta_dec['dtype'] + assert meta['compressor'] == meta_dec['compressor'] + assert meta['order'] == meta_dec['order'] + # Based off of this SO answer: https://stackoverflow.com/a/49972198 + assert np.all( + fill_value.view((np.uint8, fill_value.itemsize)) == + meta_dec['fill_value'].view((np.uint8, meta_dec['fill_value'].itemsize)) + ) + assert [] == meta_dec['filters'] + + def test_encode_decode_array_dtype_shape(): meta = dict( From d747f4cbd965a31cda8cc6c0b7652402e2ce3620 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Mon, 10 Dec 2018 03:34:34 -0500 Subject: [PATCH 125/168] Use `ensure_ndarray` to view chunk as an array (#360) * Use `ensure_ndarray` to view chunk as an array Simplifies the process of constructing an `ndarray` to view the chunk data when writing the result to a destination. * Drop trailing whitespace * Link this PR to Numcodecs upgrade release note --- docs/release.rst | 2 +- zarr/core.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/docs/release.rst b/docs/release.rst index d42759e30c..e07a82cef7 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -29,7 +29,7 @@ Bug fixes * The required version of the `numcodecs `_ package has been upgraded to 0.6.2, which has enabled some code simplification and fixes a failing test involving - msgpack encoding. By :user:`John Kirkham `, :issue:`352`, :issue:`355`, + msgpack encoding. By :user:`John Kirkham `, :issue:`360`, :issue:`352`, :issue:`355`, :issue:`324`. * Failing tests related to pickling/unpickling have been fixed. By :user:`Ryan Williams `, diff --git a/zarr/core.py b/zarr/core.py index 65bfff3cbb..b28f4e3419 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1604,10 +1604,7 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, if self._compressor: self._compressor.decode(cdata, dest) else: - if isinstance(cdata, np.ndarray): - chunk = cdata.view(self._dtype) - else: - chunk = np.frombuffer(cdata, dtype=self._dtype) + chunk = ensure_ndarray(cdata).view(self._dtype) chunk = chunk.reshape(self._chunks, order=self._order) np.copyto(dest, chunk) return From 8dc2f5d90a96358e4c0e568c5629ea1908814f9a Mon Sep 17 00:00:00 2001 From: shikharsg Date: Thu, 13 Dec 2018 01:10:36 +0530 Subject: [PATCH 126/168] removed wrong syntax --- appveyor.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index e33b697357..67058550dc 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -26,7 +26,6 @@ environment: PYTHON_VERSION: "3.7" install: - - cmd: "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" start - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" - "%CMD_IN_ENV% python -m pip install -rrequirements_test.txt" @@ -39,7 +38,3 @@ build: off test_script: - "%CMD_IN_ENV% python -m pytest -v --pyargs zarr" - -on_finish: - - cmd: | - "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe" stop From 85a5670f01a64767b150a8f0c1cfd505d5852156 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Thu, 13 Dec 2018 01:15:14 +0530 Subject: [PATCH 127/168] storage emulator with docker --- appveyor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/appveyor.yml b/appveyor.yml index 67058550dc..69bded65eb 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -26,6 +26,7 @@ environment: PYTHON_VERSION: "3.7" install: + - "docker run -d -p 10000:10000 -p 10001:10001 -p 10002:10002 microsoft/azure-storage-emulator" - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" - "%CMD_IN_ENV% python -m pip install -rrequirements_test.txt" From a09fb61ea0b206a94e42110c2dade7798302c98b Mon Sep 17 00:00:00 2001 From: shikharsg Date: Thu, 13 Dec 2018 12:47:33 +0530 Subject: [PATCH 128/168] trying different appveyor image --- appveyor.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/appveyor.yml b/appveyor.yml index 69bded65eb..34a964c218 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,6 +2,8 @@ branches: only: - master +image: Visual Studio 2017 + environment: global: From 168ba50c75776e36b228bb216a755c953c625691 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Thu, 13 Dec 2018 13:38:47 +0530 Subject: [PATCH 129/168] flake 8 fixes --- zarr/storage.py | 2 -- zarr/tests/test_storage.py | 1 - 2 files changed, 3 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 0db659aae7..8fe5963072 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -33,7 +33,6 @@ import array -import numpy as np from azure.storage.blob import BlockBlobService from azure.common import AzureMissingResourceHttpError @@ -2144,4 +2143,3 @@ def getsize(self, path): def listdir(self, path): return listdir(self.meta_store, path) - diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index c2ed7790e7..25546fa14e 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1322,4 +1322,3 @@ def test_read_write(self): cs['bar'] = 0 with pytest.raises(PermissionError): cs['spam'] = 'eggs' - From e0de99bd58803d827bf8dcccf72c2784509bf3bd Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 14 Dec 2018 00:01:28 +0530 Subject: [PATCH 130/168] full coverage --- zarr/storage.py | 10 +++------- zarr/tests/test_core.py | 4 ++-- zarr/tests/test_hierarchy.py | 4 ++-- zarr/tests/test_storage.py | 10 ++++++++-- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 8fe5963072..37b12408b2 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1913,7 +1913,7 @@ def __init__(self, container, prefix, account_name=None, account_key=None, self.account_key = account_key if blob_service_kwargs is not None: self.blob_service_kwargs = blob_service_kwargs - else: + else: # pragma: no cover self.blob_service_kwargs = dict() self.client = BlockBlobService(self.account_name, self.account_key, **self.blob_service_kwargs) @@ -1961,8 +1961,6 @@ def __setitem__(self, key, value): def __delitem__(self, key): if self.client.exists(self.container_name, '/'.join([self.prefix, key])): self.client.delete_blob(self.container_name, '/'.join([self.prefix, key])) - elif self.__contains__(key): - self.rmdir(key) else: raise KeyError @@ -2011,10 +2009,8 @@ def _strip_prefix_from_path(path, prefix): # normalized things will not have any leading or trailing slashes path_norm = normalize_storage_path(path) prefix_norm = normalize_storage_path(prefix) - if path_norm.startswith(prefix_norm): - return path_norm[(len(prefix_norm)+1):] - else: - return path + + return path_norm[(len(prefix_norm)+1):] def list_abs_directory(self, prefix, strip_prefix=True): """Return a list of all blobs and subdirectories from an abs prefix.""" diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 8f43dbc079..cfa1d732f6 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -1294,8 +1294,8 @@ class TestArrayWithABSStore(TestArray): @staticmethod def absstore(): blob_client = BlockBlobService(is_emulated=True) - if not blob_client.exists('test'): - blob_client.create_container('test') + blob_client.delete_container('test') + blob_client.create_container('test') store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', account_key='bar', blob_service_kwargs={'is_emulated': True}) store.rmdir() diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index 4038c917d7..bba21d767d 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -870,8 +870,8 @@ class TestGroupWithABSStore(TestGroup): @staticmethod def create_store(): blob_client = BlockBlobService(is_emulated=True) - if not blob_client.exists('test'): - blob_client.create_container('test') + blob_client.delete_container('test') + blob_client.create_container('test') store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', account_key='bar', blob_service_kwargs={'is_emulated': True}) store.rmdir() diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 25546fa14e..e546b336c4 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1270,13 +1270,19 @@ class TestABSStore(StoreTests, unittest.TestCase): def create_store(self): blob_client = BlockBlobService(is_emulated=True) - if not blob_client.exists('test'): - blob_client.create_container('test') + blob_client.delete_container('test') + blob_client.create_container('test') store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', account_key='bar', blob_service_kwargs={'is_emulated': True}) store.rmdir() return store + def test_context_manager(self): + with self.create_store() as store: + store['foo'] = b'bar' + store['baz'] = b'qux' + assert 2 == len(store) + class TestConsolidatedMetadataStore(unittest.TestCase): From 3efe8025052e8ef7aa92cbf6dc6afcb68a66ec59 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 14 Dec 2018 12:36:33 +0530 Subject: [PATCH 131/168] verbose logs for pip install to see appveyor error --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 34a964c218..3569bf4c45 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -33,7 +33,7 @@ install: - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" - "%CMD_IN_ENV% python -m pip install -rrequirements_test.txt" - "%CMD_IN_ENV% python -m pip install -rrequirements_dev_npy.txt" - - "%CMD_IN_ENV% python -m pip install --no-binary=numcodecs -rrequirements_dev.txt" + - "%CMD_IN_ENV% python -m pip install --no-binary=numcodecs -v -rrequirements_dev.txt" - "%CMD_IN_ENV% python setup.py install" - "%CMD_IN_ENV% python -m pip freeze" From 8f85315160efa41fb792d9a15e2ec9249843d150 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 14 Dec 2018 13:03:15 +0530 Subject: [PATCH 132/168] trying to run locally installed emulator --- appveyor.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 3569bf4c45..dba8b0c895 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -28,7 +28,6 @@ environment: PYTHON_VERSION: "3.7" install: - - "docker run -d -p 10000:10000 -p 10001:10001 -p 10002:10002 microsoft/azure-storage-emulator" - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" - "%CMD_IN_ENV% python -m pip install -rrequirements_test.txt" @@ -39,5 +38,11 @@ install: build: off +before_test: + - "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe start" + test_script: - "%CMD_IN_ENV% python -m pytest -v --pyargs zarr" + +after_test: + - "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe stop" From d1bb9ce41aae17ae7d9577785b1ef54b3e41519b Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 14 Dec 2018 13:08:53 +0530 Subject: [PATCH 133/168] single-double quote yaml fix --- appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index dba8b0c895..549967fe8d 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -39,10 +39,10 @@ install: build: off before_test: - - "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe start" + - "C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe start" test_script: - "%CMD_IN_ENV% python -m pytest -v --pyargs zarr" after_test: - - "C:\Program Files (x86)\Microsoft SDKs\Azure\Storage Emulator\AzureStorageEmulator.exe stop" + - "C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe stop" From 735c6610ae43b6184c9c2af5091a7bc351dede3c Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 14 Dec 2018 14:31:50 +0530 Subject: [PATCH 134/168] cmd prefix --- appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 549967fe8d..a87a844486 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -39,10 +39,10 @@ install: build: off before_test: - - "C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe start" + - cmd: C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe start test_script: - "%CMD_IN_ENV% python -m pytest -v --pyargs zarr" after_test: - - "C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe stop" + - cmd: C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe stop From 979a438aa7c2fc5bb390e49ea2713e92d7981df7 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 14 Dec 2018 14:42:26 +0530 Subject: [PATCH 135/168] double quotes around exe file path --- appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index a87a844486..f2dde1ecd9 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -39,10 +39,10 @@ install: build: off before_test: - - cmd: C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe start + - cmd: "C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe" start test_script: - "%CMD_IN_ENV% python -m pytest -v --pyargs zarr" after_test: - - cmd: C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe stop + - cmd: "C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe" stop From 5beace1c598d11b6d78108502ab540abd960670c Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 14 Dec 2018 14:51:11 +0530 Subject: [PATCH 136/168] double quotes within single quotes with environment variable substitution --- appveyor.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index f2dde1ecd9..471f632812 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -11,6 +11,7 @@ environment: # /E:ON and /V:ON options are not enabled in the batch script intepreter # See: http://stackoverflow.com/a/13751649/163740 CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\build.cmd" + EMULATOR_LOC: C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe matrix: @@ -39,10 +40,10 @@ install: build: off before_test: - - cmd: "C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe" start + - '"%EMULATOR_LOC%" start' test_script: - "%CMD_IN_ENV% python -m pytest -v --pyargs zarr" after_test: - - cmd: "C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe" stop + - '"%EMULATOR_LOC%" stop' From 68bda4ec8f16f32ae1258aa81ba4d49fb2651873 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 14 Dec 2018 16:00:21 +0530 Subject: [PATCH 137/168] trying appveyor build with VS2015 image ; --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 471f632812..36c3cc0547 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,7 +2,7 @@ branches: only: - master -image: Visual Studio 2017 +image: Visual Studio 2015 environment: From 77db63739f984d05b4e7ab86d86e87f7e69ea4e5 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 14 Dec 2018 16:13:21 +0530 Subject: [PATCH 138/168] added comment and removed verbosity option for pip install --- appveyor.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 36c3cc0547..d04417d671 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,6 +2,8 @@ branches: only: - master +# the VS C++ compiler path, doesn't seem to exist in the PATH environment variable of +# the Visual Studio 2017 build VM, due to which the pyosreplace package fails to build image: Visual Studio 2015 environment: @@ -33,7 +35,7 @@ install: - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" - "%CMD_IN_ENV% python -m pip install -rrequirements_test.txt" - "%CMD_IN_ENV% python -m pip install -rrequirements_dev_npy.txt" - - "%CMD_IN_ENV% python -m pip install --no-binary=numcodecs -v -rrequirements_dev.txt" + - "%CMD_IN_ENV% python -m pip install --no-binary=numcodecs -rrequirements_dev.txt" - "%CMD_IN_ENV% python setup.py install" - "%CMD_IN_ENV% python -m pip freeze" From bcdc8393b002730b9a8643956f7582fb4bd178b4 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 14 Dec 2018 16:25:15 +0530 Subject: [PATCH 139/168] list_abs_directory to list only directory blob using delimiter option in azure blob client --- zarr/storage.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 37b12408b2..89548d474b 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1991,7 +1991,7 @@ def __contains__(self, key): def list_abs_directory_blobs(self, prefix): """Return list of all blobs from an abs prefix.""" blobs = list() - for blob in self.client.list_blobs(self.container_name, prefix=prefix): + for blob in self.client.list_blobs(self.container_name, prefix=prefix, delimiter='/'): if '/' not in blob.name[len(prefix):]: blobs.append(blob.name) return blobs @@ -1999,7 +1999,7 @@ def list_abs_directory_blobs(self, prefix): def list_abs_subdirectories(self, prefix): """Return list of all "subdirectories" from an abs prefix.""" dirs = [] - for blob in self.client.list_blobs(self.container_name, prefix=prefix): + for blob in self.client.list_blobs(self.container_name, prefix=prefix, delimiter='/'): if '/' in blob.name[len(prefix):]: dirs.append(blob.name[:blob.name.find('/', len(prefix))]) return dirs @@ -2035,9 +2035,6 @@ def listdir(self, path=None): dir_path = self.dir_path(path) return sorted(self.list_abs_directory(dir_path, strip_prefix=True)) - # def rename(self, src_path, dst_path): - # raise NotImplementedErrror - def rmdir(self, path=None): dir_path = normalize_storage_path(self.full_path(path)) + '/' for blob in self.client.list_blobs(self.container_name, prefix=dir_path): From ac286ce47e6a5b712794081fba9511edd6037f97 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 14 Dec 2018 20:11:29 +0530 Subject: [PATCH 140/168] fixed ABSStore docs --- zarr/storage.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 89548d474b..797c506c53 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1889,8 +1889,7 @@ class ABSStore(MutableMapping): Parameters ---------- container_name : string - The name of the ABS container to use. Currently this must exist in the - storage account. + The name of the ABS container to use. prefix : string Location of the "directory" to use as the root of the storage hierarchy within the container. @@ -1898,6 +1897,9 @@ class ABSStore(MutableMapping): The Azure blob storage account name. account_key : string The Azure blob storage account acess key. + blob_service_kwargs : dictionary + Extra arguments to be passed into the azure blob client, for e.g. when + using the emulator, pass in blob_service_kwargs={'is_emulated': True} Notes ----- From 2c1a6e09f729547d87a9f5b3fb5592706e01e64d Mon Sep 17 00:00:00 2001 From: jakirkham Date: Sat, 15 Dec 2018 18:32:02 -0500 Subject: [PATCH 141/168] Drop temporary workaround for `get_codec` (#361) * Drop temporary workaround for `get_codec` This workaround was added because `get_codec` was modifying its argument. However as of Numcodecs 0.6.0, `get_codec` does not modify its argument as it takes a copy and modifies the copy instead. Given as we now require Numcodecs 0.6.0, this workaround is no longer needed. Hence we drop it. * Link this to the Numcodecs upgrade release note --- docs/release.rst | 4 ++-- zarr/core.py | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/docs/release.rst b/docs/release.rst index e07a82cef7..e3c0b85d59 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -29,8 +29,8 @@ Bug fixes * The required version of the `numcodecs `_ package has been upgraded to 0.6.2, which has enabled some code simplification and fixes a failing test involving - msgpack encoding. By :user:`John Kirkham `, :issue:`360`, :issue:`352`, :issue:`355`, - :issue:`324`. + msgpack encoding. By :user:`John Kirkham `, :issue:`361`, :issue:`360`, :issue:`352`, + :issue:`355`, :issue:`324`. * Failing tests related to pickling/unpickling have been fixed. By :user:`Ryan Williams `, :issue:`273`, :issue:`308`. diff --git a/zarr/core.py b/zarr/core.py index b28f4e3419..80d1830c07 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -166,9 +166,6 @@ def _load_metadata_nosync(self): if config is None: self._compressor = None else: - # temporary workaround for - # https://github.com/zarr-developers/numcodecs/issues/78 - config = dict(config) self._compressor = get_codec(config) # setup filters From cdaceb7f4adcc49c1f49e7f743814b6d6bd412f7 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Sun, 16 Dec 2018 17:34:28 +0530 Subject: [PATCH 142/168] fixed windows path listdir error --- zarr/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index 797c506c53..ea9303a730 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -2029,7 +2029,7 @@ def dir_path(self, path=None): # prefix is normalized to not have a trailing slash dir_path = self.prefix if store_path: - dir_path = os.path.join(dir_path, store_path) + dir_path = dir_path + '/' + store_path dir_path += '/' return dir_path From b6b3024dae729d7baceb938c78017da24273fa87 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Sun, 16 Dec 2018 22:03:13 +0530 Subject: [PATCH 143/168] ABSStore refactoring --- zarr/storage.py | 66 +++++++++++++++---------------------------------- 1 file changed, 20 insertions(+), 46 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index ea9303a730..3eb0385e74 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1942,8 +1942,12 @@ def _append_path_to_prefix(path, prefix): return '/'.join([normalize_storage_path(prefix), normalize_storage_path(path)]) - def full_path(self, path=None): - return self._append_path_to_prefix(path, self.prefix) + @staticmethod + def _strip_prefix_from_path(path, prefix): + # normalized things will not have any leading or trailing slashes + path_norm = normalize_storage_path(path) + prefix_norm = normalize_storage_path(prefix) + return path_norm[(len(prefix_norm)+1):] def __getitem__(self, key): blob_name = '/'.join([self.prefix, key]) @@ -1990,55 +1994,24 @@ def __contains__(self, key): else: return False - def list_abs_directory_blobs(self, prefix): - """Return list of all blobs from an abs prefix.""" - blobs = list() - for blob in self.client.list_blobs(self.container_name, prefix=prefix, delimiter='/'): - if '/' not in blob.name[len(prefix):]: - blobs.append(blob.name) - return blobs - - def list_abs_subdirectories(self, prefix): - """Return list of all "subdirectories" from an abs prefix.""" - dirs = [] - for blob in self.client.list_blobs(self.container_name, prefix=prefix, delimiter='/'): - if '/' in blob.name[len(prefix):]: - dirs.append(blob.name[:blob.name.find('/', len(prefix))]) - return dirs - - @staticmethod - def _strip_prefix_from_path(path, prefix): - # normalized things will not have any leading or trailing slashes - path_norm = normalize_storage_path(path) - prefix_norm = normalize_storage_path(prefix) - - return path_norm[(len(prefix_norm)+1):] - - def list_abs_directory(self, prefix, strip_prefix=True): - """Return a list of all blobs and subdirectories from an abs prefix.""" - items = set() - items.update(self.list_abs_directory_blobs(prefix)) - items.update(self.list_abs_subdirectories(prefix)) - items = list(items) - if strip_prefix: - items = [self._strip_prefix_from_path(path, prefix) for path in items] - return items - - def dir_path(self, path=None): + def listdir(self, path=None): store_path = normalize_storage_path(path) # prefix is normalized to not have a trailing slash dir_path = self.prefix if store_path: dir_path = dir_path + '/' + store_path dir_path += '/' - return dir_path - - def listdir(self, path=None): - dir_path = self.dir_path(path) - return sorted(self.list_abs_directory(dir_path, strip_prefix=True)) + items = list() + for blob in self.client.list_blobs(self.container_name, prefix=dir_path, delimiter='/'): + if '/' in blob.name[len(dir_path):]: + items.append(self._strip_prefix_from_path( + blob.name[:blob.name.find('/', len(dir_path))], dir_path)) + else: + items.append(self._strip_prefix_from_path(blob.name, dir_path)) + return items def rmdir(self, path=None): - dir_path = normalize_storage_path(self.full_path(path)) + '/' + dir_path = normalize_storage_path(self._append_path_to_prefix(path, self.prefix)) + '/' for blob in self.client.list_blobs(self.container_name, prefix=dir_path): self.client.delete_blob(self.container_name, blob.name) @@ -2052,9 +2025,10 @@ def getsize(self, path=None): fs_path).properties.content_length else: size = 0 - for blob_name in self.list_abs_directory_blobs(fs_path + '/'): - size += self.client.get_blob_properties(self.container_name, - blob_name).properties.content_length + for blob in self.client.list_blobs(self.container_name, prefix=fs_path + '/', + delimiter='/'): + if '/' not in blob.name[len(fs_path + '/'):]: + size += blob.properties.content_length return size def clear(self): From b6eebc8990302b00675905bdfe24190b008bdd73 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Sun, 23 Dec 2018 15:05:40 +0530 Subject: [PATCH 144/168] moved py2 array.array checking to numcodecs ensure bytes --- zarr/storage.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 3eb0385e74..a32c7f6dbf 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1958,8 +1958,7 @@ def __getitem__(self, key): raise KeyError('Blob %s not found' % blob_name) def __setitem__(self, key, value): - if PY2 and isinstance(value, array.array): - value = value.tostring() + value = ensure_bytes(value) blob_name = '/'.join([self.prefix, key]) buffer = io.BytesIO(value) self.client.create_blob_from_stream(self.container_name, blob_name, buffer) From bf2b6721774709d9a2581e2d42264dbb7612aca2 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Thu, 3 Jan 2019 13:44:08 -0500 Subject: [PATCH 145/168] Use `reversed` to iterate through `list` backwards (#374) As using slicing syntax on `list`s makes a copy, use `reversed` instead to avoid the copy and just create a generator for iterating through the `list` backwards. --- zarr/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/core.py b/zarr/core.py index 80d1830c07..602891b68e 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1735,7 +1735,7 @@ def _decode_chunk(self, cdata): # apply filters if self._filters: - for f in self._filters[::-1]: + for f in reversed(self._filters): chunk = f.decode(chunk) # view as numpy array with correct dtype From d7ad9fbd87c214c2303c6e66f4879b2d1181bedf Mon Sep 17 00:00:00 2001 From: jakirkham Date: Thu, 3 Jan 2019 13:45:02 -0500 Subject: [PATCH 146/168] Always use `tuple`s for multidimensional indexing (#376) * Always use `tuple`s for multidimensional indexing NumPy 1.15.0 deprecates the use of sequences in `ndarray` selection for sequences other than `tuple`s. We have a few cases where we are using `list`s instead as they are mutable. Thus are easier to build up and change. However passing `list`s into `ndarray` selection will cause this `FutureWarning` to be emitted (and will eventually result in an error). To fix this simply, just convert these `list`s to `tuple`s after we are done changing their contents, but before we use them with `ndarray`s. This works just as well on older versions of NumPy. Avoids the `FutureWarning` in NumPy 1.15.0+. Not to mention this will continue to work on future versions of NumPy, which may change this warning into an error. * Note use of `tuple`s for slicing NumPy `ndarrays` --- docs/release.rst | 3 +++ zarr/core.py | 1 + zarr/indexing.py | 1 + 3 files changed, 5 insertions(+) diff --git a/docs/release.rst b/docs/release.rst index e3c0b85d59..a1cc4286af 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -38,6 +38,9 @@ Bug fixes * Ensure ``DictStore`` contains only ``bytes`` to facilitate comparisons and protect against writes. By :user:`John Kirkham `, :issue:`350` +* Always use a ``tuple`` when indexing a NumPy ``ndarray``. + By :user:`John Kirkham `, :issue:`376` + Maintenance ~~~~~~~~~~~ diff --git a/zarr/core.py b/zarr/core.py index 602891b68e..cc04953a7c 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -1531,6 +1531,7 @@ def _set_selection(self, indexer, value, fields=None): item = [slice(None)] * self.ndim for a in indexer.drop_axes: item[a] = np.newaxis + item = tuple(item) chunk_value = chunk_value[item] # put data diff --git a/zarr/indexing.py b/zarr/indexing.py index 52e11fbf28..0cd116bc37 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -513,6 +513,7 @@ def oindex_set(a, selection, value): value_selection = [slice(None)] * len(a.shape) for i in drop_axes: value_selection[i] = np.newaxis + value_selection = tuple(value_selection) value = value[value_selection] a[selection] = value From 585f0f51c697acb4058e1b28da961438f96f43e2 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 15 Jan 2019 20:08:57 -0500 Subject: [PATCH 147/168] Test the `popitem` method of `MutableMapping`s too (#378) Adds a test for the `popitem` method of `MutableMapping`'s as well. Typically this is not used, but it does get used in the default `clear` method. Given this method doesn't guarantee an order, use a store with a single-key value pair to simplify the test logic. Also test the case of an empty store to make sure it errors out properly. Override the `popitem` method appropriate for stores that do not handle removal properly. --- zarr/tests/test_storage.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 0416470dd7..d9f2cc5fd2 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -93,6 +93,16 @@ def test_pop(self): with pytest.raises(KeyError): store.pop('xxx') + def test_popitem(self): + store = self.create_store() + store['foo'] = b'bar' + k, v = store.popitem() + assert k == 'foo' + assert v == b'bar' + assert len(store) == 0 + with pytest.raises(KeyError): + store.popitem() + def test_writeable_values(self): store = self.create_store() @@ -762,6 +772,13 @@ def test_pop(self): with pytest.raises(NotImplementedError): store.pop('foo') + def test_popitem(self): + # override because not implemented + store = self.create_store() + store['foo'] = b'bar' + with pytest.raises(NotImplementedError): + store.popitem() + class TestDBMStore(StoreTests, unittest.TestCase): From fefce3b0ef03885ac59f8e0f71365763cfa2eaac Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 15 Jan 2019 20:20:05 -0500 Subject: [PATCH 148/168] Test `pop` with default argument (#380) * Test `pop` with default argument Adds another case to `test_pop` for stores generally, which merely tests if `pop` can handle the default argument correctly when no key can be found. * Test `pop` with the default value of `None` Some implementations of `pop` might carelessly set the `default` to `None` when not passed. However this would make it impossible to distinguish the case where the user passed `None` for the `default` intentionally versus not passing anything for the `default`. The result being both cases would raise a `KeyError`, but the error would be incorrect in the first case. The usual way of solving this is to create some dummy object and make that the `default` when if it is not set. That way one can compare if the dummy object is seen and only raise then. Thus passing `None` for the `default` would not error, but return `None` if the `key` does not exist as expected. This test is added to catch this potential oversight. * Test `pop` with a non-trivial `bytes` object --- zarr/tests/test_storage.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index d9f2cc5fd2..154ffbe3ed 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -92,6 +92,12 @@ def test_pop(self): assert len(store) == 0 with pytest.raises(KeyError): store.pop('xxx') + v = store.pop('xxx', b'default') + assert v == b'default' + v = store.pop('xxx', b'') + assert v == b'' + v = store.pop('xxx', None) + assert v is None def test_popitem(self): store = self.create_store() From 43f7faefcd7b4bd290b86305f5085e20a6add052 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 22 Jan 2019 23:52:54 +1100 Subject: [PATCH 149/168] Add SQLiteStore (#368) * Create an SQLite-backed mutable mapping Implements a key-value store using SQLite. As this is a builtin module in Python and a common database to use in various languages, this should have high utility and be very portable. Not to mention many databases provide an SQLite language on top regardless of the internal representation. So this can be a great template for users wishing to work with Zarr in their preferred database. * Test SQLiteStore Try using the `SQLiteStore` everywhere one would use another store and make sure that it behaves correctly. This includes simple key-value store usage, creating hierarchies, and storing arrays. * Export `SQLiteStore` to the top-level namespace * Include some SQLiteStore examples Provide a few examples of how one might use `SQLiteStore` to store arrays or groups. These examples are taken with minor modifications from the `LMDBStore` examples. * Demonstrate the `SQLiteStore` in the tutorial Includes a simple example borrowed from `LMDBStore`'s tutorial example, which shows how to create and use an `SQLiteStore`. * Provide API documentation for `SQLiteStore` * Make a release note for `SQLiteStore` * Use unique extension for `SQLiteStore` files Otherwise we may end up opening a different databases' files and try to use them with SQLite only to run into errors. This caused the doctests to fail previously. Changing the extension as we have done should avoid these conflicts. * Only close SQLite database when requested Instead of opening, committing, and closing the SQLite database for every operation, limit these to user requested operations. Namely commit only when the user calls `flush`. Also close only when the user calls `close`. This should make operations with SQLite much more performant than when we automatically committed and closed after every user operation. * Update docs to show how to close `SQLiteStore` As users need to explicitly close the `SQLiteStore` to commit changes and serialize them to the SQLite database, make sure to point this out in the docs. * Ensure all SQL commands are capitalized Appears some of these commands work without capitalization. However as the docs show commands as capitalized, ensure that we are doing the same thing as well. That way this won't run into issues with different SQL implementations or older versions of SQLite that are less forgiving. Plus this should match closer to what users familiar with SQL expect. * Simplify `SQLiteStore`'s `__delitem__` using `in` Make use of `in` instead of repeating the same logic in `__delitem__`. As we now keep the database open between operations, this is much simpler than duplicating the key check logic. Also makes it a bit easier to understand what is going on. * Drop no longer needed flake8 error suppression This was needed when the `import` of `sqlite3` was only here to ensure that it existed (even though it wasn't used). Now we make use of `sqlite3` where it is being imported. So there is no need to tell flake8 to not worry about the unused import as there isn't one. * Simplify `close` and use `flush` * Flush before pickling `SQLiteStore` Make sure that everything intended to be added to the `SQLiteStore` database has been written to disk before attempting to pickle it. That way we can be sure other processes constructing their own `SQLiteStore` have access to the same data and not some earlier representation. * Special case in-memory SQLite database No need to normalize the path when there isn't one (e.g. `:memory:`). * Drop unneeded empty `return` statement * Update docs/release.rst Fix a typo. Co-Authored-By: jakirkham * Update docs/release.rst Include author and original issue in changelog entry. Co-Authored-By: jakirkham * Correct default value for `check_same_thread` The default value for `check_same_thread` was previously set to `False` when in reality we want this check enabled. So set `check_same_thread` to `True`. * Flush after making any mutation to the database As users could change the setting of things like `check_same_thread` or they may try to access the same database from multiple threads or processes, make sure to flush any changes that would mutate the database. * Skip flushing data when pickling `SQLiteStore` As we now always commit after an operation that mutates the data, there is no need to commit before pickling the `SQLiteStore` object. After all the data should already be up-to-date in the database. * Skip using `flush` in `close` As everything should already be flushed to the database whenever the state is mutated, there is no need to perform this before closing. * Implement `update` for `SQLiteStore` While there is a default implementation of `update` for `MutableMapping`s, it means that we perform multiple `__setitem__` operations. However it would be better if we could commit all key-value pairs in one operation and commit them. Hence we implement `update` for this purpose. * Rewrite `__setitem__` to use `update` Simplifies `__setitem__` to an `update` operation with a dictionary that contains only one item. This works the same as before, but cuts out some redundancy, which simplifies the code a bit. * Disable `check_same_thread` by default again As we now make sure to commit after every mutating change to the database, disable `check_same_thread` again as it should be safe. * Force some parameters to defaults As some of these parameters no longer make sense to be user customizable, go ahead and just set their values as part of the `sqlite3.connect` call. This ensures that they are set the way we expect. Also it ensures that if users try to mess with them, an error will be raised due to duplicate keyword arguments. To elaborate on why these parameters are not user configurable any more, `detect_types` only makes sense if one is building their own table with specific types. Instead we build the table for users and have very generic types (i.e. text and blobs), which are not worth checking. As we commit after every modification to the database to make it more friendly for other threads and processes, the `isolation_level` might as well be to auto-commit. Setting it to anything else really has no effect. Finally there is no need for `check_same_thread` to be anything other than `False` as we are guaranteeing everything is committed after mutation, which ensures the database is thread and process safe. * Drop `flush` calls from `SQLiteStore` As we already enable auto-committing, any mutation is automatically written after performed. So there is no need for us to commit afterwards. Besides `commit` is turned into a no-op if auto-committing is enabled. * Drop the `flush` function from `SQLiteStore` As we auto-commit all changes, there is no need for a `flush` operation for the `SQLiteStore`. So go ahead and drop the `flush` function and its documentation. * Implement optimized `clear` for `SQLiteStore` As the default implementation of `clear` deletes each key-value pair, this will be considerably slower than an operation that can remove all of them at once. Here we do exactly that by using SQL's `DROP TABLE`. Unfortunately there is not a truncate table command, but doing a drop followed by a create has the same effect. We combine these two operations using `executescript`. Thus auto-commit won't run until after both have run, which will commit the table with all key-value pairs removed. * Implement optimized `rmdir` for `SQLiteStore` Provides an SQL implementation of `rmdir` that is a bit better optimized for removing anything that matches the specified path as opposed to doing multiple removals. If it is detected that the root directory is being removed, simply fallback to clear, which is optimized for that use case as it uses `DROP TABLE` instead of deleting rows. Otherwise remove any path that begins with the normalized user-provided path as long as it may contain at least one more character after. This stops `rmdir` from removing a key-value pair where the key exactly matches normalized user-provided path (i.e. not a "directory" as it contains data). * Implement optimized `getsize` for `SQLiteStore` Take advantage of SQLite's ability to query and filter tables quickly to implement `getsize` entirely in SQL (with the exception of path normalization to sanitize user input). Measures the `LENGTH` of all blobs in the column and calls `SUM` to get their aggregate size. In the event that there are no matches, use `COALESCE` to replace the `NULL` value returned by `SUM` with `0` instead. * Implement optimized `listdir` for `SQLiteStore` Take advantage of SQLite's ability to query and filter tables quickly to implement `listdir` entirely in SQL (with the exception of path normalization to sanitize user input). Makes use of a nested `SELECT`/`AS` to build a set of partial keys below top-level key. These are then further split to get only the portion directly under the top-level key and not any of their children. * Implement `rename` for `SQLiteStore` Creates an SQL implementation of `rename` for `SQLiteStore`. As there isn't a way to safely change the keys in `SQLiteStore` (since they are `PRIMARY`), simply create a temporary table that copies over the key-value pairs with keys renamed using a nested `SELECT` statement. Then delete all key-value pairs that match the keys to move. Finally copy all key-value pairs from the temporary table into our table and delete the temporary table. Perform all of this as a transaction so only the final result of the rename is visible to others. * Allow users to specify the SQLite table name Instead of just picking an arbitrary table name for users, allow them to pick a name for the table. Let it default to `zarr` though to make it easy to discover where it got stored if someone inspects the SQLite database. * Randomize temporary table name Use a UUID to generate a unique table name for the temporary table to hopefully avoid collisions even if multiple such operations are occurring and/or remnants of older operations stuck around. * Merge `SELECT`s in `rename` Fuses the two `SELECTS` in `SQLiteStore`'s `rename` function into one. * Tidy `rename` SQL code a bit * Fuse away one `SELECT` in `listdir` In `SQLiteStore`'s `listdir`, fuse the `SELECT` performing the ordering with the `SELECT` applying the `DISTINCT` criteria. As these can be combined and often `DISTINCT` already performs ordering, this may be a bit easier to optimize for different SQL engines. * Only use `k` in `SQLiteStore`'s `__contains__` We don't make use of the values only the keys when checking for existence. So drop the `v` column from the `SELECT` statement as it is unused and only grab the `k` column. * Fuse `SELECT`s in `SQLiteStore`'s `__contains__` Simplifies the SQL used in `SQLiteStore`'s `__contains__` method by fusing the two `SELECT` statements into one. Does this by using `COUNT(*)` to determine how many rows are left after the `SELECT`. As the selection checks for an exact match with the key (and keys are `PRIMARY`), there either is exactly `1` or `0`. So this works the same as `SELECT EXISTS`, but with a single `SELECT` instead. * Cast `has` to `bool` in `SQLiteStore.__contains__` SQLite does not have a boolean type and merely represents them with integers like `0` and `1` (much like C, which it is written in). While Python will perform the conversion of the `__contains__` result to `bool` for us, go ahead and perform the conversion explicitly for clarity. * Prefer using single quotes in more places * Wrap SQL table creation text * Adjust wrapping of `SQLiteStore.clear`'s code * Use parameters for SQL in `listdir` Make sure to use parameters to pass in `path` used by `listdir`'s SQL code to avoid problems caused by injection. * Use parameters for SQL in `getsize` Make sure to use parameters to pass in `path` used by `getsize`'s SQL code to avoid problems caused by injection. * Use parameters for SQL in `rmdir` Make sure to use parameters to pass in `path` used by `rmdir`'s SQL code to avoid problems caused by injection. * Adjust formatting of `SQLiteStore.__contains__` Make sure the command is run in the first line and result stored. Then unpack and return what it finds. * Drop `SQLiteStore`'s implementation of `rename` It's difficult to protect against injections, avoid copying, using a single transaction, etc. in an SQL implementation of `rename`. So instead just drop this implementation and allow the default `rename` implementation to be used. * Just name the SQL table "zarr" Instead of allowing the user to customize where the table is stored, just set it to "zarr". This avoids issues with the table name potentially exploiting injection attacks. Besides its unclear this level of flexibility is really needed given Zarr supports Groups and thus can store many Arrays in the same key-value store. * Unwrap some lines to compact the code a bit * Simplify `SQLiteStore.__contains__` code wrapping * Check SQLite Cursor's rowcount for deletion Instead of checking if a particular key exists and then either raising a `KeyError` or deleting it in `SQLiteStore`, go ahead with the deletion and check the value of `rowcount`. As the keys are primary, they must be unique and thus each one only occurs once. Thus if deletion worked, the `rowcount` will be exactly `1` (it cannot be larger). Alternatively if deletion failed, the `rowcount` would be `0`. Thus we can simply check if the `rowcount` is not `1` and raise the `KeyError`. This should improve the performance a bit. * Parenthesize operations to `?` in SQL To make sure that SQL prioritizes the right things, parenthesize some operations with `?` to clarify to the reader and the parser what should be prioritized. This is done particularly when concatenating special string match symbols to user parameters. * Check `rowcount` for values less than `1` * Parenthesize a few other SQL commands with `?` * Use one line for `SQLiteStore.rmdir`'s SQL * Use 1 line for `SQLiteStore.rmdir`'s SQL & params * Update docs/release.rst Co-Authored-By: jakirkham * `TestSQLiteStore` -> `TestGroupWithSQLiteStore` * Drop `else` in `for`/`else` for clarity * Ensure SQLite is new enough to enable threading Adds a simple check to ensure SQLite is new enough to enable thread-safe sharing of connections before setting `check_same_thread=True`. If SQLite is not new enough, set `check_same_thread=False`. * Add spacing around `=` * Hold a lock for any DML operations in SQLiteStore As there are some concerns about keeping operations on the SQLite database sequential for thread-safety, acquire an internal lock when a DML operation occurs. This should ensure that only one modification can occur at a time regardless of whether the connection uses the serialized threading mode or not. * Raise when pickling an in-memory SQLite database * Test in-memory SQLiteStore's separately Uses all the same tests we use for SQLiteStore's on disk except it special cases the pickling test to ensure the `SQLiteStore` cannot be pickled if it is in-memory. * Drop explicit setting of `sqlite3` defaults Simply use the `Connection`'s default arguments implicitly instead of explicitly setting them in the constructor. * Adjust inheritance of `TestSQLiteStoreInMemory` Make sure to inherit directly from `unittest.TestCase` as well. --- docs/api/storage.rst | 4 + docs/release.rst | 4 + docs/tutorial.rst | 10 ++ zarr/__init__.py | 3 +- zarr/storage.py | 207 +++++++++++++++++++++++++++++++++++ zarr/tests/test_core.py | 27 ++++- zarr/tests/test_hierarchy.py | 20 +++- zarr/tests/test_storage.py | 38 ++++++- 8 files changed, 308 insertions(+), 5 deletions(-) diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 74801d3115..24498b0d79 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -21,6 +21,10 @@ Storage (``zarr.storage``) .. automethod:: close .. automethod:: flush +.. autoclass:: SQLiteStore + + .. automethod:: close + .. autoclass:: LRUStoreCache .. automethod:: invalidate diff --git a/docs/release.rst b/docs/release.rst index a1cc4286af..f0d5a559ab 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -19,6 +19,10 @@ Enhancements * Support has been added for structured arrays with sub-array shape and/or nested fields. By :user:`Tarik Onalan `, :issue:`111`, :issue:`296`. +* Adds the SQLite-backed :class:`zarr.storage.SQLiteStore` class enabling an + SQLite database to be used as the backing store for an array or group. + By :user:`John Kirkham `, :issue:`368`, :issue:`365`. + Bug fixes ~~~~~~~~~ diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 29ce8b0935..0fbefc3e2e 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -729,6 +729,16 @@ group (requires `lmdb `_ to be installed):: >>> z[:] = 42 >>> store.close() +In Zarr version 2.3 is the :class:`zarr.storage.SQLiteStore` class which +enables the SQLite database to be used for storing an array or group (requires +Python is built with SQLite support):: + + >>> store = zarr.SQLiteStore('data/example.sqldb') + >>> root = zarr.group(store=store, overwrite=True) + >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') + >>> z[:] = 42 + >>> store.close() + Distributed/cloud storage ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/zarr/__init__.py b/zarr/__init__.py index cf34d3d427..c9046f6bff 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -7,7 +7,8 @@ from zarr.creation import (empty, zeros, ones, full, array, empty_like, zeros_like, ones_like, full_like, open_array, open_like, create) from zarr.storage import (DictStore, DirectoryStore, ZipStore, TempStore, - NestedDirectoryStore, DBMStore, LMDBStore, LRUStoreCache) + NestedDirectoryStore, DBMStore, LMDBStore, SQLiteStore, + LRUStoreCache) from zarr.hierarchy import group, open_group, Group from zarr.sync import ThreadSynchronizer, ProcessSynchronizer from zarr.codecs import * diff --git a/zarr/storage.py b/zarr/storage.py index e7d70ea7bc..2a07d9aa38 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -18,6 +18,7 @@ from __future__ import absolute_import, print_function, division from collections import MutableMapping, OrderedDict import os +import operator import tempfile import zipfile import shutil @@ -26,6 +27,7 @@ import sys import json import multiprocessing +from pickle import PicklingError from threading import Lock, RLock import glob import warnings @@ -1877,6 +1879,211 @@ def __delitem__(self, key): self._invalidate_value(key) +class SQLiteStore(MutableMapping): + """Storage class using SQLite. + + Parameters + ---------- + path : string + Location of database file. + **kwargs + Keyword arguments passed through to the `sqlite3.connect` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.SQLiteStore('data/array.sqldb') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.SQLiteStore('data/group.sqldb') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + """ + + def __init__(self, path, **kwargs): + import sqlite3 + + # normalize path + if path != ':memory:': + path = os.path.abspath(path) + + # store properties + self.path = path + self.kwargs = kwargs + + # allow threading if SQLite connections are thread-safe + # + # ref: https://www.sqlite.org/releaselog/3_3_1.html + # ref: https://bugs.python.org/issue27190 + check_same_thread = True + if sqlite3.sqlite_version_info >= (3, 3, 1): + check_same_thread = False + + # keep a lock for serializing mutable operations + self.lock = Lock() + + # open database + self.db = sqlite3.connect( + self.path, + detect_types=0, + isolation_level=None, + check_same_thread=check_same_thread, + **self.kwargs + ) + + # handle keys as `str`s + self.db.text_factory = str + + # get a cursor to read/write to the database + self.cursor = self.db.cursor() + + # initialize database with our table if missing + with self.lock: + self.cursor.execute( + 'CREATE TABLE IF NOT EXISTS zarr(k TEXT PRIMARY KEY, v BLOB)' + ) + + def __getstate__(self): + if self.path == ':memory:': + raise PicklingError('Cannot pickle in-memory SQLite databases') + return self.path, self.kwargs + + def __setstate__(self, state): + path, kwargs = state + self.__init__(path=path, **kwargs) + + def close(self): + """Closes the underlying database.""" + + # close cursor and db objects + self.cursor.close() + self.db.close() + + def __getitem__(self, key): + value = self.cursor.execute('SELECT v FROM zarr WHERE (k = ?)', (key,)) + for v, in value: + return v + raise KeyError(key) + + def __setitem__(self, key, value): + self.update({key: value}) + + def __delitem__(self, key): + with self.lock: + self.cursor.execute('DELETE FROM zarr WHERE (k = ?)', (key,)) + if self.cursor.rowcount < 1: + raise KeyError(key) + + def __contains__(self, key): + cs = self.cursor.execute( + 'SELECT COUNT(*) FROM zarr WHERE (k = ?)', (key,) + ) + for has, in cs: + has = bool(has) + return has + + def items(self): + kvs = self.cursor.execute('SELECT k, v FROM zarr') + for k, v in kvs: + yield k, v + + def keys(self): + ks = self.cursor.execute('SELECT k FROM zarr') + for k, in ks: + yield k + + def values(self): + vs = self.cursor.execute('SELECT v FROM zarr') + for v, in vs: + yield v + + def __iter__(self): + return self.keys() + + def __len__(self): + cs = self.cursor.execute('SELECT COUNT(*) FROM zarr') + for c, in cs: + return c + + def update(self, *args, **kwargs): + args += (kwargs,) + + kv_list = [] + for dct in args: + for k, v in dct.items(): + # Python 2 cannot store `memoryview`s, but it can store + # `buffer`s. However Python 2 won't return `bytes` then. So we + # coerce to `bytes`, which are handled correctly. Python 3 + # doesn't have these issues. + if PY2: # pragma: py3 no cover + v = ensure_bytes(v) + else: # pragma: py2 no cover + v = ensure_contiguous_ndarray(v) + + # Accumulate key-value pairs for storage + kv_list.append((k, v)) + + with self.lock: + self.cursor.executemany('REPLACE INTO zarr VALUES (?, ?)', kv_list) + + def listdir(self, path=None): + path = normalize_storage_path(path) + keys = self.cursor.execute( + ''' + SELECT DISTINCT SUBSTR(m, 0, INSTR(m, "/")) AS l FROM ( + SELECT LTRIM(SUBSTR(k, LENGTH(?) + 1), "/") || "/" AS m + FROM zarr WHERE k LIKE (? || "_%") + ) ORDER BY l ASC + ''', + (path, path) + ) + keys = list(map(operator.itemgetter(0), keys)) + return keys + + def getsize(self, path=None): + path = normalize_storage_path(path) + size = self.cursor.execute( + ''' + SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr + WHERE k LIKE (? || "%") AND + 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") + ''', + (path, path) + ) + for s, in size: + return s + + def rmdir(self, path=None): + path = normalize_storage_path(path) + if path: + with self.lock: + self.cursor.execute( + 'DELETE FROM zarr WHERE k LIKE (? || "_%")', (path,) + ) + else: + self.clear() + + def clear(self): + with self.lock: + self.cursor.executescript( + ''' + BEGIN TRANSACTION; + DROP TABLE zarr; + CREATE TABLE zarr(k TEXT PRIMARY KEY, v BLOB); + COMMIT TRANSACTION; + ''' + ) + + class ConsolidatedMetadataStore(MutableMapping): """A layer over other storage, where the metadata has been consolidated into a single key. diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index cbad222edb..b2b6bb011e 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -15,7 +15,7 @@ from zarr.storage import (DirectoryStore, init_array, init_group, NestedDirectoryStore, - DBMStore, LMDBStore, atexit_rmtree, atexit_rmglob, + DBMStore, LMDBStore, SQLiteStore, atexit_rmtree, atexit_rmglob, LRUStoreCache) from zarr.core import Array from zarr.errors import PermissionError @@ -1390,6 +1390,31 @@ def test_nbytes_stored(self): pass # not implemented +try: + import sqlite3 +except ImportError: # pragma: no cover + sqlite3 = None + + +@unittest.skipIf(sqlite3 is None, 'python built without sqlite') +class TestArrayWithSQLiteStore(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + path = mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStore(path) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + kwargs.setdefault('compressor', Zlib(1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=cache_metadata, + cache_attrs=cache_attrs) + + def test_nbytes_stored(self): + pass # not implemented + + class TestArrayWithNoCompressor(TestArray): def create_array(self, read_only=False, **kwargs): diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index 7758976c8c..369cf4b55a 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -17,8 +17,8 @@ from zarr.storage import (DictStore, DirectoryStore, ZipStore, init_group, init_array, array_meta_key, group_meta_key, atexit_rmtree, - NestedDirectoryStore, DBMStore, LMDBStore, atexit_rmglob, - LRUStoreCache) + NestedDirectoryStore, DBMStore, LMDBStore, SQLiteStore, + atexit_rmglob, LRUStoreCache) from zarr.core import Array from zarr.compat import PY2, text_type from zarr.hierarchy import Group, group, open_group @@ -928,6 +928,22 @@ def create_store(): return store, None +try: + import sqlite3 +except ImportError: # pragma: no cover + sqlite3 = None + + +@unittest.skipIf(sqlite3 is None, 'python built without sqlite') +class TestGroupWithSQLiteStore(TestGroup): + + def create_store(self): + path = tempfile.mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStore(path) + return store, None + + class TestGroupWithChunkStore(TestGroup): @staticmethod diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 154ffbe3ed..ab3ea5c26d 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -8,6 +8,7 @@ import array import shutil import os +from pickle import PicklingError import numpy as np @@ -19,7 +20,7 @@ DirectoryStore, ZipStore, init_group, group_meta_key, getsize, migrate_1to2, TempStore, atexit_rmtree, NestedDirectoryStore, default_compressor, DBMStore, - LMDBStore, atexit_rmglob, LRUStoreCache, + LMDBStore, SQLiteStore, atexit_rmglob, LRUStoreCache, ConsolidatedMetadataStore) from zarr.meta import (decode_array_metadata, encode_array_metadata, ZARR_FORMAT, decode_group_metadata, encode_group_metadata) @@ -894,6 +895,41 @@ def test_context_manager(self): assert 2 == len(store) +try: + import sqlite3 +except ImportError: # pragma: no cover + sqlite3 = None + + +@unittest.skipIf(sqlite3 is None, 'python built without sqlite') +class TestSQLiteStore(StoreTests, unittest.TestCase): + + def create_store(self): + path = tempfile.mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStore(path) + return store + + +@unittest.skipIf(sqlite3 is None, 'python built without sqlite') +class TestSQLiteStoreInMemory(TestSQLiteStore, unittest.TestCase): + + def create_store(self): + store = SQLiteStore(':memory:') + return store + + def test_pickle(self): + + # setup store + store = self.create_store() + store['foo'] = b'bar' + store['baz'] = b'quux' + + # round-trip through pickle + with pytest.raises(PicklingError): + pickle.dumps(store) + + class TestLRUStoreCache(StoreTests, unittest.TestCase): def create_store(self): From 3abe79df41b0a1eca99197c8e3e40013c3e98b67 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Tue, 22 Jan 2019 17:21:26 +0000 Subject: [PATCH 150/168] syntax fix --- zarr/tests/test_hierarchy.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index 19fa4bfe2c..37baecf1ae 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -18,8 +18,6 @@ from zarr.storage import (DictStore, DirectoryStore, ZipStore, init_group, init_array, array_meta_key, group_meta_key, atexit_rmtree, - NestedDirectoryStore, DBMStore, LMDBStore, atexit_rmglob, - LRUStoreCache, ABSStore) NestedDirectoryStore, DBMStore, LMDBStore, SQLiteStore, ABSStore, atexit_rmglob, LRUStoreCache) from zarr.core import Array From 3ad6d9c1fded1bfd1d59f205b5ab9cbcc8e4670f Mon Sep 17 00:00:00 2001 From: shikharsg Date: Tue, 22 Jan 2019 21:35:20 +0000 Subject: [PATCH 151/168] flake8 fix --- zarr/storage.py | 1 - 1 file changed, 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index 834bc8ab49..08c7274315 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -32,7 +32,6 @@ import glob import warnings import io -import array from azure.storage.blob import BlockBlobService From ab38119fa63acd3ad5294bafeb7f1984652411f9 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Wed, 23 Jan 2019 10:00:50 +0000 Subject: [PATCH 152/168] fixed ABSStore parameter name container --- zarr/storage.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 08c7274315..a0097f1c35 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1889,7 +1889,7 @@ class ABSStore(MutableMapping): Parameters ---------- - container_name : string + container : string The name of the ABS container to use. prefix : string Location of the "directory" to use as the root of the storage hierarchy @@ -1910,7 +1910,7 @@ class ABSStore(MutableMapping): def __init__(self, container, prefix, account_name=None, account_key=None, blob_service_kwargs=None): - self.container_name = container + self.container = container self.prefix = normalize_storage_path(prefix) self.account_name = account_name self.account_key = account_key @@ -1953,7 +1953,7 @@ def _strip_prefix_from_path(path, prefix): def __getitem__(self, key): blob_name = '/'.join([self.prefix, key]) try: - blob = self.client.get_blob_to_bytes(self.container_name, blob_name) + blob = self.client.get_blob_to_bytes(self.container, blob_name) return blob.content except AzureMissingResourceHttpError: raise KeyError('Blob %s not found' % blob_name) @@ -1962,18 +1962,18 @@ def __setitem__(self, key, value): value = ensure_bytes(value) blob_name = '/'.join([self.prefix, key]) buffer = io.BytesIO(value) - self.client.create_blob_from_stream(self.container_name, blob_name, buffer) + self.client.create_blob_from_stream(self.container, blob_name, buffer) def __delitem__(self, key): - if self.client.exists(self.container_name, '/'.join([self.prefix, key])): - self.client.delete_blob(self.container_name, '/'.join([self.prefix, key])) + if self.client.exists(self.container, '/'.join([self.prefix, key])): + self.client.delete_blob(self.container, '/'.join([self.prefix, key])) else: raise KeyError def __eq__(self, other): return ( isinstance(other, ABSStore) and - self.container_name == other.container_name and + self.container == other.container and self.prefix == other.prefix ) @@ -1981,7 +1981,7 @@ def keys(self): return list(self.__iter__()) def __iter__(self): - for blob in self.client.list_blobs(self.container_name, self.prefix + '/'): + for blob in self.client.list_blobs(self.container, self.prefix + '/'): yield self._strip_prefix_from_path(blob.name, self.prefix) def __len__(self): @@ -1989,7 +1989,7 @@ def __len__(self): def __contains__(self, key): blob_name = '/'.join([self.prefix, key]) - if self.client.exists(self.container_name, blob_name): + if self.client.exists(self.container, blob_name): return True else: return False @@ -2002,7 +2002,7 @@ def listdir(self, path=None): dir_path = dir_path + '/' + store_path dir_path += '/' items = list() - for blob in self.client.list_blobs(self.container_name, prefix=dir_path, delimiter='/'): + for blob in self.client.list_blobs(self.container, prefix=dir_path, delimiter='/'): if '/' in blob.name[len(dir_path):]: items.append(self._strip_prefix_from_path( blob.name[:blob.name.find('/', len(dir_path))], dir_path)) @@ -2012,20 +2012,20 @@ def listdir(self, path=None): def rmdir(self, path=None): dir_path = normalize_storage_path(self._append_path_to_prefix(path, self.prefix)) + '/' - for blob in self.client.list_blobs(self.container_name, prefix=dir_path): - self.client.delete_blob(self.container_name, blob.name) + for blob in self.client.list_blobs(self.container, prefix=dir_path): + self.client.delete_blob(self.container, blob.name) def getsize(self, path=None): store_path = normalize_storage_path(path) fs_path = self.prefix if store_path: fs_path = self._append_path_to_prefix(store_path, self.prefix) - if self.client.exists(self.container_name, fs_path): - return self.client.get_blob_properties(self.container_name, + if self.client.exists(self.container, fs_path): + return self.client.get_blob_properties(self.container, fs_path).properties.content_length else: size = 0 - for blob in self.client.list_blobs(self.container_name, prefix=fs_path + '/', + for blob in self.client.list_blobs(self.container, prefix=fs_path + '/', delimiter='/'): if '/' not in blob.name[len(fs_path + '/'):]: size += blob.properties.content_length From 05aab41a7803358df1cbae07a14de7e6a3dc5884 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Wed, 23 Jan 2019 10:56:34 +0000 Subject: [PATCH 153/168] removed context manager from ABSStore --- zarr/storage.py | 6 ------ zarr/tests/test_storage.py | 6 ------ 2 files changed, 12 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index a0097f1c35..699844ed4b 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1932,12 +1932,6 @@ def __setstate__(self, state): self.client = BlockBlobService(self.account_name, self.account_key, **self.blob_service_kwargs) - def __enter__(self): - return self - - def __exit__(self, *args): - pass - @staticmethod def _append_path_to_prefix(path, prefix): return '/'.join([normalize_storage_path(prefix), diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 7ef8aa8151..74115549b3 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1336,12 +1336,6 @@ def create_store(self): store.rmdir() return store - def test_context_manager(self): - with self.create_store() as store: - store['foo'] = b'bar' - store['baz'] = b'qux' - assert 2 == len(store) - class TestConsolidatedMetadataStore(unittest.TestCase): From 90b5e3a4d9bfab6c6e74da2c71745187d8b4a6ef Mon Sep 17 00:00:00 2001 From: shikharsg Date: Wed, 23 Jan 2019 11:59:24 +0000 Subject: [PATCH 154/168] ABSStore.__delitem__ now takes only 1 azure storage API call --- zarr/storage.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 699844ed4b..b5cb0bee4f 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1959,10 +1959,10 @@ def __setitem__(self, key, value): self.client.create_blob_from_stream(self.container, blob_name, buffer) def __delitem__(self, key): - if self.client.exists(self.container, '/'.join([self.prefix, key])): + try: self.client.delete_blob(self.container, '/'.join([self.prefix, key])) - else: - raise KeyError + except AzureMissingResourceHttpError: + raise KeyError('Blob %s not found' % key) def __eq__(self, other): return ( From 4636d5d650186162042b3f2219e2759b0ddf82eb Mon Sep 17 00:00:00 2001 From: shikharsg Date: Wed, 23 Jan 2019 15:42:12 +0000 Subject: [PATCH 155/168] docs --- docs/api/storage.rst | 2 ++ docs/release.rst | 3 +++ zarr/storage.py | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 24498b0d79..a53a7ce7ba 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -31,6 +31,8 @@ Storage (``zarr.storage``) .. automethod:: invalidate_values .. automethod:: invalidate_keys +.. autoclass:: ABSStore + .. autoclass:: ConsolidatedMetadataStore .. autofunction:: init_array diff --git a/docs/release.rst b/docs/release.rst index f0d5a559ab..fceff051d5 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -9,6 +9,9 @@ Release notes Enhancements ~~~~~~~~~~~~ +* New storage backend, backed by Azure Blob Storage, :class:`zarr.storage.ABSStore`. + Chunks are stored as block blobs. + * Add "consolidated" metadata as an experimental feature: use :func:`zarr.convenience.consolidate_metadata` to copy all metadata from the various metadata keys within a dataset hierarchy under a single key, and diff --git a/zarr/storage.py b/zarr/storage.py index b5cb0bee4f..54a042e055 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1897,7 +1897,7 @@ class ABSStore(MutableMapping): account_name : string The Azure blob storage account name. account_key : string - The Azure blob storage account acess key. + The Azure blob storage account access key. blob_service_kwargs : dictionary Extra arguments to be passed into the azure blob client, for e.g. when using the emulator, pass in blob_service_kwargs={'is_emulated': True} From 8c3863fb668f0536a312de0268a59c6b05828930 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 23 Jan 2019 15:43:51 +0000 Subject: [PATCH 156/168] Update zarr/storage.py Co-Authored-By: shikharsg --- zarr/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index 54a042e055..be6657d798 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1904,7 +1904,7 @@ class ABSStore(MutableMapping): Notes ----- - In order to use this store, you must install the Azure Blob Storage + In order to use this store, you must install the Microsoft Azure Storage SDK for Python https://github.com/Azure/azure-storage-python/tree/master/azure-storage-blob_ version >= 1.3.0. """ From b238f0b385c0dee5927b974d70ff421498bd8e39 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Fri, 1 Feb 2019 18:50:09 +0000 Subject: [PATCH 157/168] removed global import of azure storage library --- zarr/storage.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index be6657d798..47926acfa3 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -34,10 +34,6 @@ import io -from azure.storage.blob import BlockBlobService -from azure.common import AzureMissingResourceHttpError - - from zarr.util import (normalize_shape, normalize_chunks, normalize_order, normalize_storage_path, buffer_size, normalize_fill_value, nolock, normalize_dtype) @@ -1910,6 +1906,7 @@ class ABSStore(MutableMapping): def __init__(self, container, prefix, account_name=None, account_key=None, blob_service_kwargs=None): + from azure.storage.blob import BlockBlobService self.container = container self.prefix = normalize_storage_path(prefix) self.account_name = account_name @@ -1928,6 +1925,7 @@ def __getstate__(self): return state def __setstate__(self, state): + from azure.storage.blob import BlockBlobService self.__dict__.update(state) self.client = BlockBlobService(self.account_name, self.account_key, **self.blob_service_kwargs) @@ -1945,6 +1943,7 @@ def _strip_prefix_from_path(path, prefix): return path_norm[(len(prefix_norm)+1):] def __getitem__(self, key): + from azure.common import AzureMissingResourceHttpError blob_name = '/'.join([self.prefix, key]) try: blob = self.client.get_blob_to_bytes(self.container, blob_name) @@ -1959,6 +1958,7 @@ def __setitem__(self, key, value): self.client.create_blob_from_stream(self.container, blob_name, buffer) def __delitem__(self, key): + from azure.common import AzureMissingResourceHttpError try: self.client.delete_blob(self.container, '/'.join([self.prefix, key])) except AzureMissingResourceHttpError: From 9770876272f7930674d5572014d5d7436d0e046f Mon Sep 17 00:00:00 2001 From: shikharsg Date: Sat, 2 Feb 2019 13:06:53 +0000 Subject: [PATCH 158/168] added ABSStore to zarr root import --- zarr/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/__init__.py b/zarr/__init__.py index c9046f6bff..9eb46c3f43 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -8,7 +8,7 @@ ones_like, full_like, open_array, open_like, create) from zarr.storage import (DictStore, DirectoryStore, ZipStore, TempStore, NestedDirectoryStore, DBMStore, LMDBStore, SQLiteStore, - LRUStoreCache) + LRUStoreCache, ABSStore) from zarr.hierarchy import group, open_group, Group from zarr.sync import ThreadSynchronizer, ProcessSynchronizer from zarr.codecs import * From 3ed4814ab42b5a4ee7e900dc650ef1895ca25fe3 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Sat, 2 Feb 2019 13:31:55 +0000 Subject: [PATCH 159/168] added ABSStore to tutorial.rst --- docs/tutorial.rst | 13 +++++++++++++ zarr/storage.py | 1 - 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 0fbefc3e2e..008b8aa4ba 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -780,6 +780,19 @@ Here is an example using S3Map to read an array created previously:: >>> z[:].tostring() b'Hello from the cloud!' +Zarr now also has a builtin storage backend for Azure Blob Storage. +The class is :class:`zarr.storage.ABSStore` (requires + `azure-storage-blob `_ +to be installed):: + + >>> # when using a storage account, provide account_name and account_key arguments + >>> # to ABSStore + >>> store = zarr.ABSStore(container='test', prefix='zarr-testing', + blob_service_kwargs={'is_emulated': True}) + >>> root = zarr.group(store=store, overwrite=True) + >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') + >>> z[:] = 42 + Note that retrieving data from a remote service via the network can be significantly slower than retrieving data from a local file system, and will depend on network latency and bandwidth between the client and server systems. If you are experiencing poor diff --git a/zarr/storage.py b/zarr/storage.py index 47926acfa3..29fe3e7fcf 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1901,7 +1901,6 @@ class ABSStore(MutableMapping): Notes ----- In order to use this store, you must install the Microsoft Azure Storage SDK for Python - https://github.com/Azure/azure-storage-python/tree/master/azure-storage-blob_ version >= 1.3.0. """ def __init__(self, container, prefix, account_name=None, account_key=None, From 7b08aba1ff6106e24e68d3e9988811c41bf9f5b2 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Sat, 2 Feb 2019 13:42:34 +0000 Subject: [PATCH 160/168] fixed docs --- docs/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 008b8aa4ba..b5c5955782 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -788,7 +788,7 @@ to be installed):: >>> # when using a storage account, provide account_name and account_key arguments >>> # to ABSStore >>> store = zarr.ABSStore(container='test', prefix='zarr-testing', - blob_service_kwargs={'is_emulated': True}) + >>> blob_service_kwargs={'is_emulated': True}) >>> root = zarr.group(store=store, overwrite=True) >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') >>> z[:] = 42 From 6fc869de8219753b83150a32446250008fd6da04 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Sat, 2 Feb 2019 13:54:45 +0000 Subject: [PATCH 161/168] trying to fix tutorial.rst --- docs/tutorial.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index b5c5955782..98e351907e 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -787,8 +787,7 @@ to be installed):: >>> # when using a storage account, provide account_name and account_key arguments >>> # to ABSStore - >>> store = zarr.ABSStore(container='test', prefix='zarr-testing', - >>> blob_service_kwargs={'is_emulated': True}) + >>> store = zarr.ABSStore(container='test', prefix='zarr-testing', blob_service_kwargs={'is_emulated': True}) >>> root = zarr.group(store=store, overwrite=True) >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') >>> z[:] = 42 From e9a402eb58bb988656e8f3937bfe7d82eeeb10e6 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Sat, 2 Feb 2019 14:04:05 +0000 Subject: [PATCH 162/168] flake8 fix --- zarr/storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/storage.py b/zarr/storage.py index 29fe3e7fcf..7f52a5223e 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1450,7 +1450,7 @@ def flush(self): if self.flag[0] != 'r': with self.write_mutex: if hasattr(self.db, 'sync'): - self.db.sync() + self.db.sync() else: # fall-back, close and re-open, needed for ndbm flag = self.flag From 8aa3a013cceeea44e64c3e0e248014a9350490e3 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Sat, 2 Feb 2019 14:09:57 +0000 Subject: [PATCH 163/168] fixing tutorial.rst --- docs/tutorial.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 98e351907e..09f6df8386 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -785,8 +785,7 @@ The class is :class:`zarr.storage.ABSStore` (requires `azure-storage-blob `_ to be installed):: - >>> # when using a storage account, provide account_name and account_key arguments - >>> # to ABSStore + >>> # when using a storage account, provide account_name and account_key arguments to ABSStore >>> store = zarr.ABSStore(container='test', prefix='zarr-testing', blob_service_kwargs={'is_emulated': True}) >>> root = zarr.group(store=store, overwrite=True) >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') From a9940a248af2602d27737c54e4a2ca67dfd199a7 Mon Sep 17 00:00:00 2001 From: shikharsg Date: Mon, 4 Feb 2019 14:29:29 +0000 Subject: [PATCH 164/168] fixed ABSStore in tutorial --- docs/tutorial.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 09f6df8386..502cfa2dd1 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -785,12 +785,16 @@ The class is :class:`zarr.storage.ABSStore` (requires `azure-storage-blob `_ to be installed):: - >>> # when using a storage account, provide account_name and account_key arguments to ABSStore >>> store = zarr.ABSStore(container='test', prefix='zarr-testing', blob_service_kwargs={'is_emulated': True}) >>> root = zarr.group(store=store, overwrite=True) >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') >>> z[:] = 42 +When using an actual storage account, provide ``account_name`` and +``account_key`` arguments to :class:`zarr.storage.ABSStore`, the +above client is just testing against the emulator. Please also note +that this is an experimental feature. + Note that retrieving data from a remote service via the network can be significantly slower than retrieving data from a local file system, and will depend on network latency and bandwidth between the client and server systems. If you are experiencing poor From 4d5b6d13374c0c05395bece367b122f4f77862bc Mon Sep 17 00:00:00 2001 From: shikharsg Date: Mon, 4 Feb 2019 14:31:28 +0000 Subject: [PATCH 165/168] docs --- docs/release.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/release.rst b/docs/release.rst index fceff051d5..dd6f02573b 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -9,8 +9,9 @@ Release notes Enhancements ~~~~~~~~~~~~ -* New storage backend, backed by Azure Blob Storage, :class:`zarr.storage.ABSStore`. - Chunks are stored as block blobs. +* New storage backend, backed by Azure Blob Storage, + class :class:`zarr.storage.ABSStore`. + All data is stored as Block blobs. * Add "consolidated" metadata as an experimental feature: use :func:`zarr.convenience.consolidate_metadata` to copy all metadata from the various From 8495469f46bee0da4f55ea2270a9f11f53fcec36 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 6 Feb 2019 13:38:16 +0000 Subject: [PATCH 166/168] Chunkwise iteration over arrays. (#399) * Chunkwise iteration over arrays. Closes #398. * Fixed lint error from new flake8 version. --- docs/release.rst | 11 +++++++---- zarr/compat.py | 2 ++ zarr/core.py | 12 ++++++++++++ zarr/storage.py | 2 +- zarr/tests/test_core.py | 36 +++++++++++++++++++++++++++++++++++- 5 files changed, 57 insertions(+), 6 deletions(-) diff --git a/docs/release.rst b/docs/release.rst index f0d5a559ab..d996240afa 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -23,15 +23,18 @@ Enhancements SQLite database to be used as the backing store for an array or group. By :user:`John Kirkham `, :issue:`368`, :issue:`365`. +* Efficient iteration over arrays by decompressing chunkwise. + By :user:`Jerome Kelleher `, :issue:`398`. + Bug fixes ~~~~~~~~~ -* The implementation of the :class:`zarr.storage.DirectoryStore` class has been modified to - ensure that writes are atomic and there are no race conditions where a chunk might appear - transiently missing during a write operation. By :user:`sbalmer `, :issue:`327`, +* The implementation of the :class:`zarr.storage.DirectoryStore` class has been modified to + ensure that writes are atomic and there are no race conditions where a chunk might appear + transiently missing during a write operation. By :user:`sbalmer `, :issue:`327`, :issue:`263`. -* The required version of the `numcodecs `_ package has been upgraded +* The required version of the `numcodecs `_ package has been upgraded to 0.6.2, which has enabled some code simplification and fixes a failing test involving msgpack encoding. By :user:`John Kirkham `, :issue:`361`, :issue:`360`, :issue:`352`, :issue:`355`, :issue:`324`. diff --git a/zarr/compat.py b/zarr/compat.py index 117a8edf59..91a75548e6 100644 --- a/zarr/compat.py +++ b/zarr/compat.py @@ -12,6 +12,7 @@ text_type = unicode binary_type = str reduce = reduce + from itertools import izip_longest as zip_longest class PermissionError(Exception): pass @@ -27,6 +28,7 @@ def OrderedDict_move_to_end(od, key): text_type = str binary_type = bytes from functools import reduce + from itertools import zip_longest PermissionError = PermissionError def OrderedDict_move_to_end(od, key): diff --git a/zarr/core.py b/zarr/core.py index cc04953a7c..0838117b89 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -424,6 +424,18 @@ def __array__(self, *args): a = a.astype(args[0]) return a + def __iter__(self): + if len(self.shape) == 0: + # Same error as numpy + raise TypeError("iteration over a 0-d array") + # Avoid repeatedly decompressing chunks by iterating over the chunks + # in the first dimension. + chunk_size = self.chunks[0] + for j in range(self.shape[0]): + if j % chunk_size == 0: + chunk = self[j: j + chunk_size] + yield chunk[j % chunk_size] + def __len__(self): if self.shape: return self.shape[0] diff --git a/zarr/storage.py b/zarr/storage.py index 2a07d9aa38..75e4d7d04d 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1449,7 +1449,7 @@ def flush(self): if self.flag[0] != 'r': with self.write_mutex: if hasattr(self.db, 'sync'): - self.db.sync() + self.db.sync() else: # fall-back, close and re-open, needed for ndbm flag = self.flag diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index b2b6bb011e..1c7d526c0c 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -19,7 +19,7 @@ LRUStoreCache) from zarr.core import Array from zarr.errors import PermissionError -from zarr.compat import PY2, text_type, binary_type +from zarr.compat import PY2, text_type, binary_type, zip_longest from zarr.util import buffer_size from numcodecs import (Delta, FixedScaleOffset, Zlib, Blosc, BZ2, MsgPack, Pickle, Categorize, JSON, VLenUTF8, VLenBytes, VLenArray) @@ -1155,6 +1155,40 @@ def test_object_codec_warnings(self): # provide object_codec, but not object dtype self.create_array(shape=10, chunks=5, dtype='i4', object_codec=JSON()) + def test_zero_d_iter(self): + a = np.array(1, dtype=int) + z = self.create_array(shape=a.shape, dtype=int) + z[...] = a + with pytest.raises(TypeError): + # noinspection PyStatementEffect + list(a) + with pytest.raises(TypeError): + # noinspection PyStatementEffect + list(z) + + def test_iter(self): + params = ( + ((1,), (1,)), + ((2,), (1,)), + ((1,), (2,)), + ((3,), (3,)), + ((1000,), (100,)), + ((100,), (1000,)), + ((1, 100), (1, 1)), + ((1, 0), (1, 1)), + ((0, 1), (1, 1)), + ((0, 1), (2, 1)), + ((100, 1), (3, 1)), + ((100, 100), (10, 10)), + ((10, 10, 10), (3, 3, 3)), + ) + for shape, chunks in params: + z = self.create_array(shape=shape, chunks=chunks, dtype=int) + a = np.arange(np.product(shape)).reshape(shape) + z[:] = a + for expect, actual in zip_longest(a, z): + assert_array_equal(expect, actual) + class TestArrayWithPath(TestArray): From be1c606e33666a1ef48b8567cc20e65cc7228ef9 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 6 Feb 2019 15:23:48 +0000 Subject: [PATCH 167/168] add missing PR link --- docs/release.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/release.rst b/docs/release.rst index d996240afa..9493f273f8 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -24,7 +24,7 @@ Enhancements By :user:`John Kirkham `, :issue:`368`, :issue:`365`. * Efficient iteration over arrays by decompressing chunkwise. - By :user:`Jerome Kelleher `, :issue:`398`. + By :user:`Jerome Kelleher `, :issue:`398`, :issue:`399`. Bug fixes ~~~~~~~~~ From b76b2824d85b4ac1d24688a997d4ac66d6f5bcbe Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Fri, 8 Feb 2019 06:56:58 -0800 Subject: [PATCH 168/168] MongoDB and Redis stores (#372) * add mongodb and redis stores still needs docs and some CI setup * top level doc strings * fix host kwarg * pickle support * different way of launching dbs on travis * back to default travis configs * fixes to mapping classes for both redis and mongodb stores * default redis port * pep8 * decode for py2? * no decode for py2 * address comments * cast to binary type in mongo getitem * more doc strings * more docs * split release note into two bullets * whitespace fix in .travis.yml * lint after merge * pin mongo/redis versions and a few doc changes * use redis client.delete and check for deleted keys * fix typo in requirements * Update docs/release.rst Co-Authored-By: jhamman * Update docs/release.rst Co-Authored-By: jhamman * skip redis/mongodb tests when unable to connect * fix pep8 --- .travis.yml | 7 ++ docs/api/storage.rst | 2 + docs/release.rst | 8 ++ docs/tutorial.rst | 7 ++ requirements_dev_optional.txt | 2 + zarr/__init__.py | 2 +- zarr/storage.py | 184 +++++++++++++++++++++++++++++++++- zarr/tests/test_storage.py | 50 ++++++++- 8 files changed, 258 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8a5e1fe521..5ecf462419 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,10 @@ addons: packages: - libdb-dev +services: + - redis-server + - mongodb + matrix: include: - python: 2.7 @@ -20,6 +24,9 @@ matrix: dist: xenial sudo: true +before_script: + - mongo mydb_test --eval 'db.createUser({user:"travis",pwd:"test",roles:["readWrite"]});' + install: - pip install -U pip setuptools wheel tox-travis coveralls diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 24498b0d79..9abe240379 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -25,6 +25,8 @@ Storage (``zarr.storage``) .. automethod:: close +.. autoclass:: MongoDBStore +.. autoclass:: RedisStore .. autoclass:: LRUStoreCache .. automethod:: invalidate diff --git a/docs/release.rst b/docs/release.rst index 9493f273f8..65bd94c45f 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -26,6 +26,14 @@ Enhancements * Efficient iteration over arrays by decompressing chunkwise. By :user:`Jerome Kelleher `, :issue:`398`, :issue:`399`. +* Adds the Redis-backed :class:`zarr.storage.RedisStore` class enabling a + Redis database to be used as the backing store for an array or group. + By :user:`Joe Hamman `, :issue:`299`, :issue:`372`. + +* Adds the MongoDB-backed :class:`zarr.storage.MongoDBStore` class enabling a + MongoDB database to be used as the backing store for an array or group. + By :user:`Joe Hamman `, :issue:`299`, :issue:`372`. + Bug fixes ~~~~~~~~~ diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 0fbefc3e2e..3e8e9bac66 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -739,6 +739,13 @@ Python is built with SQLite support):: >>> z[:] = 42 >>> store.close() +Also added in Zarr version 2.3 are two storage classes for interfacing with server-client +databases. The :class:`zarr.storage.RedisStore` class interfaces `Redis `_ +(an in memory data structure store), and the :class:`zarr.storage.MongoDB` class interfaces +with `MongoDB `_ (an oject oriented NoSQL database). These stores +respectively require the `redis `_ and +`pymongo `_ packages to be installed. + Distributed/cloud storage ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/requirements_dev_optional.txt b/requirements_dev_optional.txt index 1ea71451d9..2e4feddce0 100644 --- a/requirements_dev_optional.txt +++ b/requirements_dev_optional.txt @@ -1,3 +1,5 @@ # These packages are currently not available on Windows. bsddb3==6.2.6 lmdb==0.94 +redis==3.0.1 +pymongo==3.7.1 \ No newline at end of file diff --git a/zarr/__init__.py b/zarr/__init__.py index c9046f6bff..e208b8ae82 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -8,7 +8,7 @@ ones_like, full_like, open_array, open_like, create) from zarr.storage import (DictStore, DirectoryStore, ZipStore, TempStore, NestedDirectoryStore, DBMStore, LMDBStore, SQLiteStore, - LRUStoreCache) + LRUStoreCache, RedisStore, MongoDBStore) from zarr.hierarchy import group, open_group, Group from zarr.sync import ThreadSynchronizer, ProcessSynchronizer from zarr.codecs import * diff --git a/zarr/storage.py b/zarr/storage.py index 75e4d7d04d..d71ee3a18a 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -37,7 +37,7 @@ normalize_storage_path, buffer_size, normalize_fill_value, nolock, normalize_dtype) from zarr.meta import encode_array_metadata, encode_group_metadata -from zarr.compat import PY2, OrderedDict_move_to_end +from zarr.compat import PY2, OrderedDict_move_to_end, binary_type from numcodecs.registry import codec_registry from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, @@ -2084,6 +2084,188 @@ def clear(self): ) +class MongoDBStore(MutableMapping): + """Storage class using MongoDB. + + .. note:: This is an experimental feature. + + Requires the `pymongo `_ + package to be installed. + + Parameters + ---------- + database : string + Name of database + collection : string + Name of collection + **kwargs + Keyword arguments passed through to the `pymongo.MongoClient` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.MongoDBStore('localhost') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() + + Store a group:: + + >>> store = zarr.MongoDBStore('localhost') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() + + Notes + ----- + The maximum chunksize in MongoDB documents is 16 MB. + + """ + + _key = 'key' + _value = 'value' + + def __init__(self, database='mongodb_zarr', collection='zarr_collection', + **kwargs): + import pymongo + + self._database = database + self._collection = collection + self._kwargs = kwargs + + self.client = pymongo.MongoClient(**self._kwargs) + self.db = self.client.get_database(self._database) + self.collection = self.db.get_collection(self._collection) + + def __getitem__(self, key): + doc = self.collection.find_one({self._key: key}) + + if doc is None: + raise KeyError(key) + else: + return binary_type(doc[self._value]) + + def __setitem__(self, key, value): + value = ensure_bytes(value) + self.collection.replace_one({self._key: key}, + {self._key: key, self._value: value}, + upsert=True) + + def __delitem__(self, key): + result = self.collection.delete_many({self._key: key}) + if not result.deleted_count == 1: + raise KeyError(key) + + def __iter__(self): + for f in self.collection.find({}): + yield f[self._key] + + def __len__(self): + return self.collection.count_documents({}) + + def __getstate__(self): + return self._database, self._collection, self._kwargs + + def __setstate__(self, state): + database, collection, kwargs = state + self.__init__(database=database, collection=collection, **kwargs) + + def close(self): + """Cleanup client resources and disconnect from MongoDB.""" + self.client.close() + + def clear(self): + """Remove all items from store.""" + self.collection.delete_many({}) + + +class RedisStore(MutableMapping): + """Storage class using Redis. + + .. note:: This is an experimental feature. + + Requires the `redis `_ + package to be installed. + + Parameters + ---------- + prefix : string + Name of prefix for Redis keys + **kwargs + Keyword arguments passed through to the `redis.Redis` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.RedisStore(port=6379) + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + + Store a group:: + + >>> store = zarr.RedisStore(port=6379) + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + + """ + def __init__(self, prefix='zarr', **kwargs): + import redis + self._prefix = prefix + self._kwargs = kwargs + + self.client = redis.Redis(**kwargs) + + def _key(self, key): + return '{prefix}:{key}'.format(prefix=self._prefix, key=key) + + def __getitem__(self, key): + return self.client[self._key(key)] + + def __setitem__(self, key, value): + value = ensure_bytes(value) + self.client[self._key(key)] = value + + def __delitem__(self, key): + count = self.client.delete(self._key(key)) + if not count: + raise KeyError(key) + + def keylist(self): + offset = len(self._key('')) # length of prefix + return [key[offset:].decode('utf-8') + for key in self.client.keys(self._key('*'))] + + def keys(self): + for key in self.keylist(): + yield key + + def __iter__(self): + for key in self.keys(): + yield key + + def __len__(self): + return len(self.keylist()) + + def __getstate__(self): + return self._prefix, self._kwargs + + def __setstate__(self, state): + prefix, kwargs = state + self.__init__(prefix=prefix, **kwargs) + + def clear(self): + for key in self.keys(): + del self[key] + + class ConsolidatedMetadataStore(MutableMapping): """A layer over other storage, where the metadata has been consolidated into a single key. diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index ab3ea5c26d..87273d140c 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -20,8 +20,8 @@ DirectoryStore, ZipStore, init_group, group_meta_key, getsize, migrate_1to2, TempStore, atexit_rmtree, NestedDirectoryStore, default_compressor, DBMStore, - LMDBStore, SQLiteStore, atexit_rmglob, LRUStoreCache, - ConsolidatedMetadataStore) + LMDBStore, SQLiteStore, MongoDBStore, RedisStore, + atexit_rmglob, LRUStoreCache, ConsolidatedMetadataStore) from zarr.meta import (decode_array_metadata, encode_array_metadata, ZARR_FORMAT, decode_group_metadata, encode_group_metadata) from zarr.compat import PY2 @@ -900,6 +900,29 @@ def test_context_manager(self): except ImportError: # pragma: no cover sqlite3 = None +try: + import pymongo + from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError + try: + client = pymongo.MongoClient(host='127.0.0.1', + serverSelectionTimeoutMS=1e3) + client.server_info() + except (ConnectionFailure, ServerSelectionTimeoutError): # pragma: no cover + pymongo = None +except ImportError: # pragma: no cover + pymongo = None + +try: + import redis + from redis import ConnectionError + try: + rs = redis.Redis("localhost", port=6379) + rs.ping() + except ConnectionError: # pragma: no cover + redis = None +except ImportError: # pragma: no cover + redis = None + @unittest.skipIf(sqlite3 is None, 'python built without sqlite') class TestSQLiteStore(StoreTests, unittest.TestCase): @@ -930,6 +953,29 @@ def test_pickle(self): pickle.dumps(store) +@unittest.skipIf(pymongo is None, 'test requires pymongo') +class TestMongoDBStore(StoreTests, unittest.TestCase): + + def create_store(self): + store = MongoDBStore(host='127.0.0.1', database='zarr_tests', + collection='zarr_tests') + # start with an empty store + store.clear() + return store + + +@unittest.skipIf(redis is None, 'test requires redis') +class TestRedisStore(StoreTests, unittest.TestCase): + + def create_store(self): + # TODO: this is the default host for Redis on Travis, + # we probably want to generalize this though + store = RedisStore(host='localhost', port=6379) + # start with an empty store + store.clear() + return store + + class TestLRUStoreCache(StoreTests, unittest.TestCase): def create_store(self):