diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e31c477477..88234e69e8 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,15 +2,9 @@ TODO: * [ ] Add unit tests and/or doctests in docstrings -* [ ] Unit tests and doctests pass locally under Python 3.6 (e.g., run ``tox -e py36`` or - ``pytest -v --doctest-modules zarr``) -* [ ] Unit tests pass locally under Python 2.7 (e.g., run ``tox -e py27`` or - ``pytest -v zarr``) -* [ ] PEP8 checks pass (e.g., run ``tox -e py36`` or ``flake8 --max-line-length=100 zarr``) * [ ] Add docstrings and API docs for any new/modified user-facing classes and functions * [ ] New/modified features documented in docs/tutorial.rst -* [ ] Doctests in tutorial pass (e.g., run ``tox -e py36`` or ``python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst``) * [ ] Changes documented in docs/release.rst * [ ] Docs build locally (e.g., run ``tox -e docs``) * [ ] AppVeyor and Travis CI passes -* [ ] Test coverage to 100% (Coveralls passes) +* [ ] Test coverage is 100% (Coveralls passes) diff --git a/.pyup.yml b/.pyup.yml new file mode 100644 index 0000000000..0c85ee8e03 --- /dev/null +++ b/.pyup.yml @@ -0,0 +1,24 @@ +# pyup.io config file +# see https://pyup.io/docs/configuration/ for all available options + +schedule: every month + +requirements: + - requirements.txt: + pin: False + update: False + - requirements_test.txt: + pin: False + update: False + - requirements_rtfd.txt: + pin: False + update: False + - requirements_dev.txt: + pin: True + update: all + - requirements_dev_npy.txt: + pin: True + update: all + - requirements_dev_optional.txt: + pin: True + update: all diff --git a/.travis.yml b/.travis.yml index d126fb755a..2c73212d0e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,15 +11,29 @@ addons: packages: - libdb-dev -python: - - 2.7 - - 3.4 - - 3.5 - - 3.6 +services: + - docker + - redis-server + - mongodb + +matrix: + include: + - python: 2.7 + - python: 3.5 + - python: 3.6 + - python: 3.7 + dist: xenial + sudo: true + +before_install: + - docker pull arafato/azurite + - mkdir ~/blob_emulator + - docker run -e executable=blob -d -t -p 10000:10000 -v ~/blob_emulator:/opt/azurite/folder arafato/azurite +before_script: + - mongo mydb_test --eval 'db.createUser({user:"travis",pwd:"test",roles:["readWrite"]});' install: - - pip install -U pip setuptools wheel - - pip install -U tox-travis coveralls + - pip install -U pip setuptools wheel tox-travis coveralls script: - tox diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000..93175dd661 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,46 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at zarr.conduct@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/appveyor.yml b/appveyor.yml index ef94c37a54..d04417d671 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,6 +2,10 @@ branches: only: - master +# the VS C++ compiler path, doesn't seem to exist in the PATH environment variable of +# the Visual Studio 2017 build VM, due to which the pyosreplace package fails to build +image: Visual Studio 2015 + environment: global: @@ -9,60 +13,39 @@ environment: # /E:ON and /V:ON options are not enabled in the batch script intepreter # See: http://stackoverflow.com/a/13751649/163740 CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\build.cmd" + EMULATOR_LOC: C:\\Program Files (x86)\\Microsoft SDKs\\Azure\\Storage Emulator\\AzureStorageEmulator.exe matrix: - - PYTHON: "C:\\Python27" - PYTHON_VERSION: "2.7" - NUMPY_VERSION: "1.13.3" - - PYTHON: "C:\\Python27-x64" PYTHON_VERSION: "2.7" - NUMPY_VERSION: "1.13.3" - DISTUTILS_USE_SDK: "1" - - - PYTHON: "C:\\Python34" - NUMPY_VERSION: "1.13.3" - PYTHON_VERSION: "3.4" - - - PYTHON: "C:\\Python34-x64" - PYTHON_VERSION: "3.4" - NUMPY_VERSION: "1.13.3" DISTUTILS_USE_SDK: "1" - - PYTHON: "C:\\Python35" - PYTHON_VERSION: "3.5" - NUMPY_VERSION: "1.13.3" - - PYTHON: "C:\\Python35-x64" PYTHON_VERSION: "3.5" - NUMPY_VERSION: "1.13.3" - - - PYTHON: "C:\\Python36" - PYTHON_VERSION: "3.6" - NUMPY_VERSION: "1.13.3" - PYTHON: "C:\\Python36-x64" PYTHON_VERSION: "3.6" - NUMPY_VERSION: "1.13.3" - - PYTHON: "C:\\Python36" - PYTHON_VERSION: "3.6" - NUMPY_VERSION: "1.14.0" - - - PYTHON: "C:\\Python36-x64" - PYTHON_VERSION: "3.6" - NUMPY_VERSION: "1.14.0" + - PYTHON: "C:\\Python37-x64" + PYTHON_VERSION: "3.7" install: - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" + - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" + - "%CMD_IN_ENV% python -m pip install -rrequirements_test.txt" + - "%CMD_IN_ENV% python -m pip install -rrequirements_dev_npy.txt" + - "%CMD_IN_ENV% python -m pip install --no-binary=numcodecs -rrequirements_dev.txt" + - "%CMD_IN_ENV% python setup.py install" + - "%CMD_IN_ENV% python -m pip freeze" build: off +before_test: + - '"%EMULATOR_LOC%" start' + test_script: - - "%CMD_IN_ENV% python -m pip install -U pip setuptools wheel" - - "%CMD_IN_ENV% python -m pip install numpy==%NUMPY_VERSION%" - - "%CMD_IN_ENV% python -m pip install -rrequirements_dev.txt" - - "%CMD_IN_ENV% python setup.py install" - "%CMD_IN_ENV% python -m pytest -v --pyargs zarr" - + +after_test: + - '"%EMULATOR_LOC%" stop' diff --git a/docs/api/convenience.rst b/docs/api/convenience.rst index 51997a4dc2..a70a90ce7c 100644 --- a/docs/api/convenience.rst +++ b/docs/api/convenience.rst @@ -10,3 +10,5 @@ Convenience functions (``zarr.convenience``) .. autofunction:: copy_all .. autofunction:: copy_store .. autofunction:: tree +.. autofunction:: consolidate_metadata +.. autofunction:: open_consolidated diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 2365359fa9..85d85f40aa 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -21,12 +21,22 @@ Storage (``zarr.storage``) .. automethod:: close .. automethod:: flush +.. autoclass:: SQLiteStore + + .. automethod:: close + +.. autoclass:: MongoDBStore +.. autoclass:: RedisStore .. autoclass:: LRUStoreCache .. automethod:: invalidate .. automethod:: invalidate_values .. automethod:: invalidate_keys +.. autoclass:: ABSStore + +.. autoclass:: ConsolidatedMetadataStore + .. autofunction:: init_array .. autofunction:: init_group .. autofunction:: contains_array diff --git a/docs/release.rst b/docs/release.rst index fdcd3cb0e2..cfd139f6b2 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -1,6 +1,77 @@ Release notes ============= +.. _release_2.3.0: + +2.3.0 (Work in Progress) +------------------------ + +Enhancements +~~~~~~~~~~~~ + +* New storage backend, backed by Azure Blob Storage, + class :class:`zarr.storage.ABSStore`. + All data is stored as Block blobs. + +* Add "consolidated" metadata as an experimental feature: use + :func:`zarr.convenience.consolidate_metadata` to copy all metadata from the various + metadata keys within a dataset hierarchy under a single key, and + :func:`zarr.convenience.open_consolidated` to use this single key. This can greatly + cut down the number of calls to the storage backend, and so remove a lot of overhead + for reading remote data. By :user:`Martin Durant `, :issue:`268`. + +* Support has been added for structured arrays with sub-array shape and/or nested fields. By + :user:`Tarik Onalan `, :issue:`111`, :issue:`296`. + +* Adds the SQLite-backed :class:`zarr.storage.SQLiteStore` class enabling an + SQLite database to be used as the backing store for an array or group. + By :user:`John Kirkham `, :issue:`368`, :issue:`365`. + +* Efficient iteration over arrays by decompressing chunkwise. + By :user:`Jerome Kelleher `, :issue:`398`, :issue:`399`. + +* Adds the Redis-backed :class:`zarr.storage.RedisStore` class enabling a + Redis database to be used as the backing store for an array or group. + By :user:`Joe Hamman `, :issue:`299`, :issue:`372`. + +* Adds the MongoDB-backed :class:`zarr.storage.MongoDBStore` class enabling a + MongoDB database to be used as the backing store for an array or group. + By :user:`Joe Hamman `, :issue:`299`, :issue:`372`. + +Bug fixes +~~~~~~~~~ + +* The implementation of the :class:`zarr.storage.DirectoryStore` class has been modified to + ensure that writes are atomic and there are no race conditions where a chunk might appear + transiently missing during a write operation. By :user:`sbalmer `, :issue:`327`, + :issue:`263`. + +* The required version of the `numcodecs `_ package has been upgraded + to 0.6.2, which has enabled some code simplification and fixes a failing test involving + msgpack encoding. By :user:`John Kirkham `, :issue:`361`, :issue:`360`, :issue:`352`, + :issue:`355`, :issue:`324`. + +* Failing tests related to pickling/unpickling have been fixed. By :user:`Ryan Williams `, + :issue:`273`, :issue:`308`. + +* Ensure ``DictStore`` contains only ``bytes`` to facilitate comparisons and protect against writes. + By :user:`John Kirkham `, :issue:`350` + +* Always use a ``tuple`` when indexing a NumPy ``ndarray``. + By :user:`John Kirkham `, :issue:`376` + +Maintenance +~~~~~~~~~~~ + +* CI and test environments have been upgraded to include Python 3.7, drop Python 3.4, and + upgrade all pinned package requirements. :issue:`308`. + +* Corrects handling of ``NaT`` in ``datetime64`` and ``timedelta64`` in various + compressors (by :user:`John Kirkham `; :issue:`344`). + +Acknowledgments +~~~~~~~~~~~~~~~ + .. _release_2.2.0: 2.2.0 diff --git a/docs/spec/v2.rst b/docs/spec/v2.rst index 2a3bbd9a54..fa47e38e8d 100644 --- a/docs/spec/v2.rst +++ b/docs/spec/v2.rst @@ -140,13 +140,31 @@ the `NumPy documentation on Datetimes and Timedeltas `_. For example, ``"`_. For -example, the JSON list ``[["r", "|u1"], ["g", "|u1"], ["b", "|u1"]]`` defines a -data type composed of three single-byte unsigned integers labelled "r", "g" and -"b". - +Structured data types (i.e., with multiple named fields) are encoded +as a list of lists, following `NumPy array protocol type descriptions +(descr) +`_. Each +sub-list has the form ``[fieldname, datatype, shape]`` where ``shape`` +is optional. ``fieldname`` is a string, ``datatype`` is a string +specifying a simple data type (see above), and ``shape`` is a list of +integers specifying subarray shape. For example, the JSON list below +defines a data type composed of three single-byte unsigned integer +fields named "r", "g" and "b":: + + [["r", "|u1"], ["g", "|u1"], ["b", "|u1"]] + +For example, the JSON list below defines a data type composed of three +fields named "x", "y" and "z", where "x" and "y" each contain 32-bit +floats, and each item in "z" is a 2 by 2 array of floats:: + + [["x", "`_ to be installed):: >>> z[:] = 42 >>> store.close() +In Zarr version 2.3 is the :class:`zarr.storage.SQLiteStore` class which +enables the SQLite database to be used for storing an array or group (requires +Python is built with SQLite support):: + + >>> store = zarr.SQLiteStore('data/example.sqldb') + >>> root = zarr.group(store=store, overwrite=True) + >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') + >>> z[:] = 42 + >>> store.close() + +Also added in Zarr version 2.3 are two storage classes for interfacing with server-client +databases. The :class:`zarr.storage.RedisStore` class interfaces `Redis `_ +(an in memory data structure store), and the :class:`zarr.storage.MongoDB` class interfaces +with `MongoDB `_ (an oject oriented NoSQL database). These stores +respectively require the `redis `_ and +`pymongo `_ packages to be installed. + Distributed/cloud storage ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -770,6 +787,21 @@ Here is an example using S3Map to read an array created previously:: >>> z[:].tostring() b'Hello from the cloud!' +Zarr now also has a builtin storage backend for Azure Blob Storage. +The class is :class:`zarr.storage.ABSStore` (requires + `azure-storage-blob `_ +to be installed):: + + >>> store = zarr.ABSStore(container='test', prefix='zarr-testing', blob_service_kwargs={'is_emulated': True}) + >>> root = zarr.group(store=store, overwrite=True) + >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') + >>> z[:] = 42 + +When using an actual storage account, provide ``account_name`` and +``account_key`` arguments to :class:`zarr.storage.ABSStore`, the +above client is just testing against the emulator. Please also note +that this is an experimental feature. + Note that retrieving data from a remote service via the network can be significantly slower than retrieving data from a local file system, and will depend on network latency and bandwidth between the client and server systems. If you are experiencing poor @@ -778,9 +810,11 @@ chunk size, which will reduce the number of chunks and thus reduce the number of round-trips required to retrieve data for an array (and thus reduce the impact of network latency). Another option is to try to increase the compression ratio by changing compression options or trying a different compressor (which will reduce the impact of -limited network bandwidth). As of version 2.2, Zarr also provides the -:class:`zarr.storage.LRUStoreCache` which can be used to implement a local in-memory cache -layer over a remote store. E.g.:: +limited network bandwidth). + +As of version 2.2, Zarr also provides the :class:`zarr.storage.LRUStoreCache` +which can be used to implement a local in-memory cache layer over a remote +store. E.g.:: >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) @@ -797,13 +831,51 @@ layer over a remote store. E.g.:: b'Hello from the cloud!' 0.0009490990014455747 -If you are still experiencing poor performance with distributed/cloud storage, please -raise an issue on the GitHub issue tracker with any profiling data you can provide, as -there may be opportunities to optimise further either within Zarr or within the mapping -interface to the storage. +If you are still experiencing poor performance with distributed/cloud storage, +please raise an issue on the GitHub issue tracker with any profiling data you +can provide, as there may be opportunities to optimise further either within +Zarr or within the mapping interface to the storage. .. _tutorial_copy: +Consolidating metadata +~~~~~~~~~~~~~~~~~~~~~~ + +(This is an experimental feature.) + +Since there is a significant overhead for every connection to a cloud object +store such as S3, the pattern described in the previous section may incur +significant latency while scanning the metadata of the dataset hierarchy, even +though each individual metadata object is small. For cases such as these, once +the data are static and can be regarded as read-only, at least for the +metadata/structure of the dataset hierarchy, the many metadata objects can be +consolidated into a single one via +:func:`zarr.convenience.consolidate_metadata`. Doing this can greatly increase +the speed of reading the dataset metadata, e.g.:: + + >>> zarr.consolidate_metadata(store) # doctest: +SKIP + +This creates a special key with a copy of all of the metadata from all of the +metadata objects in the store. + +Later, to open a Zarr store with consolidated metadata, use +:func:`zarr.convenience.open_consolidated`, e.g.:: + + >>> root = zarr.open_consolidated(store) # doctest: +SKIP + +This uses the special key to read all of the metadata in a single call to the +backend storage. + +Note that, the hierarchy could still be opened in the normal way and altered, +causing the consolidated metadata to become out of sync with the real state of +the dataset hierarchy. In this case, +:func:`zarr.convenience.consolidate_metadata` would need to be called again. + +To protect against consolidated metadata accidentally getting out of sync, the +root group returned by :func:`zarr.convenience.open_consolidated` is read-only +for the metadata, meaning that no new groups or arrays can be created, and +arrays cannot be resized. However, data values with arrays can still be updated. + Copying/migrating data ---------------------- @@ -1331,7 +1403,7 @@ internal threads. The number of Blosc threads can be changed to increase or decrease this number, e.g.:: >>> from zarr import blosc - >>> blosc.set_nthreads(2) + >>> blosc.set_nthreads(2) # doctest: +SKIP 8 When a Zarr array is being used within a multi-threaded program, Zarr diff --git a/requirements.txt b/requirements.txt index 8720210cf5..e035a2fc72 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ asciitree -pytest -numpy fasteners numcodecs +numpy +pytest +pyosreplace; python_version < '3.3' and sys.platform == 'win32' diff --git a/requirements_dev.txt b/requirements_dev.txt index 95e4a556b4..957edac387 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,38 +1,4 @@ -appdirs==1.4.3 -args==0.1.0 asciitree==0.3.3 -certifi==2017.7.27.1 -chardet==3.0.4 -clint==0.5.1 -coverage==4.4.1 -coveralls==1.2.0 -Cython==0.27.2 -docopt==0.6.2 fasteners==0.14.1 -flake8==3.5.0 -h5py==2.7.1 -idna==2.6 -mccabe==0.6.1 -monotonic==1.3 -msgpack-python==0.4.8 -numcodecs==0.5.4 -packaging==16.8 -pkginfo==1.4.1 -pluggy==0.5.2 -py==1.4.34 -py-cpuinfo==3.3.0 -pycodestyle==2.3.1 -pyflakes==1.6.0 -pyparsing==2.2.0 -pytest==3.2.3 -pytest-cov==2.5.1 -requests==2.18.4 -requests-toolbelt==0.8.0 -setuptools-scm==1.15.6 -s3fs==0.1.2 -tox==2.9.1 -tox-travis==0.8 -tqdm==4.19.4 -twine==1.9.1 -urllib3==1.22 -virtualenv==15.1.0 +numcodecs==0.6.2 +pyosreplace==0.1; python_version < '3.3' and sys.platform == 'win32' diff --git a/requirements_dev_npy.txt b/requirements_dev_npy.txt new file mode 100644 index 0000000000..78b80223e2 --- /dev/null +++ b/requirements_dev_npy.txt @@ -0,0 +1,4 @@ +# Break this out into a separate file to allow testing against +# different versions of numpy. This file should pin to the latest +# numpy version. +numpy==1.15.4 diff --git a/requirements_dev_optional.txt b/requirements_dev_optional.txt index ad6f7064c6..2e4feddce0 100644 --- a/requirements_dev_optional.txt +++ b/requirements_dev_optional.txt @@ -1,2 +1,5 @@ -bsddb3==6.2.5 -lmdb==0.93 +# These packages are currently not available on Windows. +bsddb3==6.2.6 +lmdb==0.94 +redis==3.0.1 +pymongo==3.7.1 \ No newline at end of file diff --git a/requirements_test.txt b/requirements_test.txt new file mode 100644 index 0000000000..1492a673c1 --- /dev/null +++ b/requirements_test.txt @@ -0,0 +1,12 @@ +coverage +coveralls +cython +flake8 +h5py +msgpack-python +pytest +pytest-cov +s3fs +setuptools-scm +tox +azure-storage-blob diff --git a/setup.py b/setup.py index ae89a66ceb..69f434756f 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division from setuptools import setup +import sys DESCRIPTION = 'An implementation of chunked, compressed, ' \ @@ -9,6 +10,16 @@ with open('README.rst') as f: LONG_DESCRIPTION = f.read() +dependencies = [ + 'asciitree', + 'numpy>=1.7', + 'fasteners', + 'numcodecs>=0.6.2', +] + +if sys.version_info < (3, 3) and sys.platform == "win32": + dependencies.append('pyosreplace') + setup( name='zarr', description=DESCRIPTION, @@ -22,12 +33,7 @@ 'setuptools>18.0', 'setuptools-scm>1.5.4' ], - install_requires=[ - 'asciitree', - 'numpy>=1.7', - 'fasteners', - 'numcodecs>=0.5.3', - ], + install_requires=dependencies, package_dir={'': '.'}, packages=['zarr', 'zarr.tests'], classifiers=[ @@ -42,9 +48,9 @@ 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', ], maintainer='Alistair Miles', maintainer_email='alimanfoo@googlemail.com', diff --git a/tox.ini b/tox.ini index 21e869df54..29303876a6 100644 --- a/tox.ini +++ b/tox.ini @@ -4,27 +4,40 @@ # and then run "tox" from this directory. [tox] -envlist = py27, py34, py35, py36-npy{113,114}, docs +# N.B., test different versions of numpy under py36 rather than py37 +# because wheels for npy113 not available for py37 +envlist = py27, py35, py36-npy{113,114,115}, py37, docs [testenv] +install_command = pip install --no-binary=numcodecs {opts} {packages} setenv = PYTHONHASHSEED = 42 # hooks for coverage exclusions based on Python major version - py34,py35,py36: PY_MAJOR_VERSION = py3 + py35,py36,py37: PY_MAJOR_VERSION = py3 py27: PY_MAJOR_VERSION = py2 commands = + # clear out any data files generated during tests python -c 'import glob; import shutil; import os; [(shutil.rmtree(d) if os.path.isdir(d) else os.remove(d) if os.path.isfile(d) else None) for d in glob.glob("./example*")]' - py27,py34,py35: pytest -v --cov=zarr zarr - # don't run py36-npy114 with coverage because it is run together with py35-npy113 on travis + # main unit test runner + # N.B., don't run npy113 or npy114 with coverage because it is run together npy115 on travis + py27,py35,py36-npy115: pytest -v --cov=zarr --cov-config=.coveragerc zarr + py36-npy113: pytest -v zarr py36-npy114: pytest -v zarr - py36-npy113: pytest -v --cov=zarr --doctest-modules zarr - py27,py34,py35,py36-npy113: coverage report -m - py36-npy113: python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst - py36-npy113: flake8 --max-line-length=100 zarr + py37: pytest -v --cov=zarr --cov-config=.coveragerc --doctest-modules zarr + # generate a coverate report + py27,py35,py36-npy115,py37: coverage report -m + # run doctests in the tutorial and spec + py37: python -m doctest -o NORMALIZE_WHITESPACE -o ELLIPSIS docs/tutorial.rst docs/spec/v2.rst + # pep8 checks + py37: flake8 zarr + # print environment for debugging + pip freeze deps = py27: backports.lzma - py27,py34,py35,py36-npy113: numpy==1.13.3 - py36-npy114: numpy==1.14.0 + py36-npy113: numpy==1.13.3 + py36-npy114: numpy==1.14.6 + py27,py35,py36-npy115,py37: -rrequirements_dev_npy.txt + -rrequirements_test.txt -rrequirements_dev.txt # linux only -rrequirements_dev_optional.txt @@ -36,3 +49,6 @@ deps = -rrequirements_rtfd.txt commands = sphinx-build -W -b html -d {envtmpdir}/doctrees . {envtmpdir}/html + +[flake8] +max-line-length = 100 diff --git a/zarr/__init__.py b/zarr/__init__.py index 56d060fdac..178e857983 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -7,11 +7,13 @@ from zarr.creation import (empty, zeros, ones, full, array, empty_like, zeros_like, ones_like, full_like, open_array, open_like, create) from zarr.storage import (DictStore, DirectoryStore, ZipStore, TempStore, - NestedDirectoryStore, DBMStore, LMDBStore, LRUStoreCache) + NestedDirectoryStore, DBMStore, LMDBStore, SQLiteStore, + LRUStoreCache, ABSStore, RedisStore, MongoDBStore) from zarr.hierarchy import group, open_group, Group from zarr.sync import ThreadSynchronizer, ProcessSynchronizer from zarr.codecs import * from zarr.convenience import (open, save, save_array, save_group, load, copy_store, - copy, copy_all, tree) + copy, copy_all, tree, consolidate_metadata, + open_consolidated) from zarr.errors import CopyError, MetadataError, PermissionError from zarr.version import version as __version__ diff --git a/zarr/attrs.py b/zarr/attrs.py index 6d74d6479a..21cb77bc10 100644 --- a/zarr/attrs.py +++ b/zarr/attrs.py @@ -4,8 +4,8 @@ from collections import MutableMapping -from zarr.compat import text_type from zarr.errors import PermissionError +from zarr.meta import parse_metadata class Attributes(MutableMapping): @@ -43,7 +43,7 @@ def _get_nosync(self): except KeyError: d = dict() else: - d = json.loads(text_type(data, 'ascii')) + d = parse_metadata(data) return d def asdict(self): diff --git a/zarr/compat.py b/zarr/compat.py index 9be3384123..91a75548e6 100644 --- a/zarr/compat.py +++ b/zarr/compat.py @@ -12,6 +12,7 @@ text_type = unicode binary_type = str reduce = reduce + from itertools import izip_longest as zip_longest class PermissionError(Exception): pass @@ -19,13 +20,18 @@ class PermissionError(Exception): def OrderedDict_move_to_end(od, key): od[key] = od.pop(key) + from collections import Mapping + else: # pragma: py2 no cover text_type = str binary_type = bytes from functools import reduce + from itertools import zip_longest PermissionError = PermissionError def OrderedDict_move_to_end(od, key): od.move_to_end(key) + + from collections.abc import Mapping diff --git a/zarr/convenience.py b/zarr/convenience.py index 19de7b2826..c6e5432664 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -15,15 +15,16 @@ from zarr.errors import err_path_not_found, CopyError from zarr.util import normalize_storage_path, TreeViewer, buffer_size from zarr.compat import PY2, text_type +from zarr.meta import ensure_str, json_dumps # noinspection PyShadowingBuiltins -def open(store, mode='a', **kwargs): +def open(store=None, mode='a', **kwargs): """Convenience function to open a group or array using file-mode-like semantics. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system or name of zip file. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means @@ -31,12 +32,17 @@ def open(store, mode='a', **kwargs): exist); 'w' means create (overwrite if exists); 'w-' means create (fail if exists). **kwargs - Additional parameters are passed through to :func:`zarr.open_array` or - :func:`zarr.open_group`. + Additional parameters are passed through to :func:`zarr.creation.open_array` or + :func:`zarr.hierarchy.open_group`. + + Returns + ------- + z : :class:`zarr.core.Array` or :class:`zarr.hierarchy.Group` + Array or group, depending on what exists in the given store. See Also -------- - zarr.open_array, zarr.open_group + zarr.creation.open_array, zarr.hierarchy.open_group Examples -------- @@ -68,7 +74,8 @@ def open(store, mode='a', **kwargs): path = kwargs.get('path', None) # handle polymorphic store arg - store = normalize_store_arg(store, clobber=(mode == 'w')) + clobber = mode == 'w' + store = normalize_store_arg(store, clobber=clobber) path = normalize_storage_path(path) if mode in {'w', 'w-', 'x'}: @@ -1069,3 +1076,113 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) return n_copied, n_skipped, n_bytes_copied + + +def consolidate_metadata(store, metadata_key='.zmetadata'): + """ + Consolidate all metadata for groups and arrays within the given store + into a single resource and put it under the given key. + + This produces a single object in the backend store, containing all the + metadata read from all the zarr-related keys that can be found. After + metadata have been consolidated, use :func:`open_consolidated` to open + the root group in optimised, read-only mode, using the consolidated + metadata to reduce the number of read operations on the backend store. + + Note, that if the metadata in the store is changed after this + consolidation, then the metadata read by :func:`open_consolidated` + would be incorrect unless this function is called again. + + .. note:: This is an experimental feature. + + Parameters + ---------- + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + metadata_key : str + Key to put the consolidated metadata under. + + Returns + ------- + g : :class:`zarr.hierarchy.Group` + Group instance, opened with the new consolidated metadata. + + See Also + -------- + open_consolidated + + """ + import json + + store = normalize_store_arg(store) + + def is_zarr_key(key): + return (key.endswith('.zarray') or key.endswith('.zgroup') or + key.endswith('.zattrs')) + + out = { + 'zarr_consolidated_format': 1, + 'metadata': { + key: json.loads(ensure_str(store[key])) + for key in store if is_zarr_key(key) + } + } + store[metadata_key] = json_dumps(out).encode() + return open_consolidated(store, metadata_key=metadata_key) + + +def open_consolidated(store, metadata_key='.zmetadata', mode='r+', **kwargs): + """Open group using metadata previously consolidated into a single key. + + This is an optimised method for opening a Zarr group, where instead of + traversing the group/array hierarchy by accessing the metadata keys at + each level, a single key contains all of the metadata for everything. + For remote data sources where the overhead of accessing a key is large + compared to the time to read data. + + The group accessed must have already had its metadata consolidated into a + single key using the function :func:`consolidate_metadata`. + + This optimised method only works in modes which do not change the + metadata, although the data may still be written/updated. + + Parameters + ---------- + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + metadata_key : str + Key to read the consolidated metadata from. The default (.zmetadata) + corresponds to the default used by :func:`consolidate_metadata`. + mode : {'r', 'r+'}, optional + Persistence mode: 'r' means read only (must exist); 'r+' means + read/write (must exist) although only writes to data are allowed, + changes to metadata including creation of new arrays or group + are not allowed. + **kwargs + Additional parameters are passed through to :func:`zarr.creation.open_array` or + :func:`zarr.hierarchy.open_group`. + + Returns + ------- + g : :class:`zarr.hierarchy.Group` + Group instance, opened with the consolidated metadata. + + See Also + -------- + consolidate_metadata + + """ + + from .storage import ConsolidatedMetadataStore + + # normalize parameters + store = normalize_store_arg(store) + if mode not in {'r', 'r+'}: + raise ValueError("invalid mode, expected either 'r' or 'r+'; found {!r}" + .format(mode)) + + # setup metadata sotre + meta_store = ConsolidatedMetadataStore(store, metadata_key=metadata_key) + + # pass through + return open(store=meta_store, chunk_store=store, mode=mode, **kwargs) diff --git a/zarr/core.py b/zarr/core.py index 03d9bdc667..0838117b89 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -8,6 +8,7 @@ import numpy as np +from numcodecs.compat import ensure_ndarray from zarr.util import (is_total_slice, human_readable_size, normalize_resize_args, @@ -423,6 +424,18 @@ def __array__(self, *args): a = a.astype(args[0]) return a + def __iter__(self): + if len(self.shape) == 0: + # Same error as numpy + raise TypeError("iteration over a 0-d array") + # Avoid repeatedly decompressing chunks by iterating over the chunks + # in the first dimension. + chunk_size = self.chunks[0] + for j in range(self.shape[0]): + if j % chunk_size == 0: + chunk = self[j: j + chunk_size] + yield chunk[j % chunk_size] + def __len__(self): if self.shape: return self.shape[0] @@ -1530,6 +1543,7 @@ def _set_selection(self, indexer, value, fields=None): item = [slice(None)] * self.ndim for a in indexer.drop_axes: item[a] = np.newaxis + item = tuple(item) chunk_value = chunk_value[item] # put data @@ -1568,7 +1582,11 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, except KeyError: # chunk not initialized if self._fill_value is not None: - out[out_selection] = self._fill_value + if fields: + fill_value = self._fill_value[fields] + else: + fill_value = self._fill_value + out[out_selection] = fill_value else: @@ -1596,10 +1614,7 @@ def _chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, if self._compressor: self._compressor.decode(cdata, dest) else: - if isinstance(cdata, np.ndarray): - chunk = cdata.view(self._dtype) - else: - chunk = np.frombuffer(cdata, dtype=self._dtype) + chunk = ensure_ndarray(cdata).view(self._dtype) chunk = chunk.reshape(self._chunks, order=self._order) np.copyto(dest, chunk) return @@ -1733,21 +1748,25 @@ def _decode_chunk(self, cdata): # apply filters if self._filters: - for f in self._filters[::-1]: + for f in reversed(self._filters): chunk = f.decode(chunk) - # view as correct dtype - if self._dtype == object: - if isinstance(chunk, np.ndarray): - chunk = chunk.astype(self._dtype) - else: - raise RuntimeError('cannot read object array without object codec') - elif isinstance(chunk, np.ndarray): + # view as numpy array with correct dtype + chunk = ensure_ndarray(chunk) + # special case object dtype, because incorrect handling can lead to + # segfaults and other bad things happening + if self._dtype != object: chunk = chunk.view(self._dtype) - else: - chunk = np.frombuffer(chunk, dtype=self._dtype) - - # reshape + elif chunk.dtype != object: + # If we end up here, someone must have hacked around with the filters. + # We cannot deal with object arrays unless there is an object + # codec in the filter chain, i.e., a filter that converts from object + # array to something else during encoding, and converts back to object + # array during decoding. + raise RuntimeError('cannot read object array without object codec') + + # ensure correct chunk shape + chunk = chunk.reshape(-1, order='A') chunk = chunk.reshape(self._chunks, order=self._order) return chunk @@ -1917,7 +1936,7 @@ def hexdigest(self, hashname="sha1"): checksum = binascii.hexlify(self.digest(hashname=hashname)) # This is a bytes object on Python 3 and we want a str. - if type(checksum) is not str: + if type(checksum) is not str: # pragma: py2 no cover checksum = checksum.decode('utf8') return checksum @@ -2157,7 +2176,7 @@ def view(self, shape=None, chunks=None, dtype=None, array([0, 0, 1, ..., 1, 0, 0], dtype=uint8) >>> v = a.view(dtype=bool) >>> v[:] - array([False, False, True, ..., True, False, False], dtype=bool) + array([False, False, True, ..., True, False, False]) >>> np.all(a[:].view(dtype=bool) == v[:]) True diff --git a/zarr/creation.py b/zarr/creation.py index 004b0e4ad1..0184a4a5da 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -224,8 +224,8 @@ def zeros(shape, **kwargs): >>> z >>> z[:2, :2] - array([[ 0., 0.], - [ 0., 0.]]) + array([[0., 0.], + [0., 0.]]) """ @@ -245,8 +245,8 @@ def ones(shape, **kwargs): >>> z >>> z[:2, :2] - array([[ 1., 1.], - [ 1., 1.]]) + array([[1., 1.], + [1., 1.]]) """ @@ -266,8 +266,8 @@ def full(shape, fill_value, **kwargs): >>> z >>> z[:2, :2] - array([[ 42., 42.], - [ 42., 42.]]) + array([[42., 42.], + [42., 42.]]) """ @@ -346,15 +346,15 @@ def array(data, **kwargs): return z -def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor='default', - fill_value=0, order='C', synchronizer=None, filters=None, - cache_metadata=True, cache_attrs=True, path=None, object_codec=None, - **kwargs): +def open_array(store=None, mode='a', shape=None, chunks=True, dtype=None, + compressor='default', fill_value=0, order='C', synchronizer=None, + filters=None, cache_metadata=True, cache_attrs=True, path=None, + object_codec=None, chunk_store=None, **kwargs): """Open an array using file-mode-like semantics. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system or name of zip file. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means @@ -391,6 +391,8 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= Array path within store. object_codec : Codec, optional A codec to encode object arrays, only needed if dtype=object. + chunk_store : MutableMapping or string, optional + Store or path to directory in file system or name of zip file. Returns ------- @@ -426,7 +428,10 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= # a : read/write if exists, create otherwise (default) # handle polymorphic store arg - store = normalize_store_arg(store, clobber=(mode == 'w')) + clobber = mode == 'w' + store = normalize_store_arg(store, clobber=clobber) + if chunk_store is not None: + chunk_store = normalize_store_arg(chunk_store, clobber=clobber) path = normalize_storage_path(path) # API compatibility with h5py @@ -448,7 +453,7 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, overwrite=True, path=path, - object_codec=object_codec) + object_codec=object_codec, chunk_store=chunk_store) elif mode == 'a': if contains_group(store, path=path): @@ -457,7 +462,7 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, path=path, - object_codec=object_codec) + object_codec=object_codec, chunk_store=chunk_store) elif mode in ['w-', 'x']: if contains_group(store, path=path): @@ -468,14 +473,15 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, path=path, - object_codec=object_codec) + object_codec=object_codec, chunk_store=chunk_store) # determine read only status read_only = mode == 'r' # instantiate array z = Array(store, read_only=read_only, synchronizer=synchronizer, - cache_metadata=cache_metadata, cache_attrs=cache_attrs, path=path) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, path=path, + chunk_store=chunk_store) return z diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index e9565caa13..17821130eb 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -91,7 +91,6 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): - self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) @@ -1059,12 +1058,13 @@ def group(store=None, overwrite=False, chunk_store=None, cache_attrs=cache_attrs, synchronizer=synchronizer, path=path) -def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): +def open_group(store=None, mode='a', cache_attrs=True, synchronizer=None, path=None, + chunk_store=None): """Open a group using file-mode-like semantics. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system or name of zip file. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means @@ -1079,6 +1079,8 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): Array synchronizer. path : string, optional Group path within store. + chunk_store : MutableMapping or string, optional + Store or path to directory in file system or name of zip file. Returns ------- @@ -1102,6 +1104,8 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): # handle polymorphic store arg store = _normalize_store_arg(store) + if chunk_store is not None: + chunk_store = _normalize_store_arg(chunk_store) path = normalize_storage_path(path) # ensure store is initialized @@ -1113,13 +1117,13 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): err_group_not_found(path) elif mode == 'w': - init_group(store, overwrite=True, path=path) + init_group(store, overwrite=True, path=path, chunk_store=chunk_store) elif mode == 'a': if contains_array(store, path=path): err_contains_array(path) if not contains_group(store, path=path): - init_group(store, path=path) + init_group(store, path=path, chunk_store=chunk_store) elif mode in ['w-', 'x']: if contains_array(store, path=path): @@ -1127,10 +1131,10 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): elif contains_group(store, path=path): err_contains_group(path) else: - init_group(store, path=path) + init_group(store, path=path, chunk_store=chunk_store) # determine read only status read_only = mode == 'r' return Group(store, read_only=read_only, cache_attrs=cache_attrs, - synchronizer=synchronizer, path=path) + synchronizer=synchronizer, path=path, chunk_store=chunk_store) diff --git a/zarr/indexing.py b/zarr/indexing.py index ecbc403509..0cd116bc37 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -513,6 +513,7 @@ def oindex_set(a, selection, value): value_selection = [slice(None)] * len(a.shape) for i in drop_axes: value_selection[i] = np.newaxis + value_selection = tuple(value_selection) value = value[value_selection] a[selection] = value @@ -678,7 +679,7 @@ def __init__(self, selection, array): else: sel_sort = None - # store atrributes + # store attributes self.selection = selection self.sel_sort = sel_sort self.shape = selection[0].shape if selection[0].shape else (1,) diff --git a/zarr/meta.py b/zarr/meta.py index 3f48f3f3e1..c90c12ff38 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -5,34 +5,58 @@ import numpy as np +from numcodecs.compat import ensure_bytes -from zarr.compat import PY2, binary_type +from zarr.compat import PY2, Mapping from zarr.errors import MetadataError ZARR_FORMAT = 2 -def _ensure_str(s): - if PY2: # pragma: py3 no cover - # noinspection PyUnresolvedReferences - if isinstance(s, buffer): # noqa - s = str(s) - else: # pragma: py2 no cover - if isinstance(s, memoryview): - s = s.tobytes() - if isinstance(s, binary_type): +def ensure_str(s): + if not isinstance(s, str): + s = ensure_bytes(s) + if not PY2: # pragma: py2 no cover s = s.decode('ascii') return s +def json_dumps(o): + """Write JSON in a consistent, human-readable way.""" + return json.dumps(o, indent=4, sort_keys=True, ensure_ascii=True, + separators=(',', ': ')) + + +def parse_metadata(s): + + # Here we allow that a store may return an already-parsed metadata object, + # or a string of JSON that we will parse here. We allow for an already-parsed + # object to accommodate a consolidated metadata store, where all the metadata for + # all groups and arrays will already have been parsed from JSON. + + if isinstance(s, Mapping): + # assume metadata has already been parsed into a mapping object + meta = s + + else: + # assume metadata needs to be parsed as JSON + s = ensure_str(s) + meta = json.loads(s) + + return meta + + def decode_array_metadata(s): - s = _ensure_str(s) - meta = json.loads(s) + meta = parse_metadata(s) + + # check metadata format zarr_format = meta.get('zarr_format', None) if zarr_format != ZARR_FORMAT: raise MetadataError('unsupported zarr format: %s' % zarr_format) + + # extract array metadata fields try: dtype = decode_dtype(meta['dtype']) fill_value = decode_fill_value(meta['fill_value'], dtype) @@ -54,9 +78,12 @@ def decode_array_metadata(s): def encode_array_metadata(meta): dtype = meta['dtype'] + sdshape = () + if dtype.subdtype is not None: + dtype, sdshape = dtype.subdtype meta = dict( zarr_format=ZARR_FORMAT, - shape=meta['shape'], + shape=meta['shape'] + sdshape, chunks=meta['chunks'], dtype=encode_dtype(dtype), compressor=meta['compressor'], @@ -64,8 +91,7 @@ def encode_array_metadata(meta): order=meta['order'], filters=meta['filters'], ) - s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True, - separators=(',', ': ')) + s = json_dumps(meta) b = s.encode('ascii') return b @@ -83,10 +109,9 @@ def _decode_dtype_descr(d): # recurse to handle nested structures if PY2: # pragma: py3 no cover # under PY2 numpy rejects unicode field names - d = [(f.encode('ascii'), _decode_dtype_descr(v)) - for f, v in d] + d = [(k[0].encode("ascii"), _decode_dtype_descr(k[1])) + tuple(k[2:]) for k in d] else: # pragma: py2 no cover - d = [(f, _decode_dtype_descr(v)) for f, v in d] + d = [(k[0], _decode_dtype_descr(k[1])) + tuple(k[2:]) for k in d] return d @@ -96,14 +121,14 @@ def decode_dtype(d): def decode_group_metadata(s): - s = _ensure_str(s) - meta = json.loads(s) + meta = parse_metadata(s) + + # check metadata format version zarr_format = meta.get('zarr_format', None) if zarr_format != ZARR_FORMAT: raise MetadataError('unsupported zarr format: %s' % zarr_format) - meta = dict( - zarr_format=ZARR_FORMAT, - ) + + meta = dict(zarr_format=zarr_format) return meta @@ -113,7 +138,7 @@ def encode_group_metadata(meta=None): meta = dict( zarr_format=ZARR_FORMAT, ) - s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True) + s = json_dumps(meta) b = s.encode('ascii') return b @@ -184,6 +209,6 @@ def encode_fill_value(v, dtype): elif dtype.kind == 'U': return v elif dtype.kind in 'mM': - return int(v.view('u8')) + return int(v.view('i8')) else: return v diff --git a/zarr/storage.py b/zarr/storage.py index 7964e3dd01..d8f8433e15 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -18,29 +18,31 @@ from __future__ import absolute_import, print_function, division from collections import MutableMapping, OrderedDict import os +import operator import tempfile import zipfile import shutil import atexit import re import sys +import json import multiprocessing +from pickle import PicklingError from threading import Lock, RLock import glob import warnings - - -import numpy as np +import io from zarr.util import (normalize_shape, normalize_chunks, normalize_order, normalize_storage_path, buffer_size, normalize_fill_value, nolock, normalize_dtype) from zarr.meta import encode_array_metadata, encode_group_metadata -from zarr.compat import PY2, binary_type, OrderedDict_move_to_end +from zarr.compat import PY2, OrderedDict_move_to_end, binary_type from numcodecs.registry import codec_registry +from numcodecs.compat import ensure_bytes, ensure_contiguous_ndarray from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, - err_fspath_exists_notdir, err_read_only) + err_fspath_exists_notdir, err_read_only, MetadataError) array_meta_key = '.zarray' @@ -54,6 +56,15 @@ from zarr.codecs import Zlib default_compressor = Zlib() +# Find which function to use for atomic replace +if sys.version_info >= (3, 3): + from os import replace +elif sys.platform == "win32": # pragma: no cover + from osreplace import replace +else: # pragma: no cover + # POSIX rename() is always atomic + from os import rename as replace + def _path_to_prefix(path): # assume path already normalized @@ -329,8 +340,9 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='defa err_contains_group(path) # normalize metadata - shape = normalize_shape(shape) dtype, object_codec = normalize_dtype(dtype, object_codec) + shape = normalize_shape(shape) + dtype.shape + dtype = dtype.base chunks = normalize_chunks(chunks, shape, dtype.itemsize) order = normalize_order(order) fill_value = normalize_fill_value(fill_value, dtype) @@ -442,23 +454,6 @@ def _init_group_metadata(store, overwrite=False, path=None, chunk_store=None): store[key] = encode_group_metadata(meta) -def ensure_bytes(s): - if isinstance(s, binary_type): - return s - if isinstance(s, np.ndarray): - if PY2: # pragma: py3 no cover - # noinspection PyArgumentList - return s.tostring(order='A') - else: # pragma: py2 no cover - # noinspection PyArgumentList - return s.tobytes(order='A') - if hasattr(s, 'tobytes'): - return s.tobytes() - if PY2 and hasattr(s, 'tostring'): # pragma: py3 no cover - return s.tostring() - return memoryview(s).tobytes() - - def _dict_store_keys(d, prefix='', cls=dict): for k in d.keys(): v = d[k] @@ -552,6 +547,7 @@ def __getitem__(self, item): def __setitem__(self, item, value): with self.write_mutex: parent, key = self._require_parent(item) + value = ensure_bytes(value) parent[key] = value def __delitem__(self, item): @@ -650,17 +646,11 @@ def getsize(self, path=None): size = 0 for v in value.values(): if not isinstance(v, self.cls): - try: - size += buffer_size(v) - except TypeError: - return -1 + size += buffer_size(v) return size else: - try: - return buffer_size(value) - except TypeError: - return -1 + return buffer_size(value) def clear(self): with self.write_mutex: @@ -739,9 +729,8 @@ def __getitem__(self, key): def __setitem__(self, key, value): - # handle F-contiguous numpy arrays - if isinstance(value, np.ndarray) and value.flags.f_contiguous: - value = ensure_bytes(value) + # coerce to flat, contiguous array (ideally without copying) + value = ensure_contiguous_ndarray(value) # destination path for key file_path = os.path.join(self.path, key) @@ -770,13 +759,11 @@ def __setitem__(self, key, value): f.write(value) # move temporary file into place - if os.path.exists(file_path): - os.remove(file_path) - os.rename(temp_path, file_path) + replace(temp_path, file_path) finally: # clean up if temp file still exists for whatever reason - if temp_path is not None and os.path.exists(temp_path): + if temp_path is not None and os.path.exists(temp_path): # pragma: no cover os.remove(temp_path) def __delitem__(self, key): @@ -1190,7 +1177,7 @@ def __getitem__(self, key): def __setitem__(self, key, value): if self.mode == 'r': err_read_only() - value = ensure_bytes(value) + value = ensure_contiguous_ndarray(value) with self.mutex: self.zf.writestr(key, value) @@ -1437,7 +1424,11 @@ def __init__(self, path, flag='c', mode=0o666, open=None, write_lock=True, self.open_kwargs = open_kwargs def __getstate__(self): - self.flush() # needed for py2 and ndbm + try: + self.flush() # needed for py2 and ndbm + except Exception: + # flush may fail if db has already been closed + pass return (self.path, self.flag, self.mode, self.open, self.write_lock, self.open_kwargs) @@ -1459,7 +1450,7 @@ def flush(self): if self.flag[0] != 'r': with self.write_mutex: if hasattr(self.db, 'sync'): - self.db.sync() + self.db.sync() else: # fall-back, close and re-open, needed for ndbm flag = self.flag @@ -1631,7 +1622,11 @@ def __init__(self, path, buffers=True, **kwargs): self.kwargs = kwargs def __getstate__(self): - self.flush() # just in case + try: + self.flush() # just in case + except Exception: + # flush may fail if db has already been closed + pass return self.path, self.buffers, self.kwargs def __setstate__(self, state): @@ -1890,149 +1885,610 @@ class ABSStore(MutableMapping): Parameters ---------- - container_name : string - The name of the ABS container to use. Currently this must exist in the - storage account. + container : string + The name of the ABS container to use. prefix : string Location of the "directory" to use as the root of the storage hierarchy within the container. account_name : string The Azure blob storage account name. account_key : string - The Azure blob storage account acess key. + The Azure blob storage account access key. + blob_service_kwargs : dictionary + Extra arguments to be passed into the azure blob client, for e.g. when + using the emulator, pass in blob_service_kwargs={'is_emulated': True} Notes ----- - In order to use this store, you must install the Azure Blob Storage - `Python Client Library `_ version >= 1.3.0. + In order to use this store, you must install the Microsoft Azure Storage SDK for Python """ - def __init__(self, container_name, prefix, account_name, account_key): + def __init__(self, container, prefix, account_name=None, account_key=None, + blob_service_kwargs=None): + from azure.storage.blob import BlockBlobService + self.container = container + self.prefix = normalize_storage_path(prefix) self.account_name = account_name self.account_key = account_key - self.container_name = container_name - self.prefix = normalize_storage_path(prefix) - self.initialize_container() - - def initialize_container(self): - from azure.storage.blob import BlockBlobService - self.client = BlockBlobService(self.account_name, self.account_key) + if blob_service_kwargs is not None: + self.blob_service_kwargs = blob_service_kwargs + else: # pragma: no cover + self.blob_service_kwargs = dict() + self.client = BlockBlobService(self.account_name, self.account_key, + **self.blob_service_kwargs) # needed for pickling def __getstate__(self): state = self.__dict__.copy() + del state['client'] return state def __setstate__(self, state): + from azure.storage.blob import BlockBlobService self.__dict__.update(state) - self.initialize_container() - - def __enter__(self): - return self - - def __exit__(self, *args): - pass + self.client = BlockBlobService(self.account_name, self.account_key, + **self.blob_service_kwargs) + @staticmethod def _append_path_to_prefix(path, prefix): return '/'.join([normalize_storage_path(prefix), normalize_storage_path(path)]) - def full_path(self, path=None): - return _append_path_to_prefix(path, self.prefix) + @staticmethod + def _strip_prefix_from_path(path, prefix): + # normalized things will not have any leading or trailing slashes + path_norm = normalize_storage_path(path) + prefix_norm = normalize_storage_path(prefix) + return path_norm[(len(prefix_norm)+1):] def __getitem__(self, key): + from azure.common import AzureMissingResourceHttpError blob_name = '/'.join([self.prefix, key]) - blob = self.client.get_blob_to_bytes(self.container_name, blob_name) - if blob: + try: + blob = self.client.get_blob_to_bytes(self.container, blob_name) return blob.content - else: + except AzureMissingResourceHttpError: raise KeyError('Blob %s not found' % blob_name) def __setitem__(self, key, value): + value = ensure_bytes(value) blob_name = '/'.join([self.prefix, key]) - self.client.create_blob_from_text(self.container_name, blob_name, value) + buffer = io.BytesIO(value) + self.client.create_blob_from_stream(self.container, blob_name, buffer) def __delitem__(self, key): - raise NotImplementedError + from azure.common import AzureMissingResourceHttpError + try: + self.client.delete_blob(self.container, '/'.join([self.prefix, key])) + except AzureMissingResourceHttpError: + raise KeyError('Blob %s not found' % key) def __eq__(self, other): return ( isinstance(other, ABSStore) and - self.container_name == other.container_name and + self.container == other.container and self.prefix == other.prefix ) def keys(self): - raise NotImplementedError + return list(self.__iter__()) def __iter__(self): - raise NotImplementedError + for blob in self.client.list_blobs(self.container, self.prefix + '/'): + yield self._strip_prefix_from_path(blob.name, self.prefix) def __len__(self): - raise NotImplementedError + return len(self.keys()) def __contains__(self, key): blob_name = '/'.join([self.prefix, key]) - if self.client.exists(self.container_name, blob_name): + if self.client.exists(self.container, blob_name): return True else: return False - def list_abs_directory_blobs(self, prefix): - """Return list of all blobs from an abs prefix.""" - return [blob.name for blob in self.client.list_blobs(self.container_name)] - - def list_abs_subdirectories(self, prefix): - """Return list of all "subdirectories" from an abs prefix.""" - return list(set([blob.name.rsplit('/', 1)[0] for blob in self.client.list_blobs(self.container_name) if '/' in blob.name])) - - def _strip_prefix_from_path(path, prefix): - # normalized things will not have any leading or trailing slashes - path_norm = normalize_storage_path(path) - prefix_norm = normalize_storage_path(prefix) - if path_norm.startswith(prefix_norm): - return path_norm[(len(prefix_norm)+1):] - else: - return path - - def list_abs_directory(self, prefix, strip_prefix=True): - """Return a list of all blobs and subdirectories from an abs prefix.""" - items = set() - items.update(self.list_abs_directory_blobs(prefix)) - items.update(self.list_abs_subdirectories(prefix)) - items = list(items) - if strip_prefix: - items = [_strip_prefix_from_path(path, prefix) for path in items] - return items - - def dir_path(self, path=None): + def listdir(self, path=None): store_path = normalize_storage_path(path) # prefix is normalized to not have a trailing slash dir_path = self.prefix if store_path: - dir_path = os.path.join(dir_path, store_path) + dir_path = dir_path + '/' + store_path + dir_path += '/' + items = list() + for blob in self.client.list_blobs(self.container, prefix=dir_path, delimiter='/'): + if '/' in blob.name[len(dir_path):]: + items.append(self._strip_prefix_from_path( + blob.name[:blob.name.find('/', len(dir_path))], dir_path)) + else: + items.append(self._strip_prefix_from_path(blob.name, dir_path)) + return items + + def rmdir(self, path=None): + dir_path = normalize_storage_path(self._append_path_to_prefix(path, self.prefix)) + '/' + for blob in self.client.list_blobs(self.container, prefix=dir_path): + self.client.delete_blob(self.container, blob.name) + + def getsize(self, path=None): + store_path = normalize_storage_path(path) + fs_path = self.prefix + if store_path: + fs_path = self._append_path_to_prefix(store_path, self.prefix) + if self.client.exists(self.container, fs_path): + return self.client.get_blob_properties(self.container, + fs_path).properties.content_length else: - dir_path += '/' - return dir_path + size = 0 + for blob in self.client.list_blobs(self.container, prefix=fs_path + '/', + delimiter='/'): + if '/' not in blob.name[len(fs_path + '/'):]: + size += blob.properties.content_length + return size + + def clear(self): + self.rmdir() + + +class SQLiteStore(MutableMapping): + """Storage class using SQLite. + + Parameters + ---------- + path : string + Location of database file. + **kwargs + Keyword arguments passed through to the `sqlite3.connect` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.SQLiteStore('data/array.sqldb') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.SQLiteStore('data/group.sqldb') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + """ + + def __init__(self, path, **kwargs): + import sqlite3 + + # normalize path + if path != ':memory:': + path = os.path.abspath(path) + + # store properties + self.path = path + self.kwargs = kwargs + + # allow threading if SQLite connections are thread-safe + # + # ref: https://www.sqlite.org/releaselog/3_3_1.html + # ref: https://bugs.python.org/issue27190 + check_same_thread = True + if sqlite3.sqlite_version_info >= (3, 3, 1): + check_same_thread = False + + # keep a lock for serializing mutable operations + self.lock = Lock() + + # open database + self.db = sqlite3.connect( + self.path, + detect_types=0, + isolation_level=None, + check_same_thread=check_same_thread, + **self.kwargs + ) + + # handle keys as `str`s + self.db.text_factory = str + + # get a cursor to read/write to the database + self.cursor = self.db.cursor() + + # initialize database with our table if missing + with self.lock: + self.cursor.execute( + 'CREATE TABLE IF NOT EXISTS zarr(k TEXT PRIMARY KEY, v BLOB)' + ) + + def __getstate__(self): + if self.path == ':memory:': + raise PicklingError('Cannot pickle in-memory SQLite databases') + return self.path, self.kwargs + + def __setstate__(self, state): + path, kwargs = state + self.__init__(path=path, **kwargs) + + def close(self): + """Closes the underlying database.""" + + # close cursor and db objects + self.cursor.close() + self.db.close() + + def __getitem__(self, key): + value = self.cursor.execute('SELECT v FROM zarr WHERE (k = ?)', (key,)) + for v, in value: + return v + raise KeyError(key) + + def __setitem__(self, key, value): + self.update({key: value}) + + def __delitem__(self, key): + with self.lock: + self.cursor.execute('DELETE FROM zarr WHERE (k = ?)', (key,)) + if self.cursor.rowcount < 1: + raise KeyError(key) + + def __contains__(self, key): + cs = self.cursor.execute( + 'SELECT COUNT(*) FROM zarr WHERE (k = ?)', (key,) + ) + for has, in cs: + has = bool(has) + return has + + def items(self): + kvs = self.cursor.execute('SELECT k, v FROM zarr') + for k, v in kvs: + yield k, v + + def keys(self): + ks = self.cursor.execute('SELECT k FROM zarr') + for k, in ks: + yield k + + def values(self): + vs = self.cursor.execute('SELECT v FROM zarr') + for v, in vs: + yield v + + def __iter__(self): + return self.keys() + + def __len__(self): + cs = self.cursor.execute('SELECT COUNT(*) FROM zarr') + for c, in cs: + return c + + def update(self, *args, **kwargs): + args += (kwargs,) + + kv_list = [] + for dct in args: + for k, v in dct.items(): + # Python 2 cannot store `memoryview`s, but it can store + # `buffer`s. However Python 2 won't return `bytes` then. So we + # coerce to `bytes`, which are handled correctly. Python 3 + # doesn't have these issues. + if PY2: # pragma: py3 no cover + v = ensure_bytes(v) + else: # pragma: py2 no cover + v = ensure_contiguous_ndarray(v) + + # Accumulate key-value pairs for storage + kv_list.append((k, v)) + + with self.lock: + self.cursor.executemany('REPLACE INTO zarr VALUES (?, ?)', kv_list) def listdir(self, path=None): - dir_path = self.dir_path(path) - return sorted(self.list_abs_directory(dir_path, strip_prefix=True)) + path = normalize_storage_path(path) + keys = self.cursor.execute( + ''' + SELECT DISTINCT SUBSTR(m, 0, INSTR(m, "/")) AS l FROM ( + SELECT LTRIM(SUBSTR(k, LENGTH(?) + 1), "/") || "/" AS m + FROM zarr WHERE k LIKE (? || "_%") + ) ORDER BY l ASC + ''', + (path, path) + ) + keys = list(map(operator.itemgetter(0), keys)) + return keys - def rename(self, src_path, dst_path): - raise NotImplementedErrror + def getsize(self, path=None): + path = normalize_storage_path(path) + size = self.cursor.execute( + ''' + SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr + WHERE k LIKE (? || "%") AND + 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") + ''', + (path, path) + ) + for s, in size: + return s def rmdir(self, path=None): - dir_path = normalize_storage_path(self.full_path(path)) + '/' - for blob in self.client.list_blobs(self.container_name, prefix=dir_path): - self.client.delete_blob(self.container_name, blob.name) + path = normalize_storage_path(path) + if path: + with self.lock: + self.cursor.execute( + 'DELETE FROM zarr WHERE k LIKE (? || "_%")', (path,) + ) + else: + self.clear() - def getsize(self, path=None): - dir_path = self.dir_path(path) - size = 0 - for blob in self.client.list_blobs(self.container_name, prefix=dir_path): - size += blob.properties.content_length - return size + def clear(self): + with self.lock: + self.cursor.executescript( + ''' + BEGIN TRANSACTION; + DROP TABLE zarr; + CREATE TABLE zarr(k TEXT PRIMARY KEY, v BLOB); + COMMIT TRANSACTION; + ''' + ) + + +class MongoDBStore(MutableMapping): + """Storage class using MongoDB. + + .. note:: This is an experimental feature. + + Requires the `pymongo `_ + package to be installed. + + Parameters + ---------- + database : string + Name of database + collection : string + Name of collection + **kwargs + Keyword arguments passed through to the `pymongo.MongoClient` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.MongoDBStore('localhost') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() + + Store a group:: + + >>> store = zarr.MongoDBStore('localhost') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() + + Notes + ----- + The maximum chunksize in MongoDB documents is 16 MB. + + """ + + _key = 'key' + _value = 'value' + + def __init__(self, database='mongodb_zarr', collection='zarr_collection', + **kwargs): + import pymongo + + self._database = database + self._collection = collection + self._kwargs = kwargs + + self.client = pymongo.MongoClient(**self._kwargs) + self.db = self.client.get_database(self._database) + self.collection = self.db.get_collection(self._collection) + + def __getitem__(self, key): + doc = self.collection.find_one({self._key: key}) + + if doc is None: + raise KeyError(key) + else: + return binary_type(doc[self._value]) + + def __setitem__(self, key, value): + value = ensure_bytes(value) + self.collection.replace_one({self._key: key}, + {self._key: key, self._value: value}, + upsert=True) + + def __delitem__(self, key): + result = self.collection.delete_many({self._key: key}) + if not result.deleted_count == 1: + raise KeyError(key) + + def __iter__(self): + for f in self.collection.find({}): + yield f[self._key] + + def __len__(self): + return self.collection.count_documents({}) + + def __getstate__(self): + return self._database, self._collection, self._kwargs + + def __setstate__(self, state): + database, collection, kwargs = state + self.__init__(database=database, collection=collection, **kwargs) + + def close(self): + """Cleanup client resources and disconnect from MongoDB.""" + self.client.close() def clear(self): - raise NotImplementedError + """Remove all items from store.""" + self.collection.delete_many({}) + + +class RedisStore(MutableMapping): + """Storage class using Redis. + + .. note:: This is an experimental feature. + + Requires the `redis `_ + package to be installed. + + Parameters + ---------- + prefix : string + Name of prefix for Redis keys + **kwargs + Keyword arguments passed through to the `redis.Redis` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.RedisStore(port=6379) + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + + Store a group:: + + >>> store = zarr.RedisStore(port=6379) + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + + """ + def __init__(self, prefix='zarr', **kwargs): + import redis + self._prefix = prefix + self._kwargs = kwargs + + self.client = redis.Redis(**kwargs) + + def _key(self, key): + return '{prefix}:{key}'.format(prefix=self._prefix, key=key) + + def __getitem__(self, key): + return self.client[self._key(key)] + + def __setitem__(self, key, value): + value = ensure_bytes(value) + self.client[self._key(key)] = value + + def __delitem__(self, key): + count = self.client.delete(self._key(key)) + if not count: + raise KeyError(key) + + def keylist(self): + offset = len(self._key('')) # length of prefix + return [key[offset:].decode('utf-8') + for key in self.client.keys(self._key('*'))] + + def keys(self): + for key in self.keylist(): + yield key + + def __iter__(self): + for key in self.keys(): + yield key + + def __len__(self): + return len(self.keylist()) + + def __getstate__(self): + return self._prefix, self._kwargs + + def __setstate__(self, state): + prefix, kwargs = state + self.__init__(prefix=prefix, **kwargs) + + def clear(self): + for key in self.keys(): + del self[key] + + +class ConsolidatedMetadataStore(MutableMapping): + """A layer over other storage, where the metadata has been consolidated into + a single key. + + The purpose of this class, is to be able to get all of the metadata for + a given dataset in a single read operation from the underlying storage. + See :func:`zarr.convenience.consolidate_metadata` for how to create this + single metadata key. + + This class loads from the one key, and stores the data in a dict, so that + accessing the keys no longer requires operations on the backend store. + + This class is read-only, and attempts to change the dataset metadata will + fail, but changing the data is possible. If the backend storage is changed + directly, then the metadata stored here could become obsolete, and + :func:`zarr.convenience.consolidate_metadata` should be called again and the class + re-invoked. The use case is for write once, read many times. + + .. versionadded:: 2.3 + + .. note:: This is an experimental feature. + + Parameters + ---------- + store: MutableMapping + Containing the zarr dataset. + metadata_key: str + The target in the store where all of the metadata are stored. We + assume JSON encoding. + + See Also + -------- + zarr.convenience.consolidate_metadata, zarr.convenience.open_consolidated + + """ + def __init__(self, store, metadata_key='.zmetadata'): + self.store = store + + # retrieve consolidated metadata + if sys.version_info.major == 3 and sys.version_info.minor < 6: + d = store[metadata_key].decode() # pragma: no cover + else: # pragma: no cover + d = store[metadata_key] + meta = json.loads(d) + + # check format of consolidated metadata + consolidated_format = meta.get('zarr_consolidated_format', None) + if consolidated_format != 1: + raise MetadataError('unsupported zarr consolidated metadata format: %s' % + consolidated_format) + + # decode metadata + self.meta_store = meta['metadata'] + + def __getitem__(self, key): + return self.meta_store[key] + + def __contains__(self, item): + return item in self.meta_store + + def __iter__(self): + return iter(self.meta_store) + + def __len__(self): + return len(self.meta_store) + + def __delitem__(self, key): + err_read_only() + + def __setitem__(self, key, value): + err_read_only() + + def getsize(self, path): + return getsize(self.meta_store, path) + + def listdir(self, path): + return listdir(self.meta_store, path) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index c77006c4f6..f64d27ed16 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -4,6 +4,7 @@ import atexit import os import unittest +from numbers import Integral import numpy as np @@ -12,11 +13,12 @@ import pytest -from zarr.convenience import open, save, save_group, load, copy_store, copy -from zarr.storage import atexit_rmtree +from zarr.convenience import (open, save, save_group, load, copy_store, copy, + consolidate_metadata, open_consolidated) +from zarr.storage import atexit_rmtree, DictStore, getsize, ConsolidatedMetadataStore from zarr.core import Array from zarr.hierarchy import Group, group -from zarr.errors import CopyError +from zarr.errors import CopyError, PermissionError def test_open_array(): @@ -91,6 +93,80 @@ def test_lazy_loader(): assert_array_equal(bar, loader['bar']) +def test_consolidate_metadata(): + + # setup initial data + store = DictStore() + z = group(store) + z.create_group('g1') + g2 = z.create_group('g2') + g2.attrs['hello'] = 'world' + arr = g2.create_dataset('arr', shape=(20, 20), chunks=(5, 5), dtype='f8') + assert 16 == arr.nchunks + assert 0 == arr.nchunks_initialized + arr.attrs['data'] = 1 + arr[:] = 1.0 + assert 16 == arr.nchunks_initialized + + # perform consolidation + out = consolidate_metadata(store) + assert isinstance(out, Group) + assert '.zmetadata' in store + for key in ['.zgroup', + 'g1/.zgroup', + 'g2/.zgroup', + 'g2/.zattrs', + 'g2/arr/.zarray', + 'g2/arr/.zattrs']: + del store[key] + + # open consolidated + z2 = open_consolidated(store) + assert ['g1', 'g2'] == list(z2) + assert 'world' == z2.g2.attrs['hello'] + assert 1 == z2.g2.arr.attrs['data'] + assert (z2.g2.arr[:] == 1.0).all() + assert 16 == z2.g2.arr.nchunks + assert 16 == z2.g2.arr.nchunks_initialized + + # tests del/write on the store + cmd = ConsolidatedMetadataStore(store) + with pytest.raises(PermissionError): + del cmd['.zgroup'] + with pytest.raises(PermissionError): + cmd['.zgroup'] = None + + # test getsize on the store + assert isinstance(getsize(cmd), Integral) + + # test new metadata are not writeable + with pytest.raises(PermissionError): + z2.create_group('g3') + with pytest.raises(PermissionError): + z2.create_dataset('spam', shape=42, chunks=7, dtype='i4') + with pytest.raises(PermissionError): + del z2['g2'] + + # test consolidated metadata are not writeable + with pytest.raises(PermissionError): + z2.g2.attrs['hello'] = 'universe' + with pytest.raises(PermissionError): + z2.g2.arr.attrs['foo'] = 'bar' + + # test the data are writeable + z2.g2.arr[:] = 2 + assert (z2.g2.arr[:] == 2).all() + + # test invalid modes + with pytest.raises(ValueError): + open_consolidated(store, mode='a') + with pytest.raises(ValueError): + open_consolidated(store, mode='w') + + # make sure keyword arguments are passed through without error + open_consolidated(store, cache_attrs=True, synchronizer=None) + + class TestCopyStore(unittest.TestCase): def setUp(self): diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 390f888287..d8f566fe3c 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -12,14 +12,15 @@ import numpy as np from numpy.testing import assert_array_equal, assert_array_almost_equal import pytest +from azure.storage.blob import BlockBlobService from zarr.storage import (DirectoryStore, init_array, init_group, NestedDirectoryStore, - DBMStore, LMDBStore, atexit_rmtree, atexit_rmglob, - LRUStoreCache) + DBMStore, LMDBStore, SQLiteStore, ABSStore, atexit_rmtree, + atexit_rmglob, LRUStoreCache) from zarr.core import Array from zarr.errors import PermissionError -from zarr.compat import PY2, text_type, binary_type +from zarr.compat import PY2, text_type, binary_type, zip_longest from zarr.util import buffer_size from numcodecs import (Delta, FixedScaleOffset, Zlib, Blosc, BZ2, MsgPack, Pickle, Categorize, JSON, VLenUTF8, VLenBytes, VLenArray) @@ -656,19 +657,39 @@ def test_read_only(self): def test_pickle(self): + # setup array z = self.create_array(shape=1000, chunks=100, dtype=int, cache_metadata=False, cache_attrs=False) - z[:] = np.random.randint(0, 1000, 1000) - z2 = pickle.loads(pickle.dumps(z)) - assert z.shape == z2.shape - assert z.chunks == z2.chunks - assert z.dtype == z2.dtype + shape = z.shape + chunks = z.chunks + dtype = z.dtype + compressor_config = None if z.compressor: - assert z.compressor.get_config() == z2.compressor.get_config() - assert z.fill_value == z2.fill_value - assert z._cache_metadata == z2._cache_metadata - assert z.attrs.cache == z2.attrs.cache - assert_array_equal(z[:], z2[:]) + compressor_config = z.compressor.get_config() + fill_value = z.fill_value + cache_metadata = z._cache_metadata + attrs_cache = z.attrs.cache + a = np.random.randint(0, 1000, 1000) + z[:] = a + + # round trip through pickle + dump = pickle.dumps(z) + # some stores cannot be opened twice at the same time, need to close + # store before can round-trip through pickle + if hasattr(z.store, 'close'): + z.store.close() + z2 = pickle.loads(dump) + + # verify + assert shape == z2.shape + assert chunks == z2.chunks + assert dtype == z2.dtype + if z2.compressor: + assert compressor_config == z2.compressor.get_config() + assert fill_value == z2.fill_value + assert cache_metadata == z2._cache_metadata + assert attrs_cache == z2.attrs.cache + assert_array_equal(a, z2[:]) def test_np_ufuncs(self): z = self.create_array(shape=(100, 100), chunks=(10, 10)) @@ -826,17 +847,36 @@ def test_nchunks_initialized(self): z[:] = 42 assert 10 == z.nchunks_initialized - def test_structured_array(self): + def test_array_dtype_shape(self): + dt = "(2, 2)f4" # setup some data - d = np.array([(b'aaa', 1, 4.2), - (b'bbb', 2, 8.4), - (b'ccc', 3, 12.6)], - dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + d = np.array([((0, 1), + (1, 2)), + ((1, 2), + (2, 3)), + ((2, 3), + (3, 4))], + dtype=dt) + for a in (d, d[:0]): - for fill_value in None, b'', (b'zzz', 42, 16.8): + for fill_value in None, 0: + z = self.create_array(shape=a.shape[:-2], chunks=2, dtype=dt, fill_value=fill_value) + assert len(a) == len(z) + if fill_value is not None: + assert fill_value == z.fill_value + z[...] = a + assert_array_equal(a, z[...]) + + def check_structured_array(self, d, fill_values): + for a in (d, d[:0]): + for fill_value in fill_values: z = self.create_array(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=fill_value) assert len(a) == len(z) + assert a.shape == z.shape + assert a.dtype == z.dtype + + # check use of fill value before array is initialised with data if fill_value is not None: if fill_value == b'': # numpy 1.14 compatibility @@ -844,16 +884,53 @@ def test_structured_array(self): else: np_fill_value = np.array(fill_value, dtype=a.dtype)[()] assert np_fill_value == z.fill_value - if len(z): + if len(a): assert np_fill_value == z[0] assert np_fill_value == z[-1] + empty = np.empty_like(a) + empty[:] = np_fill_value + assert empty[0] == z[0] + assert_array_equal(empty[0:2], z[0:2]) + assert_array_equal(empty, z[...]) + for f in a.dtype.names: + assert_array_equal(empty[f], z[f]) + + # store data in array z[...] = a + + # check stored data if len(a): assert a[0] == z[0] - assert_array_equal(a, z[...]) - assert_array_equal(a['foo'], z['foo']) - assert_array_equal(a['bar'], z['bar']) - assert_array_equal(a['baz'], z['baz']) + assert a[-1] == z[-1] + assert_array_equal(a[0:2], z[0:2]) + assert_array_equal(a, z[...]) + for f in a.dtype.names: + assert_array_equal(a[f], z[f]) + + def test_structured_array(self): + d = np.array([(b'aaa', 1, 4.2), + (b'bbb', 2, 8.4), + (b'ccc', 3, 12.6)], + dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')]) + fill_values = None, b'', (b'zzz', 42, 16.8) + self.check_structured_array(d, fill_values) + + def test_structured_array_subshapes(self): + d = np.array([(0, ((0, 1, 2), (1, 2, 3)), b'aaa'), + (1, ((1, 2, 3), (2, 3, 4)), b'bbb'), + (2, ((2, 3, 4), (3, 4, 5)), b'ccc')], + dtype=[('foo', 'i8'), ('bar', '(2, 3)f4'), ('baz', 'S3')]) + fill_values = None, b'', (0, ((0, 0, 0), (1, 1, 1)), b'zzz') + self.check_structured_array(d, fill_values) + + def test_structured_array_nested(self): + d = np.array([(0, (0, ((0, 1), (1, 2), (2, 3)), 0), b'aaa'), + (1, (1, ((1, 2), (2, 3), (3, 4)), 1), b'bbb'), + (2, (2, ((2, 3), (3, 4), (4, 5)), 2), b'ccc')], + dtype=[('foo', 'i8'), ('bar', [('foo', 'i4'), ('bar', '(3, 2)f4'), + ('baz', 'u1')]), ('baz', 'S3')]) + fill_values = None, b'', (0, (0, ((0, 0), (1, 1), (2, 2)), 0), b'zzz') + self.check_structured_array(d, fill_values) def test_dtypes(self): @@ -879,8 +956,9 @@ def test_dtypes(self): dtype = '{}8[{}]'.format(base_type, resolution) z = self.create_array(shape=100, dtype=dtype, fill_value=0) assert z.dtype == np.dtype(dtype) - a = np.random.randint(0, np.iinfo('u8').max, size=z.shape[0], - dtype='u8').view(dtype) + a = np.random.randint(np.iinfo('i8').min, np.iinfo('i8').max, + size=z.shape[0], + dtype='i8').view(dtype) z[:] = a assert_array_equal(a, z[:]) @@ -906,7 +984,7 @@ def test_object_arrays(self): z[0] = 'foo' assert z[0] == 'foo' z[1] = b'bar' - assert z[1] == 'bar' # msgpack gets this wrong + assert z[1] == b'bar' z[2] = 1 assert z[2] == 1 z[3] = [2, 4, 6, 'baz'] @@ -1078,6 +1156,40 @@ def test_object_codec_warnings(self): # provide object_codec, but not object dtype self.create_array(shape=10, chunks=5, dtype='i4', object_codec=JSON()) + def test_zero_d_iter(self): + a = np.array(1, dtype=int) + z = self.create_array(shape=a.shape, dtype=int) + z[...] = a + with pytest.raises(TypeError): + # noinspection PyStatementEffect + list(a) + with pytest.raises(TypeError): + # noinspection PyStatementEffect + list(z) + + def test_iter(self): + params = ( + ((1,), (1,)), + ((2,), (1,)), + ((1,), (2,)), + ((3,), (3,)), + ((1000,), (100,)), + ((100,), (1000,)), + ((1, 100), (1, 1)), + ((1, 0), (1, 1)), + ((0, 1), (1, 1)), + ((0, 1), (2, 1)), + ((100, 1), (3, 1)), + ((100, 100), (10, 10)), + ((10, 10, 10), (3, 3, 3)), + ) + for shape, chunks in params: + z = self.create_array(shape=shape, chunks=chunks, dtype=int) + a = np.arange(np.product(shape)).reshape(shape) + z[:] = a + for expect, actual in zip_longest(a, z): + assert_array_equal(expect, actual) + class TestArrayWithPath(TestArray): @@ -1211,6 +1323,28 @@ def test_nbytes_stored(self): assert expect_nbytes_stored == z.nbytes_stored +class TestArrayWithABSStore(TestArray): + + @staticmethod + def absstore(): + blob_client = BlockBlobService(is_emulated=True) + blob_client.delete_container('test') + blob_client.create_container('test') + store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', + account_key='bar', blob_service_kwargs={'is_emulated': True}) + store.rmdir() + return store + + def create_array(self, read_only=False, **kwargs): + store = self.absstore() + kwargs.setdefault('compressor', Zlib(1)) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=cache_metadata, + cache_attrs=cache_attrs) + + class TestArrayWithNestedDirectoryStore(TestArrayWithDirectoryStore): @staticmethod @@ -1313,6 +1447,31 @@ def test_nbytes_stored(self): pass # not implemented +try: + import sqlite3 +except ImportError: # pragma: no cover + sqlite3 = None + + +@unittest.skipIf(sqlite3 is None, 'python built without sqlite') +class TestArrayWithSQLiteStore(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + path = mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStore(path) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + kwargs.setdefault('compressor', Zlib(1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=cache_metadata, + cache_attrs=cache_attrs) + + def test_nbytes_stored(self): + pass # not implemented + + class TestArrayWithNoCompressor(TestArray): def create_array(self, read_only=False, **kwargs): @@ -1539,10 +1698,22 @@ def test_astype(self): expected = data.astype(astype) assert_array_equal(expected, z2) + def test_array_dtype_shape(self): + # skip this one, cannot do delta on unstructured array + pass + def test_structured_array(self): # skip this one, cannot do delta on structured array pass + def test_structured_array_subshapes(self): + # skip this one, cannot do delta on structured array + pass + + def test_structured_array_nested(self): + # skip this one, cannot do delta on structured array + pass + def test_dtypes(self): # skip this one, delta messes up floats pass diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index eb437706f0..4c2af854fb 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -3,7 +3,7 @@ import tempfile import shutil import atexit -import warnings +import os.path import numpy as np @@ -22,12 +22,6 @@ from zarr.compat import PY2 -# needed for PY2/PY3 consistent behaviour -if PY2: # pragma: py3 no cover - warnings.resetwarnings() - warnings.simplefilter('always') - - # something bcolz-like class MockBcolzArray(object): @@ -141,6 +135,12 @@ def test_full(): z = full(100, chunks=10, fill_value=np.nan, dtype='f8') assert np.all(np.isnan(z[:])) + # NaT + z = full(100, chunks=10, fill_value='NaT', dtype='M8[s]') + assert np.all(np.isnat(z[:])) + z = full(100, chunks=10, fill_value='NaT', dtype='m8[s]') + assert np.all(np.isnat(z[:])) + # byte string dtype v = b'xxx' z = full(100, chunks=10, fill_value=v, dtype='S3') @@ -247,6 +247,14 @@ def test_open_array(): assert isinstance(z, Array) assert 'foo/bar' == z.path + # with chunk store + meta_store = 'data/meta.zarr' + chunk_store = 'data/chunks.zarr' + z = open_array(store=meta_store, chunk_store=chunk_store, shape=11, mode='w') + z[:] = 42 + assert os.path.abspath(meta_store) == z.store.path + assert os.path.abspath(chunk_store) == z.chunk_store.path + def test_empty_like(): @@ -457,12 +465,15 @@ def test_compression_args(): assert 'zlib' == z.compressor.codec_id assert 9 == z.compressor.level - with pytest.warns(UserWarning): - # 'compressor' overrides 'compression' - create(100, compressor=Zlib(9), compression='bz2', compression_opts=1) - with pytest.warns(UserWarning): - # 'compressor' ignores 'compression_opts' - create(100, compressor=Zlib(9), compression_opts=1) + # cannot get warning tests to work on PY2 + if not PY2: # pragma: py2 no cover + + with pytest.warns(UserWarning): + # 'compressor' overrides 'compression' + create(100, compressor=Zlib(9), compression='bz2', compression_opts=1) + with pytest.warns(UserWarning): + # 'compressor' ignores 'compression_opts' + create(100, compressor=Zlib(9), compression_opts=1) def test_create_read_only(): diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index f47012cf88..37baecf1ae 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -13,12 +13,13 @@ import numpy as np from numpy.testing import assert_array_equal import pytest +from azure.storage.blob import BlockBlobService from zarr.storage import (DictStore, DirectoryStore, ZipStore, init_group, init_array, array_meta_key, group_meta_key, atexit_rmtree, - NestedDirectoryStore, DBMStore, LMDBStore, atexit_rmglob, - LRUStoreCache) + NestedDirectoryStore, DBMStore, LMDBStore, SQLiteStore, + ABSStore, atexit_rmglob, LRUStoreCache) from zarr.core import Array from zarr.compat import PY2, text_type from zarr.hierarchy import Group, group, open_group @@ -820,23 +821,31 @@ def test_paths(self): g1['foo/../bar'] def test_pickle(self): - # setup + + # setup group g = self.create_group() d = g.create_dataset('foo/bar', shape=100, chunks=10) d[:] = np.arange(100) - - # needed for zip store - if hasattr(g.store, 'flush'): - g.store.flush() - - # pickle round trip - g2 = pickle.loads(pickle.dumps(g)) - assert g.path == g2.path - assert g.name == g2.name - assert len(g) == len(g2) - assert list(g) == list(g2) - assert g['foo'] == g2['foo'] - assert g['foo/bar'] == g2['foo/bar'] + path = g.path + name = g.name + n = len(g) + keys = list(g) + + # round-trip through pickle + dump = pickle.dumps(g) + # some stores cannot be opened twice at the same time, need to close + # store before can round-trip through pickle + if hasattr(g.store, 'close'): + g.store.close() + g2 = pickle.loads(dump) + + # verify + assert path == g2.path + assert name == g2.name + assert n == len(g2) + assert keys == list(g2) + assert isinstance(g2['foo'], Group) + assert isinstance(g2['foo/bar'], Array) class TestGroupWithDictStore(TestGroup): @@ -856,6 +865,19 @@ def create_store(): return store, None +class TestGroupWithABSStore(TestGroup): + + @staticmethod + def create_store(): + blob_client = BlockBlobService(is_emulated=True) + blob_client.delete_container('test') + blob_client.create_container('test') + store = ABSStore(container='test', prefix='zarrtesting/', account_name='foo', + account_key='bar', blob_service_kwargs={'is_emulated': True}) + store.rmdir() + return store, None + + class TestGroupWithNestedDirectoryStore(TestGroup): @staticmethod @@ -920,6 +942,22 @@ def create_store(): return store, None +try: + import sqlite3 +except ImportError: # pragma: no cover + sqlite3 = None + + +@unittest.skipIf(sqlite3 is None, 'python built without sqlite') +class TestGroupWithSQLiteStore(TestGroup): + + def create_store(self): + path = tempfile.mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStore(path) + return store, None + + class TestGroupWithChunkStore(TestGroup): @staticmethod diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index a8c781421f..12dda299c8 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -116,6 +116,142 @@ def test_encode_decode_array_2(): assert [df.get_config()] == meta_dec['filters'] +def test_encode_decode_array_datetime_timedelta(): + + # some variations + for k in ['m8[s]', 'M8[s]']: + compressor = Blosc(cname='lz4', clevel=3, shuffle=2) + dtype = np.dtype(k) + fill_value = dtype.type("NaT") + meta = dict( + shape=(100, 100), + chunks=(10, 10), + dtype=dtype, + compressor=compressor.get_config(), + fill_value=fill_value, + order=dtype.char, + filters=[] + ) + + meta_json = '''{ + "chunks": [10, 10], + "compressor": { + "id": "blosc", + "clevel": 3, + "cname": "lz4", + "shuffle": 2, + "blocksize": 0 + }, + "dtype": "%s", + "fill_value": -9223372036854775808, + "filters": [], + "order": "%s", + "shape": [100, 100], + "zarr_format": %s + }''' % (dtype.str, dtype.char, ZARR_FORMAT) + + # test encoding + meta_enc = encode_array_metadata(meta) + assert_json_equal(meta_json, meta_enc) + + # test decoding + meta_dec = decode_array_metadata(meta_enc) + assert ZARR_FORMAT == meta_dec['zarr_format'] + assert meta['shape'] == meta_dec['shape'] + assert meta['chunks'] == meta_dec['chunks'] + assert meta['dtype'] == meta_dec['dtype'] + assert meta['compressor'] == meta_dec['compressor'] + assert meta['order'] == meta_dec['order'] + # Based off of this SO answer: https://stackoverflow.com/a/49972198 + assert np.all( + fill_value.view((np.uint8, fill_value.itemsize)) == + meta_dec['fill_value'].view((np.uint8, meta_dec['fill_value'].itemsize)) + ) + assert [] == meta_dec['filters'] + + +def test_encode_decode_array_dtype_shape(): + + meta = dict( + shape=(100,), + chunks=(10,), + dtype=np.dtype('(10, 10)f8'), + compressor=Zlib(1).get_config(), + fill_value=None, + filters=None, + order='C' + ) + + meta_json = '''{ + "chunks": [10], + "compressor": {"id": "zlib", "level": 1}, + "dtype": "