diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 74801d3115..24498b0d79 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -21,6 +21,10 @@ Storage (``zarr.storage``) .. automethod:: close .. automethod:: flush +.. autoclass:: SQLiteStore + + .. automethod:: close + .. autoclass:: LRUStoreCache .. automethod:: invalidate diff --git a/docs/release.rst b/docs/release.rst index a1cc4286af..f0d5a559ab 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -19,6 +19,10 @@ Enhancements * Support has been added for structured arrays with sub-array shape and/or nested fields. By :user:`Tarik Onalan `, :issue:`111`, :issue:`296`. +* Adds the SQLite-backed :class:`zarr.storage.SQLiteStore` class enabling an + SQLite database to be used as the backing store for an array or group. + By :user:`John Kirkham `, :issue:`368`, :issue:`365`. + Bug fixes ~~~~~~~~~ diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 29ce8b0935..0fbefc3e2e 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -729,6 +729,16 @@ group (requires `lmdb `_ to be installed):: >>> z[:] = 42 >>> store.close() +In Zarr version 2.3 is the :class:`zarr.storage.SQLiteStore` class which +enables the SQLite database to be used for storing an array or group (requires +Python is built with SQLite support):: + + >>> store = zarr.SQLiteStore('data/example.sqldb') + >>> root = zarr.group(store=store, overwrite=True) + >>> z = root.zeros('foo/bar', shape=(1000, 1000), chunks=(100, 100), dtype='i4') + >>> z[:] = 42 + >>> store.close() + Distributed/cloud storage ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/zarr/__init__.py b/zarr/__init__.py index cf34d3d427..c9046f6bff 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -7,7 +7,8 @@ from zarr.creation import (empty, zeros, ones, full, array, empty_like, zeros_like, ones_like, full_like, open_array, open_like, create) from zarr.storage import (DictStore, DirectoryStore, ZipStore, TempStore, - NestedDirectoryStore, DBMStore, LMDBStore, LRUStoreCache) + NestedDirectoryStore, DBMStore, LMDBStore, SQLiteStore, + LRUStoreCache) from zarr.hierarchy import group, open_group, Group from zarr.sync import ThreadSynchronizer, ProcessSynchronizer from zarr.codecs import * diff --git a/zarr/storage.py b/zarr/storage.py index e7d70ea7bc..2a07d9aa38 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -18,6 +18,7 @@ from __future__ import absolute_import, print_function, division from collections import MutableMapping, OrderedDict import os +import operator import tempfile import zipfile import shutil @@ -26,6 +27,7 @@ import sys import json import multiprocessing +from pickle import PicklingError from threading import Lock, RLock import glob import warnings @@ -1877,6 +1879,211 @@ def __delitem__(self, key): self._invalidate_value(key) +class SQLiteStore(MutableMapping): + """Storage class using SQLite. + + Parameters + ---------- + path : string + Location of database file. + **kwargs + Keyword arguments passed through to the `sqlite3.connect` function. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.SQLiteStore('data/array.sqldb') + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + >>> store.close() # don't forget to call this when you're done + + Store a group:: + + >>> store = zarr.SQLiteStore('data/group.sqldb') + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + >>> store.close() # don't forget to call this when you're done + """ + + def __init__(self, path, **kwargs): + import sqlite3 + + # normalize path + if path != ':memory:': + path = os.path.abspath(path) + + # store properties + self.path = path + self.kwargs = kwargs + + # allow threading if SQLite connections are thread-safe + # + # ref: https://www.sqlite.org/releaselog/3_3_1.html + # ref: https://bugs.python.org/issue27190 + check_same_thread = True + if sqlite3.sqlite_version_info >= (3, 3, 1): + check_same_thread = False + + # keep a lock for serializing mutable operations + self.lock = Lock() + + # open database + self.db = sqlite3.connect( + self.path, + detect_types=0, + isolation_level=None, + check_same_thread=check_same_thread, + **self.kwargs + ) + + # handle keys as `str`s + self.db.text_factory = str + + # get a cursor to read/write to the database + self.cursor = self.db.cursor() + + # initialize database with our table if missing + with self.lock: + self.cursor.execute( + 'CREATE TABLE IF NOT EXISTS zarr(k TEXT PRIMARY KEY, v BLOB)' + ) + + def __getstate__(self): + if self.path == ':memory:': + raise PicklingError('Cannot pickle in-memory SQLite databases') + return self.path, self.kwargs + + def __setstate__(self, state): + path, kwargs = state + self.__init__(path=path, **kwargs) + + def close(self): + """Closes the underlying database.""" + + # close cursor and db objects + self.cursor.close() + self.db.close() + + def __getitem__(self, key): + value = self.cursor.execute('SELECT v FROM zarr WHERE (k = ?)', (key,)) + for v, in value: + return v + raise KeyError(key) + + def __setitem__(self, key, value): + self.update({key: value}) + + def __delitem__(self, key): + with self.lock: + self.cursor.execute('DELETE FROM zarr WHERE (k = ?)', (key,)) + if self.cursor.rowcount < 1: + raise KeyError(key) + + def __contains__(self, key): + cs = self.cursor.execute( + 'SELECT COUNT(*) FROM zarr WHERE (k = ?)', (key,) + ) + for has, in cs: + has = bool(has) + return has + + def items(self): + kvs = self.cursor.execute('SELECT k, v FROM zarr') + for k, v in kvs: + yield k, v + + def keys(self): + ks = self.cursor.execute('SELECT k FROM zarr') + for k, in ks: + yield k + + def values(self): + vs = self.cursor.execute('SELECT v FROM zarr') + for v, in vs: + yield v + + def __iter__(self): + return self.keys() + + def __len__(self): + cs = self.cursor.execute('SELECT COUNT(*) FROM zarr') + for c, in cs: + return c + + def update(self, *args, **kwargs): + args += (kwargs,) + + kv_list = [] + for dct in args: + for k, v in dct.items(): + # Python 2 cannot store `memoryview`s, but it can store + # `buffer`s. However Python 2 won't return `bytes` then. So we + # coerce to `bytes`, which are handled correctly. Python 3 + # doesn't have these issues. + if PY2: # pragma: py3 no cover + v = ensure_bytes(v) + else: # pragma: py2 no cover + v = ensure_contiguous_ndarray(v) + + # Accumulate key-value pairs for storage + kv_list.append((k, v)) + + with self.lock: + self.cursor.executemany('REPLACE INTO zarr VALUES (?, ?)', kv_list) + + def listdir(self, path=None): + path = normalize_storage_path(path) + keys = self.cursor.execute( + ''' + SELECT DISTINCT SUBSTR(m, 0, INSTR(m, "/")) AS l FROM ( + SELECT LTRIM(SUBSTR(k, LENGTH(?) + 1), "/") || "/" AS m + FROM zarr WHERE k LIKE (? || "_%") + ) ORDER BY l ASC + ''', + (path, path) + ) + keys = list(map(operator.itemgetter(0), keys)) + return keys + + def getsize(self, path=None): + path = normalize_storage_path(path) + size = self.cursor.execute( + ''' + SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr + WHERE k LIKE (? || "%") AND + 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") + ''', + (path, path) + ) + for s, in size: + return s + + def rmdir(self, path=None): + path = normalize_storage_path(path) + if path: + with self.lock: + self.cursor.execute( + 'DELETE FROM zarr WHERE k LIKE (? || "_%")', (path,) + ) + else: + self.clear() + + def clear(self): + with self.lock: + self.cursor.executescript( + ''' + BEGIN TRANSACTION; + DROP TABLE zarr; + CREATE TABLE zarr(k TEXT PRIMARY KEY, v BLOB); + COMMIT TRANSACTION; + ''' + ) + + class ConsolidatedMetadataStore(MutableMapping): """A layer over other storage, where the metadata has been consolidated into a single key. diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index cbad222edb..b2b6bb011e 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -15,7 +15,7 @@ from zarr.storage import (DirectoryStore, init_array, init_group, NestedDirectoryStore, - DBMStore, LMDBStore, atexit_rmtree, atexit_rmglob, + DBMStore, LMDBStore, SQLiteStore, atexit_rmtree, atexit_rmglob, LRUStoreCache) from zarr.core import Array from zarr.errors import PermissionError @@ -1390,6 +1390,31 @@ def test_nbytes_stored(self): pass # not implemented +try: + import sqlite3 +except ImportError: # pragma: no cover + sqlite3 = None + + +@unittest.skipIf(sqlite3 is None, 'python built without sqlite') +class TestArrayWithSQLiteStore(TestArray): + + @staticmethod + def create_array(read_only=False, **kwargs): + path = mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStore(path) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + kwargs.setdefault('compressor', Zlib(1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=cache_metadata, + cache_attrs=cache_attrs) + + def test_nbytes_stored(self): + pass # not implemented + + class TestArrayWithNoCompressor(TestArray): def create_array(self, read_only=False, **kwargs): diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index 7758976c8c..369cf4b55a 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -17,8 +17,8 @@ from zarr.storage import (DictStore, DirectoryStore, ZipStore, init_group, init_array, array_meta_key, group_meta_key, atexit_rmtree, - NestedDirectoryStore, DBMStore, LMDBStore, atexit_rmglob, - LRUStoreCache) + NestedDirectoryStore, DBMStore, LMDBStore, SQLiteStore, + atexit_rmglob, LRUStoreCache) from zarr.core import Array from zarr.compat import PY2, text_type from zarr.hierarchy import Group, group, open_group @@ -928,6 +928,22 @@ def create_store(): return store, None +try: + import sqlite3 +except ImportError: # pragma: no cover + sqlite3 = None + + +@unittest.skipIf(sqlite3 is None, 'python built without sqlite') +class TestGroupWithSQLiteStore(TestGroup): + + def create_store(self): + path = tempfile.mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStore(path) + return store, None + + class TestGroupWithChunkStore(TestGroup): @staticmethod diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 154ffbe3ed..ab3ea5c26d 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -8,6 +8,7 @@ import array import shutil import os +from pickle import PicklingError import numpy as np @@ -19,7 +20,7 @@ DirectoryStore, ZipStore, init_group, group_meta_key, getsize, migrate_1to2, TempStore, atexit_rmtree, NestedDirectoryStore, default_compressor, DBMStore, - LMDBStore, atexit_rmglob, LRUStoreCache, + LMDBStore, SQLiteStore, atexit_rmglob, LRUStoreCache, ConsolidatedMetadataStore) from zarr.meta import (decode_array_metadata, encode_array_metadata, ZARR_FORMAT, decode_group_metadata, encode_group_metadata) @@ -894,6 +895,41 @@ def test_context_manager(self): assert 2 == len(store) +try: + import sqlite3 +except ImportError: # pragma: no cover + sqlite3 = None + + +@unittest.skipIf(sqlite3 is None, 'python built without sqlite') +class TestSQLiteStore(StoreTests, unittest.TestCase): + + def create_store(self): + path = tempfile.mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStore(path) + return store + + +@unittest.skipIf(sqlite3 is None, 'python built without sqlite') +class TestSQLiteStoreInMemory(TestSQLiteStore, unittest.TestCase): + + def create_store(self): + store = SQLiteStore(':memory:') + return store + + def test_pickle(self): + + # setup store + store = self.create_store() + store['foo'] = b'bar' + store['baz'] = b'quux' + + # round-trip through pickle + with pytest.raises(PicklingError): + pickle.dumps(store) + + class TestLRUStoreCache(StoreTests, unittest.TestCase): def create_store(self):