-
-
Notifications
You must be signed in to change notification settings - Fork 373
Description
Problem description
I have a use case downstream where we want to access "blocks" of chunks. I have implemented a prototype that functions like dask.array.Array.blocks.
It uses the existing zarr.indexing machinery and matches the API of existing indexers.
This allows us to pull a "block" of chunks from data using slicing logic.
For instance, if we have an array with shape (10, 20, 30) with chunk sizes (5, 4, 10)
array.blocks[0]maps toarray[:5]array.blocks[..., 0]maps toarray[:, :, :10]array.blocks[1, 1, 1]maps toarray[5:10, 4:8, 10:20]array.blocks[:, 1:4, :]maps toarray[:, 4:16, :]
Why not just use dask array and method?
- We don't want to have
daskas a hard requirement in our library. - In many simple queries, vanilla zarr is much faster than converting to
dask.array+ scheduling overhead. - We don't want to install all dependencies of
daskfor just this functionality to keep our library lightweight.
If there are no objections I can start adding this in a new PR as soon as possible.
If we don't want this in zarr, I can keep this in our library as an extension to zarr.
Once the code below are implemented, all of the following evaluates to True as expected.
np.array_equal(array.blocks[0], array[:5])
np.array_equal(array.blocks[..., 0], array[:, :, :10])
np.array_equal(array.blocks[1, 1, 1], array[5:10, 4:8, 10:20])
np.array_equal(array.blocks[:, 1:4, :], array[:, 4:16, :])Tests and docs must be written, of course, I don't have that yet.
Implementation Details
Methods and attributes that would go into zarr.core.Array
class Array:
def __init__(...):
...
self._blocks = Blocks(self)
@property
def blocks(self):
return self._blocks
def get_block_selection(self, selection, out=None, fields=None):
if not self._cache_metadata:
self._load_metadata()
# check args
check_fields(fields, self._dtype)
# setup indexer
indexer = BlockIndexer(selection, self)
return self._get_selection(indexer=indexer, out=out, fields=fields)
def set_block_selection(self, selection, value, fields=None):
# guard conditions
if self._read_only:
raise ReadOnlyError()
# refresh metadata
if not self._cache_metadata:
self._load_metadata_nosync()
# setup indexer
indexer = BlockIndexer(selection, self)
self._set_selection(indexer, value, fields=fields)BlockIndexer class (compare to zarr.indexing.OrthogonalIndexer)
class BlockIndexer:
def __init__(self, selection, array):
# handle ellipsis
selection = replace_ellipsis(selection, array._shape)
# setup per-dimension indexers
dim_indexers = []
for dim_sel, dim_len, dim_chunk_len in \
zip(selection, array._shape, array._chunks):
if is_integer(dim_sel):
block_start = dim_sel * dim_chunk_len
block_stop = block_start + dim_chunk_len
block_slice = slice(block_start, block_stop)
elif is_slice(dim_sel):
start, stop, _ = dim_sel.indices(dim_len // dim_chunk_len)
block_start = start * dim_chunk_len
block_stop = stop * dim_chunk_len
block_slice = slice(block_start, block_stop)
else:
raise IndexError('unsupported selection item for block indexing; '
'expected integer or slice, got {!r}'
.format(type(dim_sel)))
dim_indexer = SliceDimIndexer(block_slice, dim_len, dim_chunk_len)
dim_indexers.append(dim_indexer)
self.dim_indexers = dim_indexers
self.shape = tuple(s.nitems for s in self.dim_indexers)
self.drop_axes = None
def __iter__(self):
for dim_projections in itertools.product(*self.dim_indexers):
chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections)
chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections)
out_selection = tuple(p.dim_out_sel for p in dim_projections
if p.dim_out_sel is not None)
yield ChunkProjection(chunk_coords, chunk_selection, out_selection)Blocks property class for slicing (compare to zarr.indexing.VIndex or zarr.indexing.OIndex
class Blocks:
def __init__(self, array):
self.array = array
def __getitem__(self, selection):
fields, selection = pop_fields(selection)
selection = ensure_tuple(selection)
return self.array.get_block_selection(selection, fields=fields)
def __setitem__(self, selection, value):
fields, selection = pop_fields(selection)
selection = ensure_tuple(selection)
return self.array.set_block_selection(selection, value, fields=fields)