-
Notifications
You must be signed in to change notification settings - Fork 16
Closed
Labels
Description
Structured data types fail to serialize to Zarr v2
Here ara
- A reference to XAraay bug: Encoding a fill value of a structured type and serializing to Zarr (v2) pydata/xarray#10591
- A reproducer 1 :
The following results in the exception in .venv/lib/python3.13/site-packages/xarray/backends/zarr.py:945: in store
E - TypeError: Failed to decode variable 'headers': unhashable type: 'writeable void-scalar'
def test_repro_structured_xr_to_zar() -> None:
"""Reproducer for problems with the segy_to_mdio_v1 function.
Will be removed in the when the final PR is submitted
"""
shape = (4, 4, 2)
dim_names = ["inline", "crossline", "depth"]
chunks = (2, 2, 2)
# Pretend that we created a pydantic model from a template
structured_type = StructuredType(
fields=[
StructuredField(name="cdp_x", format=ScalarType.INT32),
StructuredField(name="cdp_y", format=ScalarType.INT32),
StructuredField(name="elevation", format=ScalarType.FLOAT16),
StructuredField(name="some_scalar", format=ScalarType.FLOAT16),
]
)
xr_dataset = xr.Dataset()
# Add traces to the dataset, shape = (4, 4, 2) of floats
traces_zarr = zarr.zeros(shape=shape, dtype=np.float32, zarr_format=2)
traces_xr = xr.DataArray(traces_zarr, dims=dim_names)
traces_xr.encoding = {
"_FillValue": np.nan,
"chunks": chunks,
"chunk_key_encoding": V2ChunkKeyEncoding(separator="/").to_dict(),
"compressor": numcodecs.Blosc(cname="zstd", clevel=5, shuffle=1, blocksize=0),
}
xr_dataset["traces"] = traces_xr
# Add headers to the dataset, shape = (4, 4) of structured type
data_type = to_numpy_dtype(structured_type)
# Validate the conversion
assert data_type == np.dtype(
[("cdp_x", "<i4"), ("cdp_y", "<i4"), ("elevation", "<f2"), ("some_scalar", "<f2")]
)
fill_value = np.zeros((), dtype=data_type)
headers_zarr = zarr.zeros(shape=shape[:-1], dtype=data_type, zarr_format=2)
headers_xr = xr.DataArray(headers_zarr, dims=dim_names[:-1])
headers_xr.encoding = {
"_FillValue": fill_value,
"chunks": chunks[:-1],
"chunk_key_encoding": V2ChunkKeyEncoding(separator="/").to_dict(),
"compressor": numcodecs.Blosc(cname="zstd", clevel=5, shuffle=1, blocksize=0),
}
xr_dataset["headers"] = headers_xr
# See _populate_dims_coords_and_write_to_zarr()
# The compute=True because we would also write to Zarr the coord values here
xr_dataset.to_zarr(
store="/tmp/reproducer_xr.zarr", # noqa: S108
mode="w",
write_empty_chunks=False,
zarr_format=2,
compute=True,
)
# In _populate_trace_mask_and_write_to_zarr
# We do another write of "trace_mask" to the same Zarr store and remove it
# from the dataset
# ----------------------------------------------
# Now will will do parallel write of the data and the headers
# see blocked_io.to_zarr -> trace_worker
not_null = np.array(
[
[True, False, False, False],
[False, True, False, False],
[False, False, True, False],
[False, False, False, True],
]
)
hdr = (11, 22, -33.0, 44.0)
headers = np.array([hdr, hdr, hdr, hdr], dtype=data_type)
trace = np.array(
[[100.0, 200.0], [300.0, 400.0], [500.0, 600.0], [700.0, 800.0]], dtype=np.float32
)
# Here is one iteration of it:
ds_to_write = xr_dataset[["traces", "headers"]]
# We do not have any coords to reset
# ds_to_write = ds_to_write.reset_coords()
ds_to_write["headers"].data[not_null] = headers
ds_to_write["headers"].data[~not_null] = 0
ds_to_write["traces"].data[not_null] = trace
region = {
"inline": slice(0, 2, None),
"crossline": slice(0, 2, None),
"depth": slice(0, 2, None),
}
sub_dataset = ds_to_write.isel(region)
sub_dataset.to_zarr(
store="/tmp/reproducer_xr.zarr", # noqa: S108
region=region,
mode="r+",
write_empty_chunks=False,
zarr_format=2,
)- Reproducer 2
The following results in the exception in .venv/lib/python3.13/site-packages/dask/array/wrap.py:225
E TypeError: An error occurred while calling the wrap_func_shape_as_first_arg method registered to the numpy backend.
E Original Message: Cannot cast array data from dtype([('inline', '<i4'), ('cdp_x', '<f8')]) to dtype('bool') according to the rule 'unsafe'
E Raised while encoding variable 'myattr' with value <xarray.Variable (dim_0: 36)> Size: 432B
E dask.array<zeros_like, shape=(36,), dtype=[('inline', '<i4'), ('cdp_x', '<f8')], chunksize=(36,), chunktype=numpy.ndarray>
def test_to_zarr_dask(tmp_path: Path) -> None:
"""Test writing XArray dataset with data as dask array to Zarr.
"""
# Create a data type and the fill value
dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")])
dtype_fill_value = np_zeros((), dtype=dtype)
# Use '_FillValue' instead of 'fill_value'
# 'fill_value' is not a valid encoding key in Zarr v2
my_attr_encoding = {
"_FillValue": dtype_fill_value,
"chunk_key_encoding": {"name": "v2", "separator": "/"},
}
# Create a dask array using the data type
# Do not specify encoding as the array attribute
data = dask_array.zeros((36,), dtype=dtype, chunks=(36,))
aa = xr_DataArray(name="myattr", data=data)
# Specify encoding per array
encoding = {"myattr": my_attr_encoding}
file_path = output_path(tmp_path, "to_zarr/zarr_dask", debugging=False)
aa.to_zarr(file_path, mode="w", zarr_format=2, encoding=encoding, compute=False)Reactions are currently unavailable