Skip to content

Writing XArray Dataset with Structured data types fail to serialize to Zarr v2 if _FillValue is used #582

@dmitriyrepin

Description

@dmitriyrepin

Structured data types fail to serialize to Zarr v2
Here ara

def test_repro_structured_xr_to_zar() -> None:
    """Reproducer for problems with the segy_to_mdio_v1 function.

    Will be removed in the when the final PR is submitted
    """
    shape = (4, 4, 2)
    dim_names = ["inline", "crossline", "depth"]
    chunks = (2, 2, 2)
    # Pretend that we created a pydantic model from a template
    structured_type = StructuredType(
        fields=[
            StructuredField(name="cdp_x", format=ScalarType.INT32),
            StructuredField(name="cdp_y", format=ScalarType.INT32),
            StructuredField(name="elevation", format=ScalarType.FLOAT16),
            StructuredField(name="some_scalar", format=ScalarType.FLOAT16),
        ]
    )

    xr_dataset = xr.Dataset()

    # Add traces to the dataset, shape = (4, 4, 2) of floats
    traces_zarr = zarr.zeros(shape=shape, dtype=np.float32, zarr_format=2)
    traces_xr = xr.DataArray(traces_zarr, dims=dim_names)
    traces_xr.encoding = {
        "_FillValue": np.nan,
        "chunks": chunks,
        "chunk_key_encoding": V2ChunkKeyEncoding(separator="/").to_dict(),
        "compressor": numcodecs.Blosc(cname="zstd", clevel=5, shuffle=1, blocksize=0),
    }
    xr_dataset["traces"] = traces_xr

    # Add headers to the dataset, shape = (4, 4) of structured type
    data_type = to_numpy_dtype(structured_type)

    # Validate the conversion
    assert data_type == np.dtype(
        [("cdp_x", "<i4"), ("cdp_y", "<i4"), ("elevation", "<f2"), ("some_scalar", "<f2")]
    )
    fill_value = np.zeros((), dtype=data_type)
    headers_zarr = zarr.zeros(shape=shape[:-1], dtype=data_type, zarr_format=2)
    headers_xr = xr.DataArray(headers_zarr, dims=dim_names[:-1])
    headers_xr.encoding = {
        "_FillValue": fill_value,
        "chunks": chunks[:-1],
        "chunk_key_encoding": V2ChunkKeyEncoding(separator="/").to_dict(),
        "compressor": numcodecs.Blosc(cname="zstd", clevel=5, shuffle=1, blocksize=0),
    }
    xr_dataset["headers"] = headers_xr

    # See _populate_dims_coords_and_write_to_zarr()
    # The compute=True because we would also write to Zarr the coord values here
    xr_dataset.to_zarr(
        store="/tmp/reproducer_xr.zarr",  # noqa: S108
        mode="w",
        write_empty_chunks=False,
        zarr_format=2,
        compute=True,
    )

    # In _populate_trace_mask_and_write_to_zarr
    # We do another write of "trace_mask" to the same Zarr store and remove it
    # from the dataset

    # ----------------------------------------------
    # Now will will do parallel write of the data and the headers
    # see blocked_io.to_zarr -> trace_worker

    not_null = np.array(
        [
            [True, False, False, False],
            [False, True, False, False],
            [False, False, True, False],
            [False, False, False, True],
        ]
    )
    hdr = (11, 22, -33.0, 44.0)
    headers = np.array([hdr, hdr, hdr, hdr], dtype=data_type)
    trace = np.array(
        [[100.0, 200.0], [300.0, 400.0], [500.0, 600.0], [700.0, 800.0]], dtype=np.float32
    )

    # Here is one iteration of it:
    ds_to_write = xr_dataset[["traces", "headers"]]
    # We do not have any coords to reset
    # ds_to_write = ds_to_write.reset_coords()

    ds_to_write["headers"].data[not_null] = headers
    ds_to_write["headers"].data[~not_null] = 0
    ds_to_write["traces"].data[not_null] = trace

    region = {
        "inline": slice(0, 2, None),
        "crossline": slice(0, 2, None),
        "depth": slice(0, 2, None),
    }

    sub_dataset = ds_to_write.isel(region)
    sub_dataset.to_zarr(
        store="/tmp/reproducer_xr.zarr",  # noqa: S108
        region=region,
        mode="r+",
        write_empty_chunks=False,
        zarr_format=2,
    )
  • Reproducer 2
    The following results in the exception in .venv/lib/python3.13/site-packages/dask/array/wrap.py:225
    E TypeError: An error occurred while calling the wrap_func_shape_as_first_arg method registered to the numpy backend.
    E Original Message: Cannot cast array data from dtype([('inline', '<i4'), ('cdp_x', '<f8')]) to dtype('bool') according to the rule 'unsafe'
    E Raised while encoding variable 'myattr' with value <xarray.Variable (dim_0: 36)> Size: 432B
    E dask.array<zeros_like, shape=(36,), dtype=[('inline', '<i4'), ('cdp_x', '<f8')], chunksize=(36,), chunktype=numpy.ndarray>
def test_to_zarr_dask(tmp_path: Path) -> None:
    """Test writing XArray dataset with data as dask array to Zarr.
    """
    # Create a data type and the fill value
    dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")])
    dtype_fill_value = np_zeros((), dtype=dtype)

    # Use '_FillValue' instead of 'fill_value'
    # 'fill_value' is not a valid encoding key in Zarr v2
    my_attr_encoding = {
        "_FillValue": dtype_fill_value,
        "chunk_key_encoding": {"name": "v2", "separator": "/"},
    }

    # Create a dask array using the data type
    # Do not specify encoding as the array attribute
    data = dask_array.zeros((36,), dtype=dtype, chunks=(36,))
    aa = xr_DataArray(name="myattr", data=data)

    # Specify encoding per array
    encoding = {"myattr": my_attr_encoding}
    file_path = output_path(tmp_path, "to_zarr/zarr_dask", debugging=False)
    aa.to_zarr(file_path, mode="w", zarr_format=2, encoding=encoding, compute=False)

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions