diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index b0eef1a..8f54e30 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -49,7 +49,7 @@ jobs: - name: "Install dependencies" run: | - conda install --yes numpy pytest pytest-mock iris xarray filelock requests + conda install --yes numpy pytest pytest-mock iris xarray filelock requests zarr aiohttp - name: "Install *latest* Iris" run: | diff --git a/lib/ncdata/xarray.py b/lib/ncdata/xarray.py index cf92ce7..e746a44 100644 --- a/lib/ncdata/xarray.py +++ b/lib/ncdata/xarray.py @@ -9,6 +9,7 @@ # Hopefully a minimal amount. # The structure of an NcData object makes it fairly painless. # +import warnings from pathlib import Path from typing import AnyStr, Union @@ -21,6 +22,16 @@ from . import NcAttribute, NcData, NcDimension, NcVariable +def _raise_warning(var): + """Raise a warnings.warning if variable data not lazy.""" + warn_msg = ( + f"Variable {var} has fully realized " + "data, if you need lazy data, then add " + "chunks={} as argument to Xarray open_dataset." + ) + warnings.warn(warn_msg, UserWarning, stacklevel=2) + + class _XarrayNcDataStore(NetCDF4DataStore): """ An adapter class presenting ncdata as an xarray datastore. @@ -96,6 +107,16 @@ def store( # Install variables, creating dimensions as we go. for varname, var in new_variables.items(): + if isinstance(var.data, np.ndarray): + # Zarr2 metadata + if "axis" not in var.attrs: + std_axes = ["latitude", "longitude", "time"] + if not list(set(var.attrs.values()) & set(std_axes)): + _raise_warning(var) + # Zarr3 metadata + else: + if var.attrs["axis"] not in ["X", "Y", "Z", "T"]: + _raise_warning(var) if varname in self.ncdata.variables: raise ValueError(f'duplicate variable : "{varname}"') diff --git a/tests/integration/test_zarr_to_iris.py b/tests/integration/test_zarr_to_iris.py new file mode 100644 index 0000000..3580ce6 --- /dev/null +++ b/tests/integration/test_zarr_to_iris.py @@ -0,0 +1,118 @@ +"""Test conversion of remote and local Zarr store to iris Cube.""" +from importlib.resources import files as importlib_files +from pathlib import Path + +import iris +import pytest +import xarray as xr +import ncdata +import ncdata.iris_xarray +import zarr + + +def _return_kwargs(): + time_coder = xr.coders.CFDatetimeCoder(use_cftime=True) + xr_kwargs = { + "consolidated": True, + "decode_times": time_coder, + "engine": "zarr", + "chunks": {}, + "backend_kwargs": {}, + } + + return xr_kwargs + + +def _run_checks(cube): + """Run some standard checks.""" + assert cube.var_name == "q" + assert cube.standard_name == "specific_humidity" + assert cube.long_name is None + coords = cube.coords() + coord_names = [coord.standard_name for coord in coords] + assert "longitude" in coord_names + assert "latitude" in coord_names + + +def test_load_zarr2_local(): + """Test loading a Zarr2 store from local FS.""" + zarr_path = ( + Path(importlib_files("tests")) + / "zarr-sample-data" + / "example_field_0.zarr2" + ) + + xr_kwargs = _return_kwargs() + zarr_xr = xr.open_dataset(zarr_path, **xr_kwargs) + zarr_xr.unify_chunks() + + conversion_func = ncdata.iris_xarray.cubes_from_xarray + cubes = conversion_func(zarr_xr) + + assert len(cubes) == 1 + cube = cubes[0] + _run_checks(cube) + + +def test_load_zarr3_local(): + """Test loading a Zarr3 store from local FS.""" + zarr_path = ( + Path(importlib_files("tests")) + / "zarr-sample-data" + / "example_field_0.zarr3" + ) + + xr_kwargs = _return_kwargs() + zarr_xr = xr.open_dataset(zarr_path, **xr_kwargs) + zarr_xr.unify_chunks() + + conversion_func = ncdata.iris_xarray.cubes_from_xarray + cubes = conversion_func(zarr_xr) + + assert len(cubes) == 1 + cube = cubes[0] + _run_checks(cube) + + +def test_load_remote_zarr(): + """Test loading a remote Zarr store. + + This is a ~250MB compressed Zarr in an S3 bucket. + Conversion is done fully lazily, by passing chunks={} + to Xarray loader. Test takes ~3-4s and needs ~400MB res mem. + """ + zarr_path = ( + "https://uor-aces-o.s3-ext.jc.rl.ac.uk/" + "esmvaltool-zarr/pr_Amon_CNRM-ESM2-1_02Kpd-11_r1i1p2f2_gr_200601-220112.zarr3" + ) + + xr_kwargs = _return_kwargs() + zarr_xr = xr.open_dataset(zarr_path, **xr_kwargs) + zarr_xr.unify_chunks() + + conversion_func = ncdata.iris_xarray.cubes_from_xarray + cubes = conversion_func(zarr_xr) + + assert isinstance(cubes, iris.cube.CubeList) + assert len(cubes) == 1 + assert cubes[0].has_lazy_data() + + +def test_load_remote_zarr_realized_data(): + """Test with the same remote Zarr store but chunks=None.""" + zarr_path = ( + "https://uor-aces-o.s3-ext.jc.rl.ac.uk/" + "esmvaltool-zarr/pr_Amon_CNRM-ESM2-1_02Kpd-11_r1i1p2f2_gr_200601-220112.zarr3" + ) + + xr_kwargs = _return_kwargs() + xr_kwargs["chunks"] = None + zarr_xr = xr.open_dataset(zarr_path, **xr_kwargs) + + conversion_func = ncdata.iris_xarray.cubes_from_xarray + msg = ( + "has fully realized data, if you need lazy data, " + "then add chunks={} as argument to Xarray open_dataset." + ) + with pytest.warns(UserWarning, match=msg) as w: + cubes = conversion_func(zarr_xr) diff --git a/tests/zarr-sample-data/example_field_0.zarr2/.zattrs b/tests/zarr-sample-data/example_field_0.zarr2/.zattrs new file mode 100644 index 0000000..bb815de --- /dev/null +++ b/tests/zarr-sample-data/example_field_0.zarr2/.zattrs @@ -0,0 +1,3 @@ +{ + "Conventions": "CF-1.12" +} diff --git a/tests/zarr-sample-data/example_field_0.zarr2/.zgroup b/tests/zarr-sample-data/example_field_0.zarr2/.zgroup new file mode 100644 index 0000000..3f3fad2 --- /dev/null +++ b/tests/zarr-sample-data/example_field_0.zarr2/.zgroup @@ -0,0 +1,3 @@ +{ + "zarr_format": 2 +} diff --git a/tests/zarr-sample-data/example_field_0.zarr2/.zmetadata b/tests/zarr-sample-data/example_field_0.zarr2/.zmetadata new file mode 100644 index 0000000..ab417b3 --- /dev/null +++ b/tests/zarr-sample-data/example_field_0.zarr2/.zmetadata @@ -0,0 +1,171 @@ +{ + "metadata": { + ".zattrs": { + "Conventions": "CF-1.12" + }, + ".zgroup": { + "zarr_format": 2 + }, + "lat/.zarray": { + "chunks": [ + 5 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "