From b4838bbe7b8648eb0fa4c4c6b40264fde79ccb07 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Mon, 5 Dec 2022 16:21:45 +0000 Subject: [PATCH 1/6] First working copy-free xr-iris-bridge. --- lib/iris/experimental/ncxr.py | 576 +++++++++++++++++++++++++++++++++ lib/iris/fileformats/netcdf.py | 63 ++-- 2 files changed, 620 insertions(+), 19 deletions(-) create mode 100644 lib/iris/experimental/ncxr.py diff --git a/lib/iris/experimental/ncxr.py b/lib/iris/experimental/ncxr.py new file mode 100644 index 0000000000..7b6c2cdaa8 --- /dev/null +++ b/lib/iris/experimental/ncxr.py @@ -0,0 +1,576 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Temporary code layer supporting interoperation between Iris and Xarray. + +TODO: replace this with various changes: + * move Iris-agnostic code outside Iris + - into its own repo (where it can be better tested) + - leaving **only** the 'to_xarray' and 'from_xarray' functions. + * add consistency checking + * add "direct" netcdf interfacing, i.e. NcDataset.to_nc/from_nc + +""" +from functools import wraps +from pathlib import Path # noqa +from typing import AnyStr, Dict, Optional, Tuple, Union + +import dask.array as da +import netCDF4 +import numpy as np +import xarray as xr + +import iris +from iris.cube import CubeList +import iris.fileformats.netcdf +import iris.tests as itsts + +# +# A totally basic and naive representation of netCDF data. +# The structure supports groups, variables, attributes. +# The sole limitation here is that data and attributes appear as numpy-compatible +# array-like values (though this may include dask.array.Array), and hence their types +# are modelled as np.dtype's. +# + + +class NcGroup: + def __init__( + self, + name: Optional[str] = None, + dimensions: Dict[str, "NcDimension"] = None, + variables: Dict[str, "NcVariable"] = None, + attributes: Dict[str, "NcAttribute"] = None, + groups: Dict[str, "NcGroup"] = None, + ): + self.name: str = name + self.dimensions: Dict[str, "NcDimension"] = dimensions or {} + self.variables: Dict[str, "NcVariable"] = variables or {} + self.attributes: Dict[str, "NcAttribute"] = attributes or {} + self.groups: Dict[str, "NcGroup"] = groups or {} + + +class NcDimension: + def __init__(self, name: str, size: int = 0): + self.name: str = name + self.size: int = size # N.B. we retain the 'zero size means unlimited' + + +class NcVariable: + def __init__( + self, + name: str, + dimensions: Tuple[str] = None, + data: np.ndarray = None, + dtype: np.dtype = None, + attributes: Dict[str, "NcAttribute"] = None, + group: "NcGroup" = None, + ): + self.name = name + self.dimensions = tuple(dimensions or ()) + if data is not None: + if not hasattr(data, "dtype"): + data = np.asanyarray(data) + dtype = data.dtype + self.dtype = dtype + self.data = data # Supports lazy, and normally provides a dtype + self.attributes = attributes or {} + self.group = group + + # # Provide some array-like readonly properties reflected from the data. + # @property + # def dtype(self): + # return self.data.dtype + # + # @property + # def shape(self): + # return self.data.shape + + +class NcAttribute: + def __init__(self, name: str, value): + self.name: str = name + # Attribute values are arraylike, have dtype + # TODO: may need to regularise string representations? + if not hasattr(value, "dtype"): + value = np.asanyarray(value) + self.value: np.ndarray = value + + def _as_python_value(self): + result = self.value + if result.dtype.kind in ("U", "S"): + result = str(result) + if isinstance(result, bytes): + result = result.decode() + return result + + +class NcDataset(NcGroup): + # An interface class providing an NcGroup which can be converted to/from an + # xr.Dataset. This is basically done by adding a small API enabling it to function + # as an Xarray "AbstractDataStore". + # This implies some embedded knowledge of Xarray, but it is very small. + # + # This code pinched from @TomekTrzeciak + # see https://gist.github.com/TomekTrzeciak/b00ff6c9dc301ed6f684990e400d1435 + + def load(self): + variables = {} + for k, v in self.variables.items(): + attrs = { + name: attr._as_python_value() + for name, attr in v.attributes.items() + } + xr_var = xr.Variable( + v.dimensions, v.data, attrs, getattr(v, "encoding", {}) + ) + # TODO: ?possibly? need to apply usual Xarray "encodings" to convert raw + # cf-encoded data into 'normal', interpreted xr.Variables. + if k == "time": + t_bdg = 0 + xr_var = xr.conventions.decode_cf_variable(k, xr_var) + variables[k] = xr_var + attributes = { + name: attr._as_python_value() + for name, attr in self.attributes.items() + } + return variables, attributes + + def store( + self, + variables, + attributes, + check_encoding_set=frozenset(), + writer=None, + unlimited_dims=None, + ): + for k, v in attributes.items(): + if k in self.attributes: # and self.attributes[k] != v: + msg = ( + f're-setting of attribute "{k}" : ' + f"was={self.attributes[k]}, now={v}" + ) + raise ValueError(msg) + else: + self.attributes[k] = NcAttribute(k, v) + for k, v in variables.items(): + if hasattr(v, "ncattrs"): + # An actual netCDF.Variable (?PP, not sure?) + data, dtype, dims, attrs, enc = ( + v[:], + v.datatype, + v.dimensions, + v.ncattrs(), + getattr(v, "encoding", {}), + ) + else: + # An xr.Variable (?PP, not sure?) + # remove all the possible Xarray encodings + # These are all the ones potentially used by + # :func:`xr.conventions.decode_cf_variable`, in the order in which they + # would be applied. + v = xr.conventions.encode_cf_variable( + v, name=k, needs_copy=False + ) + data, dtype, dims, attrs, enc = ( + v.data, + v.dtype, + v.dims, + v.attrs, + v.encoding, + ) + + for dim_name, size in zip(dims, v.shape): + if dim_name in self.dimensions: + if self.dimensions[dim_name].size != size: + raise ValueError( + f"size mismatch for dimension {dim_name!r}: " + f"{self.dimensions[dim_name]} != {size}" + ) + else: + self.dimensions[dim_name] = NcDimension( + dim_name, size=size + ) + + if k in self.variables: + raise ValueError(f'duplicate variable : "{k}"') + attrs = { + name: NcAttribute(name, value) for name, value in attrs.items() + } + nc_var = NcVariable( + name=k, + dimensions=dims, + attributes=attrs, + data=v.data, + group=self, + ) + self.variables[k] = nc_var + + def close(self): + pass + + # + # This interface supports conversion to+from an xarray "Dataset". + # N.B. using the "AbstractDataStore" interface preserves variable contents, being + # either real or lazy arrays. + # + @classmethod + def from_xarray( + cls, dataset_or_file: Union[xr.Dataset, AnyStr, Path], **xr_load_kwargs + ): + if not isinstance(dataset_or_file, xr.Dataset): + # It's a "file" (or pathstring, or Path ?). + dataset_or_file = xr.load_dataset( + dataset_or_file, **xr_load_kwargs + ) + nc_data = cls() + dataset_or_file.dump_to_store(nc_data, **xr_load_kwargs) + return nc_data + + def to_xarray(self, **xr_save_kwargs) -> xr.Dataset: + ds = xr.Dataset.load_store(self, **xr_save_kwargs) + return ds + + +# +# Classes containing NcDataset and NcVariables, but emulating the access APIs of a +# netCDF4.Dataset. +# Notes: +# (1) only supports what is required for Iris load/save capability +# (2) we are proposing that this remains private, for now? -- due to (1) +# +class _Nc4DatalikeWithNcattrs: + # A mixin, shared by _Nc4DatasetLike and _Nc4VariableLike, which adds netcdf-like + # attribute operations'ncattrs / setncattr / getncattr', *AND* extends the local + # objects attribute to those things also + # N.B. "self._ncdata" is the underlying NcData object : either an NcDataset or + # NcVariable object. + def ncattrs(self): + return list(self._ncdata.attributes.keys()) + + def getncattr(self, attr): + attrs = self._ncdata.attributes + if attr in attrs: + result = attrs[attr]._as_python_value() + else: + # Don't allow it to issue a KeyError, as this upsets 'getattr' usage. + # Raise an AttributeError instead. + raise AttributeError(attr) + return result + + def setncattr(self, attr, value): + # TODO: are we sure we need this translation ?? + if isinstance(value, bytes): + value = value.decode("utf-8") + # N.B. using the NcAttribute class for storage also ensures/requires that all + # attributes are cast as numpy arrays (so have shape, dtype etc). + self._ncdata.attributes[attr] = NcAttribute(attr, value) + + def __getattr__(self, attr): + # Extend local object attribute access to the ncattrs of the stored data item + # (Yuck, but I think the Iris load code requires it). + return self.getncattr(attr) + + def __setattr__(self, attr, value): + if attr in self._local_instance_props: + # N.B. use _local_instance_props to define standard instance attributes, to avoid a + # possible endless loop here. + super().__setattr__(attr, value) + else: + # # if not hasattr(self, '_allsetattrs'): + # # self._allsetattrs = set() + # self._allsetattrs.add(attr) + self.setncattr(attr, value) + + +class _Nc4DatasetLike(_Nc4DatalikeWithNcattrs): + _local_instance_props = ("_ncdata", "variables") + + def __init__(self, ncdata: NcDataset = None): + if ncdata is None: + ncdata = NcDataset() # an empty dataset + self._ncdata = ncdata + # N.B. we need to create + store our OWN variables, as they are wrappers for + # the underlying NcVariable objects, with different properties. + self.variables = { + name: _Nc4VariableLike._from_ncvariable(ncvar, group=self) + for name, ncvar in self._ncdata.variables.items() + } + + @property + def dimensions(self): + return { + name: dim.size for name, dim in self._ncdata.dimensions.items() + } + + # @property + # def attributes(self): + # return { + # name: attr.value + # for name, attr in self.ncdata.attributes.items() + # } + + @property + def groups(self): + return None # not supported + + # def ncattrs(self): + # return self.attributes + # + # def getncattr(self, attr_name): + # if attr_name in self.attributes: + # return self.attributes[attr_name] + # raise AttributeError(attr_name) + # + # def setncattr(self, attr_name, value): + # if isinstance(value, bytes): + # value = value.decode("utf-8") + # self.ncdata.attributes[attr_name] = NcAttribute(attr_name, value) + # + # Attributes other than the instance-defining "slots" translate to netcdf + # attributes of the underlying ncdata varable + # + def createDimension(self, dimname, size): + if dimname in self.dimensions: + msg = f'creating duplicate dimension "{dimname}".' + raise ValueError(msg) + # if self.dimensions[name] != size: + # raise ValueError(f"size mismatch for dimension {name!r}: " + # f"{self.dimensions[name]} != {size}") + else: + self._ncdata.dimensions[dimname] = NcDimension(dimname, size) + return size + + def createVariable(self, varname, datatype, dimensions=(), **encoding): + if varname in self.variables: + msg = f'creating duplicate variable "{varname}".' + raise ValueError(msg) + # Add a variable into the underlying NcDataset object. + ncvar = NcVariable( + name=varname, + dimensions=dimensions, + group=self._ncdata, + ) + # Note: initially has no data (or attributes), since this is how netCDF4 expects + # to do it. + self._ncdata.variables[varname] = ncvar + # Create a netCDF4-like "wrapper" variable + install that here. + nc4var = _Nc4VariableLike._from_ncvariable( + ncvar, group=self, dtype=datatype + ) + self.variables[varname] = nc4var + return nc4var + + def sync(self): + pass + # for k, v in self.variables.items(): + # if not hasattr(v, 'data'): + # # coordinate system variables are created but not initialized with data by Iris! + # v.data = np.empty(v.shape, dtype=v.datatype) + # v.data[...] = netCDF4.default_fillvals.get(np.dtype(v.datatype).str[1:]) + + def close(self): + self.sync() + + def filepath(self): + # + # Note: for now, let's just not care about this. + # we *might* need this to be an optinoal defined item on an NcDataset ?? + # .. or, we ight need to store an xarray "encoding" somewhere ? + # + # return self.ncdata.encoding.get("source", "") + return "" + + +class _Nc4VariableLike(_Nc4DatalikeWithNcattrs): + _local_instance_props = ("_ncdata", "name", "datatype", "_raw_array") + + def __init__(self, ncvar: NcVariable, datatype: np.dtype): + self._ncdata = ncvar + self.name = ncvar.name + # Note: datatype must be known at creation, which may be before an actual data + # array is assigned on the ncvar. + self.datatype = np.dtype(datatype) + if ncvar.data is None: + # temporary empty data (to support never-written scalar values) + ncvar.data = np.zeros(self.shape, self.datatype) + self[:] = ncvar.data + + @classmethod + def _from_ncvariable( + cls, ncvar: NcVariable, group: NcGroup, dtype: np.dtype = None + ): + if dtype is None: + dtype = ncvar.dtype + self = cls( + ncvar=ncvar, + datatype=dtype, + ) + return self + + # Label this as an 'emulated' netCDF4.Variable, containing an actual (possibly + # lazy) array, which can be directly read/written. + @property + def _raw_array(self): + return self._ncdata.data + + @_raw_array.setter + def _raw_array(self, data): + self._ncdata.data = data + self.datatype = data.dtype + + @property + def group(self): + return self._ncdata.group + + @property + def dimensions(self): + return self._ncdata.dimensions + + # + # "Normal" data access is via indexing. + # + def __getitem__(self, keys): + if keys != slice(None): + raise IndexError(keys) + if self.ndim == 0: + return self._ncdata.data + return self._ncdata.data[keys] + + def __setitem__(self, keys, data): + if keys != slice(None): + raise IndexError(keys) + if not hasattr(data, "dtype"): + raise ValueError(f"nonarray assigned as data : {data}") + if not data.shape == self.shape: + msg = ( + f"assigned data has wrong shape : " + f"{data.shape} instead of {self.shape}" + ) + raise ValueError(msg) + self._ncdata.data = data + self.datatype = data.dtype + # if not self.dimensions and data.ndim != 0: + # # Iris assigns 1-D single element array to 0-D var! + # self.data = np.asarray(data.item()) + # else: + # shape = tuple(self.group.dimensions[d] for d in self.dimensions) + # if data.shape != shape: + # # Iris passes bounds arrays of wrong shape! + # self.data = data.reshape(shape) + # else: + # self.data = data + + @property + def dtype(self): + return self.datatype + + @property + def dims(self): + return self.dimensions + + @property + def ndim(self): + return len(self.dimensions) + + @property + def shape(self): + dims = self.group.dimensions + return tuple(dims[n].size for n in self.dimensions) + + @property + def size(self): + return np.prod(self.shape) + + def chunking(self): + return None + + +def cubes_from_xrds(xrds: xr.Dataset, **xr_load_kwargs): + ncdata = NcDataset.from_xarray(xrds, **xr_load_kwargs) + dslike = _Nc4DatasetLike(ncdata) + cubes = CubeList(iris.fileformats.netcdf.load_cubes(dslike)) + return cubes + + +def cubes_to_xrds(cubes, iris_save_kwargs=None, xr_save_kwargs=None): + iris_save_kwargs = iris_save_kwargs or {} + xr_save_kwargs = xr_save_kwargs or {} + nc4like = _Nc4DatasetLike() + iris.save( + cubes, nc4like, saver=iris.fileformats.netcdf.save, **iris_save_kwargs + ) + xrds = nc4like._ncdata.to_xarray(**xr_save_kwargs) + return xrds + + +def example_from_xr(): + iris.FUTURE.datum_support = True + filepath = itsts.get_data_path( + ["NetCDF", "stereographic", "toa_brightness_temperature.nc"] + ) + xrds = xr.open_dataset(filepath, chunks="auto") + print("\nOriginal Xarray dataset:\n", xrds) + cubes = cubes_from_xrds(xrds) + print("\nxrds['time']:\n", xrds["time"]) + print("\n\n") + print("============ CONVERT xr.Dataset TO cubes ... =========\n") + print("Cubes:") + print(cubes) + cube = cubes[0] + print("\nCube:") + print(cube) + data = cube.core_data() + print("\ncube.core_data():") + print(data) + # match = data is xrds['data'].data + # print('\ncube.core_data() is xrds["data"].data:') + # print(match) + co_auxlons = cube.coord("longitude") + print('\ncube.coord("longitude"):') + print(co_auxlons) + points = co_auxlons.core_points() + print('\ncube.coord("longitude").core_points():') + print(points) + print('\ncube.coord("longitude").points:') + print(points.compute()) + + print("\n") + print("============ CONVERT cubes TO xr.Dataset ... =========") + print("") + xrds2 = cubes_to_xrds(cubes) + print("\nxrds2:\n", xrds2) + print("\ntime:\n", xrds2["time"]) + + print("\n") + print("============ Array identity checks ... =========") + print( + "xrds2['data'].data is cube.core_data() : ", + bool(xrds2["data"].data is cube.core_data()), + ) + print( + "xrds2['lon'].data is cube.coord('longitude').core_points() : ", + bool(xrds2["lon"].data is cube.coord("longitude").core_points()), + ) + print( + "xrds2['x'].data is cube.coord('projection_x_coordinate').core_points() : ", + bool( + xrds2["x"].data + is cube.coord("projection_x_coordinate").core_points() + ), + ) + print( + "np.all(xrds2['x'].data == cube.coord('projection_x_coordinate').points) : ", + bool( + np.all( + xrds2["x"].data == cube.coord("projection_x_coordinate").points + ) + ), + ) + + +if __name__ == "__main__": + example_from_xr() diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py index 4efed43db9..490816ce68 100644 --- a/lib/iris/fileformats/netcdf.py +++ b/lib/iris/fileformats/netcdf.py @@ -618,22 +618,32 @@ def _get_cf_var_data(cf_var, filename): # Get lazy chunked data out of a cf variable. dtype = _get_actual_dtype(cf_var) - # Create cube with deferred data, but no metadata - fill_value = getattr( - cf_var.cf_data, - "_FillValue", - netCDF4.default_fillvals[cf_var.dtype.str[1:]], - ) - proxy = NetCDFDataProxy( - cf_var.shape, dtype, filename, cf_var.cf_name, fill_value - ) - # Get the chunking specified for the variable : this is either a shape, or - # maybe the string "contiguous". - chunks = cf_var.cf_data.chunking() - # In the "contiguous" case, pass chunks=None to 'as_lazy_data'. - if chunks == "contiguous": - chunks = None - return as_lazy_data(proxy, chunks=chunks) + # Shortcut for 'emulated' netcdf data loading + if hasattr(cf_var.cf_data, "_raw_array"): + # This is a emulated variable, which simply stores an array (possibly lazy) + result = cf_var.cf_data._raw_array + else: + # A 'real' netCDF4.Variable : create a lazy proxy + # Create cube with deferred data, but no metadata + fill_value = getattr( + cf_var.cf_data, + "_FillValue", + netCDF4.default_fillvals[cf_var.dtype.str[1:]], + ) + proxy = NetCDFDataProxy( + cf_var.shape, dtype, filename, cf_var.cf_name, fill_value + ) + + # Get the chunking specified for the variable : this is either a shape, or + # maybe the string "contiguous". + chunks = cf_var.cf_data.chunking() + # In the "contiguous" case, pass chunks=None to 'as_lazy_data'. + if chunks == "contiguous": + chunks = None + + result = as_lazy_data(proxy, chunks=chunks) + + return result class _OrderedAddableList(list): @@ -3015,8 +3025,22 @@ def _lazy_stream_data(data, fill_value, fill_warn, cf_var): # contains just 1 row, so the cf_var is 1D. data = data.squeeze(axis=0) - if is_lazy_data(data): - + if hasattr(cf_var, "_raw_array"): + # The target is not an actual netCDF4.Variable in a file, but an emulation + # object which can store an arraylike (including lazy) directly. + # - transfer the array without any copying (or realisation). + def store(data, cf_var, fill_value): + # Store the data directly on the Variable-like object. + cf_var._raw_array = data + # TODO: for now, just ignore any possible masking issues here, because + # it is tricky, at least for lazy data. In future, we should deal + # with this properly. + is_masked, contains_fill_value = False, False + return is_masked, contains_fill_value + + elif is_lazy_data(data): + # Storing lazy data to an actual netCDF4.Variable in a file. + # - use streaming. def store(data, cf_var, fill_value): # Store lazy data and check whether it is masked and contains # the fill value @@ -3025,7 +3049,8 @@ def store(data, cf_var, fill_value): return target.is_masked, target.contains_value else: - + # Storing real data to an actual netCDF4.Variable in a file. + # - just write the data. def store(data, cf_var, fill_value): cf_var[:] = data is_masked = np.ma.is_masked(data) From de6924be161af4a48d75e516269fe83d1e7d50bb Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Mon, 5 Dec 2022 17:46:35 +0000 Subject: [PATCH 2/6] Reorganised into separate sourcefiles. --- lib/iris/experimental/ncxr.py | 576 ------------------ .../experimental/xarray_bridge/__init__.py | 45 ++ .../xarray_bridge/_ncdata_exercise.py | 84 +++ lib/iris/experimental/xarray_bridge/ncdata.py | 207 +++++++ .../xarray_bridge/ncdata_netcdf4_adaptor.py | 236 +++++++ .../experimental/xarray_dataset_wrapper.py | 415 ------------- 6 files changed, 572 insertions(+), 991 deletions(-) delete mode 100644 lib/iris/experimental/ncxr.py create mode 100644 lib/iris/experimental/xarray_bridge/__init__.py create mode 100644 lib/iris/experimental/xarray_bridge/_ncdata_exercise.py create mode 100644 lib/iris/experimental/xarray_bridge/ncdata.py create mode 100644 lib/iris/experimental/xarray_bridge/ncdata_netcdf4_adaptor.py delete mode 100644 lib/iris/experimental/xarray_dataset_wrapper.py diff --git a/lib/iris/experimental/ncxr.py b/lib/iris/experimental/ncxr.py deleted file mode 100644 index 7b6c2cdaa8..0000000000 --- a/lib/iris/experimental/ncxr.py +++ /dev/null @@ -1,576 +0,0 @@ -# Copyright Iris contributors -# -# This file is part of Iris and is released under the LGPL license. -# See COPYING and COPYING.LESSER in the root of the repository for full -# licensing details. -""" -Temporary code layer supporting interoperation between Iris and Xarray. - -TODO: replace this with various changes: - * move Iris-agnostic code outside Iris - - into its own repo (where it can be better tested) - - leaving **only** the 'to_xarray' and 'from_xarray' functions. - * add consistency checking - * add "direct" netcdf interfacing, i.e. NcDataset.to_nc/from_nc - -""" -from functools import wraps -from pathlib import Path # noqa -from typing import AnyStr, Dict, Optional, Tuple, Union - -import dask.array as da -import netCDF4 -import numpy as np -import xarray as xr - -import iris -from iris.cube import CubeList -import iris.fileformats.netcdf -import iris.tests as itsts - -# -# A totally basic and naive representation of netCDF data. -# The structure supports groups, variables, attributes. -# The sole limitation here is that data and attributes appear as numpy-compatible -# array-like values (though this may include dask.array.Array), and hence their types -# are modelled as np.dtype's. -# - - -class NcGroup: - def __init__( - self, - name: Optional[str] = None, - dimensions: Dict[str, "NcDimension"] = None, - variables: Dict[str, "NcVariable"] = None, - attributes: Dict[str, "NcAttribute"] = None, - groups: Dict[str, "NcGroup"] = None, - ): - self.name: str = name - self.dimensions: Dict[str, "NcDimension"] = dimensions or {} - self.variables: Dict[str, "NcVariable"] = variables or {} - self.attributes: Dict[str, "NcAttribute"] = attributes or {} - self.groups: Dict[str, "NcGroup"] = groups or {} - - -class NcDimension: - def __init__(self, name: str, size: int = 0): - self.name: str = name - self.size: int = size # N.B. we retain the 'zero size means unlimited' - - -class NcVariable: - def __init__( - self, - name: str, - dimensions: Tuple[str] = None, - data: np.ndarray = None, - dtype: np.dtype = None, - attributes: Dict[str, "NcAttribute"] = None, - group: "NcGroup" = None, - ): - self.name = name - self.dimensions = tuple(dimensions or ()) - if data is not None: - if not hasattr(data, "dtype"): - data = np.asanyarray(data) - dtype = data.dtype - self.dtype = dtype - self.data = data # Supports lazy, and normally provides a dtype - self.attributes = attributes or {} - self.group = group - - # # Provide some array-like readonly properties reflected from the data. - # @property - # def dtype(self): - # return self.data.dtype - # - # @property - # def shape(self): - # return self.data.shape - - -class NcAttribute: - def __init__(self, name: str, value): - self.name: str = name - # Attribute values are arraylike, have dtype - # TODO: may need to regularise string representations? - if not hasattr(value, "dtype"): - value = np.asanyarray(value) - self.value: np.ndarray = value - - def _as_python_value(self): - result = self.value - if result.dtype.kind in ("U", "S"): - result = str(result) - if isinstance(result, bytes): - result = result.decode() - return result - - -class NcDataset(NcGroup): - # An interface class providing an NcGroup which can be converted to/from an - # xr.Dataset. This is basically done by adding a small API enabling it to function - # as an Xarray "AbstractDataStore". - # This implies some embedded knowledge of Xarray, but it is very small. - # - # This code pinched from @TomekTrzeciak - # see https://gist.github.com/TomekTrzeciak/b00ff6c9dc301ed6f684990e400d1435 - - def load(self): - variables = {} - for k, v in self.variables.items(): - attrs = { - name: attr._as_python_value() - for name, attr in v.attributes.items() - } - xr_var = xr.Variable( - v.dimensions, v.data, attrs, getattr(v, "encoding", {}) - ) - # TODO: ?possibly? need to apply usual Xarray "encodings" to convert raw - # cf-encoded data into 'normal', interpreted xr.Variables. - if k == "time": - t_bdg = 0 - xr_var = xr.conventions.decode_cf_variable(k, xr_var) - variables[k] = xr_var - attributes = { - name: attr._as_python_value() - for name, attr in self.attributes.items() - } - return variables, attributes - - def store( - self, - variables, - attributes, - check_encoding_set=frozenset(), - writer=None, - unlimited_dims=None, - ): - for k, v in attributes.items(): - if k in self.attributes: # and self.attributes[k] != v: - msg = ( - f're-setting of attribute "{k}" : ' - f"was={self.attributes[k]}, now={v}" - ) - raise ValueError(msg) - else: - self.attributes[k] = NcAttribute(k, v) - for k, v in variables.items(): - if hasattr(v, "ncattrs"): - # An actual netCDF.Variable (?PP, not sure?) - data, dtype, dims, attrs, enc = ( - v[:], - v.datatype, - v.dimensions, - v.ncattrs(), - getattr(v, "encoding", {}), - ) - else: - # An xr.Variable (?PP, not sure?) - # remove all the possible Xarray encodings - # These are all the ones potentially used by - # :func:`xr.conventions.decode_cf_variable`, in the order in which they - # would be applied. - v = xr.conventions.encode_cf_variable( - v, name=k, needs_copy=False - ) - data, dtype, dims, attrs, enc = ( - v.data, - v.dtype, - v.dims, - v.attrs, - v.encoding, - ) - - for dim_name, size in zip(dims, v.shape): - if dim_name in self.dimensions: - if self.dimensions[dim_name].size != size: - raise ValueError( - f"size mismatch for dimension {dim_name!r}: " - f"{self.dimensions[dim_name]} != {size}" - ) - else: - self.dimensions[dim_name] = NcDimension( - dim_name, size=size - ) - - if k in self.variables: - raise ValueError(f'duplicate variable : "{k}"') - attrs = { - name: NcAttribute(name, value) for name, value in attrs.items() - } - nc_var = NcVariable( - name=k, - dimensions=dims, - attributes=attrs, - data=v.data, - group=self, - ) - self.variables[k] = nc_var - - def close(self): - pass - - # - # This interface supports conversion to+from an xarray "Dataset". - # N.B. using the "AbstractDataStore" interface preserves variable contents, being - # either real or lazy arrays. - # - @classmethod - def from_xarray( - cls, dataset_or_file: Union[xr.Dataset, AnyStr, Path], **xr_load_kwargs - ): - if not isinstance(dataset_or_file, xr.Dataset): - # It's a "file" (or pathstring, or Path ?). - dataset_or_file = xr.load_dataset( - dataset_or_file, **xr_load_kwargs - ) - nc_data = cls() - dataset_or_file.dump_to_store(nc_data, **xr_load_kwargs) - return nc_data - - def to_xarray(self, **xr_save_kwargs) -> xr.Dataset: - ds = xr.Dataset.load_store(self, **xr_save_kwargs) - return ds - - -# -# Classes containing NcDataset and NcVariables, but emulating the access APIs of a -# netCDF4.Dataset. -# Notes: -# (1) only supports what is required for Iris load/save capability -# (2) we are proposing that this remains private, for now? -- due to (1) -# -class _Nc4DatalikeWithNcattrs: - # A mixin, shared by _Nc4DatasetLike and _Nc4VariableLike, which adds netcdf-like - # attribute operations'ncattrs / setncattr / getncattr', *AND* extends the local - # objects attribute to those things also - # N.B. "self._ncdata" is the underlying NcData object : either an NcDataset or - # NcVariable object. - def ncattrs(self): - return list(self._ncdata.attributes.keys()) - - def getncattr(self, attr): - attrs = self._ncdata.attributes - if attr in attrs: - result = attrs[attr]._as_python_value() - else: - # Don't allow it to issue a KeyError, as this upsets 'getattr' usage. - # Raise an AttributeError instead. - raise AttributeError(attr) - return result - - def setncattr(self, attr, value): - # TODO: are we sure we need this translation ?? - if isinstance(value, bytes): - value = value.decode("utf-8") - # N.B. using the NcAttribute class for storage also ensures/requires that all - # attributes are cast as numpy arrays (so have shape, dtype etc). - self._ncdata.attributes[attr] = NcAttribute(attr, value) - - def __getattr__(self, attr): - # Extend local object attribute access to the ncattrs of the stored data item - # (Yuck, but I think the Iris load code requires it). - return self.getncattr(attr) - - def __setattr__(self, attr, value): - if attr in self._local_instance_props: - # N.B. use _local_instance_props to define standard instance attributes, to avoid a - # possible endless loop here. - super().__setattr__(attr, value) - else: - # # if not hasattr(self, '_allsetattrs'): - # # self._allsetattrs = set() - # self._allsetattrs.add(attr) - self.setncattr(attr, value) - - -class _Nc4DatasetLike(_Nc4DatalikeWithNcattrs): - _local_instance_props = ("_ncdata", "variables") - - def __init__(self, ncdata: NcDataset = None): - if ncdata is None: - ncdata = NcDataset() # an empty dataset - self._ncdata = ncdata - # N.B. we need to create + store our OWN variables, as they are wrappers for - # the underlying NcVariable objects, with different properties. - self.variables = { - name: _Nc4VariableLike._from_ncvariable(ncvar, group=self) - for name, ncvar in self._ncdata.variables.items() - } - - @property - def dimensions(self): - return { - name: dim.size for name, dim in self._ncdata.dimensions.items() - } - - # @property - # def attributes(self): - # return { - # name: attr.value - # for name, attr in self.ncdata.attributes.items() - # } - - @property - def groups(self): - return None # not supported - - # def ncattrs(self): - # return self.attributes - # - # def getncattr(self, attr_name): - # if attr_name in self.attributes: - # return self.attributes[attr_name] - # raise AttributeError(attr_name) - # - # def setncattr(self, attr_name, value): - # if isinstance(value, bytes): - # value = value.decode("utf-8") - # self.ncdata.attributes[attr_name] = NcAttribute(attr_name, value) - # - # Attributes other than the instance-defining "slots" translate to netcdf - # attributes of the underlying ncdata varable - # - def createDimension(self, dimname, size): - if dimname in self.dimensions: - msg = f'creating duplicate dimension "{dimname}".' - raise ValueError(msg) - # if self.dimensions[name] != size: - # raise ValueError(f"size mismatch for dimension {name!r}: " - # f"{self.dimensions[name]} != {size}") - else: - self._ncdata.dimensions[dimname] = NcDimension(dimname, size) - return size - - def createVariable(self, varname, datatype, dimensions=(), **encoding): - if varname in self.variables: - msg = f'creating duplicate variable "{varname}".' - raise ValueError(msg) - # Add a variable into the underlying NcDataset object. - ncvar = NcVariable( - name=varname, - dimensions=dimensions, - group=self._ncdata, - ) - # Note: initially has no data (or attributes), since this is how netCDF4 expects - # to do it. - self._ncdata.variables[varname] = ncvar - # Create a netCDF4-like "wrapper" variable + install that here. - nc4var = _Nc4VariableLike._from_ncvariable( - ncvar, group=self, dtype=datatype - ) - self.variables[varname] = nc4var - return nc4var - - def sync(self): - pass - # for k, v in self.variables.items(): - # if not hasattr(v, 'data'): - # # coordinate system variables are created but not initialized with data by Iris! - # v.data = np.empty(v.shape, dtype=v.datatype) - # v.data[...] = netCDF4.default_fillvals.get(np.dtype(v.datatype).str[1:]) - - def close(self): - self.sync() - - def filepath(self): - # - # Note: for now, let's just not care about this. - # we *might* need this to be an optinoal defined item on an NcDataset ?? - # .. or, we ight need to store an xarray "encoding" somewhere ? - # - # return self.ncdata.encoding.get("source", "") - return "" - - -class _Nc4VariableLike(_Nc4DatalikeWithNcattrs): - _local_instance_props = ("_ncdata", "name", "datatype", "_raw_array") - - def __init__(self, ncvar: NcVariable, datatype: np.dtype): - self._ncdata = ncvar - self.name = ncvar.name - # Note: datatype must be known at creation, which may be before an actual data - # array is assigned on the ncvar. - self.datatype = np.dtype(datatype) - if ncvar.data is None: - # temporary empty data (to support never-written scalar values) - ncvar.data = np.zeros(self.shape, self.datatype) - self[:] = ncvar.data - - @classmethod - def _from_ncvariable( - cls, ncvar: NcVariable, group: NcGroup, dtype: np.dtype = None - ): - if dtype is None: - dtype = ncvar.dtype - self = cls( - ncvar=ncvar, - datatype=dtype, - ) - return self - - # Label this as an 'emulated' netCDF4.Variable, containing an actual (possibly - # lazy) array, which can be directly read/written. - @property - def _raw_array(self): - return self._ncdata.data - - @_raw_array.setter - def _raw_array(self, data): - self._ncdata.data = data - self.datatype = data.dtype - - @property - def group(self): - return self._ncdata.group - - @property - def dimensions(self): - return self._ncdata.dimensions - - # - # "Normal" data access is via indexing. - # - def __getitem__(self, keys): - if keys != slice(None): - raise IndexError(keys) - if self.ndim == 0: - return self._ncdata.data - return self._ncdata.data[keys] - - def __setitem__(self, keys, data): - if keys != slice(None): - raise IndexError(keys) - if not hasattr(data, "dtype"): - raise ValueError(f"nonarray assigned as data : {data}") - if not data.shape == self.shape: - msg = ( - f"assigned data has wrong shape : " - f"{data.shape} instead of {self.shape}" - ) - raise ValueError(msg) - self._ncdata.data = data - self.datatype = data.dtype - # if not self.dimensions and data.ndim != 0: - # # Iris assigns 1-D single element array to 0-D var! - # self.data = np.asarray(data.item()) - # else: - # shape = tuple(self.group.dimensions[d] for d in self.dimensions) - # if data.shape != shape: - # # Iris passes bounds arrays of wrong shape! - # self.data = data.reshape(shape) - # else: - # self.data = data - - @property - def dtype(self): - return self.datatype - - @property - def dims(self): - return self.dimensions - - @property - def ndim(self): - return len(self.dimensions) - - @property - def shape(self): - dims = self.group.dimensions - return tuple(dims[n].size for n in self.dimensions) - - @property - def size(self): - return np.prod(self.shape) - - def chunking(self): - return None - - -def cubes_from_xrds(xrds: xr.Dataset, **xr_load_kwargs): - ncdata = NcDataset.from_xarray(xrds, **xr_load_kwargs) - dslike = _Nc4DatasetLike(ncdata) - cubes = CubeList(iris.fileformats.netcdf.load_cubes(dslike)) - return cubes - - -def cubes_to_xrds(cubes, iris_save_kwargs=None, xr_save_kwargs=None): - iris_save_kwargs = iris_save_kwargs or {} - xr_save_kwargs = xr_save_kwargs or {} - nc4like = _Nc4DatasetLike() - iris.save( - cubes, nc4like, saver=iris.fileformats.netcdf.save, **iris_save_kwargs - ) - xrds = nc4like._ncdata.to_xarray(**xr_save_kwargs) - return xrds - - -def example_from_xr(): - iris.FUTURE.datum_support = True - filepath = itsts.get_data_path( - ["NetCDF", "stereographic", "toa_brightness_temperature.nc"] - ) - xrds = xr.open_dataset(filepath, chunks="auto") - print("\nOriginal Xarray dataset:\n", xrds) - cubes = cubes_from_xrds(xrds) - print("\nxrds['time']:\n", xrds["time"]) - print("\n\n") - print("============ CONVERT xr.Dataset TO cubes ... =========\n") - print("Cubes:") - print(cubes) - cube = cubes[0] - print("\nCube:") - print(cube) - data = cube.core_data() - print("\ncube.core_data():") - print(data) - # match = data is xrds['data'].data - # print('\ncube.core_data() is xrds["data"].data:') - # print(match) - co_auxlons = cube.coord("longitude") - print('\ncube.coord("longitude"):') - print(co_auxlons) - points = co_auxlons.core_points() - print('\ncube.coord("longitude").core_points():') - print(points) - print('\ncube.coord("longitude").points:') - print(points.compute()) - - print("\n") - print("============ CONVERT cubes TO xr.Dataset ... =========") - print("") - xrds2 = cubes_to_xrds(cubes) - print("\nxrds2:\n", xrds2) - print("\ntime:\n", xrds2["time"]) - - print("\n") - print("============ Array identity checks ... =========") - print( - "xrds2['data'].data is cube.core_data() : ", - bool(xrds2["data"].data is cube.core_data()), - ) - print( - "xrds2['lon'].data is cube.coord('longitude').core_points() : ", - bool(xrds2["lon"].data is cube.coord("longitude").core_points()), - ) - print( - "xrds2['x'].data is cube.coord('projection_x_coordinate').core_points() : ", - bool( - xrds2["x"].data - is cube.coord("projection_x_coordinate").core_points() - ), - ) - print( - "np.all(xrds2['x'].data == cube.coord('projection_x_coordinate').points) : ", - bool( - np.all( - xrds2["x"].data == cube.coord("projection_x_coordinate").points - ) - ), - ) - - -if __name__ == "__main__": - example_from_xr() diff --git a/lib/iris/experimental/xarray_bridge/__init__.py b/lib/iris/experimental/xarray_bridge/__init__.py new file mode 100644 index 0000000000..1400cf0e17 --- /dev/null +++ b/lib/iris/experimental/xarray_bridge/__init__.py @@ -0,0 +1,45 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Experimental code fror interchanging data with Xarray . + + +TODO: replace this with various changes: + * move Iris-agnostic code outside Iris + - into its own repo (where it can be better tested) + - leaving **only** the 'to_xarray' and 'from_xarray' functions. + * add consistency checking + * add "direct" netcdf interfacing, i.e. NcDataset.to_nc/from_nc + +""" +import iris +from iris.cube import CubeList +import iris.fileformats.netcdf as ifn + +from .ncdata import NcDataset +from .ncdata_netcdf4_adaptor import _Nc4DatasetLike + +# +# The primary conversion interfaces +# + + +def cubes_from_xarray(xrds: "xarray.Dataset", **xr_load_kwargs): # noqa + ncdata = NcDataset.from_xarray(xrds, **xr_load_kwargs) + dslike = _Nc4DatasetLike(ncdata) + cubes = CubeList(ifn.load_cubes(dslike)) + return cubes + + +def cubes_to_xarray(cubes, iris_save_kwargs=None, xr_save_kwargs=None): + iris_save_kwargs = iris_save_kwargs or {} + xr_save_kwargs = xr_save_kwargs or {} + nc4like = _Nc4DatasetLike() + iris.save( + cubes, nc4like, saver=iris.fileformats.netcdf.save, **iris_save_kwargs + ) + xrds = nc4like._ncdata.to_xarray(**xr_save_kwargs) + return xrds diff --git a/lib/iris/experimental/xarray_bridge/_ncdata_exercise.py b/lib/iris/experimental/xarray_bridge/_ncdata_exercise.py new file mode 100644 index 0000000000..c301f154d6 --- /dev/null +++ b/lib/iris/experimental/xarray_bridge/_ncdata_exercise.py @@ -0,0 +1,84 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +A temporary proof-of-concept test workflow + +""" +import numpy as np +import xarray as xr + +import iris +from iris.experimental.xarray_bridge import cubes_from_xarray, cubes_to_xarray +import iris.tests as itsts + + +def example_from_xr(): + iris.FUTURE.datum_support = True + filepath = itsts.get_data_path( + ["NetCDF", "stereographic", "toa_brightness_temperature.nc"] + ) + xrds = xr.open_dataset(filepath, chunks="auto") + print("\nOriginal Xarray dataset:\n", xrds) + cubes = cubes_from_xarray(xrds) + print("\nxrds['time']:\n", xrds["time"]) + print("\n\n") + print("============ CONVERT xr.Dataset TO cubes ... =========\n") + print("Cubes:") + print(cubes) + cube = cubes[0] + print("\nCube:") + print(cube) + data = cube.core_data() + print("\ncube.core_data():") + print(data) + # match = data is xrds['data'].data + # print('\ncube.core_data() is xrds["data"].data:') + # print(match) + co_auxlons = cube.coord("longitude") + print('\ncube.coord("longitude"):') + print(co_auxlons) + points = co_auxlons.core_points() + print('\ncube.coord("longitude").core_points():') + print(points) + print('\ncube.coord("longitude").points:') + print(points.compute()) + + print("\n") + print("============ CONVERT cubes TO xr.Dataset ... =========") + print("") + xrds2 = cubes_to_xarray(cubes) + print("\nxrds2:\n", xrds2) + print("\ntime:\n", xrds2["time"]) + + print("\n") + print("============ Array identity checks ... =========") + print( + "xrds2['data'].data is cube.core_data() : ", + bool(xrds2["data"].data is cube.core_data()), + ) + print( + "xrds2['lon'].data is cube.coord('longitude').core_points() : ", + bool(xrds2["lon"].data is cube.coord("longitude").core_points()), + ) + print( + "xrds2['x'].data is cube.coord('projection_x_coordinate').core_points() : ", + bool( + xrds2["x"].data + is cube.coord("projection_x_coordinate").core_points() + ), + ) + print( + "np.all(xrds2['x'].data == cube.coord('projection_x_coordinate').points) : ", + bool( + np.all( + xrds2["x"].data == cube.coord("projection_x_coordinate").points + ) + ), + ) + + +if __name__ == "__main__": + example_from_xr() diff --git a/lib/iris/experimental/xarray_bridge/ncdata.py b/lib/iris/experimental/xarray_bridge/ncdata.py new file mode 100644 index 0000000000..2e2b2edaeb --- /dev/null +++ b/lib/iris/experimental/xarray_bridge/ncdata.py @@ -0,0 +1,207 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +An abstract representation of Netcdf data with groups, variables + attributes + +This is also provided with a read/write conversion interface to Xarray. + +TODO: add direct netcdf file interface (easy, but not yet). + +""" +from pathlib import Path +from typing import AnyStr, Dict, Optional, Tuple, Union + +import numpy as np +import xarray as xr + +# +# A totally basic and naive representation of netCDF data. +# The structure supports groups, variables, attributes. +# The sole limitation here is that data and attributes appear as numpy-compatible +# array-like values (though this may include dask.array.Array), and hence their types +# are modelled as np.dtype's. +# + + +class NcGroup: + def __init__( + self, + name: Optional[str] = None, + dimensions: Dict[str, "NcDimension"] = None, + variables: Dict[str, "NcVariable"] = None, + attributes: Dict[str, "NcAttribute"] = None, + groups: Dict[str, "NcGroup"] = None, + ): + self.name: str = name + self.dimensions: Dict[str, "NcDimension"] = dimensions or {} + self.variables: Dict[str, "NcVariable"] = variables or {} + self.attributes: Dict[str, "NcAttribute"] = attributes or {} + self.groups: Dict[str, "NcGroup"] = groups or {} + + +class NcDimension: + def __init__(self, name: str, size: int = 0): + self.name: str = name + self.size: int = size # N.B. we retain the 'zero size means unlimited' + + +class NcVariable: + def __init__( + self, + name: str, + dimensions: Tuple[str] = None, + data: np.ndarray = None, + dtype: np.dtype = None, + attributes: Dict[str, "NcAttribute"] = None, + group: "NcGroup" = None, + ): + self.name = name + self.dimensions = tuple(dimensions or ()) + if data is not None: + if not hasattr(data, "dtype"): + data = np.asanyarray(data) + dtype = data.dtype + self.dtype = dtype + self.data = data # Supports lazy, and normally provides a dtype + self.attributes = attributes or {} + self.group = group + + # # Provide some array-like readonly properties reflected from the data. + # @property + # def dtype(self): + # return self.data.dtype + # + # @property + # def shape(self): + # return self.data.shape + + +class NcAttribute: + def __init__(self, name: str, value): + self.name: str = name + # Attribute values are arraylike, have dtype + # TODO: may need to regularise string representations? + if not hasattr(value, "dtype"): + value = np.asanyarray(value) + self.value: np.ndarray = value + + def _as_python_value(self): + result = self.value + if result.dtype.kind in ("U", "S"): + result = str(result) + if isinstance(result, bytes): + result = result.decode() + return result + + +class NcDataset(NcGroup): + # An interface class providing an NcGroup which can be converted to/from an + # xr.Dataset. This is basically done by adding a small API enabling it to function + # as an Xarray "AbstractDataStore". + # This implies some embedded knowledge of Xarray, but it is very small. + # + # This code pinched from @TomekTrzeciak + # see https://gist.github.com/TomekTrzeciak/b00ff6c9dc301ed6f684990e400d1435 + + def load(self): + variables = {} + for k, v in self.variables.items(): + attrs = { + name: attr._as_python_value() + for name, attr in v.attributes.items() + } + xr_var = xr.Variable( + v.dimensions, v.data, attrs, getattr(v, "encoding", {}) + ) + # TODO: ?possibly? need to apply usual Xarray "encodings" to convert raw + # cf-encoded data into 'normal', interpreted xr.Variables. + xr_var = xr.conventions.decode_cf_variable(k, xr_var) + variables[k] = xr_var + attributes = { + name: attr._as_python_value() + for name, attr in self.attributes.items() + } + return variables, attributes + + def store( + self, + variables, + attributes, + check_encoding_set=frozenset(), + writer=None, + unlimited_dims=None, + ): + for attrname, v in attributes.items(): + if attrname in self.attributes: # and self.attributes[k] != v: + msg = ( + f're-setting of attribute "{attrname}" : ' + f"was={self.attributes[attrname]}, now={v}" + ) + raise ValueError(msg) + else: + self.attributes[attrname] = NcAttribute(attrname, v) + + for varname, var in variables.items(): + if varname in self.variables: + raise ValueError(f'duplicate variable : "{varname}"') + + # An xr.Variable : remove all the possible Xarray encodings + # These are all the ones potentially used by + # :func:`xr.conventions.decode_cf_variable`, in the order in which they + # would be applied. + var = xr.conventions.encode_cf_variable( + var, name=varname, needs_copy=False + ) + + for dim_name, size in zip(var.dims, var.shape): + if dim_name in self.dimensions: + if self.dimensions[dim_name].size != size: + raise ValueError( + f"size mismatch for dimension {dim_name!r}: " + f"{self.dimensions[dim_name]} != {size}" + ) + else: + self.dimensions[dim_name] = NcDimension( + dim_name, size=size + ) + + attrs = { + name: NcAttribute(name, value) + for name, value in var.attrs.items() + } + nc_var = NcVariable( + name=varname, + dimensions=var.dims, + attributes=attrs, + data=var.data, + group=self, + ) + self.variables[varname] = nc_var + + def close(self): + pass + + # + # This interface supports conversion to+from an xarray "Dataset". + # N.B. using the "AbstractDataStore" interface preserves variable contents, being + # either real or lazy arrays. + # + @classmethod + def from_xarray( + cls, dataset_or_file: Union[xr.Dataset, AnyStr, Path], **xr_load_kwargs + ): + if not isinstance(dataset_or_file, xr.Dataset): + # It's a "file" (or pathstring, or Path ?). + dataset_or_file = xr.load_dataset( + dataset_or_file, **xr_load_kwargs + ) + nc_data = cls() + dataset_or_file.dump_to_store(nc_data, **xr_load_kwargs) + return nc_data + + def to_xarray(self, **xr_save_kwargs) -> xr.Dataset: + ds = xr.Dataset.load_store(self, **xr_save_kwargs) + return ds diff --git a/lib/iris/experimental/xarray_bridge/ncdata_netcdf4_adaptor.py b/lib/iris/experimental/xarray_bridge/ncdata_netcdf4_adaptor.py new file mode 100644 index 0000000000..ac1d878126 --- /dev/null +++ b/lib/iris/experimental/xarray_bridge/ncdata_netcdf4_adaptor.py @@ -0,0 +1,236 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +An adaptor layer allowing an NcDataset to masquerade as a netCDF4.Dataset + +This is provided primarily to support a re-use of the iris.fileformats.netcdf file +format load + save, to convert cubes to+from iris.experimental.xarray_bridde.ncdata +objects, and hence bridge to xarray.Dataset. + +These classes contain NcDataset and NcVariables, but emulating the access APIs of a +netCDF4.Dataset. + +Notes: + (1) currently only supports what is required for Iris load/save capability + (2) we are proposing that this remains private, for now? -- due to (1) + +""" +import numpy as np + +from .ncdata import NcAttribute, NcDataset, NcDimension, NcVariable + + +class _Nc4DatalikeWithNcattrs: + # A mixin, shared by _Nc4DatasetLike and _Nc4VariableLike, which adds netcdf-like + # attribute operations'ncattrs / setncattr / getncattr', *AND* extends the local + # objects attribute to those things also + # N.B. "self._ncdata" is the underlying NcData object : either an NcDataset or + # NcVariable object. + def ncattrs(self): + return list(self._ncdata.attributes.keys()) + + def getncattr(self, attr): + attrs = self._ncdata.attributes + if attr in attrs: + result = attrs[attr]._as_python_value() + else: + # Don't allow it to issue a KeyError, as this upsets 'getattr' usage. + # Raise an AttributeError instead. + raise AttributeError(attr) + return result + + def setncattr(self, attr, value): + # TODO: are we sure we need this translation ?? + if isinstance(value, bytes): + value = value.decode("utf-8") + # N.B. using the NcAttribute class for storage also ensures/requires that all + # attributes are cast as numpy arrays (so have shape, dtype etc). + self._ncdata.attributes[attr] = NcAttribute(attr, value) + + def __getattr__(self, attr): + # Extend local object attribute access to the ncattrs of the stored data item + # (Yuck, but I think the Iris load code requires it). + return self.getncattr(attr) + + def __setattr__(self, attr, value): + if attr in self._local_instance_props: + # N.B. use _local_instance_props to define standard instance attributes, to avoid a + # possible endless loop here. + super().__setattr__(attr, value) + else: + # # if not hasattr(self, '_allsetattrs'): + # # self._allsetattrs = set() + # self._allsetattrs.add(attr) + self.setncattr(attr, value) + + +class _Nc4DatasetLike(_Nc4DatalikeWithNcattrs): + _local_instance_props = ("_ncdata", "variables") + + def __init__(self, ncdata: NcDataset = None): + if ncdata is None: + ncdata = NcDataset() # an empty dataset + self._ncdata = ncdata + # N.B. we need to create + store our OWN variables, as they are wrappers for + # the underlying NcVariable objects, with different properties. + self.variables = { + name: _Nc4VariableLike._from_ncvariable(ncvar) + for name, ncvar in self._ncdata.variables.items() + } + + @property + def dimensions(self): + return { + name: dim.size for name, dim in self._ncdata.dimensions.items() + } + + @property + def groups(self): + return None # not supported + + def createDimension(self, dimname, size): + if dimname in self.dimensions: + msg = f'creating duplicate dimension "{dimname}".' + raise ValueError(msg) + # if self.dimensions[name] != size: + # raise ValueError(f"size mismatch for dimension {name!r}: " + # f"{self.dimensions[name]} != {size}") + else: + self._ncdata.dimensions[dimname] = NcDimension(dimname, size) + return size + + def createVariable(self, varname, datatype, dimensions=(), **encoding): + if varname in self.variables: + msg = f'creating duplicate variable "{varname}".' + raise ValueError(msg) + # Add a variable into the underlying NcDataset object. + ncvar = NcVariable( + name=varname, + dimensions=dimensions, + group=self._ncdata, + ) + # Note: initially has no data (or attributes), since this is how netCDF4 expects + # to do it. + self._ncdata.variables[varname] = ncvar + # Create a netCDF4-like "wrapper" variable + install that here. + nc4var = _Nc4VariableLike._from_ncvariable(ncvar, dtype=datatype) + self.variables[varname] = nc4var + return nc4var + + def sync(self): + pass + + def close(self): + self.sync() + + @staticmethod + def filepath(): + # + # Note: for now, let's just not care about this. + # we *might* need this to be an optinoal defined item on an NcDataset ?? + # .. or, we ight need to store an xarray "encoding" somewhere ? + # TODO: more thought here ? + # return self.ncdata.encoding.get("source", "") + return "" + + +class _Nc4VariableLike(_Nc4DatalikeWithNcattrs): + _local_instance_props = ("_ncdata", "name", "datatype", "_raw_array") + + def __init__(self, ncvar: NcVariable, datatype: np.dtype): + self._ncdata = ncvar + self.name = ncvar.name + # Note: datatype must be known at creation, which may be before an actual data + # array is assigned on the ncvar. + self.datatype = np.dtype(datatype) + if ncvar.data is None: + # temporary empty data (to support never-written scalar values) + # NOTE: significantly, does *not* allocate an actual full array in memory + array = np.zeros(self.shape, self.datatype) + ncvar.data = array + self._raw_array = ncvar.data + + @classmethod + def _from_ncvariable(cls, ncvar: NcVariable, dtype: np.dtype = None): + if dtype is None: + dtype = ncvar.dtype + self = cls( + ncvar=ncvar, + datatype=dtype, + ) + return self + + # Label this as an 'emulated' netCDF4.Variable, containing an actual (possibly + # lazy) array, which can be directly read/written. + @property + def _raw_array(self): + return self._ncdata.data + + @_raw_array.setter + def _raw_array(self, data): + self._ncdata.data = data + self.datatype = data.dtype + + @property + def group(self): + return self._ncdata.group + + @property + def dimensions(self): + return self._ncdata.dimensions + + # + # "Normal" data access is via indexing. + # N.B. we do still need to support this, e.g. for DimCoords ? + # + def __getitem__(self, keys): + if keys != slice(None): + raise IndexError(keys) + if self.ndim == 0: + return self._ncdata.data + return self._ncdata.data[keys] + + # The __setitem__ is not required for normal saving. + # The saver will assign ._raw_array instead + # TODO: might need to support this for future non-Iris usage ? + # + # def __setitem__(self, keys, data): + # if keys != slice(None): + # raise IndexError(keys) + # if not hasattr(data, "dtype"): + # raise ValueError(f"nonarray assigned as data : {data}") + # if not data.shape == self.shape: + # msg = ( + # f"assigned data has wrong shape : " + # f"{data.shape} instead of {self.shape}" + # ) + # raise ValueError(msg) + # self._ncdata.data = data + # self.datatype = data.dtype + + @property + def dtype(self): + return self.datatype + + @property + def dims(self): + return self.dimensions + + @property + def ndim(self): + return len(self.dimensions) + + @property + def shape(self): + dims = self.group.dimensions + return tuple(dims[n].size for n in self.dimensions) + + @property + def size(self): + return np.prod(self.shape) + + def chunking(self): + return None diff --git a/lib/iris/experimental/xarray_dataset_wrapper.py b/lib/iris/experimental/xarray_dataset_wrapper.py deleted file mode 100644 index 455057afad..0000000000 --- a/lib/iris/experimental/xarray_dataset_wrapper.py +++ /dev/null @@ -1,415 +0,0 @@ -# Copyright Iris contributors -# -# This file is part of Iris and is released under the LGPL license. -# See COPYING and COPYING.LESSER in the root of the repository for full -# licensing details. -""" -A wrapper for an xarray.Dataset that simulates a netCDF4.Dataset. -This enables code to read/write xarray data as if it were a netcdf file. - -NOTE: readonly, for now. -TODO: add modify/save functions later. - -NOTE: this code is effectively independent of Iris, and does not really belong. -However, this is a convenient place to test, for now. - -""" -from collections import OrderedDict -from typing import Optional - -import netCDF4 as nc -import numpy as np -import xarray -import xarray as xr - - -class _XrMimic: - """ - An netcdf object "mimic" wrapped around an xarray object, which will be - either a dim, var or dataset. - - These (mostly) contain an underlying xarray object, and all potentially - have a name + group (though dataset name is unused). - N.B. name is provided separately, as xr types do not "know" their own names - - e.g. an xr.Variable has no 'name' property. - - We also support object equality checks. - - NOTE: a DimensionMimic, uniquely, does *NOT* in fact contain an xarray - object, so its self._xr == None. See DimensionMimic docstring. - - """ - - def __init__(self, xr, name=None, group=None): - """ - Create a mimic object wrapping a :class:`nco.Ncobj` component. - Note: not all the underlying objects have a name, so provide that - separately. - - """ - self._xr = xr - self._name = name - self._group = group - - @property - def name(self): - return self._name - - def group(self): - return self._group - - def __eq__(self, other): - return self._xr == other._xr - - def __ne__(self, other): - return not self == other - - -class DimensionMimic(_XrMimic): - """ - A Dimension object mimic wrapper. - - Dimension additional properties: length, unlimited - - NOTE: a DimensionMimic does *NOT* contain an xarray object representing the - dimension, because xarray doesn't have such objects. - So, in xarray, you can't rename or modify an existing Dataset dimension. - But you can re-order, add, and remove ones that no variable uses. - - """ - - def __init__(self, name, len, isunlimited=False, group=None): - # Note that there *is* no underlying xarray object. - # So we make up something, to support equality checks. - id_placeholder = (name, len, isunlimited) - super().__init__(xr=id_placeholder, name=name, group=group) - self._len = len # A private version, for now, in case needs change. - self._unlimited = isunlimited - - @property - def size(self): - return 0 if self.isunlimited() else self.len - - def __len__(self): - return self._len - - def isunlimited(self): - return self._unlimited - - -class _Nc4AttrsMimic(_XrMimic): - """ - A class mixin for a Mimic with attribute access. - - I.E. shared by variables and datasets. - - """ - - def ncattrs(self): - return self._xr.attrs.keys() # Probably do *not* need/expect a list ? - - def getncattr(self, attr_name): - if attr_name in self._xr.attrs: - result = self._xr.attrs[attr_name] - else: - raise AttributeError() - return result - - def __getattr__(self, attr_name): - return self.getncattr(attr_name) - - # - # writing - # - def setncattr(self, attr_name, value): - if isinstance(value, bytes): - value = value.decode() - self._xr.attrs[attr_name] = value - - # NOTE: not currently supporting ".my_attribute = value" type access. - # def __setattr__(self, attr_name, value): - # self.setncattr(attr_name, value) - - -class VariableMimic(_Nc4AttrsMimic): - """ - A Variable object mimic wrapper. - - Variable additional properties: - dimensions, dtype, data (+ attributes, parent-group) - shape, size, ndim - - """ - - @property - def dtype(self): - return self._xr.dtype - - def chunking(self): - return None - - @property - def datatype(self): - return self.dtype - - @property - def dimensions(self): - return self._xr.dims - - def __getitem__(self, keys): - if self.ndim == 0: - return self._xr.data - else: - return self._xr[keys].data - - @property - def shape(self): - return self._xr.shape - - @property - def ndim(self): - return self._xr.ndim - - @property - def size(self): - return self._xr.size - - # - # writing - # - def __setitem__(self, keys, data): - self._xr[keys] = data - - -class DatasetMimic(_Nc4AttrsMimic): - """ - An object mimicking an netCDF4.Dataset, wrapping an xarray.Dataset. - - """ - - def __init__(self, xrds=None): - """ - Create a Dataset mimic, which provides a bridge between the - :class:`netcdf.Dataset` access API and data in the form of an - :class:`xarray.Dataset`. - - Parameters - ---------- - xrds : :class:`xr.Dataset`, optional - If provided, create a DatasetMimic representing the xarray data. - If None, initialise empty. - In either case, the result can be read or written like a - :class:`netcdf.Dataset`. Or, an xarray equivalent can be - regenerated with the :meth:`to_xarray_dataset` method. - - Notes - ----- - Only a limited subset of the :mod:`netCDF4` APIs are currently - supported : just enough to allow Iris to read and write xarray datasets - in place of netcdf files. - - In addition to the netCDF4 read API, you can at any time obtain a - version of the contents in the form of a :class:`xarray.Dataset`, from - the :meth:`DatasetMimic.to_xarray_dataset` method. - """ - if xrds is None: - # Initialise empty dataset if not passed in. - xrds = xr.Dataset() - super().__init__(xrds) - - # Capture original filepath, if known. - self._sourcepath = self._xr.encoding.get("source", "") - - # Keep track of variables which were renamed on creation to prevent - # them being made into coords (which are not writable). - self._output_renames = {} - - # Capture existing dimensions in input - unlim_dims = self._xr.encoding.get("unlimited_dims", set()) - self.dimensions = OrderedDict() - for name, len in self._xr.dims.items(): - is_unlim = name in unlim_dims - dim = DimensionMimic(name, len, isunlimited=is_unlim) - self.dimensions[name] = dim - - # Capture existing variables in input - self.variables = OrderedDict() - for name, var in self._xr.variables.items(): - var_mimic = VariableMimic(var, name=name) - self.variables[name] = var_mimic - - def filepath(self) -> str: - return self._sourcepath - - def to_xarray_dataset(self) -> xr.Dataset: - """Get an xarray.Dataset representing the simulated netCDF4.Dataset.""" - ds = self._xr - # Drop the 'extra' coordinate variables which were required to make - # indexing constructions work. - ds = ds.drop_vars(self.dimensions.keys()) - # Rename original dimension coords back to their dimension name. - ds = ds.rename_vars(self._output_renames) - # Apply "nofill" encoding to all the output vars which did do not - # actually provide a '_FillVAlue' attribute. - # TODO: check that a provided fill-value behaves as expected - for varname, var in ds.variables.items(): - # if 'missing_value' in var.attrs: - # print(varname) - # del var.attrs['missing_value'] - if "_FillValue" not in var.attrs: - var.encoding["_FillValue"] = None - return ds - - def groups(self): - # Xarray does not support groups :-( - return None - - def sync(self): - pass - - def close(self): - pass - - @staticmethod - def _dimcoord_adjusted_name(dimname): - return f"_{dimname}_XRDS_RENAMED_" - - # - # modify/write support - # - def createDimension( - self, dimname, size=None, actual_length=0 - ) -> DimensionMimic: - """ - Simulate netCDF4 call. - - N.B. the extra 'actual_length' keyword can be used in conjunction with - size=0, to create an unlimited dimension of known 'current length'. - - """ - # NOTE: this does not work in-place, but forces us to replace the - # original dataset. Therefore caller can't use a ref to the original. - # This *could* also mean that DimensionMimics don't work, but in fact - # it is okay since xarray doesn't use dimension objects, and netCDF4 - # anyway requires us to create all the dims *first*. - # TODO: check that 'unlimited' works -- suspect that present code can't - # cope with setting the 'current length' ? - self._xr = self._xr.expand_dims({dimname: size}, -1) - size = size or 0 - is_unlim = size == 0 - actual_length = actual_length or size - if is_unlim: - unlim_dims = self._xr.encoding.setdefault( - "unlimited_dimensions", set() - ) - unlim_dims.add(dimname) - dim = DimensionMimic(dimname, actual_length, is_unlim) - self.dimensions[dimname] = dim - if actual_length > 0: - # NOTE: for now, we are adding an extra index variable on each - # dimension, since this avoids much problems with variables being - # automatically converted to IndexVariables. - # These extra coord variables do *NOT* appear in self.variables, - # and are absent from the dataset produced by 'to_xarray_dataset'. - data = np.arange(actual_length, dtype=int) - self._xr[dimname] = data - return dim - - # Expected default controls in createVariable call, - # from iris.fileformats.netcdf.Saver - _netcdf_saver_defaults = { - "zlib": False, - "complevel": 4, - "shuffle": True, - "fletcher32": False, - "contiguous": False, - "chunksizes": None, - "endian": "native", - "least_significant_digit": None, - "packing": None, - } - - def createVariable( - self, varname, datatype, dimensions=(), fill_value=None, **kwargs - ) -> VariableMimic: - # TODO: kwargs should probably translate into 'encoding' on ds or vars - # FOR NOW: simply check we have no "active" kwargs requesting - # non-default operation. Unfortunately, that involves some - # detailed knowledge of the netCDF4.createVariable interface. - for kwarg, val in kwargs.items(): - if kwarg not in self._netcdf_saver_defaults: - msg = ( - "Unrecognised netcdf saver control keyword : " - "{kwarg} = {val}." - ) - raise ValueError(msg) - if val != self._netcdf_saver_defaults[kwarg]: - msg = ( - "Non-default Netcdf saver control setting : " - "{kwarg} = {val}. These controls are not supported by " - "the DatasetMimic." - ) - raise ValueError(msg) - - datatype = np.dtype(datatype) - shape = tuple(self._xr.dims[dimname] for dimname in dimensions) - - # Note: initially create with all-missing data. This can subsequently - # be assigned different values, and even support partial writes. - # TODO: would really like to support Dask arrays here. - if fill_value is not None: - attrs = {"_FillValue": fill_value} - use_fill = fill_value - else: - attrs = {} - dt_code = f"{datatype.kind}{datatype.itemsize}" - use_fill = nc.default_fillvals[dt_code] - data = np.full(shape, fill_value=use_fill, dtype=datatype) - - xr_var = xr.Variable(dims=dimensions, data=data, attrs=attrs) - original_varname = varname - if varname in self._xr.dims: - # We need to avoid creating vars as coords, for which we currently - # use a nasty trick : Insert with a modified name, and rename back - # on output (see 'to_xarray_dataset'). - # TODO: see if xarray provides a cleaner way to get what we want. - alt_varname = f"XDRS_RENAMED_{varname}_" - self._output_renames[alt_varname] = varname - varname = alt_varname - - # Install the var, and immediately re-fetch it, since the internal - # object is *not* generally the same as the one we put in. - self._xr[varname] = xr_var - xr_var = self._xr.variables[varname] - # Create a mimic for interfacing to the xarray.Variable. - var_mimic = VariableMimic(xr_var, name=original_varname) - self.variables[varname] = var_mimic - return var_mimic - - -def fake_nc4python_dataset(xr_group: Optional[xr.Dataset] = None): - """ - Make a wrapper around an xarray Dataset which emulates a - :class:`netCDF4.Dataset`. - - The resulting :class:`DatasetMimic` supports essential properties of a - read-mode :class:`netCDF4.Dataset`, enabling an arbitrary netcdf data - structure in memory to be "read" as if it were a file - (i.e. without writing it to disk). - It likewise supports write operations, which translates netCDF4 writes - into operations on the internal xarray dataset. - It can also reproduce its content as a :class:`xarray.Dataset` from its - :meth:`DatasetMimic.to_xarray_dataset` method. - - Parameters - ---------- - xr_group : xarray.Dataset, optional - If given, return a DatasetMimic wrapped around this data. - If absent, return an *empty* (but writeable) DatasetMimic. - - Returns - ------- - dataset : DatasetMimic - - """ - return DatasetMimic(xr_group) From e4a079c58d3594b04fc07fae23e9a8ef7c8b4412 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 7 Dec 2022 17:39:56 +0000 Subject: [PATCH 3/6] Reorg into planned ncdata package structure. --- lib/iris/experimental/ncdata/__init__.py | 42 +++++ .../ncdata.py => ncdata/_core.py} | 12 +- .../dataset_like.py} | 25 +-- lib/iris/experimental/ncdata/xarray.py | 144 ++++++++++++++++++ .../experimental/xarray_bridge/__init__.py | 13 +- 5 files changed, 212 insertions(+), 24 deletions(-) create mode 100644 lib/iris/experimental/ncdata/__init__.py rename lib/iris/experimental/{xarray_bridge/ncdata.py => ncdata/_core.py} (96%) rename lib/iris/experimental/{xarray_bridge/ncdata_netcdf4_adaptor.py => ncdata/dataset_like.py} (89%) create mode 100644 lib/iris/experimental/ncdata/xarray.py diff --git a/lib/iris/experimental/ncdata/__init__.py b/lib/iris/experimental/ncdata/__init__.py new file mode 100644 index 0000000000..4e2d4e93da --- /dev/null +++ b/lib/iris/experimental/ncdata/__init__.py @@ -0,0 +1,42 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +An abstract representation of Netcdf structured data, according to the +"Common Data Model" : https://docs.unidata.ucar.edu/netcdf-java/5.3/userguide/common_data_model_overview.html + +TODO: + * add consistency checking + * add "direct" netcdf interfacing, i.e. to_nc4/from_nc4 + +""" +import iris +from iris.cube import CubeList +import iris.fileformats.netcdf as ifn + +from .dataset_like import Nc4DatasetLike +from .xarray import from_xarray, to_xarray + +# +# The primary conversion interfaces +# + + +def cubes_from_xarray(xrds: "xarray.Dataset", **xr_load_kwargs): # noqa + ncdata = from_xarray(xrds, **xr_load_kwargs) + dslike = Nc4DatasetLike(ncdata) + cubes = CubeList(ifn.load_cubes(dslike)) + return cubes + + +def cubes_to_xarray(cubes, iris_save_kwargs=None, xr_save_kwargs=None): + iris_save_kwargs = iris_save_kwargs or {} + xr_save_kwargs = xr_save_kwargs or {} + nc4like = Nc4DatasetLike() + iris.save( + cubes, nc4like, saver=iris.fileformats.netcdf.save, **iris_save_kwargs + ) + xrds = to_xarray(**xr_save_kwargs) + return xrds diff --git a/lib/iris/experimental/xarray_bridge/ncdata.py b/lib/iris/experimental/ncdata/_core.py similarity index 96% rename from lib/iris/experimental/xarray_bridge/ncdata.py rename to lib/iris/experimental/ncdata/_core.py index 2e2b2edaeb..2de791347d 100644 --- a/lib/iris/experimental/xarray_bridge/ncdata.py +++ b/lib/iris/experimental/ncdata/_core.py @@ -26,20 +26,20 @@ # -class NcGroup: +class NcData: def __init__( self, name: Optional[str] = None, dimensions: Dict[str, "NcDimension"] = None, variables: Dict[str, "NcVariable"] = None, attributes: Dict[str, "NcAttribute"] = None, - groups: Dict[str, "NcGroup"] = None, + groups: Dict[str, "NcData"] = None, ): self.name: str = name self.dimensions: Dict[str, "NcDimension"] = dimensions or {} self.variables: Dict[str, "NcVariable"] = variables or {} self.attributes: Dict[str, "NcAttribute"] = attributes or {} - self.groups: Dict[str, "NcGroup"] = groups or {} + self.groups: Dict[str, "NcData"] = groups or {} class NcDimension: @@ -56,7 +56,7 @@ def __init__( data: np.ndarray = None, dtype: np.dtype = None, attributes: Dict[str, "NcAttribute"] = None, - group: "NcGroup" = None, + group: "NcData" = None, ): self.name = name self.dimensions = tuple(dimensions or ()) @@ -97,8 +97,8 @@ def _as_python_value(self): return result -class NcDataset(NcGroup): - # An interface class providing an NcGroup which can be converted to/from an +class NcDataset(NcData): + # An interface class providing an NcData which can be converted to/from an # xr.Dataset. This is basically done by adding a small API enabling it to function # as an Xarray "AbstractDataStore". # This implies some embedded knowledge of Xarray, but it is very small. diff --git a/lib/iris/experimental/xarray_bridge/ncdata_netcdf4_adaptor.py b/lib/iris/experimental/ncdata/dataset_like.py similarity index 89% rename from lib/iris/experimental/xarray_bridge/ncdata_netcdf4_adaptor.py rename to lib/iris/experimental/ncdata/dataset_like.py index ac1d878126..523cd6ab7d 100644 --- a/lib/iris/experimental/xarray_bridge/ncdata_netcdf4_adaptor.py +++ b/lib/iris/experimental/ncdata/dataset_like.py @@ -4,27 +4,28 @@ # See COPYING and COPYING.LESSER in the root of the repository for full # licensing details. """ -An adaptor layer allowing an NcDataset to masquerade as a netCDF4.Dataset +An adaptor layer allowing an NcData to masquerade as a netCDF4.Dataset object. This is provided primarily to support a re-use of the iris.fileformats.netcdf file -format load + save, to convert cubes to+from iris.experimental.xarray_bridde.ncdata -objects, and hence bridge to xarray.Dataset. +format load + save, to convert cubes to+from ncdata objects, and hence convert Iris + cubes to+from an xarray.Dataset. These classes contain NcDataset and NcVariables, but emulating the access APIs of a netCDF4.Dataset. -Notes: - (1) currently only supports what is required for Iris load/save capability - (2) we are proposing that this remains private, for now? -- due to (1) +Note: currently only supports what is required for Iris load/save capability. +It could conceivably be used for data exchange by *other* code that reads or writes +netcdf files, but that may require API support to be extended, depending on what +additional methods might be used. """ import numpy as np -from .ncdata import NcAttribute, NcDataset, NcDimension, NcVariable +from ._core import NcAttribute, NcDataset, NcDimension, NcVariable class _Nc4DatalikeWithNcattrs: - # A mixin, shared by _Nc4DatasetLike and _Nc4VariableLike, which adds netcdf-like + # A mixin, shared by Nc4DatasetLike and Nc4VariableLike, which adds netcdf-like # attribute operations'ncattrs / setncattr / getncattr', *AND* extends the local # objects attribute to those things also # N.B. "self._ncdata" is the underlying NcData object : either an NcDataset or @@ -67,7 +68,7 @@ def __setattr__(self, attr, value): self.setncattr(attr, value) -class _Nc4DatasetLike(_Nc4DatalikeWithNcattrs): +class Nc4DatasetLike(_Nc4DatalikeWithNcattrs): _local_instance_props = ("_ncdata", "variables") def __init__(self, ncdata: NcDataset = None): @@ -77,7 +78,7 @@ def __init__(self, ncdata: NcDataset = None): # N.B. we need to create + store our OWN variables, as they are wrappers for # the underlying NcVariable objects, with different properties. self.variables = { - name: _Nc4VariableLike._from_ncvariable(ncvar) + name: Nc4VariableLike._from_ncvariable(ncvar) for name, ncvar in self._ncdata.variables.items() } @@ -116,7 +117,7 @@ def createVariable(self, varname, datatype, dimensions=(), **encoding): # to do it. self._ncdata.variables[varname] = ncvar # Create a netCDF4-like "wrapper" variable + install that here. - nc4var = _Nc4VariableLike._from_ncvariable(ncvar, dtype=datatype) + nc4var = Nc4VariableLike._from_ncvariable(ncvar, dtype=datatype) self.variables[varname] = nc4var return nc4var @@ -137,7 +138,7 @@ def filepath(): return "" -class _Nc4VariableLike(_Nc4DatalikeWithNcattrs): +class Nc4VariableLike(_Nc4DatalikeWithNcattrs): _local_instance_props = ("_ncdata", "name", "datatype", "_raw_array") def __init__(self, ncvar: NcVariable, datatype: np.dtype): diff --git a/lib/iris/experimental/ncdata/xarray.py b/lib/iris/experimental/ncdata/xarray.py new file mode 100644 index 0000000000..7ba83d46bf --- /dev/null +++ b/lib/iris/experimental/ncdata/xarray.py @@ -0,0 +1,144 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Wrapper classes containing NcData, which provide an interface to read/write from an +:class:`xarray.Dataset`. + +This embeds a certain amount of Xarray knowledge (and dependency), hopefully a minimal +amount. The structure of an NcData object makes it fairly painless. + +""" +from pathlib import Path +from typing import AnyStr, Union + +import xarray as xr + +from ._core import NcAttribute, NcData, NcDimension, NcVariable + + +class _XarrayNcDataStore: # (xr.backends.common.AbstractWritableDataStore) + # An interface class providing a subset of the + # :class:`xr.AbstractWriteableDataStore` interface, and which converts to/from a + # contained ncdata.NcData. + # This requires some knowledge of Xarray, but it is very small. + # + # This code pinched from @TomekTrzeciak + # see https://gist.github.com/TomekTrzeciak/b00ff6c9dc301ed6f684990e400d1435 + + def __init__(self, ncdata: NcData = None): + if ncdata is None: + ncdata = NcData() + self.ncdata = ncdata + + def load(self): + variables = {} + for k, v in self.ncdata.variables.items(): + attrs = { + name: attr._as_python_value() + for name, attr in v.attributes.items() + } + xr_var = xr.Variable( + v.dimensions, v.data, attrs, getattr(v, "encoding", {}) + ) + # TODO: ?possibly? need to apply usual Xarray "encodings" to convert raw + # cf-encoded data into 'normal', interpreted xr.Variables. + xr_var = xr.conventions.decode_cf_variable(k, xr_var) + variables[k] = xr_var + attributes = { + name: attr._as_python_value() + for name, attr in self.ncdata.attributes.items() + } + return variables, attributes + + def store( + self, + variables, + attributes, + check_encoding_set=frozenset(), + writer=None, + unlimited_dims=None, + ): + for attrname, v in attributes.items(): + if ( + attrname in self.ncdata.attributes + ): # and self.attributes[k] != v: + msg = ( + f're-setting of attribute "{attrname}" : ' + f"was={self.ncdata.attributes[attrname]}, now={v}" + ) + raise ValueError(msg) + else: + self.ncdata.attributes[attrname] = NcAttribute(attrname, v) + + for varname, var in variables.items(): + if varname in self.ncdata.variables: + raise ValueError(f'duplicate variable : "{varname}"') + + # An xr.Variable : remove all the possible Xarray encodings + # These are all the ones potentially used by + # :func:`xr.conventions.decode_cf_variable`, in the order in which they + # would be applied. + var = xr.conventions.encode_cf_variable( + var, name=varname, needs_copy=False + ) + + for dim_name, size in zip(var.dims, var.shape): + if dim_name in self.ncdata.dimensions: + if self.ncdata.dimensions[dim_name].size != size: + raise ValueError( + f"size mismatch for dimension {dim_name!r}: " + f"{self.ncdata.dimensions[dim_name]} != {size}" + ) + else: + self.ncdata.dimensions[dim_name] = NcDimension( + dim_name, size=size + ) + + attrs = { + name: NcAttribute(name, value) + for name, value in var.attrs.items() + } + nc_var = NcVariable( + name=varname, + dimensions=var.dims, + attributes=attrs, + data=var.data, + group=self.ncdata, + ) + self.ncdata.variables[varname] = nc_var + + def close(self): + pass + + # + # This interface supports conversion to+from an xarray "Dataset". + # N.B. using the "AbstractDataStore" interface preserves variable contents, being + # either real or lazy arrays. + # + @classmethod + def from_xarray( + cls, dataset_or_file: Union[xr.Dataset, AnyStr, Path], **xr_load_kwargs + ): + if not isinstance(dataset_or_file, xr.Dataset): + # It's a "file" (or pathstring, or Path ?). + dataset_or_file = xr.load_dataset( + dataset_or_file, **xr_load_kwargs + ) + nc_data = cls() + dataset_or_file.dump_to_store(nc_data, **xr_load_kwargs) + return nc_data + + def to_xarray(self, **xr_save_kwargs) -> xr.Dataset: + ds = xr.Dataset.load_store(self, **xr_save_kwargs) + return ds + + +def to_xarray(ncdata: NcData) -> xr.Dataset: + return _XarrayNcDataStore(ncdata).to_xarray() + + +def from_xarray(xrds: Union[xr.Dataset, Path, AnyStr]) -> NcData: + return _XarrayNcDataStore.from_xarray(xrds).ncdata diff --git a/lib/iris/experimental/xarray_bridge/__init__.py b/lib/iris/experimental/xarray_bridge/__init__.py index 1400cf0e17..55e59e4b0e 100644 --- a/lib/iris/experimental/xarray_bridge/__init__.py +++ b/lib/iris/experimental/xarray_bridge/__init__.py @@ -19,8 +19,9 @@ from iris.cube import CubeList import iris.fileformats.netcdf as ifn -from .ncdata import NcDataset -from .ncdata_netcdf4_adaptor import _Nc4DatasetLike +from ..ncdata.dataset_like import Nc4DatasetLike +from ..ncdata.xarray import from_xarray as ncdata_from_xarray +from ..ncdata.xarray import to_xarray as ncdata_to_xarray # # The primary conversion interfaces @@ -28,8 +29,8 @@ def cubes_from_xarray(xrds: "xarray.Dataset", **xr_load_kwargs): # noqa - ncdata = NcDataset.from_xarray(xrds, **xr_load_kwargs) - dslike = _Nc4DatasetLike(ncdata) + ncdata = ncdata_from_xarray(xrds, **xr_load_kwargs) + dslike = Nc4DatasetLike(ncdata) cubes = CubeList(ifn.load_cubes(dslike)) return cubes @@ -37,9 +38,9 @@ def cubes_from_xarray(xrds: "xarray.Dataset", **xr_load_kwargs): # noqa def cubes_to_xarray(cubes, iris_save_kwargs=None, xr_save_kwargs=None): iris_save_kwargs = iris_save_kwargs or {} xr_save_kwargs = xr_save_kwargs or {} - nc4like = _Nc4DatasetLike() + nc4like = Nc4DatasetLike() iris.save( cubes, nc4like, saver=iris.fileformats.netcdf.save, **iris_save_kwargs ) - xrds = nc4like._ncdata.to_xarray(**xr_save_kwargs) + xrds = ncdata_to_xarray(nc4like._ncdata, **xr_save_kwargs) return xrds From 40d582b23a79e1bced420e7712d40b2f3d2fbf85 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 7 Dec 2022 18:41:57 +0000 Subject: [PATCH 4/6] Small tidy. --- lib/iris/experimental/ncdata/__init__.py | 29 +----- lib/iris/experimental/ncdata/_core.py | 114 +---------------------- 2 files changed, 3 insertions(+), 140 deletions(-) diff --git a/lib/iris/experimental/ncdata/__init__.py b/lib/iris/experimental/ncdata/__init__.py index 4e2d4e93da..dd8f344abe 100644 --- a/lib/iris/experimental/ncdata/__init__.py +++ b/lib/iris/experimental/ncdata/__init__.py @@ -12,31 +12,6 @@ * add "direct" netcdf interfacing, i.e. to_nc4/from_nc4 """ -import iris -from iris.cube import CubeList -import iris.fileformats.netcdf as ifn +from ._core import NcAttribute, NcData, NcDimension, NcVariable -from .dataset_like import Nc4DatasetLike -from .xarray import from_xarray, to_xarray - -# -# The primary conversion interfaces -# - - -def cubes_from_xarray(xrds: "xarray.Dataset", **xr_load_kwargs): # noqa - ncdata = from_xarray(xrds, **xr_load_kwargs) - dslike = Nc4DatasetLike(ncdata) - cubes = CubeList(ifn.load_cubes(dslike)) - return cubes - - -def cubes_to_xarray(cubes, iris_save_kwargs=None, xr_save_kwargs=None): - iris_save_kwargs = iris_save_kwargs or {} - xr_save_kwargs = xr_save_kwargs or {} - nc4like = Nc4DatasetLike() - iris.save( - cubes, nc4like, saver=iris.fileformats.netcdf.save, **iris_save_kwargs - ) - xrds = to_xarray(**xr_save_kwargs) - return xrds +__all__ = ["NcAttribute", "NcData", "NcDimension", "NcVariable"] diff --git a/lib/iris/experimental/ncdata/_core.py b/lib/iris/experimental/ncdata/_core.py index 2de791347d..b5672ae45e 100644 --- a/lib/iris/experimental/ncdata/_core.py +++ b/lib/iris/experimental/ncdata/_core.py @@ -11,11 +11,9 @@ TODO: add direct netcdf file interface (easy, but not yet). """ -from pathlib import Path -from typing import AnyStr, Dict, Optional, Tuple, Union +from typing import Dict, Optional, Tuple import numpy as np -import xarray as xr # # A totally basic and naive representation of netCDF data. @@ -95,113 +93,3 @@ def _as_python_value(self): if isinstance(result, bytes): result = result.decode() return result - - -class NcDataset(NcData): - # An interface class providing an NcData which can be converted to/from an - # xr.Dataset. This is basically done by adding a small API enabling it to function - # as an Xarray "AbstractDataStore". - # This implies some embedded knowledge of Xarray, but it is very small. - # - # This code pinched from @TomekTrzeciak - # see https://gist.github.com/TomekTrzeciak/b00ff6c9dc301ed6f684990e400d1435 - - def load(self): - variables = {} - for k, v in self.variables.items(): - attrs = { - name: attr._as_python_value() - for name, attr in v.attributes.items() - } - xr_var = xr.Variable( - v.dimensions, v.data, attrs, getattr(v, "encoding", {}) - ) - # TODO: ?possibly? need to apply usual Xarray "encodings" to convert raw - # cf-encoded data into 'normal', interpreted xr.Variables. - xr_var = xr.conventions.decode_cf_variable(k, xr_var) - variables[k] = xr_var - attributes = { - name: attr._as_python_value() - for name, attr in self.attributes.items() - } - return variables, attributes - - def store( - self, - variables, - attributes, - check_encoding_set=frozenset(), - writer=None, - unlimited_dims=None, - ): - for attrname, v in attributes.items(): - if attrname in self.attributes: # and self.attributes[k] != v: - msg = ( - f're-setting of attribute "{attrname}" : ' - f"was={self.attributes[attrname]}, now={v}" - ) - raise ValueError(msg) - else: - self.attributes[attrname] = NcAttribute(attrname, v) - - for varname, var in variables.items(): - if varname in self.variables: - raise ValueError(f'duplicate variable : "{varname}"') - - # An xr.Variable : remove all the possible Xarray encodings - # These are all the ones potentially used by - # :func:`xr.conventions.decode_cf_variable`, in the order in which they - # would be applied. - var = xr.conventions.encode_cf_variable( - var, name=varname, needs_copy=False - ) - - for dim_name, size in zip(var.dims, var.shape): - if dim_name in self.dimensions: - if self.dimensions[dim_name].size != size: - raise ValueError( - f"size mismatch for dimension {dim_name!r}: " - f"{self.dimensions[dim_name]} != {size}" - ) - else: - self.dimensions[dim_name] = NcDimension( - dim_name, size=size - ) - - attrs = { - name: NcAttribute(name, value) - for name, value in var.attrs.items() - } - nc_var = NcVariable( - name=varname, - dimensions=var.dims, - attributes=attrs, - data=var.data, - group=self, - ) - self.variables[varname] = nc_var - - def close(self): - pass - - # - # This interface supports conversion to+from an xarray "Dataset". - # N.B. using the "AbstractDataStore" interface preserves variable contents, being - # either real or lazy arrays. - # - @classmethod - def from_xarray( - cls, dataset_or_file: Union[xr.Dataset, AnyStr, Path], **xr_load_kwargs - ): - if not isinstance(dataset_or_file, xr.Dataset): - # It's a "file" (or pathstring, or Path ?). - dataset_or_file = xr.load_dataset( - dataset_or_file, **xr_load_kwargs - ) - nc_data = cls() - dataset_or_file.dump_to_store(nc_data, **xr_load_kwargs) - return nc_data - - def to_xarray(self, **xr_save_kwargs) -> xr.Dataset: - ds = xr.Dataset.load_store(self, **xr_save_kwargs) - return ds From 155c020158ea78e5b98c0f505068315dac17c712 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 7 Dec 2022 18:45:48 +0000 Subject: [PATCH 5/6] Added nc4 interface : N.B. no unlimited dims yet. --- .../ncdata/_nc4_interface_exercise.py | 24 ++++ lib/iris/experimental/ncdata/netcdf4.py | 134 ++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 lib/iris/experimental/ncdata/_nc4_interface_exercise.py create mode 100644 lib/iris/experimental/ncdata/netcdf4.py diff --git a/lib/iris/experimental/ncdata/_nc4_interface_exercise.py b/lib/iris/experimental/ncdata/_nc4_interface_exercise.py new file mode 100644 index 0000000000..3ed878a5c8 --- /dev/null +++ b/lib/iris/experimental/ncdata/_nc4_interface_exercise.py @@ -0,0 +1,24 @@ +from iris.experimental.ncdata.netcdf4 import from_nc4, to_nc4 +import iris.tests as itsts + + +def example_nc4_roundtrip(): + filepath = itsts.get_data_path( + ["NetCDF", "stereographic", "toa_brightness_temperature.nc"] + ) + ncdata = from_nc4(filepath) + filepath2 = "./temp_nc_output.nc" + to_nc4(ncdata, filepath2) + + # Convert to Iris + compare (a bit of a cheat, bit OK for now?) + import iris + + cube1 = iris.load_cube(filepath) + cube2 = iris.load_cube(filepath2) + print("Round-tripped result, as iris cube:") + print(cube2) + print("\nold-file-cube == new-file-cube ? ", cube1 == cube2) + + +if __name__ == "__main__": + example_nc4_roundtrip() diff --git a/lib/iris/experimental/ncdata/netcdf4.py b/lib/iris/experimental/ncdata/netcdf4.py new file mode 100644 index 0000000000..44553b7e6b --- /dev/null +++ b/lib/iris/experimental/ncdata/netcdf4.py @@ -0,0 +1,134 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Code to read/write between NcData and a :class:`netCF4.Dataset`, or disk file. + +""" +from pathlib import Path +from typing import AnyStr, Union + +import dask.array as da +import netCDF4 as nc + +from iris._lazy_data import as_lazy_data +from iris.fileformats.netcdf import NetCDFDataProxy + +from ._core import NcAttribute, NcData, NcDimension, NcVariable + + +def to_nc4( + ncdata: NcData, nc4_dataset_or_file: Union[nc.Dataset, Path, AnyStr] +): + """ + Write an NcData to a provided (writeable) :class:`netCDF4.Dataset`, or filepath. + """ + caller_owns_dataset = hasattr(nc4_dataset_or_file, "variables") + if caller_owns_dataset: + nc4ds = nc4_dataset_or_file + else: + nc4ds = nc.Dataset(nc4_dataset_or_file, "w") + + try: + for dimname, dim in ncdata.dimensions.items(): + nc4ds.createDimension(dimname, dim.size) + + for varname, var in ncdata.variables.items(): + fillattr = "_FillValue" + if fillattr in var.attributes: + fill_value = var.attributes[fillattr].value + else: + fill_value = None + + nc4var = nc4ds.createVariable( + varname=varname, + datatype=var.dtype, + dimensions=var.dimensions, + fill_value=fill_value + # TODO: needs **kwargs + ) + + data = var.data + if hasattr(data, "compute"): + da.store(data, nc4var) + else: + nc4var[:] = data + + for attrname, attr in var.attributes.items(): + if attrname != "_FillValue": + nc4var.setncattr(attrname, attr._as_python_value()) + + for attrname, attr in ncdata.attributes.items(): + nc4ds.setncattr(attrname, attr._as_python_value()) + + finally: + if not caller_owns_dataset: + nc4ds.close() + + +def from_nc4( + nc4_dataset_or_file: Union[nc.Dataset, nc.Group, Path, AnyStr] +) -> NcData: + """ + Read an NcData from a provided :class:`netCDF4.Dataset`, or filepath. + """ + ncdata = NcData() + caller_owns_dataset = hasattr(nc4_dataset_or_file, "variables") + if caller_owns_dataset: + nc4ds = nc4_dataset_or_file + else: + nc4ds = nc.Dataset(nc4_dataset_or_file) + + try: + for dimname, nc4dim in nc4ds.dimensions.items(): + ncdata.dimensions[dimname] = NcDimension(dimname, nc4dim.size) + + for varname, nc4var in nc4ds.variables.items(): + var = NcVariable( + name=varname, + dimensions=nc4var.dimensions, + dtype=nc4var.dtype, + group=ncdata, + ) + ncdata.variables[varname] = var + + # Assign a data object : for now, always LAZY. + # code shamelessly stolen from iris.fileformats.netcdf + fill_value = getattr( + var, + "_FillValue", + nc.default_fillvals[var.dtype.str[1:]], + ) + shape = tuple( + ncdata.dimensions[dimname].size for dimname in var.dimensions + ) + proxy = NetCDFDataProxy( + shape=shape, + dtype=var.dtype, + path=nc4ds.filepath(), + variable_name=varname, + fill_value=fill_value, + ) + var.data = as_lazy_data(proxy) + + for attrname in nc4var.ncattrs(): + var.attributes[attrname] = NcAttribute( + attrname, nc4var.getncattr(attrname) + ) + + for attrname in nc4ds.ncattrs(): + ncdata.attributes[attrname] = NcAttribute( + attrname, nc4ds.getncattr(attrname) + ) + + # And finally, groups -- by the magic of recursion ... + for grpname, group in nc4ds.groups.items(): + ncdata.groups[grpname] = from_nc4(nc4ds.groups[grpname]) + + finally: + if not caller_owns_dataset: + nc4ds.close() + + return ncdata From a7c7b01e1e9a9f1c59903a7e0ae2c70982264624 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 8 Dec 2022 10:28:47 +0000 Subject: [PATCH 6/6] Fixes. --- lib/iris/experimental/ncdata/dataset_like.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/iris/experimental/ncdata/dataset_like.py b/lib/iris/experimental/ncdata/dataset_like.py index 523cd6ab7d..d067faa1ad 100644 --- a/lib/iris/experimental/ncdata/dataset_like.py +++ b/lib/iris/experimental/ncdata/dataset_like.py @@ -10,7 +10,7 @@ format load + save, to convert cubes to+from ncdata objects, and hence convert Iris cubes to+from an xarray.Dataset. -These classes contain NcDataset and NcVariables, but emulating the access APIs of a +These classes contain NcData and NcVariables, but emulating the access APIs of a netCDF4.Dataset. Note: currently only supports what is required for Iris load/save capability. @@ -21,14 +21,14 @@ """ import numpy as np -from ._core import NcAttribute, NcDataset, NcDimension, NcVariable +from ._core import NcAttribute, NcData, NcDimension, NcVariable class _Nc4DatalikeWithNcattrs: # A mixin, shared by Nc4DatasetLike and Nc4VariableLike, which adds netcdf-like - # attribute operations'ncattrs / setncattr / getncattr', *AND* extends the local + # attribute operations 'ncattrs / setncattr / getncattr', *AND* extends the local # objects attribute to those things also - # N.B. "self._ncdata" is the underlying NcData object : either an NcDataset or + # N.B. "self._ncdata" is the underlying NcData object : either an NcData or # NcVariable object. def ncattrs(self): return list(self._ncdata.attributes.keys()) @@ -71,9 +71,9 @@ def __setattr__(self, attr, value): class Nc4DatasetLike(_Nc4DatalikeWithNcattrs): _local_instance_props = ("_ncdata", "variables") - def __init__(self, ncdata: NcDataset = None): + def __init__(self, ncdata: NcData = None): if ncdata is None: - ncdata = NcDataset() # an empty dataset + ncdata = NcData() # an empty dataset self._ncdata = ncdata # N.B. we need to create + store our OWN variables, as they are wrappers for # the underlying NcVariable objects, with different properties. @@ -107,7 +107,7 @@ def createVariable(self, varname, datatype, dimensions=(), **encoding): if varname in self.variables: msg = f'creating duplicate variable "{varname}".' raise ValueError(msg) - # Add a variable into the underlying NcDataset object. + # Add a variable into the underlying NcData object. ncvar = NcVariable( name=varname, dimensions=dimensions, @@ -131,7 +131,7 @@ def close(self): def filepath(): # # Note: for now, let's just not care about this. - # we *might* need this to be an optinoal defined item on an NcDataset ?? + # we *might* need this to be an optional defined item on an NcData ?? # .. or, we ight need to store an xarray "encoding" somewhere ? # TODO: more thought here ? # return self.ncdata.encoding.get("source", "")