diff --git a/lib/ncdata/utils/__init__.py b/lib/ncdata/utils/__init__.py new file mode 100644 index 0000000..1dd4138 --- /dev/null +++ b/lib/ncdata/utils/__init__.py @@ -0,0 +1,5 @@ +"""General user utility functions.""" + +from ._save_errors import save_errors + +__all__ = ["save_errors"] diff --git a/lib/ncdata/utils/_save_errors.py b/lib/ncdata/utils/_save_errors.py new file mode 100644 index 0000000..aa2e722 --- /dev/null +++ b/lib/ncdata/utils/_save_errors.py @@ -0,0 +1,217 @@ +"""User utility routines for ncdata.""" +from typing import Dict, List, Union + +import netCDF4 as nc +import numpy as np + +from ncdata import NcData, NcVariable + + +def _name_is_valid(name) -> bool: + result = True + if not isinstance(name, str) or not name: + # Catches non-string (e.g. None, 0, ..) and empty string + result = False + else: + # The name rules for netCDF are not fully clear, but seem *extremely* liberal. + # It seems that "/" is not allowed, and that's about it + # So *allow* whitespace, backslash, initial digit, initial underscore ... + if "/" in name: + result = False + return result + + +def _name_errors(element_container, id_string): + """Check that all elements in the container have valid and consistent names.""" + errors = [] + for name, element in element_container.items(): + if element.name != name: + errors.append( + f"{id_string} element {name!r} has a different element.name : " + f"{element.name!r}." + ) + if not _name_is_valid(name): + errors.append( + f"{id_string}s has an element with an invalid netCDF name : " + f"{name!r}" + ) + return errors + + +_NETCDF_VALID_DTYPES = [np.dtype(key) for key in nc.default_fillvals.keys()] + + +def _valid_attr_dtype(dtype): + # For attributes, we currently accept any kind of string dtype + # We should probably rationalise this, but for now they are converted by netCDF4 + return dtype.kind in "SU" or dtype in _NETCDF_VALID_DTYPES + + +def _invalid_attr_errors( + element: Union[NcData, NcVariable], name_prefix: str +) -> List[str]: + errors = [] + for attr in element.attributes.values(): + dtype = attr.value.dtype + if not _valid_attr_dtype(dtype): + errors.append( + f"{name_prefix} attribute {attr.name!r} has a value which cannot be " + f"saved to netcdf : {attr.value!r} ::dtype={dtype}." + ) + return errors + + +def _variable_errors( + var: NcVariable, var_prefix: str, known_dimensions: Dict[str, int] +) -> List[str]: + errors = [] + if var.data is None: + errors.append(f"{var_prefix} has no data array.") + else: + if var.dtype not in _NETCDF_VALID_DTYPES: + errors.append( + f"{var_prefix} has a dtype which cannot be saved to netcdf : " + f"{var.dtype!r}." + ) + + unknown_dimensions = [ + dim for dim in var.dimensions if dim not in known_dimensions + ] + if unknown_dimensions: + errors.append( + f"{var_prefix} references dimensions which are not found in the " + f"enclosing dataset : {unknown_dimensions!r}" + ) + else: + dims_shape = tuple(known_dimensions[dim] for dim in var.dimensions) + if var.data.shape != dims_shape: + errors.append( + f"{var_prefix} data shape = {var.data.shape}, does not match that " + f"of its dimensions = {dims_shape}." + ) + + # Warn about any unsaveable variable attributes + errors += _invalid_attr_errors(var, var_prefix) + return errors + + +def _save_errors_inner( + ncdata: NcData, + enclosing_dimensions: Dict[str, int] = None, + group_path: str = None, +) -> List[str]: + """ + Scan dataset, with context allowing operation over inner groups. + + Parameters + ---------- + ncdata + data to check + + enclosing_dimensions + A mapping {name:length} of dimensions existing in the enclosing dataset, + within which 'ncdata' is a group + + group_path + The group name or path of ncdata (including its name), when 'ncdata' is a + group within an enclosing dataset + + Returns + ------- + errors + A list of strings describing problems with the dataset + """ + # Construct a name prefix for naming dataset/group attributes + if group_path is None: + group_path = "" + ncdata_identity_prefix = "Dataset" + if ncdata.name: + ncdata_identity_prefix += f"({ncdata.name!r})" + else: + ncdata_identity_prefix = f"Group {group_path!r}" + + if enclosing_dimensions is None: + enclosing_dimensions = {} + + # Add local definitions to the map of available dimensions + # (N.B. inner name duplicates simply replace those from the caller). + known_dimensions = enclosing_dimensions.copy() # don't the passed arg + known_dimensions.update( + {name: dimension.size for name, dimension in ncdata.dimensions.items()} + ) + + # Collect the various detected errors + errors = [] + + # Check that all named containers use only valid names + for component in ("dimension", "variable", "attribute", "group"): + errors += _name_errors( + getattr(ncdata, component + "s"), # N.B. pluralise here + id_string=f"{ncdata_identity_prefix} {component}", + ) + + # List all the variable errors + path_context = group_path + if path_context: + path_context += "/" + for var in ncdata.variables.values(): + var_prefix = f"Variable '{path_context}{var.name}'" + errors += _variable_errors(var, var_prefix, known_dimensions) + + # Warn about unsaveable dataset/group attributes + errors += _invalid_attr_errors(ncdata, ncdata_identity_prefix) + + # Recurse over inner groups + if ncdata.groups: + if not group_path: + # prefix inner group paths with the dataset name, if any + group_path = ncdata.name or "" + for group in ncdata.groups.values(): + errors.extend( + _save_errors_inner( + group, + enclosing_dimensions=known_dimensions, + group_path=group_path + f"/{group.name}", + ) + ) + + return errors + + +def save_errors(ncdata: NcData) -> List[str]: + """ + Scan a dataset for it's consistency and completeness. + + Reports on anything that will make this fail to save. + If there are any such problems, then an attempt to save the ncdata to a netcdf file + will fail. If there are none, then a save should succeed. + + The checks made are roughly the following + + (1) check names in all components (dimensions, variables, attributes and groups): + + * all names are valid netcdf names + * all element names match their key in the component, + i.e. "component[key].name == key" + + (2) check that all attribute values have netcdf-compatible dtypes. + (E.G. no object or compound (recarray) dtypes). + + (3) check that, for all contained variables : + + * it's dimensions are all present in the enclosing dataset + * it has an attached data array, of a netcdf-compatible dtype + * the shape of it's data matches the lengths of it's dimensions + + Parameters + ---------- + ncdata + data to check + + Returns + ------- + errors + A list of strings, error messages describing problems with the dataset. + If no errors, returns an empty list. + """ + return _save_errors_inner(ncdata) diff --git a/tests/integration/test_xarray_load_and_save_equivalence.py b/tests/integration/test_xarray_load_and_save_equivalence.py index 9786761..1f996d4 100644 --- a/tests/integration/test_xarray_load_and_save_equivalence.py +++ b/tests/integration/test_xarray_load_and_save_equivalence.py @@ -9,6 +9,8 @@ import xarray from ncdata.netcdf4 import from_nc4, to_nc4 +from ncdata.threadlock_sharing import lockshare_context +from ncdata.xarray import from_xarray, to_xarray from tests._compare_nc_datasets import compare_nc_datasets from tests.data_testcase_schemas import ( BAD_LOADSAVE_TESTCASES, @@ -16,9 +18,6 @@ standard_testcase, ) -from ncdata.threadlock_sharing import lockshare_context -from ncdata.xarray import from_xarray, to_xarray - # Avoid complaints that imported fixtures are "unused" # TODO: declare fixtures in usual way in pytest config? standard_testcase, session_testdir @@ -37,9 +36,7 @@ def use_xarraylock(): yield -def test_load_direct_vs_viancdata( - standard_testcase, use_xarraylock, tmp_path -): +def test_load_direct_vs_viancdata(standard_testcase, use_xarraylock, tmp_path): source_filepath = standard_testcase.filepath ncdata = from_nc4(source_filepath) diff --git a/tests/unit/utils/__init__.py b/tests/unit/utils/__init__.py new file mode 100644 index 0000000..7e82563 --- /dev/null +++ b/tests/unit/utils/__init__.py @@ -0,0 +1 @@ +"""Unit tests for :mod:`ncdata.utils`.""" diff --git a/tests/unit/utils/test_save_errors.py b/tests/unit/utils/test_save_errors.py new file mode 100644 index 0000000..044c3e4 --- /dev/null +++ b/tests/unit/utils/test_save_errors.py @@ -0,0 +1,250 @@ +""" +Tests for :mod:`ncdata.utils.save_errors` +""" +import re + +import numpy as np +import pytest + +from ncdata import NcAttribute, NcData, NcDimension, NcVariable +from ncdata.utils import save_errors +from tests.unit.core.test_NcAttribute import attrvalue, datatype, structuretype + +_ = datatype, structuretype + + +def _basic_testdata(): + ncdata = NcData( + name="test_ds", + dimensions=[ + NcDimension( + "xxx", 2 + ), # unused first dim, so rename doesn't break variable + NcDimension("x", 3), + ], + variables=[ + NcVariable( + name="vx1", + dimensions=("x"), + data=[1, 2, 3], + attributes=[NcAttribute("xx", 1)], + ) + ], + groups=[NcData("inner")], + attributes=[NcAttribute("x", 1)], + ) + return ncdata + + +do_debug = True + + +# do_debug = False +def debug_errors(errors): + if do_debug and errors: + print("\n\nERROR RESULTS:") + for msg in errors: + print(" ", msg) + + +class TestSaveErrors_Okay: + def test_noerrors_empty(self): + ncdata = NcData() + assert save_errors(ncdata) == [] + + def test_noerrors_basic(self): + ncdata = _basic_testdata() + assert save_errors(ncdata) == [] + + +class TestSaveErrors_Names: + @pytest.fixture(params=["attribute", "dimension", "variable", "group"]) + def component(self, request): + return request.param + + @pytest.mark.parametrize( + "badnametype", ["None", "empty", "number", "object"] + ) + def test_bad_name_type(self, badnametype, component): + bad_name = { + "None": None, + "empty": "", + "number": 3, + "object": ("x", 2), + }[badnametype] + ncdata = _basic_testdata() + elements = getattr(ncdata, component + "s") + element = list(elements.values())[0] + name = element.name + elements.pop(name) + element.name = bad_name + elements.add(element) + + errors = save_errors(ncdata) + debug_errors(errors) + + assert len(errors) == 1 + assert re.search(f"{component}.* invalid netCDF name", errors[0]) + + def test_bad_name_string(self, component): + # Basically, only "/" is banned, at present + ncdata = _basic_testdata() + elements = getattr(ncdata, component + "s") + component_name = list(elements.keys())[0] + elements.rename(component_name, "qq/q") + + errors = save_errors(ncdata) + debug_errors(errors) + + assert len(errors) == 1 + assert re.search(f"{component}.* invalid netCDF name", errors[0]) + + def test_key_name_mismatch(self, component): + ncdata = _basic_testdata() + elements = getattr(ncdata, component + "s") + key, element = list(elements.items())[0] + element.name = "qqq" + + errors = save_errors(ncdata) + debug_errors(errors) + + assert len(errors) == 1 + msg = ( + f"{component} element {key!r} has a different element.name : 'qqq'" + ) + assert re.search(msg, errors[0]) + + +class TestSaveErrors_Attributes: + def test_valid_datatypes(self, datatype, structuretype): + # Check that all expected types + structures of attribute are accepted + if "none" in datatype or "custom" in datatype: + # These produce "unsaveable datatype" errors. + pytest.skip("invalid dtype fails") + value = attrvalue(datatype, structuretype) + ncdata = NcData(attributes=[NcAttribute("x", value)]) + errors = save_errors(ncdata) + assert errors == [] + + @pytest.mark.parametrize( + "context", + [ + "named", + "unnamed", + "group_in_named", + "group_in_unnamed", + "group_of_group", + ], + ) + def test_bad_dataset_attribute(self, context): + # NOTE: using this to test all the Dataset/Group naming constructions + ncdata = _basic_testdata() + ncdata.attributes.add(NcAttribute("q", None)) + if "group" in context: + ncdata = NcData(name="top", groups=[ncdata]) + if context == "group_of_group": + ncdata.name = "middle" + ncdata = NcData(name="top", groups=[ncdata]) + if "unnamed" in context: + # Remove name of top-level dataset (only -- others parts must have names !) + ncdata.name = None + + errors = save_errors(ncdata) + debug_errors(errors) + + expected_id_str = { + "named": "Dataset('test_ds')", + "unnamed": "Dataset", + "group_in_named": "Group 'top/test_ds'", + "group_in_unnamed": "Group '/test_ds'", + "group_of_group": "Group 'top/middle/test_ds'", + }[context] + msg = ( + f"{expected_id_str} attribute 'q' has a value which cannot " + "be saved to netcdf : array(None, dtype=object) ::dtype=object." + ) + assert errors == [msg] + + @pytest.mark.parametrize( + "context", + ["dataset", "group_in_named", "group_in_unnamed", "group_of_group"], + ) + def test_bad_variable_attribute(self, context): + ncdata = _basic_testdata() + ncdata.variables["vx1"].set_attrval("q", None) + if "group" in context: + ncdata = NcData(name="top", groups=[ncdata]) + if context == "group_of_group": + # push it down another level + ncdata.name = "middle" + ncdata = NcData("top", groups=[ncdata]) + if "unnamed" in context: + ncdata.name = None + print(ncdata) + expected_id_str = { + "dataset": "Variable 'vx1'", + "group_in_named": "Variable 'top/test_ds/vx1'", + "group_in_unnamed": "Variable '/test_ds/vx1'", + "group_of_group": "Variable 'top/middle/test_ds/vx1'", + }[context] + + errors = save_errors(ncdata) + debug_errors(errors) + + msg = ( + f"{expected_id_str} attribute 'q' has a value which cannot be saved " + "to netcdf : array(None, dtype=object) ::dtype=object." + ) + assert errors == [msg] + + +class TestSaveErrors_Variables: + def test_missing_data(self): + ncdata = _basic_testdata() + var = list(ncdata.variables.values())[0] + var.data = None + + errors = save_errors(ncdata) + + msg = "Variable 'vx1' has no data array." + assert errors == [msg] + + def test_invalid_dtype(self): + ncdata = _basic_testdata() + var = list(ncdata.variables.values())[0] + arr = np.array([None for _ in var.data]) + var.data = arr + var.dtype = arr.dtype + + errors = save_errors(ncdata) + + msg = "Variable 'vx1' has a dtype which cannot be saved to netcdf : dtype('O')." + assert errors == [msg] + + def test_shape_mismatch(self): + ncdata = _basic_testdata() + var = list(ncdata.variables.values())[0] + var.data = np.zeros((1, 2)) + + errors = save_errors(ncdata) + + msg = ( + "Variable 'vx1' data shape = (1, 2), " + "does not match that of its dimensions = (3,)." + ) + assert errors == [msg] + + @pytest.mark.parametrize("extradims", ["v", "vw"]) + def test_missing_dims(self, extradims): + ncdata = _basic_testdata() + var = list(ncdata.variables.values())[0] + extradims = list(char for char in extradims) + var.dimensions = var.dimensions + tuple(extradims) + + errors = save_errors(ncdata) + + msg = ( + "Variable 'vx1' references dimensions which are not found in the " + f"enclosing dataset : {extradims!r}" + ) + assert errors == [msg] diff --git a/tests/unit/utils/tmp.nc b/tests/unit/utils/tmp.nc new file mode 100644 index 0000000..c3ace5c Binary files /dev/null and b/tests/unit/utils/tmp.nc differ