diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index afdaa2b..4455bf1 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -35,7 +35,7 @@ jobs: - name: "Install dependencies" run: | - conda install --yes pytest iris xarray filelock requests + conda install --yes pytest pytest-mock iris xarray filelock requests - name: "Install *latest* Iris" run: | diff --git a/docs/change_log.rst b/docs/change_log.rst index 8a64708..f44b29e 100644 --- a/docs/change_log.rst +++ b/docs/change_log.rst @@ -22,6 +22,9 @@ Unreleased ^^^^^^^^^^ TODO: highlights +* `@pp-mo`_ dataset comparison routines now a public utility. + (`PR#70 `_). + * `@pp-mo`_ initial Sphinx documentation (`PR#76 `_). diff --git a/lib/ncdata/utils/__init__.py b/lib/ncdata/utils/__init__.py index 1dd4138..dd3dc8c 100644 --- a/lib/ncdata/utils/__init__.py +++ b/lib/ncdata/utils/__init__.py @@ -1,5 +1,10 @@ """General user utility functions.""" +from ._compare_nc_datasets import dataset_differences, variable_differences from ._save_errors import save_errors -__all__ = ["save_errors"] +__all__ = [ + "save_errors", + "dataset_differences", + "variable_differences", +] diff --git a/lib/ncdata/utils/_compare_nc_datasets.py b/lib/ncdata/utils/_compare_nc_datasets.py new file mode 100644 index 0000000..affd6c1 --- /dev/null +++ b/lib/ncdata/utils/_compare_nc_datasets.py @@ -0,0 +1,530 @@ +""" +Utility for comparing 2 netcdf datasets. + +Works with file-specs, netCDF4.Datasets *or* NcData. + +For purposes of testing ncdata.netcdf4 behaviour. +TODO: one day might be public ? +""" + +from pathlib import Path +from typing import AnyStr, List, Union +from warnings import warn + +import netCDF4 +import netCDF4 as nc +import numpy as np + +from ncdata import NcData, NcVariable + + +def dataset_differences( + dataset_or_path_1: Union[Path, AnyStr, nc.Dataset, NcData], + dataset_or_path_2: Union[Path, AnyStr, nc.Dataset, NcData], + check_names: bool = False, + check_dims_order: bool = True, + check_dims_unlimited: bool = True, + check_vars_order: bool = True, + check_attrs_order: bool = True, + check_groups_order: bool = True, + check_var_data: bool = True, + show_n_first_different: int = 2, + suppress_warnings: bool = False, +) -> List[str]: + r""" + Compare netcdf data objects. + + Accepts paths, pathstrings, open :class:`netCDF4.Dataset`\\s or :class:`NcData` objects. + + Parameters + ---------- + dataset_or_path_1, dataset_or_path_2 : str or Path or netCDF4.Dataset or NcData + two datasets to compare, either NcData or netCDF4 + check_dims_order, check_vars_order, check_attrs_order, check_groups_order : bool, default True + If False, no error results from the same contents in a different order, + however unless `suppress_warnings` is True, the error string is issued as a warning. + check_names: bool, default False + Whether to warn if the names of the top-level datasets are different + check_dims_unlimited: bool, default True + Whether to compare the 'unlimited' status of dimensions + check_var_data : bool, default True + If True, all variable data is also checked for equality. + If False, only dtype and shape are compared. + NOTE: comparison of large arrays is done in-memory, so may be highly inefficient. + show_n_first_different: int, default 2 + Number of value differences to display. + suppress_warnings : bool, default False + When False (the default), report changes in content order as Warnings. + When True, ignore changes in ordering. + + Returns + ------- + errs : list of str + A list of "error" strings, describing differences between the inputs. + If empty, no differences were found. + + """ + ds1_was_path = not hasattr(dataset_or_path_1, "variables") + ds2_was_path = not hasattr(dataset_or_path_2, "variables") + ds1, ds2 = None, None + try: + if ds1_was_path: + ds1 = nc.Dataset(dataset_or_path_1) + else: + ds1 = dataset_or_path_1 + + if ds2_was_path: + ds2 = nc.Dataset(dataset_or_path_2) + else: + ds2 = dataset_or_path_2 + + errs = _group_differences( + ds1, + ds2, + group_id_string="Dataset", + dims_order=check_dims_order, + vars_order=check_vars_order, + attrs_order=check_attrs_order, + groups_order=check_groups_order, + data_equality=check_var_data, + suppress_warnings=suppress_warnings, + check_names=check_names, + check_unlimited=check_dims_unlimited, + show_n_diffs=show_n_first_different, + ) + finally: + if ds1_was_path and ds1: + ds1.close() + if ds2_was_path and ds2: + ds2.close() + + return errs + + +def _namelist_differences( + l1, l2, elemname, order_strict=True, suppress_warnings=False +): + errs = [] + msg = f"{elemname} do not match: {list(l1)} != {list(l2)}" + ok = l1 == l2 + ok_except_order = ok + if not ok: + ok_except_order = sorted(l1) == sorted(l2) + + if not ok: + if not ok_except_order or order_strict: + errs.append(msg) + elif ok_except_order and not suppress_warnings: + warn("(Ignoring: " + msg + " )", category=UserWarning) + return errs + + +def _isncdata(obj): + """ + Distinguish NcData objects from similar netCDF4 ones. + + A crude test, used to support comparisons on either type of data. + """ + return hasattr(obj, "_print_content") + + +def _attribute_arrays_eq(a1, a2): + """ + Test equality of array values in attributes. + + Assumes values (attributes) are presented as numpy arrays (not lazy). + Matches any NaNs. + Does *NOT* handle masked data -- which does not occur in attributes. + """ + result = True + result &= a1.shape == a2.shape + result &= a1.dtype == a2.dtype + if result: + if a1.dtype.kind in ("S", "U", "b"): + result = np.all(a1 == a2) + else: + # array_equal handles possible NaN cases + result = np.array_equal(a1, a2, equal_nan=True) + return result + + +def _attribute_differences( + obj1, + obj2, + elemname, + attrs_order=True, + suppress_warnings=False, + force_first_attrnames=None, +) -> List[str]: + """ + Compare attribute name lists. + + Does not return results, but appends error messages to 'errs'. + """ + attrnames, attrnames2 = [ + list(obj.attributes.keys()) if _isncdata(obj) else list(obj.ncattrs()) + for obj in (obj1, obj2) + ] + if attrs_order and force_first_attrnames: + # In order to ignore the order of appearance of *specific* attributes, move + # all those ones to the front in a known order. + def fix_orders(attrlist): + for name in force_first_attrnames[::-1]: + if name in attrlist: + attrlist = [name] + [n for n in attrlist if n != name] + return attrlist + + attrnames = fix_orders(attrnames) + attrnames2 = fix_orders(attrnames2) + + errs = _namelist_differences( + attrnames, + attrnames2, + f"{elemname} attribute lists", + order_strict=attrs_order, + suppress_warnings=suppress_warnings, + ) + + # Compare the attributes themselves (dtypes and values) + for attrname in attrnames: + if attrname not in attrnames2: + # Only compare attributes existing on both inputs. + continue + + attr, attr2 = [ + ( + obj.attributes[attrname].as_python_value() + if _isncdata(obj) + else obj.getncattr(attrname) + ) + for obj in (obj1, obj2) + ] + + # TODO: this still doesn't work well for strings : for those, we should ignore + # exact "type" (including length), and just compare the content. + # TODO: get a good testcase going to check this behaviour + dtype, dtype2 = [ + # Get x.dtype, or fallback on type(x) -- basically, for strings. + getattr(attr, "dtype", type(attr)) + for attr in (attr, attr2) + ] + if all( + isinstance(dt, np.dtype) and dt.kind in "SUb" + for dt in (dtype, dtype2) + ): + dtype = dtype2 = "string" + if dtype != dtype2: + msg = ( + f'{elemname} "{attrname}" attribute datatypes differ : ' + f"{dtype!r} != {dtype2!r}" + ) + errs.append(msg) + else: + # If datatypes match (only then), compare values + # Cast attrs, which might be strings, to arrays for comparison + arr, arr2 = [np.asarray(attr) for attr in (attr, attr2)] + if not _attribute_arrays_eq(arr, arr2): + # N.B. special comparison to handle strings and NaNs + msg = ( + f'{elemname} "{attrname}" attribute values differ : ' + f"{attr!r} != {attr2!r}" + ) + errs.append(msg) + return errs + + +def variable_differences( + v1: NcVariable, + v2: NcVariable, + check_attrs_order: bool = True, + check_var_data: bool = True, + show_n_first_different: int = 2, + suppress_warnings: bool = False, + _group_id_string: str = None, +) -> List[str]: + r""" + Compare variables. + + Parameters + ---------- + v1, v2 : NcVariable + variables to compare + check_attrs_order : bool, default True + If False, no error results from the same contents in a different order, + however unless `suppress_warnings` is True, the error string is issued as a warning. + check_var_data : bool, default True + If True, all variable data is also checked for equality. + If False, only dtype and shape are compared. + NOTE: comparison of large arrays is done in-memory, so may be highly inefficient. + show_n_first_different: int, default 2 + Number of value differences to display. + suppress_warnings : bool, default False + When False (the default), report changes in content order as Warnings. + When True, ignore changes in ordering entirely. + _group_id_string : str + (internal use only) + + Returns + ------- + errs : list of str + A list of "error" strings, describing differences between the inputs. + If empty, no differences were found. + + """ + errs = [] + + show_n_first_different = int(show_n_first_different) + if show_n_first_different < 1: + msg = f"'show_n_diffs' must be >=1 : got {show_n_first_different!r}." + raise ValueError(msg) + + if v1.name == v2.name: + varname = v1.name + else: + varname = f"{v1.name} / {v2.name}" + + if _group_id_string: + var_id_string = f'{_group_id_string} variable "{varname}"' + else: + var_id_string = f'Variable "{varname}"' + + if v1.name != v2.name: + msg = f"{var_id_string} names differ : {v1.name!r} != {v2.name!r}" + errs.append(msg) + + # dimensions + dims, dims2 = [v.dimensions for v in (v1, v2)] + if dims != dims2: + msg = f"{var_id_string} dimensions differ : {dims!r} != {dims2!r}" + errs.append(msg) + + # attributes + errs += _attribute_differences( + v1, + v2, + var_id_string, + attrs_order=check_attrs_order, + suppress_warnings=suppress_warnings, + force_first_attrnames=[ + "_FillValue" + ], # for some reason, this doesn't always list consistently + ) + + # dtypes + dtype, dtype2 = [v.dtype if _isncdata(v) else v.datatype for v in (v1, v2)] + if dtype != dtype2: + msg = f"{var_id_string} datatypes differ : {dtype!r} != {dtype2!r}" + errs.append(msg) + + # data values + is_str, is_str2 = (dt.kind in "SUb" for dt in (dtype, dtype2)) + # TODO: is this correct check to allow compare between different dtypes? + if check_var_data and dims == dims2 and is_str == is_str2: + # N.B. don't check shapes here: we already checked dimensions. + # NOTE: no attempt to use laziness here. Could be improved. + def getdata(var): + if _isncdata(var): + data = var.data + if hasattr(data, "compute"): + data = data.compute() + else: + # expect var to be an actual netCDF4.Variable + # (check for obscure property NOT provided by mimics) + assert hasattr(var, "use_nc_get_vars") + data = var[:] + # Return 0D as 1D, as this makes results simpler to interpret. + if data.ndim == 0: + data = data.flatten() + assert data.shape == (1,) + return data + + data, data2 = (getdata(v) for v in (v1, v2)) + flatdata, flatdata2 = ( + np.asanyarray(arr).flatten() for arr in (data, data2) + ) + + # For simpler checking, use flat versions + flat_diff_inds = ( + [] + ) # NB *don't* make this an array, it causes problems + + # Work out whether string : N.B. array type does not ALWAYS match the + # variable type, because apparently the scalar content of a *masked* scalar + # string variable has a numeric type (!! yuck !!) + is_string_data = flatdata.dtype.kind in ("S", "U") + if is_string_data: + safe_fill_const = "" + else: + safe_fill_const = np.zeros((1,), dtype=flatdata.dtype)[0] + + # Where data is masked, count mask mismatches and skip those points + if any(np.ma.is_masked(arr) for arr in (data, data2)): + mask, mask2 = ( + np.ma.getmaskarray(array) for array in (flatdata, flatdata2) + ) + flat_diff_inds = list(np.where(mask != mask2)[0]) + # Replace all masked points to exclude them from unmasked-point checks. + either_masked = mask | mask2 + flatdata[either_masked] = safe_fill_const + flatdata2[either_masked] = safe_fill_const + + # Where data has NANs, count mismatches and skip (as for masked) + if not is_string_data: + isnans, isnans2 = (np.isnan(arr) for arr in (flatdata, flatdata2)) + if np.any(isnans) or np.any(isnans2): + nandiffs = np.where(isnans != isnans2)[0] + if nandiffs: + flat_diff_inds += list(nandiffs) + anynans = isnans | isnans2 + flatdata[anynans] = safe_fill_const + flatdata2[anynans] = safe_fill_const + + flat_diff_inds += list(np.where(flatdata != flatdata2)[0]) + # Order the nonmatching indices : We report just the first few ... + flat_diff_inds = sorted(flat_diff_inds) + n_diffs = len(flat_diff_inds) + if n_diffs: + msg = ( + f"{var_id_string} data contents differ, at {n_diffs} points: " + ) + ellps = ", ..." if n_diffs > show_n_first_different else "" + diffinds = flat_diff_inds[:show_n_first_different] + diffinds = [ + np.unravel_index(ind, shape=data.shape) for ind in diffinds + ] + diffinds_str = ", ".join(repr(tuple(x)) for x in diffinds) + inds_str = f"[{diffinds_str}{ellps}]" + points_lhs_str = ", ".join(repr(data[ind]) for ind in diffinds) + points_rhs_str = ", ".join(repr(data2[ind]) for ind in diffinds) + points_lhs_str = f"[{points_lhs_str}{ellps}]" + points_rhs_str = f"[{points_rhs_str}{ellps}]" + msg += ( + f"@INDICES{inds_str}" + f" : LHS={points_lhs_str}, RHS={points_rhs_str}" + ) + errs.append(msg) + return errs + + +def _group_differences( + g1: Union[netCDF4.Dataset, netCDF4.Group], + g2: Union[netCDF4.Dataset, netCDF4.Group], + group_id_string: str, + dims_order: bool = True, + vars_order: bool = True, + attrs_order: bool = True, + groups_order: bool = True, + data_equality: bool = True, + suppress_warnings: bool = False, + check_names: bool = False, + check_unlimited: bool = True, + show_n_diffs: int = 2, +) -> List[str]: + """ + Inner routine to compare either whole datasets or subgroups. + + Note that, rather than returning a list of error strings, it appends them to the + passed arg `errs`. This just makes recursive calling easier. + """ + errs = [] + + if check_names: + if g1.name != g2.name: + errs.append( + f"Datasets have different names: {g1.name!r} != {g2.name!r}." + ) + # Compare lists of dimension names + dimnames, dimnames2 = [list(grp.dimensions.keys()) for grp in (g1, g2)] + errs += _namelist_differences( + dimnames, + dimnames2, + f"{group_id_string} dimension lists", + order_strict=dims_order, + suppress_warnings=suppress_warnings, + ) + + # Compare the dimensions themselves + for dimname in dimnames: + if dimname not in dimnames2: + continue + d1, d2 = [grp.dimensions[dimname] for grp in (g1, g2)] + dimlen, dimlen2 = [dim.size for dim in (d1, d2)] + if dimlen != dimlen2: + msg = ( + f'{group_id_string} "{dimname}" dimensions ' + f"have different sizes: {dimlen} != {dimlen2}" + ) + errs.append(msg) + + if check_unlimited: + unlim1, unlim2 = [ + dim.unlimited if _isncdata(dim) else dim.isunlimited() + for dim in (d1, d2) + ] + if unlim1 != unlim2: + msg = ( + f'{group_id_string} "{dimname}" dimension ' + f'has different "unlimited" status : {unlim1} != {unlim2}' + ) + errs.append(msg) + + # Compare file attributes + errs += _attribute_differences( + g1, + g2, + group_id_string, + attrs_order=attrs_order, + suppress_warnings=suppress_warnings, + ) + + # Compare lists of variables + varnames, varnames2 = [list(grp.variables.keys()) for grp in (g1, g2)] + errs += _namelist_differences( + varnames, + varnames2, + f"{group_id_string} variable lists", + order_strict=vars_order, + suppress_warnings=suppress_warnings, + ) + + # Compare the variables themselves + for varname in varnames: + if varname not in varnames2: + continue + v1, v2 = [grp.variables[varname] for grp in (g1, g2)] + errs += variable_differences( + v1, + v2, + check_attrs_order=attrs_order, + check_var_data=data_equality, + show_n_first_different=show_n_diffs, + suppress_warnings=suppress_warnings, + _group_id_string=group_id_string, + ) + + # Finally, recurse over groups + grpnames, grpnames2 = [list(grp.groups.keys()) for grp in (g1, g2)] + errs += _namelist_differences( + grpnames, + grpnames2, + f"{group_id_string} subgroup lists", + order_strict=groups_order, + suppress_warnings=suppress_warnings, + ) + for grpname in grpnames: + if grpname not in grpnames2: + continue + grp1, grp2 = [grp.groups[grpname] for grp in (g1, g2)] + errs += _group_differences( + grp1, + grp2, + group_id_string=f"{group_id_string}/{grpname}", + dims_order=dims_order, + vars_order=vars_order, + attrs_order=attrs_order, + groups_order=groups_order, + data_equality=data_equality, + check_unlimited=check_unlimited, + show_n_diffs=show_n_diffs, + ) + return errs diff --git a/tests/_compare_nc_datasets.py b/tests/_compare_nc_datasets.py deleted file mode 100644 index 21ea635..0000000 --- a/tests/_compare_nc_datasets.py +++ /dev/null @@ -1,459 +0,0 @@ -""" -Utility for comparing 2 netcdf datasets. - -Works with file-specs, netCDF4.Datasets *or* NcData. - -For purposes of testing ncdata.netcdf4 behaviour. -TODO: one day might be public ? -""" - -from pathlib import Path -from typing import AnyStr, List, Union -from warnings import warn - -import netCDF4 -import netCDF4 as nc -import numpy as np - -from ncdata import NcData - - -def compare_nc_datasets( - dataset_or_path_1: Union[Path, AnyStr, nc.Dataset, NcData], - dataset_or_path_2: Union[Path, AnyStr, nc.Dataset, NcData], - check_dims_order: bool = True, - check_vars_order: bool = True, - check_attrs_order: bool = True, - check_groups_order: bool = True, - check_var_data: bool = True, - suppress_warnings: bool = False, -) -> List[str]: - r""" - Compare netcdf data. - - Accepts paths, pathstrings, open :class:`netCDF4.Dataset`\\s or :class:`NcData` objects. - - Parameters - ---------- - dataset_or_path_1, dataset_or_path_2 : str or Path or netCDF4.Dataset or NcData - two datasets to compare, either NcData or netCDF4 - check_dims_order, check_vars_order, check_attrs_order, check_groups_order : bool, default True - If False, no error results from the same contents in a different order, - however unless `suppress_warnings` is True, the error string is issued as a warning. - check_var_data : bool, default True - If True, all variable data is also checked for equality. - If False, only dtype and shape are compared. - suppress_warnings : bool, default False - When False (the default), report changes in content order as Warnings. - When True, ignore changes in ordering. - - Returns - ------- - errs : list of str - a list of error strings. - If empty, no differences were found. - - """ - ds1_was_path = not hasattr(dataset_or_path_1, "variables") - ds2_was_path = not hasattr(dataset_or_path_2, "variables") - ds1, ds2 = None, None - try: - if ds1_was_path: - ds1 = nc.Dataset(dataset_or_path_1) - else: - ds1 = dataset_or_path_1 - - if ds2_was_path: - ds2 = nc.Dataset(dataset_or_path_2) - else: - ds2 = dataset_or_path_2 - - errs = [] - _compare_nc_groups( - errs, - ds1, - ds2, - group_id_string="Dataset", - dims_order=check_dims_order, - vars_order=check_vars_order, - attrs_order=check_attrs_order, - groups_order=check_groups_order, - data_equality=check_var_data, - suppress_warnings=suppress_warnings, - ) - finally: - if ds1_was_path and ds1: - ds1.close() - if ds2_was_path and ds2: - ds2.close() - - return errs - - -def _compare_name_lists( - errslist, l1, l2, elemname, order_strict=True, suppress_warnings=False -): - msg = f"{elemname} do not match: {list(l1)} != {list(l2)}" - ok = l1 == l2 - ok_except_order = ok - if not ok: - ok_except_order = sorted(l1) == sorted(l2) - - if not ok: - if not ok_except_order or order_strict: - errslist.append(msg) - elif ok_except_order and not suppress_warnings: - warn("(Ignoring: " + msg + " )", category=UserWarning) - - -def _isncdata(obj): - """ - Distinguish NcData objects from similar netCDF4 ones. - - A crude test, used to support comparisons on either type of data. - """ - return hasattr(obj, "_print_content") - - -def _array_eq(a1, a2): - """ - Test equality of array values in attributes. - - Assumes values (attributes) are presented as numpy arrays (not lazy). - Matches any NaNs. - Does *NOT* handle masked data -- which does not occur in attributes. - """ - result = True - result &= a1.shape == a2.shape - result &= a1.dtype == a2.dtype - if result: - if a1.dtype.kind in ("S", "U", "b"): - result = np.all(a1 == a2) - else: - # array_equal handles possible NaN cases - result = np.array_equal(a1, a2, equal_nan=True) - return result - - -def _compare_attributes( - errs, - obj1, - obj2, - elemname, - attrs_order=True, - suppress_warnings=False, - force_first_attrnames=None, -): - """ - Compare attribute name lists. - - Does not return results, but appends error messages to 'errs'. - """ - attrnames, attrnames2 = [ - obj.attributes.keys() if _isncdata(obj) else obj.ncattrs() - for obj in (obj1, obj2) - ] - if attrs_order and force_first_attrnames: - - def fix_orders(attrlist): - for name in force_first_attrnames[::-1]: - if name in attrlist: - attrlist = [name] + [n for n in attrlist if n != name] - return attrlist - - attrnames = fix_orders(attrnames) - attrnames2 = fix_orders(attrnames2) - - _compare_name_lists( - errs, - attrnames, - attrnames2, - f"{elemname} attribute lists", - order_strict=attrs_order, - suppress_warnings=suppress_warnings, - ) - - # Compare the attributes themselves (dtypes and values) - for attrname in attrnames: - if attrname not in attrnames2: - # Only compare attributes existing on both inputs. - continue - - attr, attr2 = [ - ( - obj.attributes[attrname].as_python_value() - if _isncdata(obj) - else obj.getncattr(attrname) - ) - for obj in (obj1, obj2) - ] - - # TODO: this still doesn't work well for strings : for those, we should ignore - # exact "type" (including length), and just compare the content. - # TODO: get a good testcase going to check this behaviour - dtype, dtype2 = [ - # Get x.dtype, or fallback on type(x) -- basically, for strings. - getattr(attr, "dtype", type(attr)) - for attr in (attr, attr2) - ] - if all( - isinstance(dt, np.dtype) and dt.kind in "SUb" - for dt in (dtype, dtype2) - ): - dtype = dtype2 = "string" - if dtype != dtype2: - msg = ( - f'{elemname} "{attrname}" attribute datatypes differ : ' - f"{dtype!r} != {dtype2!r}" - ) - errs.append(msg) - else: - # If datatypes match (only then), compare values - # Cast attrs, which might be strings, to arrays for comparison - arr, arr2 = [np.asarray(attr) for attr in (attr, attr2)] - if not _array_eq(arr, arr2): - # N.B. special comparison to handle strings and NaNs - msg = ( - f'{elemname} "{attrname}" attribute values differ : ' - f"{attr!r} != {attr2!r}" - ) - errs.append(msg) - - -def _compare_nc_groups( - errs: List[str], - g1: Union[netCDF4.Dataset, netCDF4.Group], - g2: Union[netCDF4.Dataset, netCDF4.Group], - group_id_string: str, - dims_order: bool = True, - vars_order: bool = True, - attrs_order: bool = True, - groups_order: bool = True, - data_equality: bool = True, - suppress_warnings: bool = False, -): - """ - Inner routine to compare either whole datasets or subgroups. - - Note that, rather than returning a list of error strings, it appends them to the - passed arg `errs`. This just makes recursive calling easier. - """ - # Compare lists of dimension names - dimnames, dimnames2 = [list(grp.dimensions.keys()) for grp in (g1, g2)] - _compare_name_lists( - errs, - dimnames, - dimnames2, - f"{group_id_string} dimension lists", - order_strict=dims_order, - suppress_warnings=suppress_warnings, - ) - - # Compare the dimensions themselves - for dimname in dimnames: - if dimname not in dimnames2: - continue - d1, d2 = [grp.dimensions[dimname] for grp in (g1, g2)] - dimlen, dimlen2 = [dim.size for dim in (d1, d2)] - if dimlen != dimlen2: - msg = ( - f'{group_id_string} "{dimname}" dimensions ' - f"have different sizes: {dimlen} != {dimlen2}" - ) - errs.append(msg) - - # Compare file attributes - _compare_attributes( - errs, - g1, - g2, - group_id_string, - attrs_order=attrs_order, - suppress_warnings=suppress_warnings, - ) - - # Compare lists of variables - varnames, varnames2 = [list(grp.variables.keys()) for grp in (g1, g2)] - _compare_name_lists( - errs, - varnames, - varnames2, - f"{group_id_string} variable lists", - order_strict=dims_order, - suppress_warnings=suppress_warnings, - ) - - # Compare the variables themselves - for varname in varnames: - if varname not in varnames2: - continue - v1, v2 = [grp.variables[varname] for grp in (g1, g2)] - - var_id_string = f'{group_id_string} variable "{varname}"' - - # dimensions - dims, dims2 = [v.dimensions for v in (v1, v2)] - if dims != dims2: - msg = f"{var_id_string} dimensions differ : {dims!r} != {dims2!r}" - - # attributes - _compare_attributes( - errs, - v1, - v2, - var_id_string, - attrs_order=attrs_order, - suppress_warnings=suppress_warnings, - force_first_attrnames=[ - "_FillValue" - ], # for some reason, this doesn't always list consistently - ) - - # dtypes - dtype, dtype2 = [ - v.dtype if _isncdata(v) else v.datatype for v in (v1, v2) - ] - if dtype != dtype2: - msg = f"{var_id_string} datatypes differ : {dtype!r} != {dtype2!r}" - errs.append(msg) - - # data values - is_str, is_str2 = (dt.kind in "SUb" for dt in (dtype, dtype2)) - # TODO: is this correct check to allow compare between different dtypes? - if data_equality and dims == dims2 and is_str == is_str2: - # N.B. don't check shapes here: we already checked dimensions. - # NOTE: no attempt to use laziness here. Could be improved. - def getdata(var): - if _isncdata(var): - data = var.data - if hasattr(data, "compute"): - data = data.compute() - else: - # expect var to be an actual netCDF4.Variable - # (check for obscure property NOT provided by mimics) - assert hasattr(var, "use_nc_get_vars") - data = var[:] - # Return 0D as 1D, as this makes results simpler to interpret. - if data.ndim == 0: - data = data.flatten() - assert data.shape == (1,) - return data - - data, data2 = (getdata(v) for v in (v1, v2)) - flatdata, flatdata2 = ( - np.asanyarray(arr).flatten() for arr in (data, data2) - ) - - # For simpler checking, use flat versions - flat_diff_inds = ( - [] - ) # NB *don't* make this an array, it causes problems - - # Work out whether string : N.B. array type does not ALWAYS match the - # variable type, because apparently the scalar content of a *masked* scalar - # string variable has a numeric type (!! yuck !!) - is_string_data = flatdata.dtype.kind in ("S", "U") - if is_string_data: - safe_fill_const = "" - else: - safe_fill_const = np.zeros((1,), dtype=flatdata.dtype)[0] - - # Where data is masked, count mask mismatches and skip those points - if any(np.ma.is_masked(arr) for arr in (data, data2)): - mask, mask2 = ( - np.ma.getmaskarray(array) - for array in (flatdata, flatdata2) - ) - flat_diff_inds = list(np.where(mask != mask2)[0]) - # Replace all masked points to exclude them from unmasked-point checks. - either_masked = mask | mask2 - flatdata[either_masked] = safe_fill_const - flatdata2[either_masked] = safe_fill_const - - # Where data has NANs, count mismatches and skip (as for masked) - if not is_string_data: - isnans, isnans2 = ( - np.isnan(arr) for arr in (flatdata, flatdata2) - ) - if np.any(isnans) or np.any(isnans2): - nandiffs = np.where(isnans != isnans2)[0] - if nandiffs: - flat_diff_inds += list(nandiffs) - anynans = isnans | isnans2 - flatdata[anynans] = safe_fill_const - flatdata2[anynans] = safe_fill_const - - flat_diff_inds += list(np.where(flatdata != flatdata2)[0]) - # Order the nonmatching indices : We report just the first few ... - flat_diff_inds = sorted(flat_diff_inds) - n_diffs = len(flat_diff_inds) - if n_diffs: - msg = f"{var_id_string} data contents differ, at {n_diffs} points: " - ellps = ", ..." if n_diffs > 2 else "" - diffinds = flat_diff_inds[:2] - diffinds = [ - np.unravel_index(ind, shape=data.shape) for ind in diffinds - ] - diffinds_str = ", ".join(repr(tuple(x)) for x in diffinds) - inds_str = f"[{diffinds_str}{ellps}]" - points_lhs_str = ", ".join(repr(data[ind]) for ind in diffinds) - points_rhs_str = ", ".join( - repr(data2[ind]) for ind in diffinds - ) - points_lhs_str = f"[{points_lhs_str}{ellps}]" - points_rhs_str = f"[{points_rhs_str}{ellps}]" - msg += ( - f"@INDICES{inds_str}" - f" : LHS={points_lhs_str}, RHS={points_rhs_str}" - ) - errs.append(msg) - - # Finally, recurse over groups - grpnames, grpnames2 = [list(grp.groups.keys()) for grp in (g1, g2)] - _compare_name_lists( - errs, - grpnames, - grpnames2, - f"{group_id_string} subgroup lists", - order_strict=groups_order, - suppress_warnings=suppress_warnings, - ) - for grpname in grpnames: - if grpname not in grpnames2: - continue - grp1, grp2 = [grp.groups[grpname] for grp in (g1, g2)] - _compare_nc_groups( - errs, - grp1, - grp2, - group_id_string=f"{group_id_string}/{grpname}", - dims_order=dims_order, - vars_order=vars_order, - attrs_order=attrs_order, - groups_order=groups_order, - data_equality=data_equality, - ) - - -if __name__ == "__main__": - fps = [ - "/home/h05/itpp/tmp.nc", - "/home/h05/itpp/tmp2.nc", - "/home/h05/itpp/mask.nc", - "/home/h05/itpp/tmps.nc", - "/home/h05/itpp/tmps2.nc", - ] - fp1, fp2, fp3, fp4, fp5 = fps - pairs = [ - [fp1, fp1], - [fp1, fp2], - [fp1, fp3], - [fp4, fp5], - ] - for p1, p2 in pairs: - errs = compare_nc_datasets(p1, p2, check_attrs_order=False) - print("") - print(f"Compare {p1} with {p2} : {len(errs)} errors ") - for err in errs: - print(" ", err) - print("-ends-") diff --git a/tests/data_testcase_schemas.py b/tests/data_testcase_schemas.py index 26f8c7e..19d44a1 100644 --- a/tests/data_testcase_schemas.py +++ b/tests/data_testcase_schemas.py @@ -330,8 +330,8 @@ def _define_simple_testcases(): return testcases -ADD_IRIS_FILES = True -# ADD_IRIS_FILES = False +# ADD_IRIS_FILES = True +ADD_IRIS_FILES = False @standard_testcases_func diff --git a/tests/integration/example_scripts/ex_ncdata_netcdf_conversion.py b/tests/integration/example_scripts/ex_ncdata_netcdf_conversion.py index f1b2355..4468829 100644 --- a/tests/integration/example_scripts/ex_ncdata_netcdf_conversion.py +++ b/tests/integration/example_scripts/ex_ncdata_netcdf_conversion.py @@ -12,8 +12,8 @@ from ncdata import NcAttribute, NcData, NcDimension, NcVariable from ncdata.netcdf4 import from_nc4, to_nc4 +from ncdata.utils import dataset_differences from tests import testdata_dir -from tests._compare_nc_datasets import compare_nc_datasets def example_nc4_load_save_roundtrip(): # noqa: D103 @@ -28,7 +28,7 @@ def example_nc4_load_save_roundtrip(): # noqa: D103 filepath2 = tempdir_path / "temp_nc_output.nc" to_nc4(ncdata, filepath2) - result = compare_nc_datasets(filepath, filepath2) + result = dataset_differences(filepath, filepath2) equals_result = result == [] print("\nFiles compare? :", equals_result) assert equals_result diff --git a/tests/integration/test_iris_load_and_save_equivalence.py b/tests/integration/test_iris_load_and_save_equivalence.py index 042d8c9..93579c8 100644 --- a/tests/integration/test_iris_load_and_save_equivalence.py +++ b/tests/integration/test_iris_load_and_save_equivalence.py @@ -11,7 +11,7 @@ import pytest from ncdata.netcdf4 import from_nc4, to_nc4 -from tests._compare_nc_datasets import compare_nc_datasets +from ncdata.utils import dataset_differences from tests.data_testcase_schemas import session_testdir, standard_testcase from tests.integration.equivalence_testing_utils import ( adjust_chunks, @@ -93,7 +93,7 @@ def test_load_direct_vs_viancdata( if not result: # FOR NOW: compare with experimental ncdata comparison. # I know this is a bit circular, but it is useful for debugging, for now ... - result = compare_nc_datasets( + result = dataset_differences( from_iris(iris_cubes), from_iris(iris_ncdata_cubes) ) assert result == [] @@ -144,5 +144,5 @@ def test_save_direct_vs_viancdata(standard_testcase, tmp_path): print(txt) # Check equivalence - results = compare_nc_datasets(temp_iris_savepath, temp_ncdata_savepath) + results = dataset_differences(temp_iris_savepath, temp_ncdata_savepath) assert results == [] diff --git a/tests/integration/test_iris_xarray_roundtrips.py b/tests/integration/test_iris_xarray_roundtrips.py index 160860a..638cae5 100644 --- a/tests/integration/test_iris_xarray_roundtrips.py +++ b/tests/integration/test_iris_xarray_roundtrips.py @@ -19,8 +19,8 @@ from ncdata.iris_xarray import cubes_to_xarray from ncdata.netcdf4 import from_nc4 from ncdata.threadlock_sharing import lockshare_context +from ncdata.utils import dataset_differences from ncdata.xarray import from_xarray -from tests._compare_nc_datasets import compare_nc_datasets from tests.data_testcase_schemas import ( BAD_LOADSAVE_TESTCASES, session_testdir, @@ -172,7 +172,7 @@ def test_roundtrip_ixi(standard_testcase, use_irislock, adjust_chunks): if not result: # FOR NOW: compare with experimental ncdata comparison. # I know this is a bit circular, but it is useful for debugging, for now ... - result = compare_nc_datasets( + result = dataset_differences( from_iris(iris_cubes), from_iris(iris_xr_cubes) ) assert result == [] @@ -299,14 +299,14 @@ def test_roundtrip_xix( "calendar", "standard" ) - result = compare_nc_datasets( + result = dataset_differences( ncds_xr, ncds_xr_iris ) # , check_var_data=False) assert result == [] # TODO: check equivalence, in Xarray terms # xr_result = xrds_iris.equals(xrds) - # ncd_result = compare_nc_datasets( + # ncd_result = dataset_differences( # ncds_xr, ncds_xr_iris # ) # , check_var_data=False) # print("\nDATASET COMPARE RESULTS:\n" + "\n".join(ncd_result)) diff --git a/tests/integration/test_netcdf_roundtrips.py b/tests/integration/test_netcdf_roundtrips.py index 79e258d..6fe635d 100644 --- a/tests/integration/test_netcdf_roundtrips.py +++ b/tests/integration/test_netcdf_roundtrips.py @@ -4,7 +4,7 @@ from subprocess import check_output from ncdata.netcdf4 import from_nc4, to_nc4 -from tests._compare_nc_datasets import compare_nc_datasets +from ncdata.utils import dataset_differences from tests.data_testcase_schemas import session_testdir, standard_testcase # Avoid complaints that the imported fixtures are "unused" @@ -38,5 +38,5 @@ def test_basic(standard_testcase, tmp_path): print(txt) # Check that the re-saved file matches the original - results = compare_nc_datasets(source_filepath, intermediate_filepath) + results = dataset_differences(source_filepath, intermediate_filepath) assert results == [] diff --git a/tests/integration/test_xarray_load_and_save_equivalence.py b/tests/integration/test_xarray_load_and_save_equivalence.py index 1f996d4..d7fb316 100644 --- a/tests/integration/test_xarray_load_and_save_equivalence.py +++ b/tests/integration/test_xarray_load_and_save_equivalence.py @@ -10,8 +10,8 @@ from ncdata.netcdf4 import from_nc4, to_nc4 from ncdata.threadlock_sharing import lockshare_context +from ncdata.utils import dataset_differences from ncdata.xarray import from_xarray, to_xarray -from tests._compare_nc_datasets import compare_nc_datasets from tests.data_testcase_schemas import ( BAD_LOADSAVE_TESTCASES, session_testdir, @@ -74,10 +74,11 @@ def test_save_direct_vs_viancdata(standard_testcase, tmp_path): to_nc4(ncds_fromxr, temp_ncdata_savepath) # Check equivalence - results = compare_nc_datasets( + results = dataset_differences( temp_direct_savepath, temp_ncdata_savepath, check_dims_order=False, + check_dims_unlimited=False, # TODO: remove this when we fix it suppress_warnings=True, ) assert results == [] diff --git a/tests/unit/netcdf/test_from_nc4.py b/tests/unit/netcdf/test_from_nc4.py index 61c3c19..ea61291 100644 --- a/tests/unit/netcdf/test_from_nc4.py +++ b/tests/unit/netcdf/test_from_nc4.py @@ -16,7 +16,7 @@ from ncdata import NcData, NcDimension, NcVariable from ncdata.netcdf4 import from_nc4 -from tests._compare_nc_datasets import compare_nc_datasets +from ncdata.utils import dataset_differences from tests.data_testcase_schemas import make_testcase_dataset @@ -38,7 +38,7 @@ def test_target_types(sourcetype, tmp_path): """Check the various ways of specifying the input data.""" # This testcase is a rather complicated, but we need to test with groups, and we # may as well also test for variables which map dimensions from multiple levels. - # In effect, this is also exercising tricky bits of 'compare_nc_datasets' !! + # In effect, this is also exercising tricky bits of 'dataset_differences' !! test_spec = { "dims": [dict(name="xdim", size=3)], "vars": [ @@ -84,7 +84,7 @@ def test_target_types(sourcetype, tmp_path): variables=[ NcVariable( name="x", - dimensions=("xdim"), + dimensions=("xdim",), dtype=np.float32, data=[1.23, 2, 9], ) @@ -107,5 +107,5 @@ def test_target_types(sourcetype, tmp_path): if sourcetype == "group": ncdata_expected = ncdata_expected.groups["inner_group"] - diffs = compare_nc_datasets(ncdata, ncdata_expected) + diffs = dataset_differences(ncdata, ncdata_expected) assert diffs == [] diff --git a/tests/unit/netcdf/test_to_nc4.py b/tests/unit/netcdf/test_to_nc4.py index e72fd52..8f2934a 100644 --- a/tests/unit/netcdf/test_to_nc4.py +++ b/tests/unit/netcdf/test_to_nc4.py @@ -17,7 +17,7 @@ from ncdata import NcData from ncdata.netcdf4 import from_nc4, to_nc4 -from tests._compare_nc_datasets import compare_nc_datasets +from ncdata.utils import dataset_differences from tests.data_testcase_schemas import make_testcase_dataset @@ -61,7 +61,7 @@ def test_target_types(targettype, tmp_path): target.close() assert target_path.exists() - assert compare_nc_datasets(target_path, original_path) == [] + assert dataset_differences(target_path, original_path) == [] def fetch_nc_var(nc_file: nc.Dataset, var_path: str or List[str]): diff --git a/tests/unit/tests/unit/__init__.py b/tests/unit/tests/unit/__init__.py deleted file mode 100644 index a8038cb..0000000 --- a/tests/unit/tests/unit/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -""" -Unit tests for :mod:`tests.unit`. - -Yes I know, tests of tests. But it seems necessary. -""" diff --git a/tests/unit/utils/compare_nc_datasets/__init__.py b/tests/unit/utils/compare_nc_datasets/__init__.py new file mode 100644 index 0000000..7f699aa --- /dev/null +++ b/tests/unit/utils/compare_nc_datasets/__init__.py @@ -0,0 +1 @@ +"""Unit tests for :mod:`ncdata.utils._compare_nc_datasets`.""" diff --git a/tests/unit/tests/test_compare_nc_datasets.py b/tests/unit/utils/compare_nc_datasets/test_dataset_differences__additional.py similarity index 68% rename from tests/unit/tests/test_compare_nc_datasets.py rename to tests/unit/utils/compare_nc_datasets/test_dataset_differences__additional.py index e153db3..1c3e20e 100644 --- a/tests/unit/tests/test_compare_nc_datasets.py +++ b/tests/unit/utils/compare_nc_datasets/test_dataset_differences__additional.py @@ -1,67 +1,25 @@ """ Tests for :mod:`tests.unit.netcdf._compare_nc_files` - -Yes I know, tests of tests. But it seems necessary. +Split in two files ... + * HERE: "additional" tests cover subsidiary routines and the main + API usage modes. + * ( ALSO: "mainfunctions" (q.v.) cover the core functionality + -- which elements are compared and what errors this constructs. ) """ import shutil import warnings -from unittest import mock import netCDF4 as nc import numpy as np import pytest -from tests._compare_nc_datasets import ( - _compare_attributes, - _compare_name_lists, - compare_nc_datasets, +from ncdata.utils._compare_nc_datasets import ( + _attribute_differences, + _namelist_differences, + dataset_differences, ) from tests.test_samplecode_cdlgen_comparablecdl import ncgen_from_cdl -# CDL to create a reference file with "all" features included. -_base_cdl = """ -netcdf everything { -dimensions: - x = 2 ; - y = 3 ; - strlen = 5 ; -variables: - int x(x) ; - x:name = "var_x" ; - int var_2d(x, y) ; - uint var_u8(x) ; - float var_f4(x) ; - double var_f8(x) ; - char var_str(x, strlen) ; - int other(x) ; - other:attr_int = 1 ; - other:attr_float = 2.0f ; - other:attr_double = 2.0 ; - other:attr_string = "this" ; - int masked_int(y) ; - masked_int:_FillValue = -3 ; - int masked_float(y) ; - masked_float:_FillValue = -4.0 ; - -// global attributes: - :global_attr_1 = "one" ; - :global_attr_2 = 2 ; - -// groups: -group: grp_1 { - dimensions: - y = 7 ; - variables: - int parent_dim(x) ; - int own_dim(y) ; -} -group: grp_2 { - variables: - int grp2_x(x) ; -} -} -""" - _simple_cdl = """ netcdf test { dimensions: @@ -77,38 +35,32 @@ """ -class Test__compare_name_lists: +class Test_namelist_differences: # Test subsidiary routine for checking a list of names def test_empty(self): - errs = [] - _compare_name_lists(errs, [], [], "named-elements") + errs = _namelist_differences([], [], "named-elements") assert errs == [] def test_same(self): tst = ["a", "b"] - errs = [] - _compare_name_lists(errs, tst, tst, "named-elements") + errs = _namelist_differences(tst, tst, "named-elements") assert errs == [] def test_diff(self): - errs = [] - _compare_name_lists(errs, ["a"], [], "named-elements") + errs = _namelist_differences(["a"], [], "named-elements") assert errs == ["named-elements do not match: ['a'] != []"] def test_difforder(self): - errs = [] - _compare_name_lists(errs, ["a", "b"], ["b", "a"], "named-elements") + errs = _namelist_differences(["a", "b"], ["b", "a"], "named-elements") assert errs == [ "named-elements do not match: ['a', 'b'] != ['b', 'a']" ] def test_difforder_tolerant_warns(self): - errs = [] with pytest.warns( UserWarning, match="Ignoring: named-elements do not match" ): - _compare_name_lists( - errs, + errs = _namelist_differences( ["a", "b"], ["b", "a"], "named-elements", @@ -117,11 +69,9 @@ def test_difforder_tolerant_warns(self): assert errs == [] def test_difforder_tolerant_nowarn(self): - errs = [] with warnings.catch_warnings(): warnings.simplefilter("error") - _compare_name_lists( - errs, + errs = _namelist_differences( ["a", "b"], ["b", "a"], "named-elements", @@ -131,45 +81,39 @@ def test_difforder_tolerant_nowarn(self): assert errs == [] -class Test__compare_attributes: - def test_compare_attributes_namelists(self): - # Check that it calls the generic _compare_name_lists routine, passing all the +class Test_attribute_differences: + def test_compare_attributes_namelists(self, mocker): + # Check that it calls the generic _namelist_differences routine, passing all the # correct controls - # Mimic 2 objects with NO attributes. - attrs1 = mock.MagicMock() - attrs2 = mock.MagicMock() - # Make the test objects look like real files (not NcData), and ensure that - # obj.ncattrs() is iterable. - obj1 = mock.Mock( - spec="ncattrs", ncattrs=mock.Mock(return_value=attrs1) + # NB make the compared object mimic nc Variables, not NcData + attrnames_1 = ["a", "b"] + attrnames_2 = ["c", "d"] + obj1 = mocker.Mock( + spec=nc.Variable, ncattrs=mocker.Mock(return_value=attrnames_1) ) - obj2 = mock.Mock( - spec="ncattrs", ncattrs=mock.Mock(return_value=attrs2) + obj2 = mocker.Mock( + spec=nc.Variable, ncattrs=mocker.Mock(return_value=attrnames_2) ) - errs = mock.sentinel.errors_list elemname = "" - order = mock.sentinel.attrs_order - suppress = mock.sentinel.suppress_warnings - tgt = "tests._compare_nc_datasets._compare_name_lists" - with mock.patch(tgt) as patch_tgt: - _compare_attributes( - errs=errs, - obj1=obj1, - obj2=obj2, - elemname=elemname, - attrs_order=order, - suppress_warnings=suppress, - ) - assert patch_tgt.call_args_list == [ - mock.call( - errs, - attrs1, - attrs2, - " attribute lists", - order_strict=order, - suppress_warnings=suppress, - ) - ] + order = mocker.sentinel.attrs_order + suppress = mocker.sentinel.suppress_warnings + tgt = "ncdata.utils._compare_nc_datasets._namelist_differences" + patch_tgt = mocker.patch(tgt) + _attribute_differences( + obj1=obj1, + obj2=obj2, + elemname=elemname, + attrs_order=order, + suppress_warnings=suppress, + ) + (one_call,) = patch_tgt.call_args_list + assert one_call == mocker.call( + attrnames_1, + attrnames_2, + " attribute lists", + order_strict=order, + suppress_warnings=suppress, + ) class Nc4ObjectWithAttrsMimic: def __init__(self, **attrs): @@ -199,34 +143,37 @@ def test_compare_attributes_empty(self): # Test two objects with no attributes obj1 = self.Nc4ObjectWithAttrsMimic() obj2 = self.Nc4ObjectWithAttrsMimic() - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [] def test_compare_attributes_values__allok(self): # Objects with matching attributes obj1 = self.Nc4ObjectWithAttrsMimic(a=1, b=2) obj2 = self.Nc4ObjectWithAttrsMimic(a=1, b=2) - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") + assert errs == [] + + def test_compare_attributes_values__scalar_arrayof1(self): + # Objects with matching attributes + obj1 = self.Nc4ObjectWithAttrsMimic(a=1, b=2) + obj2 = self.Nc4ObjectWithAttrsMimic(a=1, b=[2]) + errs = _attribute_differences(obj1, obj2, "") assert errs == [] def test_compare_attributes_values__data_mismatch(self): # Attributes of different value (but matching dtype) obj1 = self.Nc4ObjectWithAttrsMimic(a=1, b=2, c=3) obj2 = self.Nc4ObjectWithAttrsMimic(a=1, b=-77, c=3) - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [ ' "b" attribute values differ : 2 != -77' ] - def test_compare_attributes_values__dtype_mismatch(self): + def test_compare_attributes_values__dtype_mismatch__length(self): # Attributes of different dtypes, even though values == obj1 = self.Nc4ObjectWithAttrsMimic(a=np.float32(0)) obj2 = self.Nc4ObjectWithAttrsMimic(a=np.float64(0)) - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [ ( ' "a" attribute datatypes differ : ' @@ -234,12 +181,47 @@ def test_compare_attributes_values__dtype_mismatch(self): ) ] + def test_compare_attributes_values__dtype_mismatch__signed_unsigned(self): + # Attributes of different dtypes, even though values == + obj1 = self.Nc4ObjectWithAttrsMimic(a=np.uint32(0)) + obj2 = self.Nc4ObjectWithAttrsMimic(a=np.int32(0)) + errs = _attribute_differences(obj1, obj2, "") + assert errs == [ + ( + ' "a" attribute datatypes differ : ' + "dtype('uint32') != dtype('int32')" + ) + ] + + def test_compare_attributes_values__dtype_mismatch__float_int(self): + # Attributes of different dtypes, even though values == + obj1 = self.Nc4ObjectWithAttrsMimic(a=np.float32(0)) + obj2 = self.Nc4ObjectWithAttrsMimic(a=np.int32(0)) + errs = _attribute_differences(obj1, obj2, "") + assert errs == [ + ( + ' "a" attribute datatypes differ : ' + "dtype('float32') != dtype('int32')" + ) + ] + + def test_compare_attributes_values__dtype_mismatch__numeric_string(self): + # Attributes of different dtypes, even though values == + obj1 = self.Nc4ObjectWithAttrsMimic(a=np.float32(0)) + obj2 = self.Nc4ObjectWithAttrsMimic(a="this") + errs = _attribute_differences(obj1, obj2, "") + assert errs == [ + ( + ' "a" attribute datatypes differ : ' + "dtype('float32') != " + ) + ] + def test_compare_attributes_values__dtype_and_data_mismatch(self): # Attributes of different dtypes, but values != obj1 = self.Nc4ObjectWithAttrsMimic(a=np.float32(0)) obj2 = self.Nc4ObjectWithAttrsMimic(a=np.float64(1)) - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [ ' "a" attribute datatypes differ : ' "dtype('float32') != dtype('float64')" @@ -250,8 +232,7 @@ def test_compare_attributes_values__data_arrays_match(self): array = np.arange(3.0) obj1 = self.Nc4ObjectWithAttrsMimic(a=array) obj2 = self.Nc4ObjectWithAttrsMimic(a=array) - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [] def test_compare_attributes_values__data_arrays_dtype_mismatch(self): @@ -259,8 +240,7 @@ def test_compare_attributes_values__data_arrays_dtype_mismatch(self): array = np.arange(3, dtype="f4") obj1 = self.Nc4ObjectWithAttrsMimic(a=array) obj2 = self.Nc4ObjectWithAttrsMimic(a=array.astype("f8")) - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [ ( ' "a" attribute datatypes differ : ' @@ -273,8 +253,7 @@ def test_compare_attributes_values__data_arrays_shape_mismatch(self): array = np.arange(3) obj1 = self.Nc4ObjectWithAttrsMimic(a=array) obj2 = self.Nc4ObjectWithAttrsMimic(a=array[:-1]) - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [ ( ' "a" attribute values differ : ' @@ -288,8 +267,7 @@ def test_compare_attributes_values__data_arrays_value_mismatch(self): array2 = np.array([1, 2, 777]) obj1 = self.Nc4ObjectWithAttrsMimic(a=array1) obj2 = self.Nc4ObjectWithAttrsMimic(a=array2) - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [ ( ' "a" attribute values differ : ' @@ -302,8 +280,7 @@ def test_compare_attributes_values__data_arrays_nans_match(self): array = np.array([1, np.nan, 3]) obj1 = self.Nc4ObjectWithAttrsMimic(a=array) obj2 = self.Nc4ObjectWithAttrsMimic(a=array) - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [] def test_compare_attributes_values__data_arrays_nans_mismatch(self): @@ -312,8 +289,7 @@ def test_compare_attributes_values__data_arrays_nans_mismatch(self): array2 = np.array([1.0, np.nan, 3.0]) obj1 = self.Nc4ObjectWithAttrsMimic(a=array1) obj2 = self.Nc4ObjectWithAttrsMimic(a=array2) - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [ ( ' "a" attribute values differ : ' @@ -325,8 +301,7 @@ def test_compare_attributes_values__string_nonstring(self): # Attributes of string and non-string types, since we handle that differently obj1 = self.Nc4ObjectWithAttrsMimic(a=1) obj2 = self.Nc4ObjectWithAttrsMimic(a="1") - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [ ' "a" attribute datatypes differ : ' "dtype('int64') != " @@ -336,16 +311,14 @@ def test_compare_attributes_values__string_match(self): # Attributes of string type (since netCDF4 returns char attributes as string) obj1 = self.Nc4ObjectWithAttrsMimic(S="this") obj2 = self.Nc4ObjectWithAttrsMimic(S="this") - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [] def test_compare_attributes_values__string_mismatch(self): # Attributes of string type (since netCDF4 returns char attributes as string) obj1 = self.Nc4ObjectWithAttrsMimic(S="this") obj2 = self.Nc4ObjectWithAttrsMimic(S="that") - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [ " \"S\" attribute values differ : 'this' != 'that'" ] @@ -354,21 +327,28 @@ def test_compare_attributes_values__string_array_match(self): # Attributes of string type (since netCDF4 returns char attributes as string) obj1 = self.Nc4ObjectWithAttrsMimic(S=["a", "b"]) obj2 = self.Nc4ObjectWithAttrsMimic(S=["a", "b"]) - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [] def test_compare_attributes_values__string_array_mismatch(self): # Attributes of string type (since netCDF4 returns char attributes as string) obj1 = self.Nc4ObjectWithAttrsMimic(S=["a", "b"]) obj2 = self.Nc4ObjectWithAttrsMimic(S=["a", "c"]) - errs = [] - _compare_attributes(errs, obj1, obj2, "") + errs = _attribute_differences(obj1, obj2, "") assert errs == [ ' "S" attribute values differ : ' "['a', 'b'] != ['a', 'c']" ] + def test_compare_attributes__ncdata_string_scalar_array(self): + # Attributes of string type (since netCDF4 returns char attributes as string) + from ncdata import NcAttribute, NcData + + obj1 = NcData(attributes=[NcAttribute("x", ["string"])]) + obj2 = NcData(attributes=[NcAttribute("x", "string")]) + errs = _attribute_differences(obj1, obj2, "") + assert errs == [] + @pytest.fixture(autouse=True, scope="module") def temp_ncfiles_dir(tmp_path_factory): @@ -405,12 +385,12 @@ def samefiles_bothtypes(samefiles_filesonly, sourcetype): class Test_compare_nc_files__api: def test_identical(self, samefiles_bothtypes): source1, source2 = samefiles_bothtypes - result = compare_nc_datasets(source1, source2) + result = dataset_differences(source1, source2) assert result == [] def test_identical_stringpaths(self, samefiles_filesonly): source1, source2 = samefiles_filesonly - result = compare_nc_datasets(str(source1), str(source2)) + result = dataset_differences(str(source1), str(source2)) assert result == [] def test_identical_datasets(self, samefiles_filesonly, sourcetype): @@ -419,7 +399,7 @@ def test_identical_datasets(self, samefiles_filesonly, sourcetype): try: ds1 = nc.Dataset(source1) ds2 = nc.Dataset(source2) - result = compare_nc_datasets(ds1, ds2) + result = dataset_differences(ds1, ds2) assert result == [] finally: for ds in (ds1, ds2): @@ -442,7 +422,7 @@ def test_small_difference( # Source1/2 are NcData : just modify source2 source2.attributes["extra_global_attr"] = 1 - result = compare_nc_datasets(source1, source2) + result = dataset_differences(source1, source2) assert result == [ "Dataset attribute lists do not match: [] != ['extra_global_attr']" ] @@ -474,7 +454,7 @@ def test_vardata_difference( if ds is not None: ds.close() - result = compare_nc_datasets(source1, source2) + result = dataset_differences(source1, source2) # N.B. ncdata comparison bypasses the masked+scaled view of data, hence the # message differs. Could fix this? mask1 = "masked" if sourcetype == "InputsFile" else "9.96921e+36" diff --git a/tests/unit/utils/compare_nc_datasets/test_dataset_differences__mainfunctions.py b/tests/unit/utils/compare_nc_datasets/test_dataset_differences__mainfunctions.py new file mode 100644 index 0000000..1e03e9f --- /dev/null +++ b/tests/unit/utils/compare_nc_datasets/test_dataset_differences__mainfunctions.py @@ -0,0 +1,479 @@ +""" +Tests for :mod:`tests.unit.netcdf._compare_nc_files` +Split in two files ... + * HERE: "mainfunctions" cover the core functionality + -- which elements are compared and what errors this constructs. + * ( ALSO: "additional" tests (q.v.) cover subsidiary routines and the + main API usage modes. ) +""" +import numpy as np +import pytest + +from ncdata import NcAttribute, NcData, NcDimension, NcVariable +from ncdata.utils import dataset_differences + +# from tests.data_testcase_schemas import _Datatype_Sample_Values, data_types +# data_types # avoid 'unused' warning + + +@pytest.fixture( + params=["in_named", "in_unnamed", "in_namedgroup", "in_unnamedgroup"] +) +def group_context(request): + """ + The different contexts of locations in a dataset + + In which an element (dimension, group or variable) might be found, and + which might appear different in the mismatch-error messages. + """ + return request.param + + +@pytest.fixture(params=["on_group", "on_variable"]) +def attr_context(request): + """The different contexts for an attribute in a dataset.""" + return request.param + + +@pytest.fixture(params=["ordered", "unordered"]) +def order_checking(request): + """Whether to test with order checking or not.""" + return request.param + + +def decode_ordercheck(order_checking): + return {"ordered": True, "unordered": False}[order_checking] + + +def location_prefix(group_context, attr_context="on_group"): + prefix = "Dataset" + if "namedgroup" in group_context: + prefix += "/inner_group" + if "variable" in attr_context: + prefix += ' variable "vx"' + return prefix + + +def put_group_into_context(testdata, group_context): + if group_context == "in_named": + pass + elif group_context == "in_unnamed": + testdata.name = None + elif "group" in group_context: + testdata.name = "inner_group" + testdata = NcData(name="outer_dataset", groups=[testdata]) + if group_context == "in_namedgroup": + pass + elif group_context == "in_unnamedgroup": + testdata.name = None + else: + raise ValueError(f"unknown group_context: {group_context!r}") + else: + raise ValueError(f"unknown group_context: {group_context!r}") + + return testdata + + +_DEBUG_RESULTS = True +# _DEBUG_RESULTS = True + + +def check(results, expected): + if _DEBUG_RESULTS: + print("\nResult messages:") + for msg in results: + print(" ", msg) + assert results == expected + + +class TestCompareDatasets: + @pytest.mark.parametrize("namecheck", ["withnames", "withoutnames"]) + @pytest.mark.parametrize("altname", ["named_y", "named_none"]) + def test_names(self, namecheck, altname): + do_namecheck = namecheck == "withnames" + altname = {"named_y": "y", "named_none": None}[altname] + data1, data2 = NcData(name="x"), NcData(name=altname) + + # Use kwargs just to confirm that the default for name-checking is 'off' + kwargs = dict(check_names=True) if do_namecheck else {} + errs = dataset_differences(data1, data2, **kwargs) + + if do_namecheck: + expected = [f"Datasets have different names: 'x' != {altname!r}."] + else: + expected = [] + check(errs, expected) + + +class TestCompareDimensions: + def dimension_testdata(self, group_context): + testdata = NcData( + name="dataset_1", + dimensions=[ + NcDimension("x", 2, unlimited=True), + NcDimension("y", 3, unlimited=False), + ], + ) + testdata = put_group_into_context(testdata, group_context) + return testdata + + @pytest.fixture(autouse=True) + def _dims_data(self, group_context): + data1, data2 = [ + self.dimension_testdata(group_context) for _ in range(2) + ] + location = data2 + if "group" in group_context: + location = location.groups["inner_group"] + + self.data1 = data1 + self.data2 = data2 + self.location_string = location_prefix(group_context) + self.dims = location.dimensions + + def test_name(self): + self.dims.rename("x", "q") + errs = dataset_differences(self.data1, self.data2) + expected = [ + f"{self.location_string} dimension lists do not match: " + "['x', 'y'] != ['q', 'y']" + ] + check(errs, expected) + + def test_size(self): + self.dims["x"].size = 77 + + errs = dataset_differences(self.data1, self.data2) + + expected = [ + f'{self.location_string} "x" dimensions have different sizes: 2 != 77' + ] + check(errs, expected) + + @pytest.mark.parametrize( + "check_unlim", ["unlims_checked", "unlims_unchecked"] + ) + def test_unlimited(self, check_unlim): + self.dims["y"].unlimited = True + + do_check_unlims = {"unlims_checked": True, "unlims_unchecked": False}[ + check_unlim + ] + errs = dataset_differences( + self.data1, self.data2, check_dims_unlimited=do_check_unlims + ) + + if do_check_unlims: + expected = [ + f'{self.location_string} "y" dimension has different "unlimited" status : ' + "False != True" + ] + else: + expected = [] + + check(errs, expected) + + def test_ordering(self, order_checking): + all_dims = list(self.dims.values()) + self.dims.clear() + self.dims.addall(all_dims[::-1]) + + do_ordercheck = decode_ordercheck(order_checking) + errs = dataset_differences( + self.data1, self.data2, check_dims_order=do_ordercheck + ) + + if do_ordercheck: + expected = [ + f"{self.location_string} dimension lists do not match: " + "['x', 'y'] != ['y', 'x']" + ] + else: + expected = [] + + check(errs, expected) + + def test_extra_or_missing(self): + all_dims = list(self.dims.values()) + # Remove the last dimension, so data1 has a dim not present in data2 + self.dims.clear() + self.dims.addall(all_dims[:-1]) + + errs = dataset_differences(self.data1, self.data2) + + expected = [ + f"{self.location_string} dimension lists do not match: " + "['x', 'y'] != ['x']" + ] + check(errs, expected) + + +class TestCompareAttributes: + def attribute_testdata(self, group_context): + testdata = NcData( + name="dataset_1", + variables=[ + NcVariable( + "vx", + dimensions=[], + data=np.array(1.0), + attributes=[ + NcAttribute("att1", 1), + NcAttribute("att2", 2), + ], + ) + ], + attributes=[ + NcAttribute("att1", 11), + NcAttribute("att2", 12), + ], + ) + testdata = put_group_into_context(testdata, group_context) + return testdata + + @pytest.fixture(autouse=True) + def _attrs_data(self, group_context, attr_context): + data1, data2 = [ + self.attribute_testdata(group_context) for _ in range(2) + ] + location = data2 + if "group" in group_context: + location = location.groups["inner_group"] + is_on_var = {"on_group": False, "on_variable": True}[attr_context] + if is_on_var: + location = location.variables["vx"] + + self.data1 = data1 + self.data2 = data2 + self.location_string = location_prefix(group_context, attr_context) + self.attrs = location.attributes + + def test_name(self): + self.attrs.rename("att1", "changed") + + errs = dataset_differences(self.data1, self.data2) + + expected = [ + f"{self.location_string} attribute lists do not match: " + "['att1', 'att2'] != ['changed', 'att2']" + ] + check(errs, expected) + + def test_value(self, attr_context): + self.attrs["att1"].value = np.array(999) + + errs = dataset_differences(self.data1, self.data2) + + if "variable" in attr_context: + value_string = "1" + else: + value_string = "11" + expected = [ + f'{self.location_string} "att1" attribute values differ : ' + f"array({value_string}) != array(999)" + ] + check(errs, expected) + + def test_ordering(self, order_checking): + do_ordercheck = decode_ordercheck(order_checking) + all_attrs = list(self.attrs.values()) + self.attrs.clear() + self.attrs.addall(all_attrs[::-1]) + + errs = dataset_differences( + self.data1, self.data2, check_attrs_order=do_ordercheck + ) + + if do_ordercheck: + expected = [ + f"{self.location_string} attribute lists do not match: " + "['att1', 'att2'] != ['att2', 'att1']" + ] + else: + expected = [] + check(errs, expected) + + def test_extra_or_missing(self, order_checking): + do_ordercheck = decode_ordercheck(order_checking) + del self.attrs["att1"] + + errs = dataset_differences( + self.data1, self.data2, check_attrs_order=do_ordercheck + ) + + expected = [ + f"{self.location_string} attribute lists do not match: " + "['att1', 'att2'] != ['att2']" + ] + check(errs, expected) + + @pytest.mark.parametrize("attname", ["fillvalue", "generic"]) + def test_fillvalue_anyorder(self, attname): + """The order of "_FillValue" attributes is specially ignored.""" + name = {"fillvalue": "_FillValue", "generic": "anyold"}[attname] + # data1, data2 have attrs in the other order + attr_pair = [NcAttribute(name, 1), NcAttribute("x", 1)] + data1, data2 = [ + NcData( + variables=[ + NcVariable("vx", (), data=np.array(0.0), attributes=attrs) + ] + ) + for attrs in (attr_pair, attr_pair[::-1]) + ] + + errs = dataset_differences(data1, data2) + + if "generic" in attname: + expected = [ + 'Dataset variable "vx" attribute lists do not match: ' + "['anyold', 'x'] != ['x', 'anyold']" + ] + else: + expected = [] + check(errs, expected) + + +class TestCompareVariables: + """ + Test variable comparison. + + Mostly, this is about comparison of the variable contents of a dataset + or group, since variable-to-variable comparison is done by + variable_differences, which is tested independently elsewhere. + This includes testing the generation of the variable identity strings in + various contexts (by parametrising over group_context). + """ + + @staticmethod + def _vars_testdata(group_context): + def data(): + return np.zeros((2, 3)) + + testdata = NcData( + name="dataset_1", + dimensions=[NcDimension("y", 2), NcDimension("x", 3)], + variables=[ + NcVariable("v1", ("y", "x"), data=data()), + NcVariable("v2", ("y", "x"), data=data()), + ], + ) + testdata = put_group_into_context(testdata, group_context) + return testdata + + @pytest.fixture(autouse=True) + def _vars_data(self, group_context): + data1, data2 = [self._vars_testdata(group_context) for _ in range(2)] + location = data2 + if "group" in group_context: + location = location.groups["inner_group"] + + self.data1 = data1 + self.data2 = data2 + self.location_string = location_prefix(group_context) + self.vars = location.variables + + def test_var_names(self): + self.vars.rename("v2", "q") + + errs = dataset_differences(self.data1, self.data2) + + expected = [ + f"{self.location_string} variable lists do not match: " + "['v1', 'v2'] != ['v1', 'q']" + ] + check(errs, expected) + + def test_var_order(self, order_checking): + all_vars = list(self.vars.values()) + self.vars.clear() + self.vars.addall(all_vars[::-1]) + + do_ordercheck = decode_ordercheck(order_checking) + errs = dataset_differences( + self.data1, self.data2, check_vars_order=do_ordercheck + ) + + if do_ordercheck: + expected = [ + f"{self.location_string} variable lists do not match: " + "['v1', 'v2'] != ['v2', 'v1']" + ] + else: + expected = [] + check(errs, expected) + + def test_vars_extra_or_missing(self, order_checking): + del self.vars["v1"] + + do_ordercheck = decode_ordercheck(order_checking) + errs = dataset_differences( + self.data1, self.data2, check_vars_order=do_ordercheck + ) + + expected = [ + f"{self.location_string} variable lists do not match: " + "['v1', 'v2'] != ['v2']" + ] + check(errs, expected) + + +class TestCompareGroups: + @staticmethod + def _groups_testdata(): + testdata = NcData( + name="dataset_1", + groups=[ + NcData(name=name, attributes=[NcAttribute("attr_1", 1)]) + for name in ("g1", "g2") + ], + ) + return testdata + + @pytest.fixture(autouse=True) + def _groups_data(self): + self.data1, self.data2 = [self._groups_testdata() for _ in range(2)] + self.groups = self.data2.groups + + def test_group_names(self): + self.groups.rename("g2", "q") + + errs = dataset_differences(self.data1, self.data2) + + expected = [ + "Dataset subgroup lists do not match: ['g1', 'g2'] != ['g1', 'q']" + ] + check(errs, expected) + + def test_group_order(self, order_checking): + all_groups = list(self.groups.values()) + self.groups.clear() + self.groups.addall(all_groups[::-1]) + + do_ordercheck = decode_ordercheck(order_checking) + errs = dataset_differences( + self.data1, self.data2, check_groups_order=do_ordercheck + ) + + if do_ordercheck: + expected = [ + "Dataset subgroup lists do not match: " + "['g1', 'g2'] != ['g2', 'g1']" + ] + else: + expected = [] + check(errs, expected) + + def test_groups_extra_or_missing(self, order_checking): + del self.groups["g1"] + + do_ordercheck = decode_ordercheck(order_checking) + errs = dataset_differences( + self.data1, self.data2, check_groups_order=do_ordercheck + ) + + # NB since the sets are different, the ordering control has no effect + expected = [ + "Dataset subgroup lists do not match: ['g1', 'g2'] != ['g2']" + ] + check(errs, expected) diff --git a/tests/unit/utils/compare_nc_datasets/test_variable_differences.py b/tests/unit/utils/compare_nc_datasets/test_variable_differences.py new file mode 100644 index 0000000..986008b --- /dev/null +++ b/tests/unit/utils/compare_nc_datasets/test_variable_differences.py @@ -0,0 +1,305 @@ +import dask.array as da +import numpy as np +import pytest + +from ncdata import NcVariable +from ncdata.utils import variable_differences + +_DEBUG_RESULTS = True +# _DEBUG_RESULTS = True + + +def check(results, expected): + if _DEBUG_RESULTS: + print("\nResult messages:") + for msg in results: + print(" ", msg) + assert results == expected + + +class TestSimpleProperties: + @pytest.fixture(autouse=True) + def _vars_data(self): + self.var1, self.var2 = [ + NcVariable("v1", ("y", "x"), data=np.zeros((2, 3))) + for _ in range(2) + ] + + def test_var_names(self): + self.var2.name = "q" + + errs = variable_differences(self.var1, self.var2) + expected = ['Variable "v1 / q" names differ : ' "'v1' != 'q'"] + check(errs, expected) + + def test_var_dims__reorder(self): + # N.B. here we check behaviour of the DIMENSIONS order control, but this does + # not apply to dimensions order in a variable,which is *always* significant. + self.var2.dimensions = self.var2.dimensions[::-1] + # N.B. the data shape doesn't now correspond, but that won't matter as, with + # mismatched dimensions, the data won't be checked. + + errs = variable_differences(self.var1, self.var2) + + expected = [ + 'Variable "v1" dimensions differ : ' "('y', 'x') != ('x', 'y')" + ] + check(errs, expected) + + def test_var_dims__extra_or_missing(self): + # N.B. here we check for DIMENSIONS order check control. + self.var2.dimensions = self.var2.dimensions[:-1] + # N.B. the data shape doesn't now correspond, but that won't matter as, with + # mismatched dimensions, the data won't be checked. + + errs = variable_differences(self.var1, self.var2) + + expected = ["Variable \"v1\" dimensions differ : ('y', 'x') != ('y',)"] + check(errs, expected) + + +class TestDtypes: + # Note: testing variable comparison via the 'main' public API instead of + # via 'variable_differences'. This makes sense because it is only called + # in one way, from one place. + @pytest.fixture(autouse=True) + def _vars_data(self): + self.var1, self.var2 = [ + NcVariable("v1", ("x"), data=np.zeros(3)) for _ in range(2) + ] + + def test_numbers_v_strings(self): + # Set a different dtype + # NB this is different from the actual data array, but that doesn't + # matter, as it won't attempt to compare strings with numbers + self.var2.dtype = np.dtype("S5") + + # Test the comparison + errs = variable_differences(self.var1, self.var2) + expected = [ + 'Variable "v1" datatypes differ : ' + "dtype('float64') != dtype('S5')" + ] + check(errs, expected) + + @pytest.mark.parametrize("equaldata", [False, True]) + def test_ints_v_floats(self, equaldata): + # In this case, there is also a data comparison to check. + v1 = self.var2 + + new_dtype = np.dtype(np.int32) + v1.data = v1.data.astype(new_dtype) + if not equaldata: + v1.data.flat[0] += 1 + v1.dtype = new_dtype + + # Test the comparison + errs = variable_differences(self.var1, self.var2) + + expected = [ + 'Variable "v1" datatypes differ : ' + "dtype('float64') != dtype('int32')" + ] + if not equaldata: + expected.append( + 'Variable "v1" data contents differ, at 1 points: ' + "@INDICES[(0,)] : LHS=[0.0], RHS=[1]" + ) + check(errs, expected) + + @pytest.mark.parametrize("equaldata", [False, True]) + def test_wordlengths(self, equaldata): + # Test floats with wordlength difference -- assume ints are the same + # In this case, there is also a data comparison to check. + v1 = self.var2 + + new_dtype = np.dtype(np.float32) + v1.data = v1.data.astype(new_dtype) + if not equaldata: + v1.data.flat[0] += 1 + v1.dtype = new_dtype + + # Test the comparison + errs = variable_differences(self.var1, self.var2) + + expected = [ + 'Variable "v1" datatypes differ : ' + "dtype('float64') != dtype('float32')" + ] + if not equaldata: + expected.append( + 'Variable "v1" data contents differ, at 1 points: ' + "@INDICES[(0,)] : LHS=[0.0], RHS=[1.0]" + ) + check(errs, expected) + + @pytest.mark.parametrize("equaldata", [False, True]) + def test_signed_unsigned(self, equaldata): + # Test floats with wordlength difference -- assume ints are the same + # In this case, there is also a data comparison to check. + new_dtype = np.dtype(np.int64) + v0 = self.var1 + v0.data = v0.data.astype(new_dtype) + v0.dtype = new_dtype + + new_dtype = np.dtype(np.uint64) + v1 = self.var2 + v1.data = v1.data.astype(new_dtype) + if not equaldata: + v1.data.flat[0] += 1 + v1.dtype = new_dtype + + # Test the comparison + errs = variable_differences(self.var1, self.var2) + + expected = [ + 'Variable "v1" datatypes differ : ' + "dtype('int64') != dtype('uint64')" + ] + if not equaldata: + expected.append( + 'Variable "v1" data contents differ, at 1 points: ' + "@INDICES[(0,)] : LHS=[0], RHS=[1]" + ) + check(errs, expected) + + +class TestDataCheck__controls: + # Note: testing variable comparison via the 'main' public API instead of + # via 'variable_differences'. This makes sense because it is only called + # in one way, from one place. + @pytest.fixture(autouse=True) + def _vars_data(self): + self.var1, self.var2 = [ + NcVariable("v1", ("x"), data=np.arange(6.0).reshape((2, 3))) + for _ in range(2) + ] + + def test_no_values_check(self): + self.var2.data += 1 + errs = variable_differences(self.var1, self.var2, check_var_data=False) + check(errs, []) + + def test_print_bad_nprint(self): + msg = "'show_n_diffs' must be >=1 : got 0." + with pytest.raises(ValueError, match=msg): + variable_differences( + self.var1, self.var2, show_n_first_different=0 + ) + + @pytest.mark.parametrize("ndiffs", [1, 2, 3]) + def test_ndiffs(self, ndiffs): + self.var2.data.flat[1 : ndiffs + 1] += 1 + errs = variable_differences(self.var1, self.var2) + detail = { + 1: "[(0, 1)] : LHS=[1.0], RHS=[2.0]", + 2: "[(0, 1), (0, 2)] : LHS=[1.0, 2.0], RHS=[2.0, 3.0]", + 3: ( + "[(0, 1), (0, 2), ...] : " + "LHS=[1.0, 2.0, ...], RHS=[2.0, 3.0, ...]" + ), + }[ndiffs] + expected = [ + f'Variable "v1" data contents differ, at {ndiffs} points: ' + f"@INDICES{detail}" + ] + check(errs, expected) + + @pytest.mark.parametrize("nprint", [1, 2, 3]) + def test_show_n_first_different(self, nprint): + self.var2.data.flat[1:3] += 1 + errs = variable_differences( + self.var1, self.var2, show_n_first_different=nprint + ) + detail = { + 1: "[(0, 1), ...] : LHS=[1.0, ...], RHS=[2.0, ...]", + 2: "[(0, 1), (0, 2)] : LHS=[1.0, 2.0], RHS=[2.0, 3.0]", + 3: "[(0, 1), (0, 2)] : LHS=[1.0, 2.0], RHS=[2.0, 3.0]", + }[nprint] + expected = [ + f'Variable "v1" data contents differ, at 2 points: ' + f"@INDICES{detail}" + ] + check(errs, expected) + + +class TestDataCheck__difference_reports: + # Note: testing variable comparison via the 'main' public API instead of + # via 'variable_differences'. This makes sense because it is only called + # in one way, from one place. + @pytest.fixture(autouse=True) + def _vars_data(self): + self.var1, self.var2 = [ + NcVariable("v1", ("x"), data=np.arange(4.0)) for _ in range(2) + ] + + @pytest.mark.parametrize("datavalues", ["same", "different"]) + @pytest.mark.parametrize("masks", ["onemasked", "bothmasked"]) + def test_masked(self, datavalues, masks): + different = datavalues == "different" + bothmasked = masks == "bothmasked" + testvar = self.var2 + testvar.data = np.ma.masked_array(testvar.data) + if different: + testvar.data[1:2] += 1 + testvar.data[1:2] = np.ma.masked + if bothmasked: + self.var1.data = np.ma.masked_array(self.var1.data) + self.var1.data[1:2] = np.ma.masked + errs = variable_differences(self.var1, self.var2) + if bothmasked: + expected = [] + else: + expected = [ + 'Variable "v1" data contents differ, at 1 points: ' + "@INDICES[(1,)] : LHS=[1.0], RHS=[masked]" + ] + check(errs, expected) + + @pytest.mark.parametrize("nans", ["onenans", "bothnans"]) + def test_nans(self, nans): + bothnans = nans == "bothnans" + self.var2.data[1:2] = np.nan + if bothnans: + self.var1.data[1:2] = np.nan + errs = variable_differences(self.var1, self.var2) + if bothnans: + expected = [] + else: + expected = [ + 'Variable "v1" data contents differ, at 1 points: ' + "@INDICES[(1,)] : LHS=[1.0], RHS=[nan]" + ] + check(errs, expected) + + def test_scalar(self): + # Check how a difference of scalar arrays is reported + for value, var in enumerate([self.var1, self.var2]): + var.dimensions = () + var.data = np.array(value, dtype=var.dtype) + errs = variable_differences(self.var1, self.var2) + expected = [ + 'Variable "v1" data contents differ, at 1 points: ' + "@INDICES[(0,)] : LHS=[0.0], RHS=[1.0]" + ] + check(errs, expected) + + @pytest.mark.parametrize( + "argtypes", ["real_real", "real_lazy", "lazy_lazy"] + ) + def test_real_and_lazy(self, argtypes): + type1, type2 = argtypes[:4], argtypes[-4:] + # fix the testvar to create a difference + self.var2.data[1:2] += 1 + # setup vars with lazy/real data arrays + for arraytype, var in zip([type1, type2], [self.var1, self.var2]): + if arraytype == "lazy": + var.data = da.from_array(var.data, chunks=-1) + # compare + check results + errs = variable_differences(self.var1, self.var2) + # N.B. the result should be the same in all cases + expected = [ + 'Variable "v1" data contents differ, at 1 points: ' + "@INDICES[(1,)] : LHS=[1.0], RHS=[2.0]" + ] + check(errs, expected)