From 0113f75530e1993240c5f850f63fb2b2f134c0ac Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 18 Nov 2022 11:55:17 +0100 Subject: [PATCH 01/10] Arbitrary dims for multi-model-stats --- esmvalcore/preprocessor/_multimodel.py | 118 ++++++++++++------ .../multimodel_statistics/test_multimodel.py | 46 ++++--- .../_multimodel/test_multimodel.py | 101 +++++++++++++-- 3 files changed, 192 insertions(+), 73 deletions(-) diff --git a/esmvalcore/preprocessor/_multimodel.py b/esmvalcore/preprocessor/_multimodel.py index 9c9fa9662e..db1e8f215a 100644 --- a/esmvalcore/preprocessor/_multimodel.py +++ b/esmvalcore/preprocessor/_multimodel.py @@ -17,6 +17,7 @@ import iris import iris.coord_categorisation import numpy as np +from iris.cube import Cube, CubeList from iris.util import equalise_attributes from esmvalcore.iris_helpers import date2num @@ -139,8 +140,16 @@ def _unify_time_coordinates(cubes): # Update the cubes' time coordinate (both point values and the units!) cube.coord('time').points = date2num(dates, t_unit, coord.dtype) cube.coord('time').units = t_unit - cube.coord('time').bounds = None + _guess_time_bounds(cube) + + +def _guess_time_bounds(cube): + """Guess time bounds if possible.""" + cube.coord('time').bounds = None + try: cube.coord('time').guess_bounds() + except ValueError: # Time has length 1 --> guessing bounds not possible + return def _time_coords_are_aligned(cubes): @@ -197,14 +206,34 @@ def _map_to_new_time(cube, time_points): def _align(cubes, span): - """Expand or subset cubes so they share a common time span.""" - _unify_time_coordinates(cubes) + """Expand or subset cubes so they share a common time span. + + Note + ---- + Cubes with no time dimension are ignored here. If some cubes have a time + dimension and some do not, this will raise an error at a later stage. - if _time_coords_are_aligned(cubes): + """ + time_cubes = CubeList( + [cube for cube in cubes if cube.coords('time')] + ) + no_time_cubes = CubeList( + [cube for cube in cubes if not cube.coords('time')] + ) + + # If no cubes with time dimension are given, we are done here + if not time_cubes: return cubes - all_time_arrays = [cube.coord('time').points for cube in cubes] + # Unify time coordinates for cubes with time dimension; if all of them are + # equal afterwards, we are also done + _unify_time_coordinates(time_cubes) + if _time_coords_are_aligned(time_cubes): + return no_time_cubes + time_cubes + # If time dimensions have different lengths, equalize them according to the + # 'span' option + all_time_arrays = [cube.coord('time').points for cube in time_cubes] if span == 'overlap': new_time_points = reduce(np.intersect1d, all_time_arrays) elif span == 'full': @@ -212,15 +241,16 @@ def _align(cubes, span): else: raise ValueError(f"Invalid argument for span: {span!r}" "Must be one of 'overlap', 'full'.") + new_time_cubes = [ + _map_to_new_time(cube, new_time_points) for cube in time_cubes + ] - new_cubes = [_map_to_new_time(cube, new_time_points) for cube in cubes] - - for cube in new_cubes: - # Make sure bounds exist and are consistent - cube.coord('time').bounds = None - cube.coord('time').guess_bounds() + # Make sure time bounds exist and are consistent + for cube in new_time_cubes: + _guess_time_bounds(cube) - return new_cubes + # Return all cubes + return no_time_cubes + new_time_cubes def _equalise_cell_methods(cubes): @@ -288,7 +318,7 @@ def _combine(cubes): concat_dim = iris.coords.AuxCoord(i, var_name=CONCAT_DIM) cube.add_aux_coord(concat_dim) - cubes = iris.cube.CubeList(cubes) + cubes = CubeList(cubes) merged_cube = cubes.merge_cube() @@ -296,19 +326,33 @@ def _combine(cubes): def _compute_slices(cubes): - """Create cube slices resulting in a combined cube of about 1 GiB.""" + """Create cube slices along the first dimension of the cubes. + + This results in a combined cube of about 1 GiB. + + Note + ---- + For scalar cubes, simply return ``None``. + + """ + # Scalar cubes + if cubes[0].shape == (): + yield None + return + + # Non-scalar cubes gibibyte = 2**30 total_bytes = cubes[0].data.nbytes * len(cubes) n_slices = int(np.ceil(total_bytes / gibibyte)) - n_timesteps = cubes[0].shape[0] - slice_len = int(np.ceil(n_timesteps / n_slices)) + len_dim_0 = cubes[0].shape[0] + slice_len = int(np.ceil(len_dim_0 / n_slices)) for i in range(n_slices): start = i * slice_len end = (i + 1) * slice_len - if end >= n_timesteps: - yield slice(start, n_timesteps) + if end >= len_dim_0: + yield slice(start, len_dim_0) return yield slice(start, end) @@ -320,7 +364,10 @@ def _compute_eager(cubes: list, *, operator: iris.analysis.Aggregator, result_slices = [] for chunk in _compute_slices(cubes): - single_model_slices = [cube[chunk] for cube in cubes] + if chunk is None: + single_model_slices = cubes # scalar cubes + else: + single_model_slices = [cube[chunk] for cube in cubes] combined_slice = _combine(single_model_slices) with warnings.catch_warnings(): warnings.filterwarnings( @@ -342,14 +389,14 @@ def _compute_eager(cubes: list, *, operator: iris.analysis.Aggregator, result_slices.append(collapsed_slice) try: - result_cube = iris.cube.CubeList(result_slices).concatenate_cube() + result_cube = CubeList(result_slices).concatenate_cube() except Exception as excinfo: raise ValueError( - "Multi-model statistics failed to concatenate results into a" - f" single array. This happened for operator {operator}" - f" with computed statistics {result_slices}." - "This can happen e.g. if the calculation results in inconsistent" - f" dtypes. Encountered the following exception: {excinfo}") + f"Multi-model statistics failed to concatenate results into a " + f"single array. This happened for operator {operator} " + f"with computed statistics {result_slices}. " + f"This can happen e.g. if the calculation results in inconsistent " + f"dtypes") from excinfo result_cube.data = np.ma.array(result_cube.data) result_cube.remove_coord(CONCAT_DIM) @@ -439,9 +486,8 @@ def multi_model_statistics(products, workflow and provenance information, and this option should typically be ignored. - Apart from the time coordinate, cubes must have consistent shapes. There - are two options to combine time coordinates of different lengths, see the - ``span`` argument. + Cubes must have consistent shapes. There are two options to combine time + coordinates of different lengths, see the ``span`` argument. Uses the statistical operators in :py:mod:`iris.analysis`, including ``mean``, ``median``, ``min``, ``max``, and ``std``. Percentiles are also @@ -461,7 +507,8 @@ def multi_model_statistics(products, span: str Overlap or full; if overlap, statitstics are computed on common time- span; if full, statistics are computed on full time spans, ignoring - missing data. + missing data. This option is ignored if input cubes do not have time + dimensions. statistics: list Statistical metrics to be computed, e.g. [``mean``, ``max``]. Choose from the operators listed in the iris.analysis package. Percentiles can @@ -471,8 +518,8 @@ def multi_model_statistics(products, preprocessorfiles as values. If products are passed as input, the statistics cubes will be assigned to these output products. groupby: tuple - Group products by a given tag or attribute, e.g. - ('project', 'dataset', 'tag1'). + Group products by a given tag or attribute, e.g., ('project', + 'dataset', 'tag1'). This is ignored if ``products`` is a list of cubes. keep_input_datasets: bool If True, the output will include the input datasets. If False, only the computed statistics will be returned. @@ -489,7 +536,7 @@ def multi_model_statistics(products, If span is neither overlap nor full, or if input type is neither cubes nor products. """ - if all(isinstance(p, iris.cube.Cube) for p in products): + if all(isinstance(p, Cube) for p in products): return _multicube_statistics( cubes=products, statistics=statistics, @@ -514,9 +561,10 @@ def multi_model_statistics(products, return statistics_products raise ValueError( - "Input type for multi_model_statistics not understood. Expected " - "iris.cube.Cube or esmvalcore.preprocessor.PreprocessorFile, " - "got {}".format(products)) + f"Input type for multi_model_statistics not understood. Expected " + f"iris.cube.Cube or esmvalcore.preprocessor.PreprocessorFile, " + f"got {products}" + ) def ensemble_statistics(products, statistics, diff --git a/tests/sample_data/multimodel_statistics/test_multimodel.py b/tests/sample_data/multimodel_statistics/test_multimodel.py index 480169dc61..e93f935a8d 100644 --- a/tests/sample_data/multimodel_statistics/test_multimodel.py +++ b/tests/sample_data/multimodel_statistics/test_multimodel.py @@ -252,24 +252,6 @@ def test_multimodel_no_vertical_dimension(timeseries_cubes_month): multimodel_test(cubes, span=span, statistic='mean') -@pytest.mark.use_sample_data -@pytest.mark.xfail( - raises=iris.exceptions.MergeError, - reason='https://github.com/ESMValGroup/ESMValCore/issues/956') -# @pytest.mark.xfail( -# raises=iris.exceptions.CoordinateNotFoundError, -# reason='https://github.com/ESMValGroup/ESMValCore/issues/891') -def test_multimodel_no_horizontal_dimension(timeseries_cubes_month): - """Test statistic without horizontal dimension using monthly data.""" - span = 'full' - cubes = timeseries_cubes_month - cubes = [cube[:, :, 0, 0] for cube in cubes] - # Coordinate not found error - # iris.exceptions.CoordinateNotFoundError: - # 'Expected to find exactly 1 depth coordinate, but found none.' - multimodel_test(cubes, span=span, statistic='mean') - - @pytest.mark.use_sample_data def test_multimodel_only_time_dimension(timeseries_cubes_month): """Test statistic without only the time dimension using monthly data.""" @@ -280,13 +262,27 @@ def test_multimodel_only_time_dimension(timeseries_cubes_month): @pytest.mark.use_sample_data -@pytest.mark.xfail( - raises=ValueError, - reason='https://github.com/ESMValGroup/ESMValCore/issues/890') def test_multimodel_no_time_dimension(timeseries_cubes_month): - """Test statistic without time dimension using monthly data.""" + """Test statistic without time dimension using monthly data. + + Also remove air_pressure dimensions since this slightly differs across + cubes. See https://github.com/ESMValGroup/ESMValCore/issues/956. + + """ span = 'full' cubes = timeseries_cubes_month - cubes = [cube[0] for cube in cubes] - # ValueError: Cannot guess bounds for a coordinate of length 1. - multimodel_test(cubes, span=span, statistic='mean') + cubes = [cube[0, 0] for cube in cubes] + + result = multimodel_test(cubes, span=span, statistic='mean')['mean'] + assert result.shape == (3, 2) + + +@pytest.mark.use_sample_data +def test_multimodel_scalar_cubes(timeseries_cubes_month): + """Test statistic with scalar cubes.""" + span = 'full' + cubes = timeseries_cubes_month + cubes = [cube[0, 0, 0, 0] for cube in cubes] + + result = multimodel_test(cubes, span=span, statistic='mean')['mean'] + assert result.shape == () diff --git a/tests/unit/preprocessor/_multimodel/test_multimodel.py b/tests/unit/preprocessor/_multimodel/test_multimodel.py index 7d1f6bc2bc..de1f9a8855 100644 --- a/tests/unit/preprocessor/_multimodel/test_multimodel.py +++ b/tests/unit/preprocessor/_multimodel/test_multimodel.py @@ -10,8 +10,8 @@ import numpy as np import pytest from cf_units import Unit -from iris.coords import AuxCoord -from iris.cube import Cube +from iris.coords import AuxCoord, DimCoord +from iris.cube import Cube, CubeList import esmvalcore.preprocessor._multimodel as mm from esmvalcore.iris_helpers import date2num @@ -34,12 +34,32 @@ def assert_array_allclose(this, other): np.testing.assert_allclose(this, other) +@pytest.fixture +def cubes_with_arbitrary_dimensions(): + """Create cubes with non-standard dimensions.""" + a_coord = DimCoord([1, 2, 3], var_name='a') + b_coord = DimCoord([1], var_name='b') + s_coord = AuxCoord(0, var_name='s') + cube_kwargs = { + 'var_name': 'x', + 'dim_coords_and_dims': [(a_coord, 0), (b_coord, 1)], + 'aux_coords_and_dims': [(s_coord, ())], + } + + cubes = CubeList([ + Cube([[0.0], [0.0], [0.0]], **cube_kwargs), + Cube([[0.0], [2.0], [1.0]], **cube_kwargs), + Cube([[0.0], [4.0], [2.0]], **cube_kwargs), + ]) + + return cubes + + def timecoord(frequency, calendar='gregorian', offset='days since 1850-01-01', num=3): """Return a time coordinate with the given time points and calendar.""" - time_points = range(1, num + 1) if frequency == 'hourly': @@ -53,7 +73,7 @@ def timecoord(frequency, unit = Unit(offset, calendar=calendar) points = date2num(dates, unit) - return iris.coords.DimCoord(points, standard_name='time', units=unit) + return DimCoord(points, standard_name='time', units=unit) def generate_cube_from_dates( @@ -90,9 +110,9 @@ def generate_cube_from_dates( else: len_data = len(dates) unit = Unit(offset, calendar=calendar) - time = iris.coords.DimCoord(date2num(dates, unit), - standard_name='time', - units=unit) + time = DimCoord(date2num(dates, unit), + standard_name='time', + units=unit) data = np.array((fill_val, ) * len_data, dtype=np.float32) @@ -104,7 +124,6 @@ def generate_cube_from_dates( def get_cubes_for_validation_test(frequency, lazy=False): """Set up cubes used for testing multimodel statistics.""" - # Simple 1d cube with standard time cord cube1 = generate_cube_from_dates(frequency, lazy=lazy) @@ -127,7 +146,7 @@ def get_cubes_for_validation_test(frequency, lazy=False): def get_cube_for_equal_coords_test(num_cubes): - """Setup cubes with equal auxiliary coordinates.""" + """Set up cubes with equal auxiliary coordinates.""" cubes = [] for num in range(num_cubes): @@ -309,7 +328,6 @@ def test_lazy_data_inconsistent_times(span): This hits `_align`, which adds additional computations which must be lazy. """ - cubes = ( generate_cube_from_dates( [datetime(1850, i, 15, 0, 0, 0) for i in range(1, 10)], lazy=True), @@ -376,10 +394,8 @@ def test_get_consistent_time_unit(calendar1, calendar2, expected): @pytest.mark.parametrize('span', SPAN_OPTIONS) def test_align(span): """Test _align function.""" - # TODO --> check that if a cube is extended, # the extended points are masked (not NaN!) - len_data = 3 cubes = [] @@ -859,7 +875,7 @@ def test_remove_fx_variables(): def test_no_warn_model_dim_non_contiguous(recwarn): """Test that now warning is raised that model dim is non-contiguous.""" - coord = iris.coords.DimCoord( + coord = DimCoord( [0.5, 1.5], bounds=[[0, 1.], [1., 2.]], standard_name='time', @@ -921,3 +937,62 @@ def test_preserve_equal_coordinates(): assert stat_cube.coord('x').standard_name is None assert stat_cube.coord('x').long_name is None assert stat_cube.coord('x').attributes == {} + + +def test_arbitrary_dims_2d(cubes_with_arbitrary_dimensions): + """Test ``multi_model_statistics`` with arbitrary dimensions.""" + stat_cubes = multi_model_statistics( + cubes_with_arbitrary_dimensions, + span='overlap', + statistics=['sum'], + ) + assert len(stat_cubes) == 1 + assert 'sum' in stat_cubes + stat_cube = stat_cubes['sum'] + assert stat_cube.shape == (3, 1) + assert_array_allclose(stat_cube.data, np.ma.array([[0.0], [6.0], [3.0]])) + + +def test_arbitrary_dims_1d_1(cubes_with_arbitrary_dimensions): + """Test ``multi_model_statistics`` with arbitrary dimensions.""" + cubes = [cube[0] for cube in cubes_with_arbitrary_dimensions] + stat_cubes = multi_model_statistics( + cubes, + span='overlap', + statistics=['sum'], + ) + assert len(stat_cubes) == 1 + assert 'sum' in stat_cubes + stat_cube = stat_cubes['sum'] + assert stat_cube.shape == (1,) + assert_array_allclose(stat_cube.data, np.ma.array([0.0])) + + +def test_arbitrary_dims_1d_3(cubes_with_arbitrary_dimensions): + """Test ``multi_model_statistics`` with arbitrary dimensions.""" + cubes = [cube[:, 0] for cube in cubes_with_arbitrary_dimensions] + stat_cubes = multi_model_statistics( + cubes, + span='overlap', + statistics=['sum'], + ) + assert len(stat_cubes) == 1 + assert 'sum' in stat_cubes + stat_cube = stat_cubes['sum'] + assert stat_cube.shape == (3,) + assert_array_allclose(stat_cube.data, np.ma.array([0.0, 6.0, 3.0])) + + +def test_arbitrary_dims_0d(cubes_with_arbitrary_dimensions): + """Test ``multi_model_statistics`` with arbitrary dimensions.""" + cubes = [cube[0, 0] for cube in cubes_with_arbitrary_dimensions] + stat_cubes = multi_model_statistics( + cubes, + span='overlap', + statistics=['sum'], + ) + assert len(stat_cubes) == 1 + assert 'sum' in stat_cubes + stat_cube = stat_cubes['sum'] + assert stat_cube.shape == () + assert_array_allclose(stat_cube.data, np.ma.array(0.0)) From 7872f25b0c7106f774d175b8fcfb3c0e09ec8b97 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 18 Nov 2022 12:13:04 +0100 Subject: [PATCH 02/10] Re-added test for expected MergeError --- .../multimodel_statistics/test_multimodel.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/sample_data/multimodel_statistics/test_multimodel.py b/tests/sample_data/multimodel_statistics/test_multimodel.py index e93f935a8d..dd2e3bf739 100644 --- a/tests/sample_data/multimodel_statistics/test_multimodel.py +++ b/tests/sample_data/multimodel_statistics/test_multimodel.py @@ -9,6 +9,7 @@ import iris import numpy as np import pytest +from iris.exceptions import MergeError from esmvalcore.preprocessor import extract_time from esmvalcore.preprocessor._multimodel import multi_model_statistics @@ -28,7 +29,7 @@ pytest.param( 'proleptic_gregorian', marks=pytest.mark.xfail( - raises=iris.exceptions.MergeError, + raises=MergeError, reason='https://github.com/ESMValGroup/ESMValCore/issues/956')), pytest.param( 'julian', @@ -214,7 +215,7 @@ def multimodel_regression_test(cubes, span, name): @pytest.mark.xfail( - raises=iris.exceptions.MergeError, + raises=MergeError, reason='https://github.com/ESMValGroup/ESMValCore/issues/956') @pytest.mark.use_sample_data @pytest.mark.parametrize('span', SPAN_PARAMS) @@ -252,6 +253,19 @@ def test_multimodel_no_vertical_dimension(timeseries_cubes_month): multimodel_test(cubes, span=span, statistic='mean') +@pytest.mark.use_sample_data +def test_multimodel_merge_error(timeseries_cubes_month): + """Test statistic with slightly different vertical coordinates. + + See https://github.com/ESMValGroup/ESMValCore/issues/956. + + """ + span = 'full' + cubes = timeseries_cubes_month + with pytest.raises(MergeError): + multimodel_test(cubes, span=span, statistic='mean') + + @pytest.mark.use_sample_data def test_multimodel_only_time_dimension(timeseries_cubes_month): """Test statistic without only the time dimension using monthly data.""" From 99b0589dec7a47cdea35604b78b8a5b8174dcde6 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 18 Nov 2022 12:32:12 +0100 Subject: [PATCH 03/10] Updated doc --- doc/recipe/preprocessor.rst | 25 +++++------ esmvalcore/preprocessor/_multimodel.py | 5 ++- .../_multimodel/test_multimodel.py | 41 +++++++++++++++++++ 3 files changed, 54 insertions(+), 17 deletions(-) diff --git a/doc/recipe/preprocessor.rst b/doc/recipe/preprocessor.rst index 6b372b9143..1abb54f145 100644 --- a/doc/recipe/preprocessor.rst +++ b/doc/recipe/preprocessor.rst @@ -1007,14 +1007,13 @@ of argument parameters passed to ``multi_model_statistics``. Percentiles can be specified like ``p1.5`` or ``p95``. The decimal point will be replaced by a dash in the output file. -Restrictive computation is also available by excluding any set of models that +Restrictive computation is also available by excluding any set of models that the user will not want to include in the statistics (by setting ``exclude: -[excluded models list]`` argument). The implementation has a few restrictions -that apply to the input data: model datasets must have consistent shapes, apart -from the time dimension; and cubes with more than four dimensions (time, -vertical axis, two horizontal axes) are not supported. +[excluded models list]`` argument). -Input datasets may have different time coordinates. Statistics can be computed +Input datasets may have different time coordinates. +Apart from that, all dimensions must match. +Statistics can be computed across overlapping times only (``span: overlap``) or across the full time span of the combined models (``span: full``). The preprocessor sets a common time coordinate on all datasets. As the number of days in a year may vary between @@ -1038,10 +1037,6 @@ parameter ``keep_input_datasets`` to ``false`` (default value is ``true``). exclude: [NCEP] keep_input_datasets: false -Input datasets may have different time coordinates. The multi-model statistics -preprocessor sets a common time coordinate on all datasets. As the number of -days in a year may vary between calendars, (sub-)daily data are not supported. - Multi-model statistics also supports a ``groupby`` argument. You can group by any dataset key (``project``, ``experiment``, etc.) or a combination of keys in a list. You can also add an arbitrary tag to a dataset definition and then group by that tag. When @@ -1967,11 +1962,11 @@ See also :func:`esmvalcore.preprocessor.detrend`. Rolling window statistics ========================= -One can calculate rolling window statistics using the -preprocessor function ``rolling_window_statistics``. +One can calculate rolling window statistics using the +preprocessor function ``rolling_window_statistics``. This function takes three parameters: -* ``coordinate``: coordinate over which the rolling-window statistics is +* ``coordinate``: coordinate over which the rolling-window statistics is calculated. * ``operator``: operation to apply. Accepted values are 'mean', 'median', @@ -1980,12 +1975,12 @@ This function takes three parameters: * ``window_length``: size of the rolling window to use (number of points). This example applied on daily precipitation data calculates two-day rolling -precipitation sum. +precipitation sum. .. code-block:: yaml preprocessors: - preproc_rolling_window: + preproc_rolling_window: coordinate: time operator: sum window_length: 2 diff --git a/esmvalcore/preprocessor/_multimodel.py b/esmvalcore/preprocessor/_multimodel.py index db1e8f215a..08abcdc2a1 100644 --- a/esmvalcore/preprocessor/_multimodel.py +++ b/esmvalcore/preprocessor/_multimodel.py @@ -486,8 +486,9 @@ def multi_model_statistics(products, workflow and provenance information, and this option should typically be ignored. - Cubes must have consistent shapes. There are two options to combine time - coordinates of different lengths, see the ``span`` argument. + Cubes must have consistent shapes apart from a potential time dimensions. + There are two options to combine time coordinates of different lengths, see + the ``span`` argument. Uses the statistical operators in :py:mod:`iris.analysis`, including ``mean``, ``median``, ``min``, ``max``, and ``std``. Percentiles are also diff --git a/tests/unit/preprocessor/_multimodel/test_multimodel.py b/tests/unit/preprocessor/_multimodel/test_multimodel.py index de1f9a8855..84884d8f7d 100644 --- a/tests/unit/preprocessor/_multimodel/test_multimodel.py +++ b/tests/unit/preprocessor/_multimodel/test_multimodel.py @@ -55,6 +55,30 @@ def cubes_with_arbitrary_dimensions(): return cubes +@pytest.fixture +def cubes_5d(): + """Create 5d cubes.""" + a_coord = DimCoord([1], var_name='a') + b_coord = DimCoord([1], var_name='b') + c_coord = DimCoord([1], var_name='c') + d_coord = DimCoord([1], var_name='d') + e_coord = DimCoord([1], var_name='e') + coord_spec = [ + (a_coord, 0), + (b_coord, 1), + (c_coord, 2), + (d_coord, 3), + (e_coord, 4), + ] + + cubes = CubeList([ + Cube(np.full((1, 1, 1, 1, 1), 1.0), dim_coords_and_dims=coord_spec), + Cube(np.full((1, 1, 1, 1, 1), 2.0), dim_coords_and_dims=coord_spec), + ]) + + return cubes + + def timecoord(frequency, calendar='gregorian', offset='days since 1850-01-01', @@ -939,6 +963,23 @@ def test_preserve_equal_coordinates(): assert stat_cube.coord('x').attributes == {} +def test_arbitrary_dims_5d(cubes_5d): + """Test ``multi_model_statistics`` with 5D cubes.""" + stat_cubes = multi_model_statistics( + cubes_5d, + span='overlap', + statistics=['sum'], + ) + assert len(stat_cubes) == 1 + assert 'sum' in stat_cubes + stat_cube = stat_cubes['sum'] + assert stat_cube.shape == (1, 1, 1, 1, 1) + assert_array_allclose( + stat_cube.data, + np.ma.array(np.full((1, 1, 1, 1, 1), 3.0)), + ) + + def test_arbitrary_dims_2d(cubes_with_arbitrary_dimensions): """Test ``multi_model_statistics`` with arbitrary dimensions.""" stat_cubes = multi_model_statistics( From b41043a2e7a15db6cc0ac3140c7891f56b289d36 Mon Sep 17 00:00:00 2001 From: Manuel Schlund <32543114+schlunma@users.noreply.github.com> Date: Mon, 21 Nov 2022 10:55:07 +0100 Subject: [PATCH 04/10] Update esmvalcore/preprocessor/_multimodel.py Co-authored-by: Bettina Gier --- esmvalcore/preprocessor/_multimodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/esmvalcore/preprocessor/_multimodel.py b/esmvalcore/preprocessor/_multimodel.py index 45571597fa..34bfa0a9fa 100644 --- a/esmvalcore/preprocessor/_multimodel.py +++ b/esmvalcore/preprocessor/_multimodel.py @@ -486,7 +486,7 @@ def multi_model_statistics(products, workflow and provenance information, and this option should typically be ignored. - Cubes must have consistent shapes apart from a potential time dimensions. + Cubes must have consistent shapes apart from a potential time dimension. There are two options to combine time coordinates of different lengths, see the ``span`` argument. From 7928b8d38c0ac08b024984f9ee94b1356bec61e5 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 22 Nov 2022 10:36:08 +0100 Subject: [PATCH 05/10] Simplify time handling --- esmvalcore/preprocessor/_multimodel.py | 71 +++++++++++++------------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/esmvalcore/preprocessor/_multimodel.py b/esmvalcore/preprocessor/_multimodel.py index 34bfa0a9fa..b7da5d36d9 100644 --- a/esmvalcore/preprocessor/_multimodel.py +++ b/esmvalcore/preprocessor/_multimodel.py @@ -18,6 +18,7 @@ import iris.coord_categorisation import numpy as np from iris.cube import Cube, CubeList +from iris.exceptions import MergeError from iris.util import equalise_attributes from esmvalcore.iris_helpers import date2num @@ -205,35 +206,15 @@ def _map_to_new_time(cube, time_points): return new_cube -def _align(cubes, span): - """Expand or subset cubes so they share a common time span. +def _align_time_coord(cubes, span): + """Expand or subset cubes so they share a common time span.""" + _unify_time_coordinates(cubes) - Note - ---- - Cubes with no time dimension are ignored here. If some cubes have a time - dimension and some do not, this will raise an error at a later stage. - - """ - time_cubes = CubeList( - [cube for cube in cubes if cube.coords('time')] - ) - no_time_cubes = CubeList( - [cube for cube in cubes if not cube.coords('time')] - ) - - # If no cubes with time dimension are given, we are done here - if not time_cubes: + if _time_coords_are_aligned(cubes): return cubes - # Unify time coordinates for cubes with time dimension; if all of them are - # equal afterwards, we are also done - _unify_time_coordinates(time_cubes) - if _time_coords_are_aligned(time_cubes): - return no_time_cubes + time_cubes + all_time_arrays = [cube.coord('time').points for cube in cubes] - # If time dimensions have different lengths, equalize them according to the - # 'span' option - all_time_arrays = [cube.coord('time').points for cube in time_cubes] if span == 'overlap': new_time_points = reduce(np.intersect1d, all_time_arrays) elif span == 'full': @@ -241,16 +222,14 @@ def _align(cubes, span): else: raise ValueError(f"Invalid argument for span: {span!r}" "Must be one of 'overlap', 'full'.") - new_time_cubes = [ - _map_to_new_time(cube, new_time_points) for cube in time_cubes - ] - # Make sure time bounds exist and are consistent - for cube in new_time_cubes: + new_cubes = [_map_to_new_time(cube, new_time_points) for cube in cubes] + + for cube in new_cubes: + # Make sure bounds exist and are consistent _guess_time_bounds(cube) - # Return all cubes - return no_time_cubes + new_time_cubes + return new_cubes def _equalise_cell_methods(cubes): @@ -320,7 +299,13 @@ def _combine(cubes): cubes = CubeList(cubes) - merged_cube = cubes.merge_cube() + try: + merged_cube = cubes.merge_cube() + except MergeError as exc: + raise ValueError( + "Multi-model statistics failed to merge input cubes into a single " + "array" + ) from exc return merged_cube @@ -424,9 +409,25 @@ def _multicube_statistics(cubes, statistics, span): raise ValueError('Cannot perform multicube statistics ' 'for a single cube.') - copied_cubes = [cube.copy() for cube in cubes] # avoid modifying inputs - aligned_cubes = _align(copied_cubes, span=span) + # Avoid modifying inputs + copied_cubes = [cube.copy() for cube in cubes] + + # If all cubes contain a time coordinate, align them. If only some of them + # contain a time coordinate, raise an exception. If no cube contains a time + # coordinate, do nothing. + time_coords = [cube.coords('time') for cube in cubes] + if any(time_coords) and not all(time_coords): + raise ValueError( + "Multi-model statistics failed to merge input cubes into a single " + "array: some cubes have a 'time' dimension, some do not have a " + "'time' dimension." + ) + if all(time_coords): + aligned_cubes = _align_time_coord(copied_cubes, span=span) + else: + aligned_cubes = copied_cubes + # Calculate statistics statistics_cubes = {} for statistic in statistics: logger.debug('Multicube statistics: computing: %s', statistic) From 9527f80e949de8aacfbdf1811f1f6af7f3a12e34 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 22 Nov 2022 10:36:30 +0100 Subject: [PATCH 06/10] Expanded tests --- .../multimodel_statistics/test_multimodel.py | 174 +++++++++++++----- .../_multimodel/test_multimodel.py | 22 ++- 2 files changed, 147 insertions(+), 49 deletions(-) diff --git a/tests/sample_data/multimodel_statistics/test_multimodel.py b/tests/sample_data/multimodel_statistics/test_multimodel.py index cddaa4d8f2..f06cde40d3 100644 --- a/tests/sample_data/multimodel_statistics/test_multimodel.py +++ b/tests/sample_data/multimodel_statistics/test_multimodel.py @@ -10,7 +10,6 @@ import iris import numpy as np import pytest -from iris.exceptions import MergeError from esmvalcore.preprocessor import extract_time from esmvalcore.preprocessor._multimodel import multi_model_statistics @@ -20,24 +19,6 @@ # Increase this number anytime you change the cached input data to the tests. TEST_REVISION = 1 -CALENDAR_PARAMS = ( - pytest.param( - '360_day', - marks=pytest.mark.skip( - reason='Cannot calculate statistics with single cube in list')), - '365_day', - 'standard' if cf_units.__version__ >= '3.1' else 'gregorian', - pytest.param( - 'proleptic_gregorian', - marks=pytest.mark.xfail( - raises=MergeError, - reason='https://github.com/ESMValGroup/ESMValCore/issues/956')), - pytest.param( - 'julian', - marks=pytest.mark.skip( - reason='Cannot calculate statistics with single cube in list')), -) - SPAN_PARAMS = ('overlap', 'full') @@ -215,42 +196,93 @@ def multimodel_regression_test(cubes, span, name): raise RuntimeError(f'Wrote reference data to {filename.absolute()}') -@pytest.mark.xfail( - raises=MergeError, - reason='https://github.com/ESMValGroup/ESMValCore/issues/956') @pytest.mark.use_sample_data @pytest.mark.parametrize('span', SPAN_PARAMS) def test_multimodel_regression_month(timeseries_cubes_month, span): - """Test statistic.""" + """Test statistic fail due to differing input coordinates (pressure). + + See https://github.com/ESMValGroup/ESMValCore/issues/956. + + """ cubes = timeseries_cubes_month name = 'timeseries_monthly' - multimodel_regression_test( - name=name, - span=span, - cubes=cubes, + msg = ( + "Multi-model statistics failed to merge input cubes into a single " + "array" ) + with pytest.raises(ValueError, match=msg): + multimodel_regression_test(name=name, span=span, cubes=cubes) + + +@pytest.mark.use_sample_data +@pytest.mark.parametrize('span', SPAN_PARAMS) +def test_multimodel_regression_day_standard(timeseries_cubes_day, span): + """Test statistic.""" + calendar = 'standard' if cf_units.__version__ >= '3.1' else 'gregorian' + cubes = timeseries_cubes_day[calendar] + name = f'timeseries_daily_{calendar}' + multimodel_regression_test(name=name, span=span, cubes=cubes) @pytest.mark.use_sample_data -@pytest.mark.parametrize('calendar', CALENDAR_PARAMS) @pytest.mark.parametrize('span', SPAN_PARAMS) -def test_multimodel_regression_day(timeseries_cubes_day, span, calendar): +def test_multimodel_regression_day_365_day(timeseries_cubes_day, span): """Test statistic.""" + calendar = '365_day' cubes = timeseries_cubes_day[calendar] name = f'timeseries_daily_{calendar}' - multimodel_regression_test( - name=name, - span=span, - cubes=cubes, + multimodel_regression_test(name=name, span=span, cubes=cubes) + + +@pytest.mark.skip( + reason='Cannot calculate statistics with single cube in list' +) +@pytest.mark.use_sample_data +@pytest.mark.parametrize('span', SPAN_PARAMS) +def test_multimodel_regression_day_360_day(timeseries_cubes_day, span): + """Test statistic.""" + calendar = '360_day' + cubes = timeseries_cubes_day[calendar] + name = f'timeseries_daily_{calendar}' + multimodel_regression_test(name=name, span=span, cubes=cubes) + + +@pytest.mark.skip( + reason='Cannot calculate statistics with single cube in list' +) +@pytest.mark.use_sample_data +@pytest.mark.parametrize('span', SPAN_PARAMS) +def test_multimodel_regression_day_julian(timeseries_cubes_day, span): + """Test statistic.""" + calendar = 'julian' + cubes = timeseries_cubes_day[calendar] + name = f'timeseries_daily_{calendar}' + multimodel_regression_test(name=name, span=span, cubes=cubes) + + +@pytest.mark.use_sample_data +@pytest.mark.parametrize('span', SPAN_PARAMS) +def test_multimodel_regression_day_proleptic_gregorian( + timeseries_cubes_day, + span, +): + """Test statistic.""" + calendar = 'proleptic_gregorian' + cubes = timeseries_cubes_day[calendar] + name = f'timeseries_daily_{calendar}' + msg = ( + "Multi-model statistics failed to merge input cubes into a single " + "array" ) + with pytest.raises(ValueError, match=msg): + multimodel_regression_test(name=name, span=span, cubes=cubes) @pytest.mark.use_sample_data def test_multimodel_no_vertical_dimension(timeseries_cubes_month): """Test statistic without vertical dimension using monthly data.""" span = 'full' - cubes = timeseries_cubes_month - cubes = [cube[:, 0] for cube in cubes] + cubes = [cube[:, 0] for cube in timeseries_cubes_month] multimodel_test(cubes, span=span, statistic='mean') @@ -263,16 +295,15 @@ def test_multimodel_merge_error(timeseries_cubes_month): """ span = 'full' cubes = timeseries_cubes_month - with pytest.raises(MergeError): + with pytest.raises(ValueError): multimodel_test(cubes, span=span, statistic='mean') @pytest.mark.use_sample_data def test_multimodel_only_time_dimension(timeseries_cubes_month): """Test statistic without only the time dimension using monthly data.""" - cubes = timeseries_cubes_month span = 'full' - cubes = [cube[:, 0, 0, 0] for cube in cubes] + cubes = [cube[:, 0, 0, 0] for cube in timeseries_cubes_month] multimodel_test(cubes, span=span, statistic='mean') @@ -285,8 +316,7 @@ def test_multimodel_no_time_dimension(timeseries_cubes_month): """ span = 'full' - cubes = timeseries_cubes_month - cubes = [cube[0, 0] for cube in cubes] + cubes = [cube[0, 0] for cube in timeseries_cubes_month] result = multimodel_test(cubes, span=span, statistic='mean')['mean'] assert result.shape == (3, 2) @@ -296,8 +326,68 @@ def test_multimodel_no_time_dimension(timeseries_cubes_month): def test_multimodel_scalar_cubes(timeseries_cubes_month): """Test statistic with scalar cubes.""" span = 'full' - cubes = timeseries_cubes_month - cubes = [cube[0, 0, 0, 0] for cube in cubes] + cubes = [cube[0, 0, 0, 0] for cube in timeseries_cubes_month] result = multimodel_test(cubes, span=span, statistic='mean')['mean'] assert result.shape == () + + +@pytest.mark.use_sample_data +def test_multimodel_0d_and_1d_time_dimensions(timeseries_cubes_month): + """Test statistic fail on 0D and 1D time dimension using monthly data. + + Also remove air_pressure dimensions since this slightly differs across + cubes. See https://github.com/ESMValGroup/ESMValCore/issues/956. + + """ + span = 'full' + cubes = [cube[:, 0] for cube in timeseries_cubes_month] # remove Z-dim + cubes[1] = cubes[1][0] # use 0D time dim for one cube + + msg = "Tried to align cubes in multi-model statistics, but failed for cube" + with pytest.raises(ValueError, match=msg): + multimodel_test(cubes, span=span, statistic='mean') + + +@pytest.mark.use_sample_data +def test_multimodel_only_some_time_dimensions(timeseries_cubes_month): + """Test statistic fail if only some cubes have time dimension. + + Also remove air_pressure dimensions since this slightly differs across + cubes. See https://github.com/ESMValGroup/ESMValCore/issues/956. + + """ + span = 'full' + cubes = [cube[:, 0] for cube in timeseries_cubes_month] # remove Z-dim + + # Remove time dimension for one cube + cubes[1] = cubes[1][0] + cubes[1].remove_coord('time') + + msg = ( + "Multi-model statistics failed to merge input cubes into a single " + "array: some cubes have a 'time' dimension, some do not have a 'time' " + "dimension." + ) + with pytest.raises(ValueError, match=msg): + multimodel_test(cubes, span=span, statistic='mean') + + +@pytest.mark.use_sample_data +def test_multimodel_0d_different_time_dimensions(timeseries_cubes_month): + """Test statistic fail on different scalar time dimensions. + + Also remove air_pressure dimensions since this slightly differs across + cubes. See https://github.com/ESMValGroup/ESMValCore/issues/956. + + """ + span = 'full' + cubes = [cube[0, 0] for cube in timeseries_cubes_month] + + # Use different scalar time point and bounds for one cube + cubes[1].coord('time').points = 20.0 + cubes[1].coord('time').bounds = [0.0, 40.0] + + msg = "Tried to align cubes in multi-model statistics, but failed for cube" + with pytest.raises(ValueError, match=msg): + multimodel_test(cubes, span=span, statistic='mean') diff --git a/tests/unit/preprocessor/_multimodel/test_multimodel.py b/tests/unit/preprocessor/_multimodel/test_multimodel.py index a782178e36..0625813916 100644 --- a/tests/unit/preprocessor/_multimodel/test_multimodel.py +++ b/tests/unit/preprocessor/_multimodel/test_multimodel.py @@ -430,7 +430,7 @@ def test_align(span): len_data=3) cubes.append(cube) - result_cubes = mm._align(cubes, span) + result_cubes = mm._align_time_coord(cubes, span) calendars = set(cube.coord('time').units.calendar for cube in result_cubes) @@ -479,8 +479,12 @@ def test_combine_different_shape_fail(): cube = generate_cube_from_dates('monthly', '360_day', len_data=num) cubes.append(cube) - with pytest.raises(iris.exceptions.MergeError): - _ = mm._combine(cubes) + msg = ( + "Multi-model statistics failed to merge input cubes into a single " + "array" + ) + with pytest.raises(ValueError, match=msg): + mm._combine(cubes) def test_combine_inconsistent_var_names_fail(): @@ -494,8 +498,12 @@ def test_combine_inconsistent_var_names_fail(): var_name=f'test_var_{num}') cubes.append(cube) - with pytest.raises(iris.exceptions.MergeError): - _ = mm._combine(cubes) + msg = ( + "Multi-model statistics failed to merge input cubes into a single " + "array" + ) + with pytest.raises(ValueError, match=msg): + mm._combine(cubes) @pytest.mark.parametrize('scalar_coord', ['p0', 'ptop']) @@ -859,7 +867,7 @@ def test_daily_inconsistent_calendars(): cubes = [leapcube, noleapcube] # span=full - aligned_cubes = mm._align(cubes, span='full') + aligned_cubes = mm._align_time_coord(cubes, span='full') for cube in aligned_cubes: assert cube.coord('time').units.calendar in ("standard", "gregorian") assert cube.shape == (367, ) @@ -872,7 +880,7 @@ def test_daily_inconsistent_calendars(): assert result_cube[366].data == 1 # outside original range # span=overlap - aligned_cubes = mm._align(cubes, span='overlap') + aligned_cubes = mm._align_time_coord(cubes, span='overlap') for cube in aligned_cubes: assert cube.coord('time').units.calendar in ("standard", "gregorian") assert cube.shape == (365, ) From 3cdae75ce563bd75f9420f4ef0fe9cf495337577 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 22 Nov 2022 10:47:48 +0100 Subject: [PATCH 07/10] Optimized tests --- tests/sample_data/multimodel_statistics/test_multimodel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/sample_data/multimodel_statistics/test_multimodel.py b/tests/sample_data/multimodel_statistics/test_multimodel.py index f06cde40d3..5add64d043 100644 --- a/tests/sample_data/multimodel_statistics/test_multimodel.py +++ b/tests/sample_data/multimodel_statistics/test_multimodel.py @@ -295,7 +295,11 @@ def test_multimodel_merge_error(timeseries_cubes_month): """ span = 'full' cubes = timeseries_cubes_month - with pytest.raises(ValueError): + msg = ( + "Multi-model statistics failed to merge input cubes into a single " + "array" + ) + with pytest.raises(ValueError, match=msg): multimodel_test(cubes, span=span, statistic='mean') From c1c0e259030f25707a80500830efcf241dd09499 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 22 Nov 2022 10:55:54 +0100 Subject: [PATCH 08/10] Simplified logic --- esmvalcore/preprocessor/_multimodel.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/esmvalcore/preprocessor/_multimodel.py b/esmvalcore/preprocessor/_multimodel.py index b7da5d36d9..e65f10e560 100644 --- a/esmvalcore/preprocessor/_multimodel.py +++ b/esmvalcore/preprocessor/_multimodel.py @@ -412,20 +412,19 @@ def _multicube_statistics(cubes, statistics, span): # Avoid modifying inputs copied_cubes = [cube.copy() for cube in cubes] - # If all cubes contain a time coordinate, align them. If only some of them - # contain a time coordinate, raise an exception. If no cube contains a time - # coordinate, do nothing. + # If all cubes contain a time coordinate, align them. If no cube contains a + # time coordinate, do nothing. Else, raise an exception time_coords = [cube.coords('time') for cube in cubes] - if any(time_coords) and not all(time_coords): + if all(time_coords): + aligned_cubes = _align_time_coord(copied_cubes, span=span) + elif not any(time_coords): + aligned_cubes = copied_cubes + else: raise ValueError( "Multi-model statistics failed to merge input cubes into a single " "array: some cubes have a 'time' dimension, some do not have a " "'time' dimension." ) - if all(time_coords): - aligned_cubes = _align_time_coord(copied_cubes, span=span) - else: - aligned_cubes = copied_cubes # Calculate statistics statistics_cubes = {} From 4dd15bfc6d9e949dc8601cedf8b319bb99ebcf4c Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 22 Nov 2022 14:14:48 +0100 Subject: [PATCH 09/10] suggestions from code review --- esmvalcore/preprocessor/_multimodel.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/esmvalcore/preprocessor/_multimodel.py b/esmvalcore/preprocessor/_multimodel.py index e65f10e560..aaee7a2cd4 100644 --- a/esmvalcore/preprocessor/_multimodel.py +++ b/esmvalcore/preprocessor/_multimodel.py @@ -150,6 +150,10 @@ def _guess_time_bounds(cube): try: cube.coord('time').guess_bounds() except ValueError: # Time has length 1 --> guessing bounds not possible + logger.debug( + "Encountered scalar time coordinate in multi_model_statistics: " + "cannot determine its bounds" + ) return @@ -302,9 +306,12 @@ def _combine(cubes): try: merged_cube = cubes.merge_cube() except MergeError as exc: + # Note: str(exc) starts with "failed to merge into a single cube.\n" + # --> remove this here for clear error message + msg = "\n".join(str(exc).split('\n')[1:]) raise ValueError( - "Multi-model statistics failed to merge input cubes into a single " - "array" + f"Multi-model statistics failed to merge input cubes into a " + f"single array:\n{cubes}\n{msg}" ) from exc return merged_cube From 11835220c840a4fed6760af2511d12124d0463e4 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Wed, 23 Nov 2022 11:03:41 +0100 Subject: [PATCH 10/10] no try/except --- esmvalcore/preprocessor/_multimodel.py | 7 +++---- tests/sample_data/multimodel_statistics/test_multimodel.py | 1 + 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/esmvalcore/preprocessor/_multimodel.py b/esmvalcore/preprocessor/_multimodel.py index aaee7a2cd4..6fb5047fde 100644 --- a/esmvalcore/preprocessor/_multimodel.py +++ b/esmvalcore/preprocessor/_multimodel.py @@ -147,14 +147,13 @@ def _unify_time_coordinates(cubes): def _guess_time_bounds(cube): """Guess time bounds if possible.""" cube.coord('time').bounds = None - try: - cube.coord('time').guess_bounds() - except ValueError: # Time has length 1 --> guessing bounds not possible + if cube.coord('time').shape == (1,): logger.debug( "Encountered scalar time coordinate in multi_model_statistics: " "cannot determine its bounds" ) - return + else: + cube.coord('time').guess_bounds() def _time_coords_are_aligned(cubes): diff --git a/tests/sample_data/multimodel_statistics/test_multimodel.py b/tests/sample_data/multimodel_statistics/test_multimodel.py index 5add64d043..f69c424d3d 100644 --- a/tests/sample_data/multimodel_statistics/test_multimodel.py +++ b/tests/sample_data/multimodel_statistics/test_multimodel.py @@ -334,6 +334,7 @@ def test_multimodel_scalar_cubes(timeseries_cubes_month): result = multimodel_test(cubes, span=span, statistic='mean')['mean'] assert result.shape == () + assert result.coord('time').bounds is None @pytest.mark.use_sample_data