Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
8e914e7
Fix calendar units to 'days since 1850-01-01' on a standard calendar
Peter9192 Jun 23, 2020
d6b9947
More thorough check on source time frequency; align behaviour with re…
Peter9192 Jun 23, 2020
12f9aa4
Simplify _get_overlap
Peter9192 Jun 23, 2020
bf82085
Align behaviour for union (full) and intersection (overlap) of time a…
Peter9192 Jun 23, 2020
3bd1e82
Add function to make all cubes use the same calendar.
Peter9192 Jun 23, 2020
7841a5f
Remove _datetime_to_int_days, as we can just use the time points
Peter9192 Jun 23, 2020
f95aa7c
Simplify and rename _slice_cube
Peter9192 Jun 23, 2020
db47522
Align _assemble_overlap_data more with _assemble_full_data
Peter9192 Jun 23, 2020
127c69b
Align _assemble_full_data more with _assemble_overlap_data
Peter9192 Jun 23, 2020
f873ea5
Futher align assemble full and overlap data
Peter9192 Jun 24, 2020
3b7b55b
Merge assemble full and overlap data.
Peter9192 Jun 24, 2020
168a701
Further simplify
Peter9192 Jun 24, 2020
629a9d6
Remove stuff about bounds and aux coords as it is not used anyway
Peter9192 Jun 24, 2020
f538fa8
Remove stuff about bounds and aux coords as it is not used anyway
Peter9192 Jun 24, 2020
b0cc1ae
Clean up tests and add tests for new functions
Peter9192 Jun 24, 2020
aef2578
Valeriu's suggestions
Peter9192 Jun 29, 2020
24294df
fix tests
Peter9192 Jun 29, 2020
d3a4ddd
Merge remote-tracking branch 'origin/master' into mmstats_simplify_time
Peter9192 Jul 2, 2020
35210a5
Realize data before making time slices
Peter9192 Jul 2, 2020
d2fddeb
Avoid codacy 'pointless-statement' message
Peter9192 Jul 2, 2020
d028b28
Merge remote-tracking branch 'origin/master' into mmstats_simplify_time
Peter9192 Aug 4, 2020
cc51e15
Address Bouwe's comments
Peter9192 Aug 4, 2020
7c182a2
Update esmvalcore/preprocessor/_multimodel.py
Peter9192 Aug 5, 2020
a663ea6
Don't change the calendar if not necessary
Peter9192 Aug 7, 2020
2e363c9
Merge branch 'mmstats_simplify_time' of github.com:ESMValGroup/ESMVal…
Peter9192 Aug 7, 2020
c8b5f4b
Use template cube's calendar and don't slice by time
Peter9192 Aug 7, 2020
fcb9955
Use num2date rather than cells() method
Peter9192 Aug 10, 2020
46337f7
Apply suggestions from code review
Peter9192 Aug 10, 2020
4e78224
Merge branch 'mmstats_simplify_time' of github.com:ESMValGroup/ESMVal…
Peter9192 Aug 10, 2020
c03358f
fix bug for monthly data in unify time
Peter9192 Sep 8, 2020
efa6372
add time bounds to output cube
Peter9192 Sep 30, 2020
64bc2fe
Merge remote-tracking branch 'origin/master' into mmstats_simplify_time
Peter9192 Jan 5, 2021
bb8e9c5
Add missing var_name attribute to time coordinate
Peter9192 Jan 7, 2021
13ac659
Force regrid of daily data and be more consistent with regrid time.
Peter9192 Jan 7, 2021
1becb92
Add long name to new time coordinates
Peter9192 Jan 7, 2021
2bd2d62
Temporarily bypass coordinate check
Peter9192 Jan 12, 2021
7f5ace8
Merge remote-tracking branch 'origin/master' into mmstats_simplify_time
Peter9192 Jan 13, 2021
df689b8
Revert "Temporarily bypass coordinate check"
Peter9192 Jan 13, 2021
395a675
Update reference data
Peter9192 Jan 13, 2021
ccadc6e
Typo
Peter9192 Jan 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/recipe/preprocessor.rst
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,10 @@ from a statistical point of view, this is needed since weights are not yet
implemented; also higher dimensional data is not supported (i.e. anything with
dimensionality higher than four: time, vertical axis, two horizontal axes).

Input datasets may have different time coordinates. The multi-model statistics
preprocessor sets a common time coordinate on all datasets. As the number of
days in a year may vary between calendars, (sub-)daily data are not supported.

.. code-block:: yaml

preprocessors:
Expand Down
279 changes: 108 additions & 171 deletions esmvalcore/preprocessor/_multimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
It operates on different (time) spans:
- full: computes stats on full dataset time;
- overlap: computes common time overlap between datasets;

"""

import logging
Expand All @@ -22,26 +21,14 @@
import numpy as np
import scipy

from ._time import regrid_time

logger = logging.getLogger(__name__)


def _get_time_offset(time_unit):
"""Return a datetime object equivalent to tunit."""
# tunit e.g. 'day since 1950-01-01 00:00:00.0000000 UTC'
cfunit = cf_units.Unit(time_unit, calendar=cf_units.CALENDAR_STANDARD)
time_offset = cfunit.num2date(0)
return time_offset


def _plev_fix(dataset, pl_idx):
"""Extract valid plev data.

this function takes care of situations
in which certain plevs are completely
masked due to unavailable interpolation
boundaries.
This function takes care of situations in which certain plevs are
completely masked due to unavailable interpolation boundaries.
"""
if np.ma.is_masked(dataset):
# keep only the valid plevs
Expand All @@ -60,9 +47,9 @@ def _plev_fix(dataset, pl_idx):
def _quantile(data, axis, quantile):
"""Calculate quantile.

Workaround for calling scipy's mquantiles with arrays of >2 dimensions
Similar to iris' _percentiles function, see their discussion:
https://github.com/SciTools/iris/pull/625
Workaround for calling scipy's mquantiles with arrays of >2
dimensions Similar to iris' _percentiles function, see their
discussion: https://github.com/SciTools/iris/pull/625
"""
# Ensure that the target axis is the last dimension.
data = np.rollaxis(data, axis, start=data.ndim)
Expand Down Expand Up @@ -114,7 +101,7 @@ def _compute_statistic(data, statistic_name):
# data is per time point
# so we can safely NOT compute stats for single points
if data.ndim == 1:
u_datas = [d for d in data]
u_datas = data
else:
u_datas = [d for d in data if not np.all(d.mask)]
if len(u_datas) > 1:
Expand Down Expand Up @@ -143,15 +130,14 @@ def _compute_statistic(data, statistic_name):

def _put_in_cube(template_cube, cube_data, statistic, t_axis):
"""Quick cube building and saving."""
if t_axis is None:
times = template_cube.coord('time')
else:
unit_name = template_cube.coord('time').units.name
tunits = cf_units.Unit(unit_name, calendar="standard")
times = iris.coords.DimCoord(t_axis,
standard_name='time',
units=tunits,
var_name='time')
tunits = template_cube.coord('time').units
times = iris.coords.DimCoord(t_axis,
standard_name='time',
units=tunits,
var_name='time',
long_name='time')
times.bounds = None
times.guess_bounds()

coord_names = [c.long_name for c in template_cube.coords()]
coord_names.extend([c.standard_name for c in template_cube.coords()])
Expand Down Expand Up @@ -202,158 +188,110 @@ def _put_in_cube(template_cube, cube_data, statistic, t_axis):
return stats_cube


def _datetime_to_int_days(cube):
"""Return list of int(days) converted from cube datetime cells."""
cube = _align_yearly_axes(cube)
time_cells = [cell.point for cell in cube.coord('time').cells()]

# extract date info
real_dates = []
for date_obj in time_cells:
# real_date resets the actual data point day
# to the 1st of the month so that there are no
# wrong overlap indices
real_date = datetime(date_obj.year, date_obj.month, 1, 0, 0, 0)
real_dates.append(real_date)

# get the number of days starting from the reference unit
time_unit = cube.coord('time').units.name
time_offset = _get_time_offset(time_unit)
days = [(date_obj - time_offset).days for date_obj in real_dates]
def _get_consistent_time_unit(cubes):
"""Return cubes' time unit if consistent, standard calendar otherwise."""
t_units = [cube.coord('time').units for cube in cubes]
if len(set(t_units)) == 1:
return t_units[0]
return cf_units.Unit("days since 1850-01-01", calendar="standard")

return days

def _unify_time_coordinates(cubes):
"""Make sure all cubes' share the same time coordinate.

def _align_yearly_axes(cube):
"""Perform a time-regridding operation to align time axes for yr data."""
years = [cell.point.year for cell in cube.coord('time').cells()]
# be extra sure that the first point is not in the previous year
if 0 not in np.diff(years):
return regrid_time(cube, 'yr')
return cube
This function extracts the date information from the cube and
reconstructs the time coordinate, resetting the actual dates to the
15th of the month or 1st of july for yearly data (consistent with
`regrid_time`), so that there are no mismatches in the time arrays.

If cubes have different time units, it will use reset the calendar to
a default gregorian calendar with unit "days since 1850-01-01".

def _get_overlap(cubes):
Might not work for (sub)daily data, because different calendars may have
different number of days in the year.
"""
Get discrete time overlaps.

This method gets the bounds of coord time
from the cube and assembles a continuous time
axis with smallest unit 1; then it finds the
overlaps by doing a 1-dim intersect;
takes the floor of first date and
ceil of last date.
"""
all_times = []
for cube in cubes:
span = _datetime_to_int_days(cube)
start, stop = span[0], span[-1]
all_times.append([start, stop])
bounds = [range(b[0], b[-1] + 1) for b in all_times]
time_pts = reduce(np.intersect1d, bounds)
if len(time_pts) > 1:
time_bounds_list = [time_pts[0], time_pts[-1]]
return time_bounds_list

t_unit = _get_consistent_time_unit(cubes)

def _slice_cube(cube, t_1, t_2):
"""
Efficient slicer.

Simple cube data slicer on indices
of common time-data elements.
"""
time_pts = [t for t in cube.coord('time').points]
converted_t = _datetime_to_int_days(cube)
idxs = sorted([
time_pts.index(ii) for ii, jj in zip(time_pts, converted_t)
if t_1 <= jj <= t_2
])
return [idxs[0], idxs[-1]]


def _monthly_t(cubes):
"""Rearrange time points for monthly data."""
# get original cubes tpoints
days = {day for cube in cubes for day in _datetime_to_int_days(cube)}
return sorted(days)


def _full_time_slice(cubes, ndat, indices, ndatarr, t_idx):
"""Construct a contiguous collection over time."""
for idx_cube, cube in enumerate(cubes):
# reset mask
ndat.mask = True
ndat[indices[idx_cube]] = cube.data
if np.ma.is_masked(cube.data):
ndat.mask[indices[idx_cube]] = cube.data.mask
for cube in cubes:
# Extract date info from cube
coord = cube.coord('time')
years = [p.year for p in coord.units.num2date(coord.points)]
months = [p.month for p in coord.units.num2date(coord.points)]
days = [p.day for p in coord.units.num2date(coord.points)]

# Reconstruct default calendar
if 0 not in np.diff(years):
# yearly data
dates = [datetime(year, 7, 1, 0, 0, 0) for year in years]

elif 0 not in np.diff(months):
# monthly data
dates = [
datetime(year, month, 15, 0, 0, 0)
for year, month in zip(years, months)
]
elif 0 not in np.diff(days):
# daily data
dates = [
datetime(year, month, day, 0, 0, 0)
for year, month, day in zip(years, months, days)
]
if coord.units != t_unit:
logger.warning(
"Multimodel encountered (sub)daily data and inconsistent "
"time units or calendars. Attempting to continue, but "
"might produce unexpected results.")
else:
ndat.mask[indices[idx_cube]] = False
ndatarr[idx_cube] = ndat[t_idx]

# return time slice
return ndatarr


def _assemble_overlap_data(cubes, interval, statistic):
"""Get statistical data in iris cubes for OVERLAP."""
start, stop = interval
sl_1, sl_2 = _slice_cube(cubes[0], start, stop)
stats_dats = np.ma.zeros(cubes[0].data[sl_1:sl_2 + 1].shape)
raise ValueError(
"Multimodel statistics preprocessor currently does not "
"support sub-daily data.")

# keep this outside the following loop
# this speeds up the code by a factor of 15
indices = [_slice_cube(cube, start, stop) for cube in cubes]

for i in range(stats_dats.shape[0]):
time_data = [
cube.data[indx[0]:indx[1] + 1][i]
for cube, indx in zip(cubes, indices)
]
stats_dats[i] = _compute_statistic(time_data, statistic)
stats_cube = _put_in_cube(cubes[0][sl_1:sl_2 + 1],
stats_dats,
statistic,
t_axis=None)
return stats_cube
# Update the cubes' time coordinate (both point values and the units!)
cube.coord('time').points = t_unit.date2num(dates)
cube.coord('time').units = t_unit
cube.coord('time').bounds = None
cube.coord('time').guess_bounds()


def _assemble_full_data(cubes, statistic):
"""Get statistical data in iris cubes for FULL."""
# all times, new MONTHLY data time axis
time_axis = [float(fl) for fl in _monthly_t(cubes)]
def _get_time_slice(cubes, time):
"""Fill time slice array with cubes' data if time in cube, else mask."""
time_slice = []
for cube in cubes:
cube_time = cube.coord('time').points
if time in cube_time:
idx = int(np.argwhere(cube_time == time))
subset = cube.data[idx]
else:
subset = np.ma.empty(list(cube.shape[1:]))
subset.mask = True
time_slice.append(subset)
return time_slice

# new big time-slice array shape
new_shape = [len(time_axis)] + list(cubes[0].shape[1:])

# assemble an array to hold all time data
# for all cubes; shape is (ncubes,(plev), lat, lon)
new_arr = np.ma.empty([len(cubes)] + list(new_shape[1:]), dtype='float32')
def _assemble_data(cubes, statistic, span='overlap'):
"""Get statistical data in iris cubes."""
# New time array representing the union or intersection of all cubes
time_spans = [cube.coord('time').points for cube in cubes]
if span == 'overlap':
new_times = reduce(np.intersect1d, time_spans)
elif span == 'full':
new_times = reduce(np.union1d, time_spans)
n_times = len(new_times)

# data array for stats computation
stats_dats = np.ma.zeros(new_shape, dtype='float32')
# Target array to populate with computed statistics
new_shape = [n_times] + list(cubes[0].shape[1:])
stats_data = np.ma.zeros(new_shape, dtype=np.dtype('float32'))

# assemble indices list to chop new_arr on
indices_list = []
# Realize all cubes at once instead of separately for each time slice
_ = [cube.data for cube in cubes]

# empty data array to hold time slices
empty_arr = np.ma.empty(new_shape, dtype='float32')
# Make time slices and compute stats
for i, time in enumerate(new_times):
time_data = _get_time_slice(cubes, time)
stats_data[i] = _compute_statistic(time_data, statistic)

# loop through cubes and populate empty_arr with points
for cube in cubes:
time_redone = _datetime_to_int_days(cube)
oidx = [time_axis.index(s) for s in time_redone]
indices_list.append(oidx)
for i in range(new_shape[0]):
# hold time slices only
new_datas_array = _full_time_slice(cubes, empty_arr, indices_list,
new_arr, i)
# list to hold time slices
time_data = []
for j in range(len(cubes)):
time_data.append(new_datas_array[j])
stats_dats[i] = _compute_statistic(time_data, statistic)
stats_cube = _put_in_cube(cubes[0], stats_dats, statistic, time_axis)
template = cubes[0]
stats_cube = _put_in_cube(template, stats_data, statistic, new_times)
return stats_cube


Expand Down Expand Up @@ -414,10 +352,14 @@ def multi_model_statistics(products, span, statistics, output_products=None):
cubes = products
statistic_products = {}

# Reset time coordinates and make cubes share the same calendar
_unify_time_coordinates(cubes)

if span == 'overlap':
# check if we have any time overlap
interval = _get_overlap(cubes)
if interval is None:
times = [cube.coord('time').points for cube in cubes]
overlap = reduce(np.intersect1d, times)
if len(overlap) <= 1:
logger.info("Time overlap between cubes is none or a single point."
"check datasets: will not compute statistics.")
return products
Expand All @@ -432,12 +374,7 @@ def multi_model_statistics(products, span, statistics, output_products=None):

for statistic in statistics:
# Compute statistic
if span == 'overlap':
statistic_cube = _assemble_overlap_data(cubes, interval, statistic)
elif span == 'full':
statistic_cube = _assemble_full_data(cubes, statistic)
statistic_cube.data = np.ma.array(statistic_cube.data,
dtype=np.dtype('float32'))
statistic_cube = _assemble_data(cubes, statistic, span)

if output_products:
# Add to output product and log provenance
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading