Add cube co-realisation. (#2967)

pp-mo · djkirkham · commit 3d692fd9c97e · 2018-03-09T10:58:34.000Z
diff --git a/docs/iris/src/whatsnew/contributions_2.1/newfeature_2018-Mar-08_co_realise_cubes.txt b/docs/iris/src/whatsnew/contributions_2.1/newfeature_2018-Mar-08_co_realise_cubes.txt
@@ -0,0 +1,3 @@
+* Added new function :func:`iris.co_realise_cubes` to compute multiple lazy
+  values in a single operation, avoiding repeated re-loading of data or
+  re-calculation of expressions.
diff --git a/lib/iris/__init__.py b/lib/iris/__init__.py
@@ -112,6 +112,7 @@ def callback(cube, field, filename):
 from iris._deprecation import IrisDeprecation, warn_deprecated
 import iris.fileformats
 import iris.io
+from iris._lazy_data import co_realise_cubes
 
 
 try:
@@ -127,7 +128,7 @@ def callback(cube, field, filename):
 __all__ = ['load', 'load_cube', 'load_cubes', 'load_raw',
            'save', 'Constraint', 'AttributeConstraint', 'sample_data_path',
            'site_configuration', 'Future', 'FUTURE',
-           'IrisDeprecation']
+           'IrisDeprecation', 'co_realise_cubes']
 
 
 Constraint = iris._constraints.Constraint
diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py
@@ -1,4 +1,4 @@
-# (C) British Crown Copyright 2017, Met Office
+# (C) British Crown Copyright 2017 - 2018, Met Office
 #
 # This file is part of Iris.
 #
@@ -102,6 +102,39 @@ def as_lazy_data(data, chunks=None, asarray=False):
     return data
 
 
+def _co_realise_lazy_arrays(arrays):
+    """
+    Compute multiple lazy arrays and return a list of real values.
+
+    All the arrays are computed together, so they can share results for common
+    graph elements.
+
+    Casts all results with `np.asanyarray`, and converts any MaskedConstants
+    appearing into masked arrays, to ensure that all return values are
+    writeable NumPy array objects.
+
+    Any non-lazy arrays are passed through, as they are by `da.compute`.
+    They undergo the same result standardisation.
+
+    """
+    computed_arrays = da.compute(*arrays)
+    results = []
+    for lazy_in, real_out in zip(arrays, computed_arrays):
+        # Ensure we always have arrays.
+        # Note : in some cases dask (and numpy) will return a scalar
+        # numpy.int/numpy.float object rather than an ndarray.
+        # Recorded in https://github.com/dask/dask/issues/2111.
+        real_out = np.asanyarray(real_out)
+        if isinstance(real_out, ma.core.MaskedConstant):
+            # Convert any masked constants into NumPy masked arrays.
+            # NOTE: in this case, also apply the original lazy-array dtype, as
+            # masked constants *always* have dtype float64.
+            real_out = ma.masked_array(real_out.data, mask=real_out.mask,
+                                       dtype=lazy_in.dtype)
+        results.append(real_out)
+    return results
+
+
 def as_concrete_data(data):
     """
     Return the actual content of a lazy array, as a numpy array.
@@ -120,14 +153,7 @@ def as_concrete_data(data):
 
     """
     if is_lazy_data(data):
-        # Realise dask array, ensuring the data result is always a NumPy array.
-        # In some cases dask may return a scalar numpy.int/numpy.float object
-        # rather than a numpy.ndarray object.
-        # Recorded in https://github.com/dask/dask/issues/2111.
-        dtype = data.dtype
-        data = np.asanyarray(data.compute())
-        if isinstance(data, ma.core.MaskedConstant):
-            data = ma.masked_array(data.data, dtype=dtype, mask=data.mask)
+        data, = _co_realise_lazy_arrays([data])
 
     return data
 
@@ -158,3 +184,39 @@ def multidim_lazy_stack(stack):
         result = da.stack([multidim_lazy_stack(subarray)
                            for subarray in stack])
     return result
+
+
+def co_realise_cubes(*cubes):
+    """
+    Fetch 'real' data for multiple cubes, in a shared calculation.
+
+    This computes any lazy data, equivalent to accessing each `cube.data`.
+    However, lazy calculations and data fetches can be shared between the
+    computations, improving performance.
+
+    Args:
+
+    * cubes (list of :class:`~iris.cube.Cube`):
+        Arguments, each of which is a cube to be realised.
+
+    For example::
+
+        # Form stats.
+        a_std = cube_a.collapsed(['x', 'y'], iris.analysis.STD_DEV)
+        b_std = cube_b.collapsed(['x', 'y'], iris.analysis.STD_DEV)
+        ab_mean_diff = (cube_b - cube_a).collapsed(['x', 'y'],
+                                                   iris.analysis.MEAN)
+        std_err = (a_std * a_std + b_std * b_std) ** 0.5
+
+        # Compute stats together (to avoid multiple data passes).
+        iris.co_realise_cubes(a_std, b_std, ab_mean_diff, std_err)
+
+
+    .. Note::
+
+        Cubes with non-lazy data may also be passed, with no ill effect.
+
+    """
+    results = _co_realise_lazy_arrays([cube.core_data() for cube in cubes])
+    for cube, result in zip(cubes, results):
+        cube.data = result
diff --git a/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py b/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py
@@ -0,0 +1,86 @@
+# (C) British Crown Copyright 2018, Met Office
+#
+# This file is part of Iris.
+#
+# Iris is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Iris is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with Iris.  If not, see <http://www.gnu.org/licenses/>.
+"""Test function :func:`iris._lazy data.co_realise_cubes`."""
+
+from __future__ import (absolute_import, division, print_function)
+from six.moves import (filter, input, map, range, zip)  # noqa
+
+# Import iris.tests first so that some things can be initialised before
+# importing anything else.
+import iris.tests as tests
+
+from mock import MagicMock
+import numpy as np
+
+from iris.cube import Cube
+from iris._lazy_data import as_lazy_data
+
+from iris._lazy_data import co_realise_cubes
+
+
+class ArrayAccessCounter(object):
+    def __init__(self, array):
+        self.dtype = array.dtype
+        self.shape = array.shape
+        self._array = array
+        self.access_count = 0
+
+    def __getitem__(self, keys):
+        self.access_count += 1
+        return self._array[keys]
+
+
+class Test_co_realise_cubes(tests.IrisTest):
+    def test_empty(self):
+        # Ensure that 'no args' case does not raise an error.
+        co_realise_cubes()
+
+    def test_basic(self):
+        real_data = np.arange(3.)
+        cube = Cube(as_lazy_data(real_data))
+        co_realise_cubes(cube)
+        self.assertFalse(cube.has_lazy_data())
+        self.assertArrayAllClose(cube.core_data(), real_data)
+
+    def test_multi(self):
+        real_data = np.arange(3.)
+        cube_base = Cube(as_lazy_data(real_data))
+        cube_inner = cube_base + 1
+        result_a = cube_base + 1
+        result_b = cube_inner + 1
+        co_realise_cubes(result_a, result_b)
+        # Check that target cubes were realised.
+        self.assertFalse(result_a.has_lazy_data())
+        self.assertFalse(result_b.has_lazy_data())
+        # Check that other cubes referenced remain lazy.
+        self.assertTrue(cube_base.has_lazy_data())
+        self.assertTrue(cube_inner.has_lazy_data())
+
+    def test_combined_access(self):
+        wrapped_array = ArrayAccessCounter(np.arange(3.))
+        lazy_array = as_lazy_data(wrapped_array)
+        derived_a = lazy_array + 1
+        derived_b = lazy_array + 2
+        cube_a = Cube(derived_a)
+        cube_b = Cube(derived_b)
+        co_realise_cubes(cube_a, cube_b)
+        # Though used twice, the source data should only get fetched once.
+        self.assertEqual(wrapped_array.access_count, 1)
+
+
+if __name__ == '__main__':
+    tests.main()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+* Added new function :func:`iris.co_realise_cubes` to compute multiple lazy
	`2`	`+ values in a single operation, avoiding repeated re-loading of data or`
	`3`	`+ re-calculation of expressions.`