ESMValGroup · mattiarighi · Feb 10, 2020 · Nov 1, 2019 · Dec 24, 2019 · Dec 31, 2019
diff --git a/doc/esmvalcore/fixing_data.rst b/doc/esmvalcore/fixing_data.rst
@@ -16,6 +16,15 @@ coordinate bounds like ''lat_bnds'') or problems with the actual data
 The ESMValTool can apply on the fly fixes to data sets that have
 known errors that can be fixed automatically.
 
+.. note::
+  **CMORization as a fix**. As of early 2020, we've started implementing CMORization as fixes for
+  observational datasets. Previously, CMORization was an additional function implemented in ESMValTool.
+  This meant that users always had to store 2 copies of their observational data: both raw and CMORized.
+  Implementing CMORization as a fix removes this redundancy, as the fixes are applied 'on the fly' when
+  running a recipe. **ERA5** is the first dataset for which this 'CMORization on the fly' is supported.
+  For more information about CMORization, see:
+  `Contributing a CMORizing script for an observational dataset <https://esmvaltool.readthedocs.io/en/latest/esmvaldiag/observations.html>`_.
+
 Fix structure
 =============
 
@@ -44,7 +53,7 @@ Check the output
 Next to the error message, you should see some info about the iris cube: size,
 coordinates. In our example it looks like this:
 
-.. code-block::
+.. code-block:: python
 
     air_temperature/ (K) (time: 312; altitude: 90; longitude: 180)
         Dimension coordinates:
@@ -202,16 +211,28 @@ with the actual data units.
 
 .. code-block:: python
 
-    ...
-        def fix_metadata(self, cubes):
-            ...
-            cube.units = 'real_units'
-            ...
+    def fix_metadata(self, cubes):
+        cube.units = 'real_units'
+
 
 Detecting this error can be tricky if the units are similar enough. It also
 has a good chance of going undetected until you notice strange results in
 your diagnostic.
 
+For the above example, it can be useful to access the variable definition
+and associated coordinate definitions as provided by the CMOR table.
+For example:
+
+.. code-block:: python
+
+    def fix_metadata(self, cubes):
+        cube.units = self.vardef.units
+
+To learn more about what is available in these definitions, see:
+:class:`esmvalcore.cmor.table.VariableInfo` and
+:class:`esmvalcore.cmor.table.CoordinateInfo`.
+
+
 
 Coordinates missing
 -------------------

diff --git a/esmvalcore/_data_finder.py b/esmvalcore/_data_finder.py
@@ -5,10 +5,11 @@
 # Mattia Righi (DLR, Germany - mattia.righi@dlr.de)
 
 import fnmatch
+import glob
 import logging
 import os
 import re
-import glob
+from pathlib import Path
 
 import iris
 
@@ -32,56 +33,30 @@ def find_files(dirnames, filenames):
 
 
 def get_start_end_year(filename):
-    """Get the start and end year from a file name.
-
-    This works for filenames matching
-
-    *[-,_]YYYY*[-,_]YYYY*.*
-      or
-    *[-,_]YYYY*.*
-      or
-    YYYY*[-,_]*.*
-      or
-    YYYY*[-,_]YYYY*[-,_]*.*
-      or
-    YYYY*[-,_]*[-,_]YYYY*.* (Does this make sense? Is this worth catching?)
-    """
-    name = os.path.splitext(filename)[0]
-
-    name = name.split(os.sep)[-1]
-    filename_list = [elem.split('-') for elem in name.split('_')]
-    filename_list = [elem for sublist in filename_list for elem in sublist]
-
-    pos_ydates = [elem.isdigit() and len(elem) >= 4 for elem in filename_list]
-    pos_ydates_l = list(pos_ydates)
-    pos_ydates_r = list(pos_ydates)
-
-    for ind, _ in enumerate(pos_ydates_l):
-        if ind != 0:
-            pos_ydates_l[ind] = (pos_ydates_l[ind - 1] and pos_ydates_l[ind])
-
-    for ind, _ in enumerate(pos_ydates_r):
-        if ind != 0:
-            pos_ydates_r[-ind - 1] = (pos_ydates_r[-ind]
-                                      and pos_ydates_r[-ind - 1])
-
-    dates = [
-        filename_list[ind] for ind, _ in enumerate(pos_ydates)
-        if pos_ydates_r[ind] or pos_ydates_l[ind]
-    ]
-    start_year = None
-    end_year = None
-    if len(dates) == 1:
-        start_year = int(dates[0][:4])
-        end_year = start_year
-    elif len(dates) == 2:
-        start_year, end_year = int(dates[0][:4]), int(dates[1][:4])
+    """Get the start and end year from a file name."""
+    stem = Path(filename).stem
+    start_year = end_year = None
+
+    # First check for a block of two potential dates separated by _ or -
+    daterange = re.findall(r'([0-9]{4,12}[-_][0-9]{4,12})', stem)
+    if daterange:
+        start_date, end_date = re.findall(r'([0-9]{4,12})', daterange[0])
+        start_year = start_date[:4]
+        end_year = end_date[:4]
     else:
-        # Slower than just parsing the name
-        try:
-            cubes = iris.load(filename)
-        except OSError:
-            raise ValueError('File {0} can not be read'.format(filename))
+        # Check for single dates in the filename
+        dates = re.findall(r'([0-9]{4,12})', stem)
+        if len(dates) == 1:
+            start_year = end_year = dates[0][:4]
+        elif len(dates) > 1:
+            # Check for dates at start or end of filename
+            outerdates = re.findall(r'^[0-9]{4,12}|[0-9]{4,12}$', stem)
+            if len(outerdates) == 1:
+                start_year = end_year = outerdates[0][:4]
+
+    # As final resort, try to get the dates from the file contents
+    if start_year is None or end_year is None:
+        cubes = iris.load(filename)
 
         for cube in cubes:
             logger.debug(cube)
@@ -94,11 +69,11 @@ def get_start_end_year(filename):
             break
 
     if start_year is None or end_year is None:
-        raise ValueError(
-            'File {0} dates do not match a recognized pattern and time can '
-            'not be read from the file'.format(filename)
-        )
-    return start_year, end_year
+        raise ValueError(f'File {filename} dates do not match a recognized'
+                         'pattern and time can not be read from the file')
+
+    logger.debug("Found start_year %s and end_year %s", start_year, end_year)
+    return int(start_year), int(end_year)
 
 
 def select_files(filenames, start_year, end_year):

diff --git a/esmvalcore/_recipe.py b/esmvalcore/_recipe.py
@@ -88,40 +88,26 @@ def _get_value(key, datasets):
     return value
 
 
-def _update_from_others(variable, keys, datasets):
-    """Get values for keys by copying from the other datasets."""
-    for key in keys:
-        if key not in variable:
-            value = _get_value(key, datasets)
-            if value is not None:
-                variable[key] = value
-
-
 def _add_cmor_info(variable, override=False):
     """Add information from CMOR tables to variable."""
     logger.debug("If not present: adding keys from CMOR table to %s", variable)
-
-    if 'cmor_table' not in variable or 'mip' not in variable:
-        logger.debug("Skipping because cmor_table or mip not specified")
-        return
-
-    if variable['cmor_table'] not in CMOR_TABLES:
-        logger.warning("Unknown CMOR table %s", variable['cmor_table'])
-
-    derive = variable.get('derive', False)
     # Copy the following keys from CMOR table
     cmor_keys = [
         'standard_name', 'long_name', 'units', 'modeling_realm', 'frequency'
     ]
-    cmor_table = variable['cmor_table']
+    project = variable['project']
     mip = variable['mip']
     short_name = variable['short_name']
-    table_entry = CMOR_TABLES[cmor_table].get_variable(mip, short_name, derive)
-
+    derive = variable.get('derive', False)
+    table = CMOR_TABLES.get(project)
+    if table:
+        table_entry = table.get_variable(mip, short_name, derive)
+    else:
+        table_entry = None
     if table_entry is None:
         raise RecipeError(
-            "Unable to load CMOR table '{}' for variable '{}' with mip '{}'".
-            format(cmor_table, short_name, mip))
+            f"Unable to load CMOR table (project) '{project}' for variable "
+            f"'{short_name}' with mip '{mip}'")
 
     for key in cmor_keys:
         if key not in variable or override:
@@ -180,12 +166,17 @@ def _update_target_levels(variable, variables, settings, config_user):
             del settings['extract_levels']
         else:
             variable_data = _get_dataset_info(dataset, variables)
-            filename = \
-                _dataset_to_file(variable_data, config_user)
+            filename = _dataset_to_file(variable_data, config_user)
             settings['extract_levels']['levels'] = get_reference_levels(
-                filename, variable_data['project'], dataset,
-                variable_data['short_name'],
-                os.path.splitext(variable_data['filename'])[0] + '_fixed')
+                filename=filename,
+                project=variable_data['project'],
+                dataset=dataset,
+                short_name=variable_data['short_name'],
+                mip=variable_data['mip'],
+                frequency=variable_data['frequency'],
+                fix_dir=os.path.splitext(
+                    variable_data['filename'])[0] + '_fixed',
+            )
 
 
 def _update_target_grid(variable, variables, settings, config_user):
@@ -300,19 +291,16 @@ def _get_default_settings(variable, config_user, derive=False):
         'project': variable['project'],
         'dataset': variable['dataset'],
         'short_name': variable['short_name'],
+        'mip': variable['mip'],
     }
     # File fixes
     fix_dir = os.path.splitext(variable['filename'])[0] + '_fixed'
     settings['fix_file'] = dict(fix)
     settings['fix_file']['output_dir'] = fix_dir
     # Cube fixes
-    # Only supply mip if the CMOR check fixes are implemented.
-    if variable.get('cmor_table'):
-        fix['cmor_table'] = variable['cmor_table']
-        fix['mip'] = variable['mip']
-        fix['frequency'] = variable['frequency']
-    settings['fix_data'] = dict(fix)
+    fix['frequency'] = variable['frequency']
     settings['fix_metadata'] = dict(fix)
+    settings['fix_data'] = dict(fix)
 
     # Configure time extraction
     if 'start_year' in variable and 'end_year' in variable \
@@ -335,21 +323,19 @@ def _get_default_settings(variable, config_user, derive=False):
         }
 
     # Configure CMOR metadata check
-    if variable.get('cmor_table'):
-        settings['cmor_check_metadata'] = {
-            'cmor_table': variable['cmor_table'],
-            'mip': variable['mip'],
-            'short_name': variable['short_name'],
-            'frequency': variable['frequency'],
-        }
+    settings['cmor_check_metadata'] = {
+        'cmor_table': variable['project'],
+        'mip': variable['mip'],
+        'short_name': variable['short_name'],
+        'frequency': variable['frequency'],
+    }
     # Configure final CMOR data check
-    if variable.get('cmor_table'):
-        settings['cmor_check_data'] = {
-            'cmor_table': variable['cmor_table'],
-            'mip': variable['mip'],
-            'short_name': variable['short_name'],
-            'frequency': variable['frequency'],
-        }
+    settings['cmor_check_data'] = {
+        'cmor_table': variable['project'],
+        'mip': variable['mip'],
+        'short_name': variable['short_name'],
+        'frequency': variable['frequency'],
+    }
 
     # Clean up fixed files
     if not config_user['save_intermediary_cubes']:
@@ -1029,9 +1015,6 @@ def _initialize_variables(self, raw_variable, raw_datasets):
             variable.update(dataset)
 
             variable['recipe_dataset_index'] = index
-            if ('cmor_table' not in variable
-                    and variable.get('project') in CMOR_TABLES):
-                variable['cmor_table'] = variable['project']
             if 'end_year' in variable and 'max_years' in self._cfg:
                 variable['end_year'] = min(
                     variable['end_year'],
@@ -1049,7 +1032,6 @@ def _initialize_variables(self, raw_variable, raw_datasets):
         if 'fx' not in raw_variable.get('mip', ''):
             required_keys.update({'start_year', 'end_year'})
         for variable in variables:
-            _update_from_others(variable, ['cmor_table', 'mip'], datasets)
             if 'institute' not in variable:
                 institute = get_institutes(variable)
                 if institute:

diff --git a/esmvalcore/cmor/_fixes/cmip5/gfdl_esm2g.py b/esmvalcore/cmor/_fixes/cmip5/gfdl_esm2g.py
@@ -121,7 +121,8 @@ def fix_metadata(self, cubes):
         -------
         iris.cube.CubeList
         """
-        cubes[0].standard_name = 'sea_ice_x_velocity'
+        cube = self.get_cube_from_list(cubes)
+        cube.standard_name = 'sea_ice_x_velocity'
         return cubes
 
 
@@ -141,5 +142,6 @@ def fix_metadata(self, cubes):
         -------
         iris.cube.CubeList
         """
-        cubes[0].standard_name = 'sea_ice_y_velocity'
+        cube = self.get_cube_from_list(cubes)
+        cube.standard_name = 'sea_ice_y_velocity'
         return cubes