From c3cceb8fd1229928de3dd16d6738b70d3bfd8fa8 Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Tue, 3 Nov 2020 15:18:25 +0100
Subject: [PATCH 01/18] Update .gitignore

---
 .gitignore | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 6e78646..e64e4c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,89 @@
-# ignore these files
+# Distribution / packaging
+.Python
+build/
+c
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+env/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+*.orig
+*.tmp
+MANIFEST
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Created by editors
+*~
+\#*
+\.\#*
+*.swp
+
+# Created by PyCharm
+.idea/
+
+# eclipse/pydev
+.project
+.pydevproject
+.settings
+
+#Create by VSCode
+.vscode
+
+#pytest
+.cache
+.pytest_cache
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+*.tmp
+*.orig
+/c
+/tests/data/**
+test-reports/
+/test_bash.sh
+/python_test_out.txt
+
+# Build folder
+doc/sphinx/build
 
 # esgf-pyclient cache
 *.sqlite

From 4f634167696ea235db0f4038dfff70a422658c28 Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Tue, 3 Nov 2020 15:18:49 +0100
Subject: [PATCH 02/18] Add basic setup.py

---
 setup.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 setup.py

diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..a97fa7e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,53 @@
+import os
+
+from setuptools import setup
+
+with open('README.md') as readme_file:
+    readme = readme_file.read()
+
+setup(
+    name='ESMValTool sample data',
+    version='0.0.1',
+    description="ESMValTool sample data",
+    long_description=readme + '\n\n',
+    author="",
+    author_email='',
+    url='https://github.com/ESMValGroup/ESMValTool_sample_data',
+    packages=[
+        'esmvaltool_sample_data',
+    ],
+    include_package_data=True,
+    license="",
+    zip_safe=False,
+    keywords='ESMValTool',
+    classifiers=[
+        'Development Status :: 2 - Pre-Alpha',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: Apache Software License',
+        'Natural Language :: English',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+    ],
+    test_suite='tests',
+    install_requires=[
+        'cube-helper',
+        'scitools-iris>=2.2',
+    ],
+    # tests_require=[
+    #     'pytest',
+    #     'pytest-cov',
+    #     'pycodestyle',
+    # ],
+    extras_require={
+        'develop':  [
+            'codespell',
+            'docformatter',
+            'isort',
+            'pre-commit',
+            'prospector[with_pyroma]!=1.1.6.3,!=1.1.6.4',
+            'yamllint',
+            'yapf',
+        ],
+    },
+)

From 136e1b1c48205432729b98262ee54060193717d7 Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Tue, 3 Nov 2020 15:56:18 +0100
Subject: [PATCH 03/18] Add basic functionality to load datasets

---
 esmvaltool_sample_data/__init__.py |  1 +
 esmvaltool_sample_data/loader.py   | 53 ++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 esmvaltool_sample_data/__init__.py
 create mode 100644 esmvaltool_sample_data/loader.py

diff --git a/esmvaltool_sample_data/__init__.py b/esmvaltool_sample_data/__init__.py
new file mode 100644
index 0000000..a9ab7e8
--- /dev/null
+++ b/esmvaltool_sample_data/__init__.py
@@ -0,0 +1 @@
+from .loader import load_map_data, load_profile_data, load_timeseries_data
diff --git a/esmvaltool_sample_data/loader.py b/esmvaltool_sample_data/loader.py
new file mode 100644
index 0000000..4b236db
--- /dev/null
+++ b/esmvaltool_sample_data/loader.py
@@ -0,0 +1,53 @@
+from pathlib import Path
+
+import cube_helper
+
+base_dir = Path(__file__).parent
+
+
+def load_timeseries_data():
+    """
+    ta / Amon / historical / r1i1p1f1, any grid, 1850 - onwards, all dimensions reduced to a few steps except for the time dimension
+    some other variable / ocean, probably a different frequency, similar number of timesteps, other dimensions reduced
+    """
+
+    timeseries_dir = base_dir / 'data' / 'timeseries'
+
+    data_dirs = [
+        'CMIP6.CMIP.CAMS.CAMS-CSM1-0.historical.r1i1p1f1.Amon.ta.gn.v20190708',
+        'CMIP6.CMIP.CCCR-IITM.IITM-ESM.historical.r1i1p1f1.Amon.ta.gn.v20191226',
+        'CMIP6.CMIP.CSIRO-ARCCSS.ACCESS-CM2.historical.r1i1p1f1.Amon.ta.gn.v20191108',
+        'CMIP6.CMIP.E3SM-Project.E3SM-1-1.historical.r1i1p1f1.Amon.ta.gr.v20191211',
+        'CMIP6.CMIP.FIO-QLNM.FIO-ESM-2-0.historical.r1i1p1f1.Amon.ta.gn.v20191204',
+        'CMIP6.CMIP.HAMMOZ-Consortium.MPI-ESM-1-2-HAM.historical.r1i1p1f1.Amon.ta.gn.v20190627',
+        'CMIP6.CMIP.INM.INM-CM4-8.historical.r1i1p1f1.Amon.ta.gr1.v20190605',
+        'CMIP6.CMIP.INM.INM-CM5-0.historical.r1i1p1f1.Amon.ta.gr1.v20190610',
+        'CMIP6.CMIP.IPSL.IPSL-CM6A-LR.historical.r1i1p1f1.Amon.ta.gr.v20180803',
+        'CMIP6.CMIP.MPI-M.MPI-ESM1-2-HR.historical.r1i1p1f1.Amon.ta.gn.v20190710',
+        'CMIP6.CMIP.MPI-M.MPI-ESM1-2-LR.historical.r1i1p1f1.Amon.ta.gn.v20190710',
+        'CMIP6.CMIP.NOAA-GFDL.GFDL-CM4.historical.r1i1p1f1.Amon.ta.gr1.v20180701',
+        'CMIP6.CMIP.NOAA-GFDL.GFDL-ESM4.historical.r1i1p1f1.Amon.ta.gr1.v20190726',
+    ]
+
+    input_dirs = [timeseries_dir / data_dir for data_dir in data_dirs]
+
+    cubelists = []
+
+    for input_dir in input_dirs:
+        print(input_dir)
+        cubelist = cube_helper.load(str(input_dir), filetype='.nc')
+        cubelists.append(cubelist)
+
+    return cubelists
+
+
+def load_map_data():
+    """a 4D atmospheric variable, all dimensions reduced to a few steps except
+    the horizontal dimension(s) same for an ocean variable."""
+    raise NotImplementedError
+
+
+def load_profile_data():
+    """a 4D atmospheric variable, all dimensions reduced to a few steps except
+    the horizontal dimension(s) same for an ocean variable."""
+    raise NotImplementedError

From f3c9a30fabcd277eac77cbe45e6e06e673396cba Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Wed, 4 Nov 2020 10:01:09 +0100
Subject: [PATCH 04/18] Set cube helper logging level to ERROR to hide warnings

i.e. `tracking_id, history and creation_date attributes inconsistent`
---
 esmvaltool_sample_data/loader.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/esmvaltool_sample_data/loader.py b/esmvaltool_sample_data/loader.py
index 4b236db..c3597e1 100644
--- a/esmvaltool_sample_data/loader.py
+++ b/esmvaltool_sample_data/loader.py
@@ -2,6 +2,8 @@
 
 import cube_helper
 
+cube_helper.logger.muffle_logger()
+
 base_dir = Path(__file__).parent
 
 
@@ -51,3 +53,9 @@ def load_profile_data():
     """a 4D atmospheric variable, all dimensions reduced to a few steps except
     the horizontal dimension(s) same for an ocean variable."""
     raise NotImplementedError
+
+
+if __name__ == '__main__':
+    cube_helper.logger.reset_logger()
+    ts = load_timeseries_data()
+    breakpoint()

From 294833ea2f061bf265b071cbba44e47dacceb812 Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Wed, 4 Nov 2020 12:18:40 +0100
Subject: [PATCH 05/18] Comment out problematic dataset

---
 esmvaltool_sample_data/loader.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/esmvaltool_sample_data/loader.py b/esmvaltool_sample_data/loader.py
index c3597e1..2eddfd4 100644
--- a/esmvaltool_sample_data/loader.py
+++ b/esmvaltool_sample_data/loader.py
@@ -20,7 +20,6 @@ def load_timeseries_data():
         'CMIP6.CMIP.CCCR-IITM.IITM-ESM.historical.r1i1p1f1.Amon.ta.gn.v20191226',
         'CMIP6.CMIP.CSIRO-ARCCSS.ACCESS-CM2.historical.r1i1p1f1.Amon.ta.gn.v20191108',
         'CMIP6.CMIP.E3SM-Project.E3SM-1-1.historical.r1i1p1f1.Amon.ta.gr.v20191211',
-        'CMIP6.CMIP.FIO-QLNM.FIO-ESM-2-0.historical.r1i1p1f1.Amon.ta.gn.v20191204',
         'CMIP6.CMIP.HAMMOZ-Consortium.MPI-ESM-1-2-HAM.historical.r1i1p1f1.Amon.ta.gn.v20190627',
         'CMIP6.CMIP.INM.INM-CM4-8.historical.r1i1p1f1.Amon.ta.gr1.v20190605',
         'CMIP6.CMIP.INM.INM-CM5-0.historical.r1i1p1f1.Amon.ta.gr1.v20190610',
@@ -29,6 +28,10 @@ def load_timeseries_data():
         'CMIP6.CMIP.MPI-M.MPI-ESM1-2-LR.historical.r1i1p1f1.Amon.ta.gn.v20190710',
         'CMIP6.CMIP.NOAA-GFDL.GFDL-CM4.historical.r1i1p1f1.Amon.ta.gr1.v20180701',
         'CMIP6.CMIP.NOAA-GFDL.GFDL-ESM4.historical.r1i1p1f1.Amon.ta.gr1.v20190726',
+
+        # BUG: next dataset is problematic
+        # raises ValueError: Cube 'air_temperature' must contain a single 1D y coordinate.
+        'CMIP6.CMIP.FIO-QLNM.FIO-ESM-2-0.historical.r1i1p1f1.Amon.ta.gn.v20191204',
     ]
 
     input_dirs = [timeseries_dir / data_dir for data_dir in data_dirs]

From 869dc5b8c2fc35b4f27742c2948ff3aa0e833aa1 Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Thu, 5 Nov 2020 10:56:33 +0100
Subject: [PATCH 06/18] Speed up data loading and lose the cube-helper
 dependency

---
 esmvaltool_sample_data/loader.py | 55 +++++++++++++++++++++++---------
 setup.py                         |  3 +-
 2 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/esmvaltool_sample_data/loader.py b/esmvaltool_sample_data/loader.py
index 2eddfd4..7fa257d 100644
--- a/esmvaltool_sample_data/loader.py
+++ b/esmvaltool_sample_data/loader.py
@@ -1,16 +1,46 @@
 from pathlib import Path
 
-import cube_helper
-
-cube_helper.logger.muffle_logger()
+import cf_units
+import iris
 
 base_dir = Path(__file__).parent
 
 
+def strip_attributes(cube):
+    """Remove attributes that cause issues with merging and concatenation."""
+    for attr in ['creation_date', 'tracking_id', 'history']:
+        if attr in cube.attributes:
+            cube.attributes.pop(attr)
+
+
+def simplify_time(cube):
+    coord = cube.coord('time')
+    coord.convert_units(
+        cf_units.Unit('days since 1850-1-1 00:00:00',
+                      calendar=coord.units.calendar))
+
+
+def load_cubes_from_input_dirs(input_dirs):
+    """Loads all *.nc files from each input dir into a cube."""
+    for input_dir in input_dirs:
+        files = input_dir.glob('*.nc')
+        cubes = iris.load(str(file) for file in files)
+        for cube in cubes:
+            strip_attributes(cube)
+            simplify_time(cube)
+
+        cubes = cubes.concatenate()
+        cube = cubes[0]
+
+        yield cube
+
+
 def load_timeseries_data():
     """
-    ta / Amon / historical / r1i1p1f1, any grid, 1850 - onwards, all dimensions reduced to a few steps except for the time dimension
-    some other variable / ocean, probably a different frequency, similar number of timesteps, other dimensions reduced
+    Data: ta / Amon / historical / r1i1p1f1, any grid, 1850 - onwards.
+    All dimensions reduced to a few steps except for the time dimension
+    Some other variable / ocean, probably a different frequency,
+       similar number of timesteps, other dimensions reduced.
     """
 
     timeseries_dir = base_dir / 'data' / 'timeseries'
@@ -30,20 +60,16 @@ def load_timeseries_data():
         'CMIP6.CMIP.NOAA-GFDL.GFDL-ESM4.historical.r1i1p1f1.Amon.ta.gr1.v20190726',
 
         # BUG: next dataset is problematic
-        # raises ValueError: Cube 'air_temperature' must contain a single 1D y coordinate.
-        'CMIP6.CMIP.FIO-QLNM.FIO-ESM-2-0.historical.r1i1p1f1.Amon.ta.gn.v20191204',
+        # raises ValueError: Cube 'air_temperature' must contain
+        #     a single 1D y coordinate.
+        # 'CMIP6.CMIP.FIO-QLNM.FIO-ESM-2-0.historical.r1i1p1f1.Amon.ta.gn.v20191204',
     ]
 
     input_dirs = [timeseries_dir / data_dir for data_dir in data_dirs]
 
-    cubelists = []
-
-    for input_dir in input_dirs:
-        print(input_dir)
-        cubelist = cube_helper.load(str(input_dir), filetype='.nc')
-        cubelists.append(cubelist)
+    cubes = load_cubes_from_input_dirs(input_dirs)
 
-    return cubelists
+    return list(cubes)
 
 
 def load_map_data():
@@ -59,6 +85,5 @@ def load_profile_data():
 
 
 if __name__ == '__main__':
-    cube_helper.logger.reset_logger()
     ts = load_timeseries_data()
     breakpoint()
diff --git a/setup.py b/setup.py
index a97fa7e..e736f5f 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,6 @@
     ],
     test_suite='tests',
     install_requires=[
-        'cube-helper',
         'scitools-iris>=2.2',
     ],
     # tests_require=[
@@ -40,7 +39,7 @@
     #     'pycodestyle',
     # ],
     extras_require={
-        'develop':  [
+        'develop': [
             'codespell',
             'docformatter',
             'isort',

From 24bb39a94c23e784053056f9a0f672af8d38cf98 Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Thu, 5 Nov 2020 17:09:21 +0100
Subject: [PATCH 07/18] Rename data -> cubes

---
 esmvaltool_sample_data/__init__.py | 2 +-
 esmvaltool_sample_data/loader.py   | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/esmvaltool_sample_data/__init__.py b/esmvaltool_sample_data/__init__.py
index a9ab7e8..cc8e762 100644
--- a/esmvaltool_sample_data/__init__.py
+++ b/esmvaltool_sample_data/__init__.py
@@ -1 +1 @@
-from .loader import load_map_data, load_profile_data, load_timeseries_data
+from .loader import load_map_cubes, load_profile_cubes, load_timeseries_cubes
diff --git a/esmvaltool_sample_data/loader.py b/esmvaltool_sample_data/loader.py
index 7fa257d..66db39c 100644
--- a/esmvaltool_sample_data/loader.py
+++ b/esmvaltool_sample_data/loader.py
@@ -35,7 +35,7 @@ def load_cubes_from_input_dirs(input_dirs):
         yield cube
 
 
-def load_timeseries_data():
+def load_timeseries_cubes():
     """
     Data: ta / Amon / historical / r1i1p1f1, any grid, 1850 - onwards.
     All dimensions reduced to a few steps except for the time dimension
@@ -72,18 +72,18 @@ def load_timeseries_data():
     return list(cubes)
 
 
-def load_map_data():
+def load_map_cubes():
     """a 4D atmospheric variable, all dimensions reduced to a few steps except
     the horizontal dimension(s) same for an ocean variable."""
     raise NotImplementedError
 
 
-def load_profile_data():
+def load_profile_cubes():
     """a 4D atmospheric variable, all dimensions reduced to a few steps except
     the horizontal dimension(s) same for an ocean variable."""
     raise NotImplementedError
 
 
 if __name__ == '__main__':
-    ts = load_timeseries_data()
+    ts = load_timeseries_cubes()
     breakpoint()

From 29885ec4b183607c50e4c60ecbb5e57534f0e62a Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Fri, 6 Nov 2020 11:11:47 +0100
Subject: [PATCH 08/18] Add package data

---
 MANIFEST.in |  1 +
 setup.py    | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)
 create mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..9197b5f
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include esmvaltool_sample_data/data/ *.nc
diff --git a/setup.py b/setup.py
index e736f5f..8ae9475 100644
--- a/setup.py
+++ b/setup.py
@@ -1,10 +1,12 @@
-import os
-
 from setuptools import setup
 
 with open('README.md') as readme_file:
     readme = readme_file.read()
 
+PACKAGES = [
+    'esmvaltool_sample_data',
+]
+
 setup(
     name='ESMValTool sample data',
     version='0.0.1',
@@ -13,9 +15,7 @@
     author="",
     author_email='',
     url='https://github.com/ESMValGroup/ESMValTool_sample_data',
-    packages=[
-        'esmvaltool_sample_data',
-    ],
+    packages=PACKAGES,
     include_package_data=True,
     license="",
     zip_safe=False,

From 26f3370a42ca8136ae06e866773de468141676e0 Mon Sep 17 00:00:00 2001
From: Stef Smeets <stefsmeets@users.noreply.github.com>
Date: Mon, 9 Nov 2020 12:09:35 +0000
Subject: [PATCH 09/18] Update esmvaltool_sample_data/loader.py

Co-authored-by: Bouwe Andela <b.andela@esciencecenter.nl>
---
 esmvaltool_sample_data/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/esmvaltool_sample_data/loader.py b/esmvaltool_sample_data/loader.py
index 66db39c..ade4100 100644
--- a/esmvaltool_sample_data/loader.py
+++ b/esmvaltool_sample_data/loader.py
@@ -80,7 +80,7 @@ def load_map_cubes():
 
 def load_profile_cubes():
     """a 4D atmospheric variable, all dimensions reduced to a few steps except
-    the horizontal dimension(s) same for an ocean variable."""
+    the vertical dimension(s) same for an ocean variable."""
     raise NotImplementedError
 
 

From 1316756cc15879b2670ede8ae8590cb4d3ee632e Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Tue, 10 Nov 2020 11:43:54 +0100
Subject: [PATCH 10/18] Add developer imports

---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index 8ae9475..2e95a67 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,9 @@
         'develop': [
             'codespell',
             'docformatter',
+            'esgf-pyclient',
             'isort',
+            'myproxyclient',
             'pre-commit',
             'prospector[with_pyroma]!=1.1.6.3,!=1.1.6.4',
             'yamllint',

From e46ae61f1f8e2d893cf1f50b1ef15a9339aa98fc Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Tue, 10 Nov 2020 11:44:43 +0100
Subject: [PATCH 11/18] Address review comments

---
 esmvaltool_sample_data/loader.py | 42 +++++++++++++-------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/esmvaltool_sample_data/loader.py b/esmvaltool_sample_data/loader.py
index ade4100..0300997 100644
--- a/esmvaltool_sample_data/loader.py
+++ b/esmvaltool_sample_data/loader.py
@@ -5,6 +5,14 @@
 
 base_dir = Path(__file__).parent
 
+problematic = [
+    # iris.exceptions.ConcatenateError: failed to concatenate into a single cube.
+    'esmvaltool_sample_data/data/timeseries/CMIP6/CMIP/NCC/NorCPM1/historical/r1i1p1f1/Amon/ta/gn/v20190914',
+    # UserWarning: Gracefully filling 'lat' dimension coordinate masked points
+    'esmvaltool_sample_data/data/timeseries/CMIP6/CMIP/NCAR/CESM2-FV2/historical/r1i1p1f1/Amon/ta/gn/v20191120',
+    'esmvaltool_sample_data/data/timeseries/CMIP6/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/Amon/ta/gn/v20191120',
+]
+
 
 def strip_attributes(cube):
     """Remove attributes that cause issues with merging and concatenation."""
@@ -23,21 +31,24 @@ def simplify_time(cube):
 def load_cubes_from_input_dirs(input_dirs):
     """Loads all *.nc files from each input dir into a cube."""
     for input_dir in input_dirs:
+        if str(input_dir) in problematic:
+            # print('Skipping', input_dir)
+            continue
+        # print(input_dir)
         files = input_dir.glob('*.nc')
         cubes = iris.load(str(file) for file in files)
         for cube in cubes:
             strip_attributes(cube)
             simplify_time(cube)
 
-        cubes = cubes.concatenate()
-        cube = cubes[0]
+        cube = cubes.concatenate_cube()
 
         yield cube
 
 
-def load_timeseries_cubes():
+def load_timeseries_cubes(mip_table='Amon'):
     """
-    Data: ta / Amon / historical / r1i1p1f1, any grid, 1850 - onwards.
+    Data: ta / Amon / historical / r1i1p1f1, any grid, 1950 - onwards.
     All dimensions reduced to a few steps except for the time dimension
     Some other variable / ocean, probably a different frequency,
        similar number of timesteps, other dimensions reduced.
@@ -45,27 +56,8 @@ def load_timeseries_cubes():
 
     timeseries_dir = base_dir / 'data' / 'timeseries'
 
-    data_dirs = [
-        'CMIP6.CMIP.CAMS.CAMS-CSM1-0.historical.r1i1p1f1.Amon.ta.gn.v20190708',
-        'CMIP6.CMIP.CCCR-IITM.IITM-ESM.historical.r1i1p1f1.Amon.ta.gn.v20191226',
-        'CMIP6.CMIP.CSIRO-ARCCSS.ACCESS-CM2.historical.r1i1p1f1.Amon.ta.gn.v20191108',
-        'CMIP6.CMIP.E3SM-Project.E3SM-1-1.historical.r1i1p1f1.Amon.ta.gr.v20191211',
-        'CMIP6.CMIP.HAMMOZ-Consortium.MPI-ESM-1-2-HAM.historical.r1i1p1f1.Amon.ta.gn.v20190627',
-        'CMIP6.CMIP.INM.INM-CM4-8.historical.r1i1p1f1.Amon.ta.gr1.v20190605',
-        'CMIP6.CMIP.INM.INM-CM5-0.historical.r1i1p1f1.Amon.ta.gr1.v20190610',
-        'CMIP6.CMIP.IPSL.IPSL-CM6A-LR.historical.r1i1p1f1.Amon.ta.gr.v20180803',
-        'CMIP6.CMIP.MPI-M.MPI-ESM1-2-HR.historical.r1i1p1f1.Amon.ta.gn.v20190710',
-        'CMIP6.CMIP.MPI-M.MPI-ESM1-2-LR.historical.r1i1p1f1.Amon.ta.gn.v20190710',
-        'CMIP6.CMIP.NOAA-GFDL.GFDL-CM4.historical.r1i1p1f1.Amon.ta.gr1.v20180701',
-        'CMIP6.CMIP.NOAA-GFDL.GFDL-ESM4.historical.r1i1p1f1.Amon.ta.gr1.v20190726',
-
-        # BUG: next dataset is problematic
-        # raises ValueError: Cube 'air_temperature' must contain
-        #     a single 1D y coordinate.
-        # 'CMIP6.CMIP.FIO-QLNM.FIO-ESM-2-0.historical.r1i1p1f1.Amon.ta.gn.v20191204',
-    ]
-
-    input_dirs = [timeseries_dir / data_dir for data_dir in data_dirs]
+    paths = timeseries_dir.glob(f'**/{mip_table}/**/*.nc')
+    input_dirs = list(set(path.parent for path in paths))
 
     cubes = load_cubes_from_input_dirs(input_dirs)
 

From 042dd66f12700d190004a873311110ad756a0b57 Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Tue, 10 Nov 2020 11:50:17 +0100
Subject: [PATCH 12/18] Update doc strings and add annotations

---
 esmvaltool_sample_data/loader.py | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/esmvaltool_sample_data/loader.py b/esmvaltool_sample_data/loader.py
index 0300997..8baf682 100644
--- a/esmvaltool_sample_data/loader.py
+++ b/esmvaltool_sample_data/loader.py
@@ -14,22 +14,24 @@
 ]
 
 
-def strip_attributes(cube):
-    """Remove attributes that cause issues with merging and concatenation."""
+def strip_attributes(cube: 'iris.Cube') -> None:
+    """Remove attributes in-place that cause issues with merging and
+    concatenation."""
     for attr in ['creation_date', 'tracking_id', 'history']:
         if attr in cube.attributes:
             cube.attributes.pop(attr)
 
 
-def simplify_time(cube):
+def simplify_time(cube: 'iris.Cube') -> None:
+    """Simplifies the time coordinate in-place."""
     coord = cube.coord('time')
     coord.convert_units(
         cf_units.Unit('days since 1850-1-1 00:00:00',
                       calendar=coord.units.calendar))
 
 
-def load_cubes_from_input_dirs(input_dirs):
-    """Loads all *.nc files from each input dir into a cube."""
+def load_cubes_from_input_dirs(input_dirs: list) -> 'iris.Cube':
+    """Generator that loads all *.nc files from each input dir into a cube."""
     for input_dir in input_dirs:
         if str(input_dir) in problematic:
             # print('Skipping', input_dir)
@@ -46,12 +48,20 @@ def load_cubes_from_input_dirs(input_dirs):
         yield cube
 
 
-def load_timeseries_cubes(mip_table='Amon'):
-    """
-    Data: ta / Amon / historical / r1i1p1f1, any grid, 1950 - onwards.
-    All dimensions reduced to a few steps except for the time dimension
-    Some other variable / ocean, probably a different frequency,
-       similar number of timesteps, other dimensions reduced.
+def load_timeseries_cubes(mip_table: str = 'Amon') -> list:
+    """Returns a list of iris cubes with timeseries data.
+
+    The data are: ta / Amon / historical / r1i1p1f1, any grid, 1950 - onwards.
+    All dimensions were reduced to a few steps except for the time dimension.
+
+    Parameters
+    ----------
+    mip_table: str
+        select monthly (`Amon`) or daily (`day`) data.
+
+    Returns
+    -------
+    list of iris.cube
     """
 
     timeseries_dir = base_dir / 'data' / 'timeseries'

From 179416de65d7ce8ec83fb6cfab12542f3dd2826d Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Tue, 10 Nov 2020 13:34:36 +0100
Subject: [PATCH 13/18] Select subset of data

---
 esmvaltool_sample_data/loader.py | 41 ++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/esmvaltool_sample_data/loader.py b/esmvaltool_sample_data/loader.py
index 8baf682..df24f94 100644
--- a/esmvaltool_sample_data/loader.py
+++ b/esmvaltool_sample_data/loader.py
@@ -7,10 +7,20 @@
 
 problematic = [
     # iris.exceptions.ConcatenateError: failed to concatenate into a single cube.
-    'esmvaltool_sample_data/data/timeseries/CMIP6/CMIP/NCC/NorCPM1/historical/r1i1p1f1/Amon/ta/gn/v20190914',
+    'data/timeseries/CMIP6/CMIP/NCC/NorCPM1/historical/r1i1p1f1/Amon/ta/gn/v20190914',
     # UserWarning: Gracefully filling 'lat' dimension coordinate masked points
-    'esmvaltool_sample_data/data/timeseries/CMIP6/CMIP/NCAR/CESM2-FV2/historical/r1i1p1f1/Amon/ta/gn/v20191120',
-    'esmvaltool_sample_data/data/timeseries/CMIP6/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/Amon/ta/gn/v20191120',
+    'data/timeseries/CMIP6/CMIP/NCAR/CESM2-FV2/historical/r1i1p1f1/Amon/ta/gn/v20191120',
+    'data/timeseries/CMIP6/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/Amon/ta/gn/v20191120',
+]
+
+whitelist = [
+    # (780, 2, 2, 2) 365_day
+    'data/timeseries/CMIP6/CMIP/CAS/FGOALS-f3-L/historical/r1i1p1f1/Amon/ta/gr/v20190927',
+    'data/timeseries/CMIP6/CMIP/E3SM-Project/E3SM-1-0/historical/r1i1p1f1/Amon/ta/gr/v20191220',
+    'data/timeseries/CMIP6/CMIP/E3SM-Project/E3SM-1-1-ECA/historical/r1i1p1f1/Amon/ta/gr/v20200624',
+    'data/timeseries/CMIP6/CMIP/E3SM-Project/E3SM-1-1/historical/r1i1p1f1/Amon/ta/gr/v20191211',
+    'data/timeseries/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/historical/r1i1p1f1/Amon/ta/gr1/v20180701',
+    'data/timeseries/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Amon/ta/gr1/v20190726',
 ]
 
 
@@ -33,10 +43,6 @@ def simplify_time(cube: 'iris.Cube') -> None:
 def load_cubes_from_input_dirs(input_dirs: list) -> 'iris.Cube':
     """Generator that loads all *.nc files from each input dir into a cube."""
     for input_dir in input_dirs:
-        if str(input_dir) in problematic:
-            # print('Skipping', input_dir)
-            continue
-        # print(input_dir)
         files = input_dir.glob('*.nc')
         cubes = iris.load(str(file) for file in files)
         for cube in cubes:
@@ -45,9 +51,27 @@ def load_cubes_from_input_dirs(input_dirs: list) -> 'iris.Cube':
 
         cube = cubes.concatenate_cube()
 
+        # print(cube.shape, cube.coord('time').units.calendar, input_dir)
+
         yield cube
 
 
+def filter_problematic(dirs):
+    base = Path(__file__).parent
+    for drc in dirs:
+        relative_dir = drc.relative_to(base)
+        if str(relative_dir) not in problematic:
+            yield drc
+
+
+def get_subset(dirs):
+    base = Path(__file__).parent
+    for drc in dirs:
+        relative_dir = drc.relative_to(base)
+        if str(relative_dir) in whitelist:
+            yield drc
+
+
 def load_timeseries_cubes(mip_table: str = 'Amon') -> list:
     """Returns a list of iris cubes with timeseries data.
 
@@ -69,6 +93,9 @@ def load_timeseries_cubes(mip_table: str = 'Amon') -> list:
     paths = timeseries_dir.glob(f'**/{mip_table}/**/*.nc')
     input_dirs = list(set(path.parent for path in paths))
 
+    input_dirs = filter_problematic(input_dirs)
+    input_dirs = get_subset(input_dirs)
+
     cubes = load_cubes_from_input_dirs(input_dirs)
 
     return list(cubes)

From 6de61547723f18fef3850d9863eecdae6d645491 Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Tue, 10 Nov 2020 16:00:44 +0100
Subject: [PATCH 14/18] Add whitelists for specific subsets of data

---
 esmvaltool_sample_data/__init__.py | 126 ++++++++++++++++++++++++++++-
 esmvaltool_sample_data/loader.py   | 118 ---------------------------
 2 files changed, 125 insertions(+), 119 deletions(-)
 delete mode 100644 esmvaltool_sample_data/loader.py

diff --git a/esmvaltool_sample_data/__init__.py b/esmvaltool_sample_data/__init__.py
index cc8e762..c359921 100644
--- a/esmvaltool_sample_data/__init__.py
+++ b/esmvaltool_sample_data/__init__.py
@@ -1 +1,125 @@
-from .loader import load_map_cubes, load_profile_cubes, load_timeseries_cubes
+from pathlib import Path
+
+import cf_units
+import iris
+
+base_dir = Path(__file__).parent
+
+problematic = [
+    # iris.exceptions.ConcatenateError: failed to concatenate into a single cube.
+    'data/timeseries/CMIP6/CMIP/NCC/NorCPM1/historical/r1i1p1f1/Amon/ta/gn/v20190914',
+    # UserWarning: Gracefully filling 'lat' dimension coordinate masked points
+    'data/timeseries/CMIP6/CMIP/NCAR/CESM2-FV2/historical/r1i1p1f1/Amon/ta/gn/v20191120',
+    'data/timeseries/CMIP6/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/Amon/ta/gn/v20191120',
+    'data/timeseries/CMIP6/CMIP/CSIRO-ARCCSS/ACCESS-CM2/historical/r1i1p1f1/day/ta/gn/v20191108',
+]
+
+whitelist = {
+    'Amon': (
+        # (780, 2, 2, 2) 365_day
+        'data/timeseries/CMIP6/CMIP/CAS/FGOALS-f3-L/historical/r1i1p1f1/Amon/ta/gr/v20190927',
+        'data/timeseries/CMIP6/CMIP/E3SM-Project/E3SM-1-0/historical/r1i1p1f1/Amon/ta/gr/v20191220',
+        'data/timeseries/CMIP6/CMIP/E3SM-Project/E3SM-1-1-ECA/historical/r1i1p1f1/Amon/ta/gr/v20200624',
+        'data/timeseries/CMIP6/CMIP/E3SM-Project/E3SM-1-1/historical/r1i1p1f1/Amon/ta/gr/v20191211',
+        'data/timeseries/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/historical/r1i1p1f1/Amon/ta/gr1/v20180701',
+        'data/timeseries/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Amon/ta/gr1/v20190726',
+    ),
+    'day': (
+        # (3650, 2, 3, 2) 365_day
+        'data/timeseries/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/day/ta/gn/v20200626',
+        'data/timeseries/CMIP6/CMIP/NCAR/CESM2-WACCM/historical/r1i1p1f1/day/ta/gn/v20190227',
+        'data/timeseries/CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/day/ta/gn/v20190308',
+        'data/timeseries/CMIP6/CMIP/NCC/NorESM2-MM/historical/r1i1p1f1/day/ta/gn/v20191108',
+        'data/timeseries/CMIP6/CMIP/NASA-GISS/GISS-E2-1-G/historical/r1i1p1f1/day/ta/gn/v20181015',
+    )
+}
+
+
+def strip_attributes(cube: 'iris.Cube') -> None:
+    """Remove attributes in-place that cause issues with merging and
+    concatenation."""
+    for attr in ['creation_date', 'tracking_id', 'history']:
+        if attr in cube.attributes:
+            cube.attributes.pop(attr)
+
+
+def simplify_time(cube: 'iris.Cube') -> None:
+    """Simplifies the time coordinate in-place."""
+    coord = cube.coord('time')
+    coord.convert_units(
+        cf_units.Unit('days since 1850-1-1 00:00:00',
+                      calendar=coord.units.calendar))
+
+
+def load_cubes_from_input_dirs(input_dirs: list) -> 'iris.Cube':
+    """Generator that loads all *.nc files from each input dir into a cube."""
+    for input_dir in input_dirs:
+        files = input_dir.glob('*.nc')
+        cubes = iris.load(str(file) for file in files)
+        for cube in cubes:
+            strip_attributes(cube)
+            simplify_time(cube)
+
+        cube = cubes.concatenate_cube()
+
+        # print(cube.shape, cube.coord('time').units.calendar, input_dir)
+
+        yield cube
+
+
+def get_subset(dirs, subset):
+    base = Path(__file__).parent
+    for drc in dirs:
+        relative_dir = drc.relative_to(base)
+        if str(relative_dir) in problematic:
+            continue
+        if str(relative_dir) in subset:
+            yield drc
+
+
+def load_timeseries_cubes(mip_table: str = 'Amon') -> list:
+    """Returns a list of iris cubes with timeseries data.
+
+    The data are: ta / Amon / historical / r1i1p1f1, any grid, 1950 - onwards.
+    All dimensions were reduced to a few steps except for the time dimension.
+
+    Parameters
+    ----------
+    mip_table: str
+        select monthly (`Amon`) or daily (`day`) data.
+
+    Returns
+    -------
+    list of iris.cube
+    """
+
+    timeseries_dir = base_dir / 'data' / 'timeseries'
+
+    paths = timeseries_dir.glob(f'**/{mip_table}/**/*.nc')
+    input_dirs = list(set(path.parent for path in paths))
+
+    subset = whitelist[mip_table]
+
+    input_dirs = get_subset(input_dirs, subset=subset)
+
+    cubes = load_cubes_from_input_dirs(input_dirs)
+
+    return list(cubes)
+
+
+def load_map_cubes():
+    """a 4D atmospheric variable, all dimensions reduced to a few steps except
+    the horizontal dimension(s) same for an ocean variable."""
+    raise NotImplementedError
+
+
+def load_profile_cubes():
+    """a 4D atmospheric variable, all dimensions reduced to a few steps except
+    the vertical dimension(s) same for an ocean variable."""
+    raise NotImplementedError
+
+
+if __name__ == '__main__':
+    ts_day = load_timeseries_cubes('day')
+    ts_amon = load_timeseries_cubes('Amon')
+    breakpoint()
diff --git a/esmvaltool_sample_data/loader.py b/esmvaltool_sample_data/loader.py
deleted file mode 100644
index df24f94..0000000
--- a/esmvaltool_sample_data/loader.py
+++ /dev/null
@@ -1,118 +0,0 @@
-from pathlib import Path
-
-import cf_units
-import iris
-
-base_dir = Path(__file__).parent
-
-problematic = [
-    # iris.exceptions.ConcatenateError: failed to concatenate into a single cube.
-    'data/timeseries/CMIP6/CMIP/NCC/NorCPM1/historical/r1i1p1f1/Amon/ta/gn/v20190914',
-    # UserWarning: Gracefully filling 'lat' dimension coordinate masked points
-    'data/timeseries/CMIP6/CMIP/NCAR/CESM2-FV2/historical/r1i1p1f1/Amon/ta/gn/v20191120',
-    'data/timeseries/CMIP6/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/Amon/ta/gn/v20191120',
-]
-
-whitelist = [
-    # (780, 2, 2, 2) 365_day
-    'data/timeseries/CMIP6/CMIP/CAS/FGOALS-f3-L/historical/r1i1p1f1/Amon/ta/gr/v20190927',
-    'data/timeseries/CMIP6/CMIP/E3SM-Project/E3SM-1-0/historical/r1i1p1f1/Amon/ta/gr/v20191220',
-    'data/timeseries/CMIP6/CMIP/E3SM-Project/E3SM-1-1-ECA/historical/r1i1p1f1/Amon/ta/gr/v20200624',
-    'data/timeseries/CMIP6/CMIP/E3SM-Project/E3SM-1-1/historical/r1i1p1f1/Amon/ta/gr/v20191211',
-    'data/timeseries/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/historical/r1i1p1f1/Amon/ta/gr1/v20180701',
-    'data/timeseries/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Amon/ta/gr1/v20190726',
-]
-
-
-def strip_attributes(cube: 'iris.Cube') -> None:
-    """Remove attributes in-place that cause issues with merging and
-    concatenation."""
-    for attr in ['creation_date', 'tracking_id', 'history']:
-        if attr in cube.attributes:
-            cube.attributes.pop(attr)
-
-
-def simplify_time(cube: 'iris.Cube') -> None:
-    """Simplifies the time coordinate in-place."""
-    coord = cube.coord('time')
-    coord.convert_units(
-        cf_units.Unit('days since 1850-1-1 00:00:00',
-                      calendar=coord.units.calendar))
-
-
-def load_cubes_from_input_dirs(input_dirs: list) -> 'iris.Cube':
-    """Generator that loads all *.nc files from each input dir into a cube."""
-    for input_dir in input_dirs:
-        files = input_dir.glob('*.nc')
-        cubes = iris.load(str(file) for file in files)
-        for cube in cubes:
-            strip_attributes(cube)
-            simplify_time(cube)
-
-        cube = cubes.concatenate_cube()
-
-        # print(cube.shape, cube.coord('time').units.calendar, input_dir)
-
-        yield cube
-
-
-def filter_problematic(dirs):
-    base = Path(__file__).parent
-    for drc in dirs:
-        relative_dir = drc.relative_to(base)
-        if str(relative_dir) not in problematic:
-            yield drc
-
-
-def get_subset(dirs):
-    base = Path(__file__).parent
-    for drc in dirs:
-        relative_dir = drc.relative_to(base)
-        if str(relative_dir) in whitelist:
-            yield drc
-
-
-def load_timeseries_cubes(mip_table: str = 'Amon') -> list:
-    """Returns a list of iris cubes with timeseries data.
-
-    The data are: ta / Amon / historical / r1i1p1f1, any grid, 1950 - onwards.
-    All dimensions were reduced to a few steps except for the time dimension.
-
-    Parameters
-    ----------
-    mip_table: str
-        select monthly (`Amon`) or daily (`day`) data.
-
-    Returns
-    -------
-    list of iris.cube
-    """
-
-    timeseries_dir = base_dir / 'data' / 'timeseries'
-
-    paths = timeseries_dir.glob(f'**/{mip_table}/**/*.nc')
-    input_dirs = list(set(path.parent for path in paths))
-
-    input_dirs = filter_problematic(input_dirs)
-    input_dirs = get_subset(input_dirs)
-
-    cubes = load_cubes_from_input_dirs(input_dirs)
-
-    return list(cubes)
-
-
-def load_map_cubes():
-    """a 4D atmospheric variable, all dimensions reduced to a few steps except
-    the horizontal dimension(s) same for an ocean variable."""
-    raise NotImplementedError
-
-
-def load_profile_cubes():
-    """a 4D atmospheric variable, all dimensions reduced to a few steps except
-    the vertical dimension(s) same for an ocean variable."""
-    raise NotImplementedError
-
-
-if __name__ == '__main__':
-    ts = load_timeseries_cubes()
-    breakpoint()

From 710989a13d16e5ce0d02064a315a972380c757ac Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Tue, 17 Nov 2020 15:43:42 +0100
Subject: [PATCH 15/18] Use ignore list to filter problematic datasets

---
 esmvaltool_sample_data/__init__.py  | 66 ++++++++++++-----------------
 esmvaltool_sample_data/datasets.yml |  2 +
 2 files changed, 30 insertions(+), 38 deletions(-)

diff --git a/esmvaltool_sample_data/__init__.py b/esmvaltool_sample_data/__init__.py
index c359921..c57ee57 100644
--- a/esmvaltool_sample_data/__init__.py
+++ b/esmvaltool_sample_data/__init__.py
@@ -2,37 +2,20 @@
 
 import cf_units
 import iris
+import yaml
 
 base_dir = Path(__file__).parent
 
-problematic = [
-    # iris.exceptions.ConcatenateError: failed to concatenate into a single cube.
-    'data/timeseries/CMIP6/CMIP/NCC/NorCPM1/historical/r1i1p1f1/Amon/ta/gn/v20190914',
-    # UserWarning: Gracefully filling 'lat' dimension coordinate masked points
-    'data/timeseries/CMIP6/CMIP/NCAR/CESM2-FV2/historical/r1i1p1f1/Amon/ta/gn/v20191120',
-    'data/timeseries/CMIP6/CMIP/NCAR/CESM2-WACCM-FV2/historical/r1i1p1f1/Amon/ta/gn/v20191120',
-    'data/timeseries/CMIP6/CMIP/CSIRO-ARCCSS/ACCESS-CM2/historical/r1i1p1f1/day/ta/gn/v20191108',
-]
+VERBOSE = False
+
+with open(base_dir / 'datasets.yml', 'r') as f:
+    config = yaml.safe_load(f)
 
-whitelist = {
-    'Amon': (
-        # (780, 2, 2, 2) 365_day
-        'data/timeseries/CMIP6/CMIP/CAS/FGOALS-f3-L/historical/r1i1p1f1/Amon/ta/gr/v20190927',
-        'data/timeseries/CMIP6/CMIP/E3SM-Project/E3SM-1-0/historical/r1i1p1f1/Amon/ta/gr/v20191220',
-        'data/timeseries/CMIP6/CMIP/E3SM-Project/E3SM-1-1-ECA/historical/r1i1p1f1/Amon/ta/gr/v20200624',
-        'data/timeseries/CMIP6/CMIP/E3SM-Project/E3SM-1-1/historical/r1i1p1f1/Amon/ta/gr/v20191211',
-        'data/timeseries/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/historical/r1i1p1f1/Amon/ta/gr1/v20180701',
-        'data/timeseries/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Amon/ta/gr1/v20190726',
-    ),
-    'day': (
-        # (3650, 2, 3, 2) 365_day
-        'data/timeseries/CMIP6/CMIP/AS-RCEC/TaiESM1/historical/r1i1p1f1/day/ta/gn/v20200626',
-        'data/timeseries/CMIP6/CMIP/NCAR/CESM2-WACCM/historical/r1i1p1f1/day/ta/gn/v20190227',
-        'data/timeseries/CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/day/ta/gn/v20190308',
-        'data/timeseries/CMIP6/CMIP/NCC/NorESM2-MM/historical/r1i1p1f1/day/ta/gn/v20191108',
-        'data/timeseries/CMIP6/CMIP/NASA-GISS/GISS-E2-1-G/historical/r1i1p1f1/day/ta/gn/v20181015',
-    )
-}
+ignore_list = [fn.replace('.', '/') for fn in config['ignore']]
+
+ignore_list += [
+    # Add paths to problematic data sets here or to `datasets.yml`
+]
 
 
 def strip_attributes(cube: 'iris.Cube') -> None:
@@ -53,7 +36,10 @@ def simplify_time(cube: 'iris.Cube') -> None:
 
 def load_cubes_from_input_dirs(input_dirs: list) -> 'iris.Cube':
     """Generator that loads all *.nc files from each input dir into a cube."""
-    for input_dir in input_dirs:
+    for i, input_dir in enumerate(sorted(input_dirs)):
+        if VERBOSE:
+            print(f'Loading #{i:02d}:', input_dir)
+
         files = input_dir.glob('*.nc')
         cubes = iris.load(str(file) for file in files)
         for cube in cubes:
@@ -62,19 +48,19 @@ def load_cubes_from_input_dirs(input_dirs: list) -> 'iris.Cube':
 
         cube = cubes.concatenate_cube()
 
-        # print(cube.shape, cube.coord('time').units.calendar, input_dir)
+        if VERBOSE:
+            print('           ', cube.shape, cube.coord('time').units.calendar)
 
         yield cube
 
 
-def get_subset(dirs, subset):
-    base = Path(__file__).parent
+def filter_ignored_datasets(dirs, root):
     for drc in dirs:
-        relative_dir = drc.relative_to(base)
-        if str(relative_dir) in problematic:
-            continue
-        if str(relative_dir) in subset:
+        test_drc = str(drc.relative_to(root))
+        if test_drc not in ignore_list:
             yield drc
+        elif VERBOSE:
+            print('Ignored:', test_drc)
 
 
 def load_timeseries_cubes(mip_table: str = 'Amon') -> list:
@@ -98,9 +84,7 @@ def load_timeseries_cubes(mip_table: str = 'Amon') -> list:
     paths = timeseries_dir.glob(f'**/{mip_table}/**/*.nc')
     input_dirs = list(set(path.parent for path in paths))
 
-    subset = whitelist[mip_table]
-
-    input_dirs = get_subset(input_dirs, subset=subset)
+    input_dirs = list(filter_ignored_datasets(input_dirs, timeseries_dir))
 
     cubes = load_cubes_from_input_dirs(input_dirs)
 
@@ -120,6 +104,12 @@ def load_profile_cubes():
 
 
 if __name__ == '__main__':
+    VERBOSE = True
+
+    print('Loading daily data')
     ts_day = load_timeseries_cubes('day')
+    print()
+    print('Loading monthly data')
     ts_amon = load_timeseries_cubes('Amon')
+
     breakpoint()
diff --git a/esmvaltool_sample_data/datasets.yml b/esmvaltool_sample_data/datasets.yml
index b3806a4..a3ca9c2 100644
--- a/esmvaltool_sample_data/datasets.yml
+++ b/esmvaltool_sample_data/datasets.yml
@@ -27,3 +27,5 @@ ignore:
   - CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3-Veg-LR.historical.r1i1p1f1.Amon.ta.gr.v20200217
   # something wrong with lon coord
   - CMIP6.CMIP.UA.MCM-UA-1-0.historical.r1i1p1f1.Amon.ta.gn.v20190731
+  # iris.exceptions.ConcatenateError: failed to concatenate into a single cube.
+  - CMIP6.CMIP.NCC.NorCPM1.historical.r1i1p1f1.Amon.ta.gn.v20190914

From d478a8dbe5d4a9f5fbfc5ddcf6428e462d0b7d0a Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Tue, 17 Nov 2020 16:01:29 +0100
Subject: [PATCH 16/18] Ignore dataset that fails to regrid

---
 esmvaltool_sample_data/__init__.py  | 17 ++++++++++++-----
 esmvaltool_sample_data/datasets.yml |  2 ++
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/esmvaltool_sample_data/__init__.py b/esmvaltool_sample_data/__init__.py
index c57ee57..d726b71 100644
--- a/esmvaltool_sample_data/__init__.py
+++ b/esmvaltool_sample_data/__init__.py
@@ -106,10 +106,17 @@ def load_profile_cubes():
 if __name__ == '__main__':
     VERBOSE = True
 
-    print('Loading daily data')
-    ts_day = load_timeseries_cubes('day')
-    print()
-    print('Loading monthly data')
-    ts_amon = load_timeseries_cubes('Amon')
+    for mip_table in (
+            'Amon',
+            'day',
+    ):
+        print()
+        print(f'Loading `{mip_table}`')
+        ts = load_timeseries_cubes(mip_table)
+
+        first_cube = ts[0]
+        for i, cube in enumerate(ts):
+            print(i)
+            cube.regrid(grid=first_cube, scheme=iris.analysis.Linear())
 
     breakpoint()
diff --git a/esmvaltool_sample_data/datasets.yml b/esmvaltool_sample_data/datasets.yml
index a3ca9c2..bc182b6 100644
--- a/esmvaltool_sample_data/datasets.yml
+++ b/esmvaltool_sample_data/datasets.yml
@@ -29,3 +29,5 @@ ignore:
   - CMIP6.CMIP.UA.MCM-UA-1-0.historical.r1i1p1f1.Amon.ta.gn.v20190731
   # iris.exceptions.ConcatenateError: failed to concatenate into a single cube.
   - CMIP6.CMIP.NCC.NorCPM1.historical.r1i1p1f1.Amon.ta.gn.v20190914
+  # Regridding -> ValueError: Cube 'air_temperature' must contain a single 1D y coordinate.
+  - CMIP6.CMIP.FIO-QLNM.FIO-ESM-2-0.historical.r1i1p1f1.Amon.ta.gn.v20191204

From f2d4e97693e7df0a40c0eb9e543b9f1a85cbca36 Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Tue, 17 Nov 2020 16:59:28 +0100
Subject: [PATCH 17/18] Remove unused functions

---
 esmvaltool_sample_data/__init__.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/esmvaltool_sample_data/__init__.py b/esmvaltool_sample_data/__init__.py
index d726b71..2ebcd74 100644
--- a/esmvaltool_sample_data/__init__.py
+++ b/esmvaltool_sample_data/__init__.py
@@ -91,18 +91,6 @@ def load_timeseries_cubes(mip_table: str = 'Amon') -> list:
     return list(cubes)
 
 
-def load_map_cubes():
-    """a 4D atmospheric variable, all dimensions reduced to a few steps except
-    the horizontal dimension(s) same for an ocean variable."""
-    raise NotImplementedError
-
-
-def load_profile_cubes():
-    """a 4D atmospheric variable, all dimensions reduced to a few steps except
-    the vertical dimension(s) same for an ocean variable."""
-    raise NotImplementedError
-
-
 if __name__ == '__main__':
     VERBOSE = True
 

From 256a9a9082477d4e837ac4ebdcd5ef0d918c279e Mon Sep 17 00:00:00 2001
From: Stef Smeets <s.smeets@esciencecenter.nl>
Date: Tue, 17 Nov 2020 17:00:34 +0100
Subject: [PATCH 18/18] Remove code used for testing

---
 esmvaltool_sample_data/__init__.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/esmvaltool_sample_data/__init__.py b/esmvaltool_sample_data/__init__.py
index 2ebcd74..6544d48 100644
--- a/esmvaltool_sample_data/__init__.py
+++ b/esmvaltool_sample_data/__init__.py
@@ -13,10 +13,6 @@
 
 ignore_list = [fn.replace('.', '/') for fn in config['ignore']]
 
-ignore_list += [
-    # Add paths to problematic data sets here or to `datasets.yml`
-]
-
 
 def strip_attributes(cube: 'iris.Cube') -> None:
     """Remove attributes in-place that cause issues with merging and
@@ -107,4 +103,4 @@ def load_timeseries_cubes(mip_table: str = 'Amon') -> list:
             print(i)
             cube.regrid(grid=first_cube, scheme=iris.analysis.Linear())
 
-    breakpoint()
+    # breakpoint()