From c608b0a6f46443216033837ee9ba9f7a0c9b9223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Fri, 25 Feb 2022 06:52:43 +0100 Subject: [PATCH 01/11] add data_manager scaffold --- ruins/core/data_manager.py | 230 ++++++++++++++++++++++++++++++++ ruins/tests/test_datamanager.py | 31 +++++ 2 files changed, 261 insertions(+) create mode 100644 ruins/core/data_manager.py create mode 100644 ruins/tests/test_datamanager.py diff --git a/ruins/core/data_manager.py b/ruins/core/data_manager.py new file mode 100644 index 0000000..9d7fc68 --- /dev/null +++ b/ruins/core/data_manager.py @@ -0,0 +1,230 @@ +""" +Data Manager +============ + +The DataManager is a wrapper around all data sources used by RUINSapp. +It can be configures by any :class:`Config ` class +and organizes or caches all data sources using a +:class:`DataSource ` inherited class. +This makes the read and filter interface available on all sources, no matter +where they are stored. +Using the :class:`Config ` to instantiate a data +manager can in principle enabled different profiles, or even an interaction +with the frontend, although not implemented nor desired at the current stage. + +Example +------- + +.. code-block:: python + + from ruins import core + + # create default config + conf = core.Config() + + # create a data manager from this + dm = core.DataManager(**conf) + +Of course, the data manager can also be used without the config, ie. to open it +in debug mode: + +.. code-block:: python + + # using conf with conf.debug=False and overwrite it + dm = core.DataManager(**conf, debug=True) + +""" +import abc +import os +import glob +import xarray as xr +from collections.abc import Mapping +from typing import Type, List + + +DEFAULT_MIMES = { + 'nc': 'HDF5Source' +} + + +class DataSource(abc.ABC): + def __init__(self, **kwargs): + self._kwargs = kwargs + + @abc.abstractmethod + def read(self): + pass + + @abc.abstractmethod + def filter(self, **kwargs): + pass + + +class HDF5Source(DataSource): + def __init__(self, path: str, cache: bool = True, **kwargs): + super().__init__(**kwargs) + self.path = path + self.cache = cache + + # check cache settings + if self.cache: + self.data = self._load_source() + + def _load_source(self): + """Method to load the actual source on the disk""" + return xr.load_dataset(self.path) + + def read(self): + if self.cache: + return self.data + else: + return self._load_source() + + def filter(self): + pass + + +class DataManager(Mapping): + """Main class for accessing different data sources. + + The DataManager holds and manages all data sources. The default behavior is + to scan the specified path for files of known file extension and cache them + in memory. + + Parameters + ---------- + datapath : str + A location where the data is stored. The class will load all sources + there and make them accessible through DataSource classes. + cache : bool + Will be passed to the DataSource classes. It true, the source will only + be read once and then stored in memory until the DataManager gets + deconstructed. + include_mimes : dict + A dictionary of file extensions and their corresponding DataSource. + If something is not listed, the DataManager will ignore the file type. + The include_mimes can be overwritten by passing filenames directly. + + """ + def __init__(self, datapath: str = None, cache: bool = True, debug: bool = False, **kwargs) -> None: + """ + You can pass in a Config as kwargs. + """ + # check if the no config - or config without datapath - was passed + if datapath is None: + from ruins.core import Config + self.from_config(**Config(**kwargs)) + else: + self.from_config(datapath=datapath, cache=cache, debug=debug, **kwargs) + + def from_config(self, datapath: str = None, cache: bool = True, debug: bool = False, **kwargs) -> None: + """ + Initialize the DataManager from a :class:`Config ` object. + """ + # store the main settings + self._config = kwargs + self._datapath = datapath + self.cache = cache + self.debug = debug + + # file settings + self._data_sources = {} + + # infer data source + if self._datapath is not None: + self._infer_from_folder() + + @property + def datapath(self) -> str: + return self._datapath + + @datapath.setter + def datapath(self, path: str) -> None: + if os.path.exists(path): + self._datapath = path + self._infer_from_folder() + else: + raise OSError(f"{path} does not exist.") + + @property + def datasources(self) -> List[DataSource]: + return list(self._data_sources.keys()) + + def _infer_from_folder(self) -> None: + """ + Read all files from the datapath as specified on instantiation. + Calls :func:`add_source` on each file. + """ + # get a list of all files + file_list = glob.glob(os.path.join(self.datapath, '*')) + file_list.extend(glob.glob(os.path.join(self.datapath, '**', '*'))) + + + for fname in file_list: + self.add_source(path=fname, not_exists='warn' if self.debug else 'ignore') + + def add_source(self, path: str, not_exists: str = 'raise') -> None: + """ + Add a file as data source to the DataManager. + Only if the file has an allowed file extension, it will be managed. + Files of same name will be overwritten, this is also true if they had + different extensions. + + """ + # load the tracked + mimes = self._config.get('include_mimes', DEFAULT_MIMES) + + # get the basename + try: + basename, mime = os.path.basename(path).split('.') + except ValueError: + if self.debug: + print(f"[Warning]: {path} has no extension.") + return + + if mime in mimes.keys(): + # get the class - overwirte by direct kwargs settings if needed + clsName = mimes[mime] if basename not in self._config else self._config[basename] + BaseClass = self.resolve_class_name(clsName) + + # add the source + args = self._config.get(basename, {}) + args.update({'path': path, 'cache': self.cache}) + self._data_sources[basename] = BaseClass(**args) + else: + if not_exists == 'raise': + raise OSError(f"{path} is not a configured data source") + elif not_exists == 'ignore': + pass + elif not_exists == 'warn': + print(f"{path} is found, but not a configured data source") + + def resolve_class_name(self, cls_name: str) -> Type[DataSource]: + # checkout globals + cls = globals().get(cls_name, False) + + # do we have a class? + if not cls: + # TODO, there is maybe an extension module to search one day + raise RuntimeError(f"Can't find class {cls_name}.") + + return cls + + def __len__(self): + """Return the number of managed data sources""" + return len(self._data_sources) + + def __iter__(self): + """Iterate over all dataset names""" + for name in self._data_sources.keys(): + yield name + + def __getitem__(self, key: str): + """Return the requested datasource""" + return self._data_sources[key] + + def __repr__(self): + return f"{self.__class__.__name__}(datapath={self.datapath}, cache={self.cache})" + + def __str__(self): + return f"" diff --git a/ruins/tests/test_datamanager.py b/ruins/tests/test_datamanager.py new file mode 100644 index 0000000..b3858e0 --- /dev/null +++ b/ruins/tests/test_datamanager.py @@ -0,0 +1,31 @@ +import xarray as xr + +from ruins.core import DataManager +from ruins.core.data_manager import HDF5Source + + +def test_default_manager(): + """Instantiate the default data manager""" + dm = DataManager() + + assert dm.cache == True + + # find some datasets weather dataset + assert 'cordex_coast' in dm.datasources + assert 'CMIP5grid' in dm.datasources + + +def test_weather_dataset(): + """Test the weather dataset""" + dm = DataManager() + + # check weather dataset was loaded + assert 'weather' in dm.datasources + + # check Source type + weather = dm['weather'] + assert isinstance(weather, HDF5Source) + + # load the data + data = weather.read() + assert isinstance(data, xr.Dataset) From 140576d662ff85fbc2ffc7914a881d81e828ad8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Fri, 25 Feb 2022 06:59:12 +0100 Subject: [PATCH 02/11] add DataManager import --- ruins/core/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ruins/core/__init__.py b/ruins/core/__init__.py index 3558f42..d1db6a5 100644 --- a/ruins/core/__init__.py +++ b/ruins/core/__init__.py @@ -1 +1,2 @@ -from .config import Config \ No newline at end of file +from .config import Config +from .data_manager import DataManager \ No newline at end of file From 23b832250ac6ba04be8b508cd83816923747c551 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Fri, 25 Feb 2022 09:15:24 +0100 Subject: [PATCH 03/11] add OS environ check --- .github/workflows/main.yml | 7 ++++--- ruins/tests/test_datamanager.py | 8 ++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 17c550c..12d6f25 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,9 +20,10 @@ jobs: - name: Checkout and use lfs uses: actions/checkout@v2 with: - lfs: true - - name: Download LFS - run: git lfs checkout + # lfs: true + lfs: false + # - name: Download LFS + # run: git lfs checkout - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: diff --git a/ruins/tests/test_datamanager.py b/ruins/tests/test_datamanager.py index b3858e0..1c63238 100644 --- a/ruins/tests/test_datamanager.py +++ b/ruins/tests/test_datamanager.py @@ -1,8 +1,13 @@ import xarray as xr +import os from ruins.core import DataManager from ruins.core.data_manager import HDF5Source +# some datasources are backed by git-lfs which have to be disabled on +# github actions +NO_LFS = 'NO_LFS' in os.environ + def test_default_manager(): """Instantiate the default data manager""" @@ -27,5 +32,8 @@ def test_weather_dataset(): assert isinstance(weather, HDF5Source) # load the data + if NO_LFS: + return + data = weather.read() assert isinstance(data, xr.Dataset) From a151ec460f9b31910e92cb22b3213393058cd56b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Fri, 25 Feb 2022 09:21:06 +0100 Subject: [PATCH 04/11] add print to captured output --- ruins/tests/test_datamanager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ruins/tests/test_datamanager.py b/ruins/tests/test_datamanager.py index 1c63238..25e1cbd 100644 --- a/ruins/tests/test_datamanager.py +++ b/ruins/tests/test_datamanager.py @@ -33,6 +33,7 @@ def test_weather_dataset(): # load the data if NO_LFS: + print('No LFS, skipping partial test') return data = weather.read() From c1f8f0721089ef4f8debe2d195c216a94b4794f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Fri, 25 Feb 2022 09:23:44 +0100 Subject: [PATCH 05/11] disable LFS tests on GH actions --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 12d6f25..45ddf1d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -38,6 +38,8 @@ jobs: run: | pip install pytest pytest-cov pytest --import-mode=append --cov-config=.coveragerc --cov=ruins --cov-report=xml + env: + NO_LFS: true - name: Upload coverage to codecov uses: codecov/codecov-action@v2 From 78433d6823c1916658f8b06c08803ffe310d4c6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Fri, 25 Feb 2022 09:46:21 +0100 Subject: [PATCH 06/11] only cache on read --- ruins/core/data_manager.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ruins/core/data_manager.py b/ruins/core/data_manager.py index 9d7fc68..f1e4395 100644 --- a/ruins/core/data_manager.py +++ b/ruins/core/data_manager.py @@ -66,17 +66,16 @@ def __init__(self, path: str, cache: bool = True, **kwargs): self.path = path self.cache = cache - # check cache settings - if self.cache: - self.data = self._load_source() - def _load_source(self): """Method to load the actual source on the disk""" return xr.load_dataset(self.path) def read(self): if self.cache: + if not hasattr(self, 'data'): + self.data = self._load_source() return self.data + else: return self._load_source() From 7d117cf0077db9422ab15d95a404952b86988b5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Tue, 1 Mar 2022 08:58:46 +0100 Subject: [PATCH 07/11] Prevent LFS downloads in GH actions --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 17c550c..d52d3ce 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,7 +20,7 @@ jobs: - name: Checkout and use lfs uses: actions/checkout@v2 with: - lfs: true + lfs: false - name: Download LFS run: git lfs checkout - name: Set up Python ${{ matrix.python-version }} From 7f0f934b922e2acef086b52a9e2a9c5763d2cb31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Tue, 1 Mar 2022 12:54:33 +0100 Subject: [PATCH 08/11] Update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index d75b0a3..cee9ff3 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # RUINS climate data and regional model app +[![Test RUINS](https://github.com/hydrocode-de/RUINSapp/actions/workflows/main.yml/badge.svg)](https://github.com/hydrocode-de/RUINSapp/actions/workflows/main.yml) +[![codecov](https://codecov.io/gh/hydrocode-de/RUINSapp/branch/main/graph/badge.svg?token=SFxENKltZb)](https://codecov.io/gh/hydrocode-de/RUINSapp) + + This is a compilation of tools to assess and visualise climate data and climate model projections at the German North Sea coast. This app is intended to especially focus on the treatment of uncertainties within the data and model projections. For more information about the RUINS project, see our website. From a969bac156432fd7a63c2c3bf18d886fe23d8416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Wed, 2 Mar 2022 09:34:40 +0100 Subject: [PATCH 09/11] 0.4.0 --- requirements.txt | 3 ++- ruins/__init__.py | 2 +- version.py | 48 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 version.py diff --git a/requirements.txt b/requirements.txt index aade70b..0725afd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ statsmodels xarray climate-indices pillow -sklearn \ No newline at end of file +sklearn +fire \ No newline at end of file diff --git a/ruins/__init__.py b/ruins/__init__.py index 290d7c6..abeeedb 100644 --- a/ruins/__init__.py +++ b/ruins/__init__.py @@ -1 +1 @@ -__version__ = '0.3.0' \ No newline at end of file +__version__ = '0.4.0' diff --git a/version.py b/version.py new file mode 100644 index 0000000..7ffa3b2 --- /dev/null +++ b/version.py @@ -0,0 +1,48 @@ +import os +from ruins import __version__ + +def increment(which='patch'): + """ + Increment the version number. + """ + parts = __version__.split('.') + if which == 'patch': + parts[2] = str(int(parts[2]) + 1) + elif which == 'minor': + parts[1] = str(int(parts[1]) + 1) + elif which == 'major': + parts[0] = str(int(parts[0]) + 1) + else: + raise ValueError("Invalid version increment.") + return '.'.join(parts) + + +def replace(which='patch'): + """ + Increment the version number for RUINS. + + """ + # find the file + path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'ruins', '__init__.py')) + + # read + with open(path, 'r') as f: + lines = f.readlines() + + # replace the version + for i, line in enumerate(lines): + if '__version__' in line: + new_version = increment(which) + lines[i] = f"__version__ = '{new_version}'\n" + break + + # overwrite + with open(path, 'w') as f: + f.writelines(lines) + + print(new_version) + + +if __name__ == '__main__': + import fire + fire.Fire(replace) \ No newline at end of file From 8b7385d7e599eaad3cdfe3e8f2f4013bd39551ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Wed, 2 Mar 2022 09:40:00 +0100 Subject: [PATCH 10/11] added badges --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index cee9ff3..d9c517a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # RUINS climate data and regional model app +![GitHub release (latest by date)](https://img.shields.io/github/v/release/hydrocode-de/RUINSapp?color=success&logo=Github) +![PyPI](https://img.shields.io/pypi/v/ruins-app?color=success&logo=PyPI) [![Test RUINS](https://github.com/hydrocode-de/RUINSapp/actions/workflows/main.yml/badge.svg)](https://github.com/hydrocode-de/RUINSapp/actions/workflows/main.yml) [![codecov](https://codecov.io/gh/hydrocode-de/RUINSapp/branch/main/graph/badge.svg?token=SFxENKltZb)](https://codecov.io/gh/hydrocode-de/RUINSapp) From ad53bf99e22c676730e655f61acfbde00f3344dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20M=C3=A4licke?= Date: Wed, 2 Mar 2022 09:43:14 +0100 Subject: [PATCH 11/11] small fix to version.py --- version.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/version.py b/version.py index 7ffa3b2..5daf726 100644 --- a/version.py +++ b/version.py @@ -10,8 +10,11 @@ def increment(which='patch'): parts[2] = str(int(parts[2]) + 1) elif which == 'minor': parts[1] = str(int(parts[1]) + 1) + parts[2] = '0' elif which == 'major': parts[0] = str(int(parts[0]) + 1) + parts[1] = '0' + parts[2] = '0' else: raise ValueError("Invalid version increment.") return '.'.join(parts)