diff --git a/conftest.py b/conftest.py index 959eb7332e..74988549ea 100644 --- a/conftest.py +++ b/conftest.py @@ -251,7 +251,24 @@ def data_repository(directory_tree): @pytest.fixture( - params=['test-renku-v0.3.0.git', 'old-datasets-v0.3.0.git'], + params=[ + { + 'name': 'old-datasets-v0.3.0.git', + 'exit_code': 1 + }, + { + 'name': 'old-datasets-v0.5.0.git', + 'exit_code': 1 + }, + { + 'name': 'old-datasets-v0.5.1.git', + 'exit_code': 0 + }, + { + 'name': 'test-renku-v0.3.0.git', + 'exit_code': 1 + }, + ], scope='module', ) def old_bare_repository(request, tmpdir_factory): @@ -261,13 +278,19 @@ def old_bare_repository(request, tmpdir_factory): compressed_repo_path = Path( __file__ - ).parent / 'tests' / 'fixtures' / '{0}.tar.gz'.format(request.param) - working_dir_path = tmpdir_factory.mktemp(request.param) + ).parent / 'tests' / 'fixtures' / '{0}.tar.gz'.format( + request.param['name'] + ) + + working_dir_path = tmpdir_factory.mktemp(request.param['name']) with tarfile.open(str(compressed_repo_path), 'r') as fixture: fixture.extractall(working_dir_path.strpath) - yield working_dir_path / request.param + yield { + 'path': working_dir_path / request.param['name'], + 'exit_code': request.param['exit_code'] + } shutil.rmtree(working_dir_path.strpath) @@ -279,21 +302,29 @@ def old_repository(tmpdir_factory, old_bare_repository): from git import Repo repo_path = tmpdir_factory.mktemp('repo') - yield Repo(old_bare_repository.strpath).clone(repo_path.strpath) + yield { + 'repo': + Repo(old_bare_repository['path'].strpath).clone(repo_path.strpath), + 'exit_code': old_bare_repository['exit_code'] + } shutil.rmtree(repo_path.strpath) @pytest.fixture def old_project(old_repository): """Create a test project.""" - repo = old_repository - repository = repo.working_dir + repo = old_repository['repo'] + repository_path = repo.working_dir commit = repo.head.commit - os.chdir(repository) - yield repository - os.chdir(repository) + os.chdir(repository_path) + yield { + 'repo': repo, + 'path': repository_path, + 'exit_code': old_repository['exit_code'] + } + os.chdir(repository_path) repo.head.reset(commit, index=True, working_tree=True) # remove any extra non-tracked files (.pyc, etc) repo.git.clean('-xdff') diff --git a/renku/api/_git.py b/renku/api/_git.py index 0b96ec5360..11f5ed1149 100644 --- a/renku/api/_git.py +++ b/renku/api/_git.py @@ -34,6 +34,7 @@ from renku import errors from renku._compat import Path +from renku.errors import NothingToCommit COMMIT_DIFF_STRATEGY = 'DIFF' STARTED_AT = int(time.time() * 1e3) @@ -229,7 +230,7 @@ def ensure_unstaged(self, path): pass @contextmanager - def commit(self, author_date=None, commit_only=None): + def commit(self, author_date=None, commit_only=None, allow_empty=True): """Automatic commit.""" from git import Actor from renku.version import __version__, version_url @@ -286,6 +287,9 @@ def commit(self, author_date=None, commit_only=None): if not commit_only: self.repo.git.add('--all') + if not allow_empty and not self.repo.index.diff('HEAD'): + raise NothingToCommit() + argv = [os.path.basename(sys.argv[0])] + sys.argv[1:] # Ignore pre-commit hooks since we have already done everything. @@ -303,7 +307,8 @@ def transaction( up_to_date=False, commit=True, commit_only=None, - ignore_std_streams=False + ignore_std_streams=False, + allow_empty=True, ): """Perform Git checks and operations.""" if clean: @@ -316,7 +321,7 @@ def transaction( pass if commit: - with self.commit(commit_only=commit_only): + with self.commit(commit_only=commit_only, allow_empty=allow_empty): yield self else: yield self diff --git a/renku/api/repository.py b/renku/api/repository.py index 0d1eafc261..28f1dc5b65 100644 --- a/renku/api/repository.py +++ b/renku/api/repository.py @@ -16,7 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Client for handling a local repository.""" - +import subprocess import uuid from collections import defaultdict from contextlib import contextmanager @@ -99,11 +99,13 @@ def __attrs_post_init__(self): # initialize submodules if self.repo: - check_output([ - 'git', 'submodule', 'update', '--init', '--recursive' - ], - cwd=str(self.path)) - # TODO except + try: + check_output([ + 'git', 'submodule', 'update', '--init', '--recursive' + ], + cwd=str(self.path)) + except subprocess.CalledProcessError: + pass @property def lock(self): @@ -251,42 +253,49 @@ def subclients(self, parent_commit): Submodule.iter_items(self.repo, parent_commit=parent_commit) ] except (RuntimeError, ValueError): - # There are no submodules assiciated with the given commit. + # There are no submodules associated with the given commit. submodules = [] - return self._subclients.setdefault( - parent_commit, { - submodule: self.__class__( - path=(self.path / submodule.path).resolve(), + subclients = {} + for submodule in submodules: + subpath = (self.path / submodule.path).resolve() + is_renku = subpath / Path(self.renku_home) + + if subpath.exists() and is_renku.exists(): + subclients[submodule] = self.__class__( + path=subpath, parent=(self, submodule), ) - for submodule in submodules - } - ) + + return subclients def resolve_in_submodules(self, commit, path): """Resolve filename in submodules.""" original_path = self.path / path - if original_path.is_symlink() or str(path - ).startswith('.renku/vendors'): + in_vendor = str(path).startswith('.renku/vendors') + + if original_path.is_symlink() or in_vendor: original_path = original_path.resolve() + for submodule, subclient in self.subclients(commit).items(): - try: - subpath = original_path.relative_to(subclient.path) - return ( - subclient, - subclient.find_previous_commit( - subpath, revision=submodule.hexsha - ), - subpath, - ) - except ValueError: - pass + if (Path(submodule.path) / Path('.git')).exists(): + + try: + subpath = original_path.relative_to(subclient.path) + return ( + subclient, + subclient.find_previous_commit( + subpath, revision=submodule.hexsha + ), + subpath, + ) + except ValueError: + pass return self, commit, path @contextmanager - def with_metadata(self): + def with_metadata(self, read_only=False): """Yield an editable metadata object.""" from renku.models.projects import Project @@ -299,7 +308,8 @@ def with_metadata(self): yield metadata - metadata.to_yaml() + if not read_only: + metadata.to_yaml() @contextmanager def with_workflow_storage(self): diff --git a/renku/cli/_checks/__init__.py b/renku/cli/_checks/__init__.py index 7d484741c2..b63859df8f 100644 --- a/renku/cli/_checks/__init__.py +++ b/renku/cli/_checks/__init__.py @@ -17,11 +17,11 @@ # limitations under the License. """Define repository checks for :program:`renku doctor`.""" -from .files_in_datasets import check_missing_files -from .location_datasets import check_dataset_metadata +from .migrate_datasets import (check_dataset_metadata, check_missing_files) from .references import check_missing_references -# Checks will be executed in the order as they are listed in __all__ +# Checks will be executed in the order as they are listed in __all__. +# They are mostly used in ``doctor`` command to inspect broken things. __all__ = ( 'check_dataset_metadata', 'check_missing_files', diff --git a/renku/cli/_checks/files_in_datasets.py b/renku/cli/_checks/files_in_datasets.py deleted file mode 100644 index 700dd1907b..0000000000 --- a/renku/cli/_checks/files_in_datasets.py +++ /dev/null @@ -1,54 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright 2019 - Swiss Data Science Center (SDSC) -# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and -# Eidgenössische Technische Hochschule Zürich (ETHZ). -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Check location of files in datasets.""" - -from collections import defaultdict -from pathlib import Path - -import click - -from .._echo import WARNING - - -def check_missing_files(client): - """Find missing files listed in datasets.""" - missing = defaultdict(list) - - for path, dataset in client.datasets.items(): - for file_ in dataset.files: - filepath = Path(file_.path) - if not filepath.exists(): - missing[str( - path.parent.relative_to(client.renku_datasets_path) - )].append(str(filepath)) - - if not missing: - return True - - click.secho( - WARNING + 'There are missing files in datasets.' - # '\n (use "renku dataset clean " to clean them)' - ) - - for dataset, files in missing.items(): - click.secho( - '\n\t' + click.style(dataset, fg='yellow') + ':\n\t ' + - '\n\t '.join(click.style(path, fg='red') for path in files) - ) - - return False diff --git a/renku/cli/_checks/location_datasets.py b/renku/cli/_checks/location_datasets.py deleted file mode 100644 index d84bd600a2..0000000000 --- a/renku/cli/_checks/location_datasets.py +++ /dev/null @@ -1,46 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright 2019 - Swiss Data Science Center (SDSC) -# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and -# Eidgenössische Technische Hochschule Zürich (ETHZ). -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Check location of dataset metadata files.""" - -import click - -from .._echo import WARNING - - -def _dataset_metadata_pre_0_3_4(client): - """Return paths of dataset metadata for pre 0.3.4.""" - return (client.path / 'data').rglob('metadata.yml') - - -def check_dataset_metadata(client): - """Check location of dataset metadata.""" - # Find pre 0.3.4 metadata files. - old_metadata = list(_dataset_metadata_pre_0_3_4(client)) - - if not old_metadata: - return True - - click.secho( - WARNING + 'There are metadata files in the old location.' - '\n (use "renku migrate datasets" to move them)\n\n\t' + '\n\t'.join( - click.style(str(path.relative_to(client.path)), fg='yellow') - for path in old_metadata - ) + '\n' - ) - - return False diff --git a/renku/cli/_checks/migrate_datasets.py b/renku/cli/_checks/migrate_datasets.py new file mode 100644 index 0000000000..f3197d3656 --- /dev/null +++ b/renku/cli/_checks/migrate_datasets.py @@ -0,0 +1,231 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2019 - Swiss Data Science Center (SDSC) +# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and +# Eidgenössische Technische Hochschule Zürich (ETHZ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Checks needed to determine dataset migration policy.""" +import shutil +import uuid +from collections import defaultdict +from pathlib import Path + +import click + +from renku.models.datasets import Dataset +from renku.models.refs import LinkReference + +from .._echo import WARNING + + +def dataset_pre_0_3(client): + """Return paths of dataset metadata for pre 0.3.4.""" + return (client.path / 'data').rglob(client.METADATA) + + +def check_dataset_metadata(client): + """Check location of dataset metadata.""" + # Find pre 0.3.4 metadata files. + old_metadata = list(dataset_pre_0_3(client)) + + if not old_metadata: + return True + + click.secho( + WARNING + 'There are metadata files in the old location.' + '\n (use "renku migrate datasets" to move them)\n\n\t' + '\n\t'.join( + click.style(str(path.relative_to(client.path)), fg='yellow') + for path in old_metadata + ) + '\n' + ) + + return False + + +def check_missing_files(client): + """Find missing files listed in datasets.""" + missing = defaultdict(list) + + for path, dataset in client.datasets.items(): + for file_ in dataset.files: + filepath = Path(file_.path) + if not filepath.exists(): + missing[str( + path.parent.relative_to(client.renku_datasets_path) + )].append(str(filepath)) + + if not missing: + return True + + click.secho( + WARNING + 'There are missing files in datasets.' + # '\n (use "renku dataset clean " to clean them)' + ) + + for dataset, files in missing.items(): + click.secho( + '\n\t' + click.style(dataset, fg='yellow') + ':\n\t ' + + '\n\t '.join(click.style(path, fg='red') for path in files) + ) + + return False + + +def check_dataset_resources(client): + """Find missing datasets or files listed in metadata.""" + missing_datasets = defaultdict(list) + missing_files = defaultdict(list) + + for path, dataset in client.datasets.items(): + + if not (Path(dataset.path) / Path(client.METADATA)).exists(): + missing_datasets[path] = dataset + + for file_ in dataset.files: + filepath = Path(file_.path) + if not filepath.exists(): + relative = path.parent.relative_to(client.renku_datasets_path) + missing_files[str(relative)].append(str(filepath)) + + return missing_datasets, missing_files + + +def ensure_clean_lock(client): + """Make sure Renku lock file is not part of repository.""" + lock_file = client.path / '.renku.lock' + if lock_file.exists(): + lock_file.unlink() + + # Add lock file to .gitignore. + gitignore = client.path / '.gitignore' + if str(lock_file.name) not in gitignore.read_text(): + gitignore.open('a').write('\n{0}\n'.format(lock_file.name)) + + +def migrate_datasets_pre_v0_3(client): + """Migrate datasets from Renku 0.3.x.""" + for old_path in dataset_pre_0_3(client): + name = str(old_path.parent.relative_to(client.path / 'data')) + + dataset = Dataset.from_yaml(old_path, client=client) + new_path = (client.renku_datasets_path / dataset.uid / client.METADATA) + new_path.parent.mkdir(parents=True, exist_ok=True) + + with client.with_metadata(read_only=True) as meta: + for module in client.repo.submodules: + if Path(module.url).name == meta.name: + module.remove() + + for file_ in dataset.files: + if not Path(file_.path).exists(): + expected_path = ( + client.path / 'data' / dataset.name / file_.path + ) + if expected_path.exists(): + file_.path = expected_path.relative_to(client.path) + + dataset.__reference__ = new_path + dataset.to_yaml() + + Path(old_path).unlink() + ref = LinkReference.create( + client=client, + name='datasets/{0}'.format(name), + force=True, + ) + ref.set_reference(new_path) + + +def migrate_broken_dataset_paths(client): + """Ensure all paths are using correct directory structure.""" + for dataset in client.datasets.values(): + dataset_path = Path(dataset.path) + + expected_path = (client.renku_datasets_path / Path(dataset.identifier)) + + if not dataset_path.exists(): + dataset_path = ( + client.renku_datasets_path / uuid.UUID(dataset.identifier).hex + ) + + if not expected_path.exists(): + shutil.move(str(dataset_path), str(expected_path)) + dataset.path = expected_path + dataset.__reference__ = expected_path / client.METADATA + + for file_ in dataset.files: + file_path = Path(file_.path) + if not file_path.exists() and file_.path.startswith('..'): + new_path = ( + client.renku_datasets_path / dataset.uid / file_path + ).resolve().relative_to(client.path) + + file_.path = new_path + file_._label = new_path + + _, commit, _ = client.resolve_in_submodules( + client.find_previous_commit(file_.path, revision='HEAD'), + file_.path, + ) + id_format = 'blob/{commit}/{path}' + file_._id = id_format.format( + commit=commit.hexsha, path=new_path + ) + + dataset.to_yaml() + + +def dataset_file_path_migration(client, dataset, file_): + """Migrate a DatasetFile file path.""" + file_path = Path(file_.path) + if not file_path.exists(): + # old-style migrated paths relative to dataset root + if file_.path.startswith('..'): + new_path = (client.renku_datasets_path / dataset.uid / + file_path).resolve().relative_to(client.path) + else: + new_path = (( + client.path / client.datadir / dataset.name / file_path + ).relative_to(client.path)) + file_.path = new_path + + _, commit, _ = client.resolve_in_submodules( + client.find_previous_commit(file_.path, revision='HEAD'), + file_.path, + ) + file_._id = file_.default_id() + file_._label = file_.default_label() + + +def fix_uncommitted_labels(client): + """Ensure files have correct label instantiation.""" + for dataset in client.datasets.values(): + for file_ in dataset.files: + _, commit, _ = client.resolve_in_submodules( + client.find_previous_commit(file_.path, revision='HEAD'), + file_.path, + ) + file_.commit = commit + if 'UNCOMMITTED' in file_._label: + file_._label = file_.default_label() + file_._id = file_.default_id() + dataset.to_yaml() + + +STRUCTURE_MIGRATIONS = [ + ensure_clean_lock, + migrate_datasets_pre_v0_3, + migrate_broken_dataset_paths, + fix_uncommitted_labels, +] diff --git a/renku/cli/_client.py b/renku/cli/_client.py index 2a8c505859..e7ed4267c4 100644 --- a/renku/cli/_client.py +++ b/renku/cli/_client.py @@ -44,6 +44,7 @@ def pass_local_client( commit=None, commit_only=None, ignore_std_streams=True, + allow_empty=True, lock=None, ): """Pass client from the current context to the decorated command.""" @@ -55,6 +56,7 @@ def pass_local_client( commit=commit, commit_only=commit_only, ignore_std_streams=ignore_std_streams, + allow_empty=allow_empty, lock=lock, ) @@ -72,7 +74,8 @@ def new_func(*args, **kwargs): up_to_date=up_to_date, commit=commit, commit_only=commit_only, - ignore_std_streams=ignore_std_streams + ignore_std_streams=ignore_std_streams, + allow_empty=allow_empty, ) stack.enter_context(transaction) diff --git a/renku/cli/dataset.py b/renku/cli/dataset.py index 1544a15012..9397c54028 100644 --- a/renku/cli/dataset.py +++ b/renku/cli/dataset.py @@ -169,7 +169,10 @@ from renku.api._git import COMMIT_DIFF_STRATEGY from renku.api.datasets import DATASET_METADATA_PATHS +from renku.cli._checks.migrate_datasets import check_dataset_resources, \ + dataset_pre_0_3 from renku.cli._providers import ProviderFactory +from renku.errors import MigrationRequired from renku.models._tabulate import tabulate from renku.models.datasets import Dataset @@ -195,6 +198,12 @@ def dataset(ctx, client, revision, datadir, format): """Handle datasets.""" ctx.meta['renku.datasets.datadir'] = datadir + missing_dataset, missing_files = check_dataset_resources(client) + old_datasets = [ds for ds in dataset_pre_0_3(client)] + + if missing_dataset or missing_files or old_datasets: + raise MigrationRequired('datasets') + if ctx.invoked_subcommand is not None: return diff --git a/renku/cli/migrate.py b/renku/cli/migrate.py index f45bd6f566..12284c1584 100644 --- a/renku/cli/migrate.py +++ b/renku/cli/migrate.py @@ -26,12 +26,9 @@ ``renku migrate datasets`` command will take care of it. """ -import os - import click -import yaml -from renku.api._git import COMMIT_DIFF_STRATEGY +from renku.cli._checks.migrate_datasets import STRUCTURE_MIGRATIONS from ._client import pass_local_client @@ -43,35 +40,16 @@ def migrate(): @migrate.command() @pass_local_client( - clean=False, + clean=True, commit=True, - commit_only=COMMIT_DIFF_STRATEGY, + allow_empty=False, ) @click.pass_context def datasets(ctx, client): """Migrate dataset metadata.""" - from renku.models._jsonld import asjsonld - from renku.models.datasets import Dataset - from renku.models.refs import LinkReference - - from ._checks.location_datasets import _dataset_metadata_pre_0_3_4 - - for old_path in _dataset_metadata_pre_0_3_4(client): - dataset = Dataset.from_yaml(old_path, client=client) - - name = str(old_path.parent.relative_to(client.path / 'data')) - new_path = (client.renku_datasets_path / dataset.uid / client.METADATA) - new_path.parent.mkdir(parents=True, exist_ok=True) - - dataset = dataset.rename_files( - lambda key: os.path. - relpath(str(old_path.parent / key), start=str(new_path.parent)) - ) - - with new_path.open('w') as fp: - yaml.dump(asjsonld(dataset), fp, default_flow_style=False) - - old_path.unlink() + results = [ + migration(client) is not False for migration in STRUCTURE_MIGRATIONS + ] - LinkReference.create(client=client, - name='datasets/' + name).set_reference(new_path) + if all(results) and client.repo.index.diff(None): + click.secho('OK', fg='green') diff --git a/renku/errors.py b/renku/errors.py index fcb46d82c6..f68fcaae3c 100644 --- a/renku/errors.py +++ b/renku/errors.py @@ -155,6 +155,28 @@ def __init__(self, ignored): ) +class MigrationRequired(RenkuException, click.ClickException): + """Raise when migration is required.""" + + def __init__(self, resource_type): + """Build a custom message.""" + super(MigrationRequired, self).__init__( + 'Broken resource of type `{0}` found.\n' + 'Migration is required.\n' + 'Please check `renku migrate --help` ' + 'command to fix the issue.\n' + 'Hint: `renku migrate datasets`'.format(resource_type) + ) + + +class NothingToCommit(RenkuException, click.ClickException): + """Raise when there is nothing to commit.""" + + def __init__(self): + """Build a custom message.""" + super(NothingToCommit, self).__init__('There is nothing to commit.') + + class FailedMerge(RenkuException, click.ClickException): """Raise when automatic merge failed.""" diff --git a/renku/models/_jsonld.py b/renku/models/_jsonld.py index f7d0caae6f..4c874e24bf 100644 --- a/renku/models/_jsonld.py +++ b/renku/models/_jsonld.py @@ -24,6 +24,7 @@ from datetime import datetime, timezone import attr +import yaml from attr._compat import iteritems from attr._funcs import has from attr._make import Factory, fields @@ -31,7 +32,7 @@ from renku._compat import Path from renku.models._locals import ReferenceMixin, with_reference -from renku.models.migrations import MIGRATIONS +from renku.models.migrations import JSONLD_MIGRATIONS KEY = '__json_ld' KEY_CLS = '__json_ld_cls' @@ -346,6 +347,22 @@ def from_jsonld( data.setdefault('@context', cls._jsonld_context) + schema_type = data.get('@type') + migrations = [] + + if isinstance(schema_type, list): + for schema in schema_type: + mig_ = JSONLD_MIGRATIONS.get(schema) + if mig_: + migrations += mig_ + + if isinstance(schema_type, str) and not migrations: + migrations += JSONLD_MIGRATIONS.get(schema_type, []) + + for migration in set(migrations): + data = migration(data) + __source__ = migration(__source__) + if data['@context'] != cls._jsonld_context: try: compacted = ld.compact(data, {'@context': cls._jsonld_context}) @@ -354,9 +371,6 @@ def from_jsonld( else: compacted = data - for migration in MIGRATIONS: - data = migration(data) - fields = cls._jsonld_fields data_ = {} @@ -405,8 +419,6 @@ def asjsonld(self): def to_yaml(self): """Store an instance to the referenced YAML file.""" - import yaml - dumper = yaml.dumper.Dumper dumper.ignore_aliases = lambda _, data: True diff --git a/renku/models/datasets.py b/renku/models/datasets.py index 54f282e22e..daa68c1af8 100644 --- a/renku/models/datasets.py +++ b/renku/models/datasets.py @@ -217,7 +217,8 @@ def _now(self): @filename.default def default_filename(self): """Generate default filename based on path.""" - return Path(self.path).name + if self.path: + return Path(self.path).name @property def full_path(self): @@ -234,7 +235,10 @@ def size_in_mb(self): def __attrs_post_init__(self): """Set the property "name" after initialization.""" - self.name = self.filename + super().__attrs_post_init__() + + if not self.name: + self.name = self.filename def _parse_date(value): @@ -248,8 +252,11 @@ def _convert_dataset_files(value): """Convert dataset files.""" coll = value - if isinstance(value, dict): # compatibility with previous versions - coll = value.values() + if isinstance(coll, dict): # compatibility with previous versions + if any([key.startswith('@') for key in coll.keys()]): + return [DatasetFile.from_jsonld(coll)] + else: + coll = value.values() return [DatasetFile.from_jsonld(v) for v in coll] @@ -474,7 +481,10 @@ def unlink_file(self, file_path): def __attrs_post_init__(self): """Post-Init hook.""" - self._id = self.identifier + super().__attrs_post_init__() + + if not self._id: + self._id = self.identifier if not self._label: self._label = self.identifier @@ -483,16 +493,18 @@ def __attrs_post_init__(self): self.path = str(self.client.renku_datasets_path / str(self.uid)) if self.files: - for datasetfile in self.files: - if datasetfile.client is None: + for dataset_file in self.files: + file_exists = Path(dataset_file.path).exists() + + if dataset_file.client is None and file_exists: client, _, _ = self.client.resolve_in_submodules( self.client.find_previous_commit( - datasetfile.path, revision='HEAD' + dataset_file.path, revision='HEAD' ), - datasetfile.path, + dataset_file.path, ) - datasetfile.client = client + dataset_file.client = client try: self.commit = self.client.find_previous_commit( diff --git a/renku/models/migrations/__init__.py b/renku/models/migrations/__init__.py index 8d3c7bcaae..6dc9b470a6 100644 --- a/renku/models/migrations/__init__.py +++ b/renku/models/migrations/__init__.py @@ -15,8 +15,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Renku migrations.""" +"""Renku JSON-LD migrations.""" -from renku.models.migrations.dataset import migrate_dataset +from renku.models.migrations.dataset import ( + migrate_dataset_schema, + migrate_absolute_paths, +) -MIGRATIONS = [migrate_dataset] +JSONLD_MIGRATIONS = { + 'dctypes:Dataset': [migrate_dataset_schema, migrate_absolute_paths], + 'schema:Dataset': [migrate_absolute_paths], +} diff --git a/renku/models/migrations/dataset.py b/renku/models/migrations/dataset.py index 272c58c928..3ffb988fe1 100644 --- a/renku/models/migrations/dataset.py +++ b/renku/models/migrations/dataset.py @@ -16,16 +16,53 @@ # See the License for the specific language governing permissions and # limitations under the License. """Migrations for dataset.""" +import os +from pathlib import Path -def migrate_dataset(data): +def migrate_dataset_schema(data): """Migrate from old dataset formats.""" - if data.get('@type') != 'dctypes:Dataset': - return data + if 'authors' not in data: + return - data['creator'] = data.pop('authors', {}) + data['@context']['creator'] = data['@context'].pop( + 'authors', {'@container': 'list'} + ) + data['creator'] = data.pop('authors', {}) for file_name, file_ in data.get('files', {}).items(): file_['creator'] = file_.pop('authors', {}) return data + + +def migrate_absolute_paths(data): + """Migrate dataset paths to use relative path.""" + raw_path = data.get('path', '.') + path = Path(raw_path) + + if path.is_absolute(): + try: + data['path'] = path.relative_to(os.getcwd()) + except ValueError: + elements = raw_path.split('/') + index = elements.index('.renku') + data['path'] = Path('/'.join(elements[index:])) + + files = data.get('files', []) + + if isinstance(files, dict): + files = files.values() + + for file_ in files: + path = Path(file_.get('path'), '.') + if path.is_absolute(): + file_['path'] = path.relative_to((os.getcwd())) + + return data + + +DATASET_MIGRATIONS = [ + migrate_absolute_paths, + migrate_dataset_schema, +] diff --git a/renku/models/provenance/activities.py b/renku/models/provenance/activities.py index 5e5174faeb..4c12f1a8bc 100644 --- a/renku/models/provenance/activities.py +++ b/renku/models/provenance/activities.py @@ -18,6 +18,7 @@ """Represent a Git commit.""" import os +import uuid from collections import OrderedDict from pathlib import Path @@ -28,6 +29,7 @@ from renku.models.cwl import WORKFLOW_STEP_RUN_TYPES from renku.models.cwl._ascwl import CWLClass from renku.models.cwl.types import PATH_OBJECTS +from renku.models.refs import LinkReference from .agents import Person, renku_agent from .entities import Collection, CommitMixin, Entity, Process, Workflow @@ -108,13 +110,11 @@ class Activity(CommitMixin): def default_generated(self): """Entities generated by this Action.""" results = [] - for path, role in self.outputs.items(): client, commit, path = self.client.resolve_in_submodules( self.commit, path, ) - output_path = client.path / path parents = list(output_path.relative_to(client.path).parents) @@ -178,11 +178,24 @@ def parents(self): @property def paths(self): """Return all paths in the commit.""" - return { - item.a_path - for item in self.commit.diff(self.commit.parents or NULL_TREE) - # if not item.deleted_file - } + index = set() + + for file_ in self.commit.diff(self.commit.parents or NULL_TREE): + path_ = Path(file_.a_path) + is_dataset = self.client.DATASETS in str(path_) + not_refs = LinkReference.REFS not in str(path_) + does_not_exists = not path_.exists() + + if all([is_dataset, not_refs, does_not_exists]): + uid = uuid.UUID(path_.parent.name) + path_ = ( + Path(self.client.renku_home) / self.client.DATASETS / + str(uid) / self.client.METADATA + ) + + index.add(str(path_)) + + return index @classmethod def generate_id(cls, commit): diff --git a/tests/cli/test_migrate.py b/tests/cli/test_migrate.py new file mode 100644 index 0000000000..3dec5312e0 --- /dev/null +++ b/tests/cli/test_migrate.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017-2019 - Swiss Data Science Center (SDSC) +# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and +# Eidgenössische Technische Hochschule Zürich (ETHZ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test ``migrate`` command.""" +from pathlib import Path + +import pytest + +from renku import LocalClient, cli +from renku.cli import RENKU_HOME + + +@pytest.mark.migration +def test_status_with_old_repository(isolated_runner, old_project): + """Test status on all old repositories created by old version of renku.""" + runner = isolated_runner + result = runner.invoke(cli.cli, ['status']) + assert 0 == result.exit_code + + output = result.output.split('\n') + assert output.pop(0) == 'On branch master' + assert output.pop(0) == 'All files were generated from the latest inputs.' + + +@pytest.mark.migration +def test_update_with_old_repository(isolated_runner, old_project): + """Test update on all old repositories created by old version of renku.""" + runner = isolated_runner + + result = runner.invoke(cli.cli, ['update']) + assert 0 == result.exit_code + + output = result.output.split('\n') + assert output.pop(0) == 'All files were generated from the latest inputs.' + + +@pytest.mark.migration +def test_list_with_old_repository(isolated_runner, old_project): + """Test dataset list on old repository.""" + result = isolated_runner.invoke(cli.cli, ['dataset']) + + assert old_project['exit_code'] == result.exit_code + assert not old_project['repo'].is_dirty() + + +@pytest.mark.migration +def test_migrate_with_old_repository(isolated_runner, old_project): + """Test migrate on old repository.""" + result = isolated_runner.invoke(cli.cli, ['migrate', 'datasets']) + assert 0 == result.exit_code + assert not old_project['repo'].is_dirty() + + +@pytest.mark.migration +def test_correct_path_migrated(isolated_runner, old_project): + """Check if path on dataset files has been correctly migrated.""" + result = isolated_runner.invoke(cli.cli, ['migrate', 'datasets']) + assert 0 == result.exit_code + + client = LocalClient(path=old_project['path']) + assert client.datasets + + for ds in client.datasets.values(): + for file_ in ds.files: + path_ = Path(file_.path) + assert path_.exists() + assert not path_.is_absolute() + assert file_._label + assert file_._id + assert file_.path in file_._label + assert file_.path in file_._id + + +@pytest.mark.migration +def test_author_to_creator_migration(isolated_runner, old_project): + """Check renaming of author to creator migration.""" + client = LocalClient(path=old_project['path']) + if client.datasets: + dataset = client.datasets.popitem()[1] + dataset_path_pre40 = Path(dataset.path.replace('-', '')) + if dataset_path_pre40.exists(): + metadata = (dataset_path_pre40 / client.METADATA).read_text() + + assert 'authors:' in metadata + result = isolated_runner.invoke(cli.cli, ['migrate', 'datasets']) + assert 0 == result.exit_code + + after_metadata = (Path(dataset.path) / client.METADATA).read_text() + assert 'creator:' in after_metadata + assert 'authors:' not in after_metadata + + +@pytest.mark.migration +def test_correct_relative_path(isolated_runner, old_project): + """Check if path on dataset has been correctly migrated.""" + result = isolated_runner.invoke(cli.cli, ['migrate', 'datasets']) + assert 0 == result.exit_code + + client = LocalClient(path=old_project['path']) + assert client.datasets + + for ds in client.datasets.values(): + assert not Path(ds.path).is_absolute() + assert ds.path.startswith(RENKU_HOME) + + +@pytest.mark.migration +def test_remove_committed_lock_file(isolated_runner, old_project): + """Check that renku lock file has been successfully removed from git.""" + repo = old_project['repo'] + repo_path = Path(old_project['path']) + with open(str(repo_path / '.renku.lock'), 'w') as f: + f.write('lock') + + repo.index.add(['.renku.lock']) + repo.index.commit('locked') + + result = isolated_runner.invoke(cli.cli, ['migrate', 'datasets']) + assert 0 == result.exit_code + + assert (repo_path / '.renku.lock').exists() is False + assert repo.is_dirty() is False + + ignored = (repo_path / '.gitignore').read_text() + assert '.renku.lock' in ignored + + +@pytest.mark.migration +def test_graph_building_after_migration(isolated_runner, old_project): + """Check that structural migration did not break graph building.""" + result = isolated_runner.invoke(cli.cli, ['migrate', 'datasets']) + assert 0 == result.exit_code + + result = isolated_runner.invoke(cli.cli, ['log']) + assert 0 == result.exit_code + + +@pytest.mark.migration +def test_migrations_run_once(isolated_runner, old_project): + """Check that migration commit is not empty.""" + result = isolated_runner.invoke(cli.cli, ['dataset']) + assert old_project['exit_code'] == result.exit_code + + result = isolated_runner.invoke(cli.cli, ['migrate', 'datasets']) + assert 0 == result.exit_code + + result = isolated_runner.invoke(cli.cli, ['migrate', 'datasets']) + assert 1 == result.exit_code diff --git a/tests/fixtures/old-datasets-v0.3.0.git.tar.gz b/tests/fixtures/old-datasets-v0.3.0.git.tar.gz index 0bec2ee8e3..35ac13b5af 100644 Binary files a/tests/fixtures/old-datasets-v0.3.0.git.tar.gz and b/tests/fixtures/old-datasets-v0.3.0.git.tar.gz differ diff --git a/tests/fixtures/old-datasets-v0.5.0.git.tar.gz b/tests/fixtures/old-datasets-v0.5.0.git.tar.gz new file mode 100644 index 0000000000..1d14a86914 Binary files /dev/null and b/tests/fixtures/old-datasets-v0.5.0.git.tar.gz differ diff --git a/tests/fixtures/old-datasets-v0.5.1.git.tar.gz b/tests/fixtures/old-datasets-v0.5.1.git.tar.gz new file mode 100644 index 0000000000..da7f8e88b6 Binary files /dev/null and b/tests/fixtures/old-datasets-v0.5.1.git.tar.gz differ diff --git a/tests/test_cli.py b/tests/test_cli.py index 2aedeb3f4a..79b42b7040 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -413,41 +413,6 @@ def test_file_tracking(isolated_runner): assert 'output' in Path('.gitattributes').read_text() -def test_status_with_old_repository(isolated_runner, old_project): - """Test status on all old repositories created by old version of renku.""" - runner = isolated_runner - - result = runner.invoke(cli.cli, ['status']) - assert 0 == result.exit_code - - output = result.output.split('\n') - assert output.pop(0) == 'On branch master' - assert output.pop(0) == 'All files were generated from the latest inputs.' - - -def test_update_with_old_repository(isolated_runner, old_project): - """Test update on all old repositories created by old version of renku.""" - runner = isolated_runner - - result = runner.invoke(cli.cli, ['update']) - assert 0 == result.exit_code - - output = result.output.split('\n') - assert output.pop(0) == 'All files were generated from the latest inputs.' - - -def test_list_with_old_repository(isolated_runner, old_project): - """Test dataset list on old repository.""" - result = isolated_runner.invoke(cli.cli, ['dataset']) - assert 0 == result.exit_code - - -def test_migrate_with_old_repository(isolated_runner, old_project): - """Test migrate on old repository.""" - result = isolated_runner.invoke(cli.cli, ['migrate', 'datasets']) - assert 0 == result.exit_code - - def test_status_with_submodules(isolated_runner, monkeypatch): """Test status calculation with submodules.""" runner = isolated_runner