From 2eab4e38a2f0ef090bcf386333e17b01501b7b06 Mon Sep 17 00:00:00 2001 From: Steffan Sluis Date: Fri, 22 May 2020 11:46:40 +0200 Subject: [PATCH 1/3] azure: support external dependencies and outputs - Implement get_file_checksum with get_etag - Implement a DependencyAzure and OutputAzure classes - Add AzureOutput to OUTS, OUTS_MAP and CHECKSUMS_SCHEMA - Add AzureDependency to DEPS and DEPS_MAP - Add unit tests for AzureOutput and AzureDependency Fixes #3540. --- dvc/dependency/__init__.py | 3 +++ dvc/dependency/azure.py | 6 ++++++ dvc/output/__init__.py | 5 +++++ dvc/output/azure.py | 6 ++++++ dvc/remote/azure.py | 9 +++++++++ tests/unit/dependency/test_azure.py | 7 +++++++ tests/unit/output/test_azure.py | 7 +++++++ 7 files changed, 43 insertions(+) create mode 100644 dvc/dependency/azure.py create mode 100644 dvc/output/azure.py create mode 100644 tests/unit/dependency/test_azure.py create mode 100644 tests/unit/output/test_azure.py diff --git a/dvc/dependency/__init__.py b/dvc/dependency/__init__.py index 45de5bf57d..1fa2073e24 100644 --- a/dvc/dependency/__init__.py +++ b/dvc/dependency/__init__.py @@ -2,6 +2,7 @@ from urllib.parse import urlparse import dvc.output as output +from dvc.dependency.azure import AzureDependency from dvc.dependency.gs import GSDependency from dvc.dependency.hdfs import HDFSDependency from dvc.dependency.http import HTTPDependency @@ -17,6 +18,7 @@ from .repo import RepoDependency DEPS = [ + AzureDependency, GSDependency, HDFSDependency, HTTPDependency, @@ -30,6 +32,7 @@ Schemes.LOCAL: LocalDependency, Schemes.SSH: SSHDependency, Schemes.S3: S3Dependency, + Schemes.AZURE: AzureDependency, Schemes.GS: GSDependency, Schemes.HDFS: HDFSDependency, Schemes.HTTP: HTTPDependency, diff --git a/dvc/dependency/azure.py b/dvc/dependency/azure.py new file mode 100644 index 0000000000..809e227fed --- /dev/null +++ b/dvc/dependency/azure.py @@ -0,0 +1,6 @@ +from dvc.dependency.base import BaseDependency +from dvc.output.azure import AzureOutput + + +class AzureDependency(BaseDependency, AzureOutput): + pass diff --git a/dvc/output/__init__.py b/dvc/output/__init__.py index 492b0e4862..0d3a7650b6 100644 --- a/dvc/output/__init__.py +++ b/dvc/output/__init__.py @@ -2,6 +2,7 @@ from voluptuous import And, Any, Coerce, Length, Lower, Required, SetTo +from dvc.output.azure import AzureOutput from dvc.output.base import BaseOutput from dvc.output.gs import GSOutput from dvc.output.hdfs import HDFSOutput @@ -9,12 +10,14 @@ from dvc.output.s3 import S3Output from dvc.output.ssh import SSHOutput from dvc.remote import Remote +from dvc.remote.azure import AzureRemote from dvc.remote.hdfs import HDFSRemote from dvc.remote.local import LocalRemote from dvc.remote.s3 import S3Remote from dvc.scheme import Schemes OUTS = [ + AzureOutput, HDFSOutput, S3Output, GSOutput, @@ -23,6 +26,7 @@ ] OUTS_MAP = { + Schemes.AZURE: AzureOutput, Schemes.HDFS: HDFSOutput, Schemes.S3: S3Output, Schemes.GS: GSOutput, @@ -45,6 +49,7 @@ # so when a few types of outputs share the same name, we only need # specify it once. CHECKSUMS_SCHEMA = { + AzureRemote.PARAM_CHECKSUM: CHECKSUM_SCHEMA, LocalRemote.PARAM_CHECKSUM: CHECKSUM_SCHEMA, S3Remote.PARAM_CHECKSUM: CHECKSUM_SCHEMA, HDFSRemote.PARAM_CHECKSUM: CHECKSUM_SCHEMA, diff --git a/dvc/output/azure.py b/dvc/output/azure.py new file mode 100644 index 0000000000..a47ca3b8ea --- /dev/null +++ b/dvc/output/azure.py @@ -0,0 +1,6 @@ +from dvc.output.base import BaseOutput +from dvc.remote.azure import AzureRemote + + +class AzureOutput(BaseOutput): + REMOTE = AzureRemote diff --git a/dvc/remote/azure.py b/dvc/remote/azure.py index 90d76dbf81..374146217d 100644 --- a/dvc/remote/azure.py +++ b/dvc/remote/azure.py @@ -58,6 +58,15 @@ def blob_service(self): blob_service.create_container(self.path_info.bucket) return blob_service + def get_etag(self, path_info): + etag = self.blob_service.get_blob_properties( + path_info.bucket, path_info.path + ).properties.etag + return etag.strip('"') + + def get_file_checksum(self, path_info): + return self.get_etag(path_info) + def remove(self, path_info): if path_info.scheme != self.scheme: raise NotImplementedError diff --git a/tests/unit/dependency/test_azure.py b/tests/unit/dependency/test_azure.py new file mode 100644 index 0000000000..fff7910615 --- /dev/null +++ b/tests/unit/dependency/test_azure.py @@ -0,0 +1,7 @@ +from dvc.dependency.azure import AzureDependency +from tests.unit.dependency.test_local import TestLocalDependency + + +class TestAzureDependency(TestLocalDependency): + def _get_cls(self): + return AzureDependency diff --git a/tests/unit/output/test_azure.py b/tests/unit/output/test_azure.py new file mode 100644 index 0000000000..2564f94690 --- /dev/null +++ b/tests/unit/output/test_azure.py @@ -0,0 +1,7 @@ +from dvc.output.azure import AzureOutput +from tests.unit.output.test_local import TestLocalOutput + + +class TestAzureOutput(TestLocalOutput): + def _get_cls(self): + return AzureOutput From 5df9cb8887ed1c4b66844b0f35a9934a7c91ef87 Mon Sep 17 00:00:00 2001 From: Steffan Sluis Date: Fri, 22 May 2020 18:12:23 +0200 Subject: [PATCH 2/3] Use content_md5 instead of etag The etag is different for the same file in different locations. --- dvc/remote/azure.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dvc/remote/azure.py b/dvc/remote/azure.py index 374146217d..99c2679ba6 100644 --- a/dvc/remote/azure.py +++ b/dvc/remote/azure.py @@ -18,7 +18,7 @@ class AzureRemote(BaseRemote): scheme = Schemes.AZURE path_cls = CloudURLInfo REQUIRES = {"azure-storage-blob": "azure.storage.blob"} - PARAM_CHECKSUM = "etag" + PARAM_CHECKSUM = "md5" COPY_POLL_SECONDS = 5 LIST_OBJECT_PAGE_SIZE = 5000 @@ -58,14 +58,14 @@ def blob_service(self): blob_service.create_container(self.path_info.bucket) return blob_service - def get_etag(self, path_info): - etag = self.blob_service.get_blob_properties( + def get_content_md5(self, path_info): + content_md5 = self.blob_service.get_blob_properties( path_info.bucket, path_info.path - ).properties.etag - return etag.strip('"') + ).properties.content_settings.content_md5 + return content_md5.strip('"') def get_file_checksum(self, path_info): - return self.get_etag(path_info) + return self.get_content_md5(path_info) def remove(self, path_info): if path_info.scheme != self.scheme: From f63add9b5b0a4ad8fb7ef86850e3c3b5abb8896a Mon Sep 17 00:00:00 2001 From: Steffan Sluis Date: Fri, 22 May 2020 21:42:51 +0200 Subject: [PATCH 3/3] Use etag and remove external outputs for Azure Due to inconsistencies with Azure's Content-MD5 property, it is not stable enough to be used at the moment. Using the etag offers enough reliability to support external dependencies. See also: https://discordapp.com/channels/485586884165107732/565699007037571084/713462619469643796 --- dvc/dependency/azure.py | 7 ++++--- dvc/output/__init__.py | 5 ----- dvc/output/azure.py | 6 ------ dvc/remote/azure.py | 18 ++++++++---------- tests/unit/output/test_azure.py | 7 ------- 5 files changed, 12 insertions(+), 31 deletions(-) delete mode 100644 dvc/output/azure.py delete mode 100644 tests/unit/output/test_azure.py diff --git a/dvc/dependency/azure.py b/dvc/dependency/azure.py index 809e227fed..c119d15efb 100644 --- a/dvc/dependency/azure.py +++ b/dvc/dependency/azure.py @@ -1,6 +1,7 @@ from dvc.dependency.base import BaseDependency -from dvc.output.azure import AzureOutput +from dvc.output.base import BaseOutput +from dvc.remote.azure import AzureRemote -class AzureDependency(BaseDependency, AzureOutput): - pass +class AzureDependency(BaseDependency, BaseOutput): + REMOTE = AzureRemote diff --git a/dvc/output/__init__.py b/dvc/output/__init__.py index 0d3a7650b6..492b0e4862 100644 --- a/dvc/output/__init__.py +++ b/dvc/output/__init__.py @@ -2,7 +2,6 @@ from voluptuous import And, Any, Coerce, Length, Lower, Required, SetTo -from dvc.output.azure import AzureOutput from dvc.output.base import BaseOutput from dvc.output.gs import GSOutput from dvc.output.hdfs import HDFSOutput @@ -10,14 +9,12 @@ from dvc.output.s3 import S3Output from dvc.output.ssh import SSHOutput from dvc.remote import Remote -from dvc.remote.azure import AzureRemote from dvc.remote.hdfs import HDFSRemote from dvc.remote.local import LocalRemote from dvc.remote.s3 import S3Remote from dvc.scheme import Schemes OUTS = [ - AzureOutput, HDFSOutput, S3Output, GSOutput, @@ -26,7 +23,6 @@ ] OUTS_MAP = { - Schemes.AZURE: AzureOutput, Schemes.HDFS: HDFSOutput, Schemes.S3: S3Output, Schemes.GS: GSOutput, @@ -49,7 +45,6 @@ # so when a few types of outputs share the same name, we only need # specify it once. CHECKSUMS_SCHEMA = { - AzureRemote.PARAM_CHECKSUM: CHECKSUM_SCHEMA, LocalRemote.PARAM_CHECKSUM: CHECKSUM_SCHEMA, S3Remote.PARAM_CHECKSUM: CHECKSUM_SCHEMA, HDFSRemote.PARAM_CHECKSUM: CHECKSUM_SCHEMA, diff --git a/dvc/output/azure.py b/dvc/output/azure.py deleted file mode 100644 index a47ca3b8ea..0000000000 --- a/dvc/output/azure.py +++ /dev/null @@ -1,6 +0,0 @@ -from dvc.output.base import BaseOutput -from dvc.remote.azure import AzureRemote - - -class AzureOutput(BaseOutput): - REMOTE = AzureRemote diff --git a/dvc/remote/azure.py b/dvc/remote/azure.py index 99c2679ba6..e23d8ed2db 100644 --- a/dvc/remote/azure.py +++ b/dvc/remote/azure.py @@ -18,7 +18,7 @@ class AzureRemote(BaseRemote): scheme = Schemes.AZURE path_cls = CloudURLInfo REQUIRES = {"azure-storage-blob": "azure.storage.blob"} - PARAM_CHECKSUM = "md5" + PARAM_CHECKSUM = "etag" COPY_POLL_SECONDS = 5 LIST_OBJECT_PAGE_SIZE = 5000 @@ -29,10 +29,8 @@ def __init__(self, repo, config): self.path_info = self.path_cls(url) if not self.path_info.bucket: - self.path_info = self.path_cls.from_parts( - scheme=self.scheme, - netloc=os.getenv("AZURE_STORAGE_CONTAINER_NAME"), - ) + container = os.getenv("AZURE_STORAGE_CONTAINER_NAME") + self.path_info = self.path_cls(f"azure://{container}") self.connection_string = config.get("connection_string") or os.getenv( "AZURE_STORAGE_CONNECTION_STRING" @@ -58,14 +56,14 @@ def blob_service(self): blob_service.create_container(self.path_info.bucket) return blob_service - def get_content_md5(self, path_info): - content_md5 = self.blob_service.get_blob_properties( + def get_etag(self, path_info): + etag = self.blob_service.get_blob_properties( path_info.bucket, path_info.path - ).properties.content_settings.content_md5 - return content_md5.strip('"') + ).properties.etag + return etag.strip('"') def get_file_checksum(self, path_info): - return self.get_content_md5(path_info) + return self.get_etag(path_info) def remove(self, path_info): if path_info.scheme != self.scheme: diff --git a/tests/unit/output/test_azure.py b/tests/unit/output/test_azure.py deleted file mode 100644 index 2564f94690..0000000000 --- a/tests/unit/output/test_azure.py +++ /dev/null @@ -1,7 +0,0 @@ -from dvc.output.azure import AzureOutput -from tests.unit.output.test_local import TestLocalOutput - - -class TestAzureOutput(TestLocalOutput): - def _get_cls(self): - return AzureOutput