From d0ca0a1c5f0360942a6df9335b8309273a42e342 Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Thu, 10 Dec 2020 15:37:11 -0800 Subject: [PATCH 01/11] Change the tag used for our docker containers so we can use a single ECR repository, but use the platform and a hash of the dockerfile in the tag name so we can cache across branches. Also push newly built containers up to ECR repo so future CI runs will not have to build entire container. --- ci/build.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/ci/build.py b/ci/build.py index 8c2a6e9ac20b..b02ebaa00e51 100755 --- a/ci/build.py +++ b/ci/build.py @@ -27,6 +27,8 @@ import argparse import glob +import hashlib +import os import pprint import re import shutil @@ -58,7 +60,11 @@ def get_docker_tag(platform: str, registry: str) -> str: platform = platform if any(x in platform for x in ['build.', 'publish.']) else 'build.{}'.format(platform) if not registry: registry = "mxnet_local" - return "{0}/{1}".format(registry, platform) + dockerfile_hash = 'unk' + with open(get_dockerfile(platform),"rb") as f: + bytes = f.read() + dockerfile_hash = hashlib.sha256(bytes).hexdigest()[:12] + return "{0}:{1}-{2}".format(registry, platform, dockerfile_hash) def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str: @@ -117,6 +123,8 @@ def run_cmd(): image_id = _get_local_image_id(docker_tag=tag) if not image_id: raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag)) + # now that we've built the container, push it to our docker cache + push_docker_cache(registry, tag, image_id) return image_id @@ -265,6 +273,22 @@ def load_docker_cache(tag, docker_registry) -> None: else: logging.info('Distributed docker cache disabled') +def push_docker_cache(registry, docker_tag, image_id) -> None: + """Uploads tagged container to given docker registry""" + if docker_registry: + # noinspection PyBroadException + try: + if "dkr.ecr" in registry: + # we need to get credentials to login to ECR + os.system("$(aws ecr get-login --no-include-email)") + import docker_cache + logging.info('Docker cache upload is enabled to registry %s', registry) + docker_cache._upload_image(registry, tag, image_id) + except Exception: + logging.exception('Unable to push image to Docker cache...') + else: + logging.info('Distributed docker cache disabled') + def log_environment(): instance_info = ec2_instance_info() @@ -406,7 +430,8 @@ def main() -> int: tag = get_docker_tag(platform=platform, registry=args.docker_registry) load_docker_cache(tag=tag, docker_registry=args.docker_registry) build_docker(platform, registry=args.docker_registry, - num_retries=args.docker_build_retries, no_cache=args.no_cache) + num_retries=args.docker_build_retries, no_cache=args.no_cache, + cache_intermediate=args.cache_intermediate) if args.build_only: continue shutil.rmtree(buildir(), ignore_errors=True) From a2d58def39f077d3650e333935d916991e582313 Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Thu, 10 Dec 2020 15:37:30 -0800 Subject: [PATCH 02/11] Use new env variable for new ECR registry in order to test. --- ci/Jenkinsfile_utils.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy index 8ecc7e193b97..e4135cc691db 100644 --- a/ci/Jenkinsfile_utils.groovy +++ b/ci/Jenkinsfile_utils.groovy @@ -160,7 +160,7 @@ def collect_test_results_windows(original_file_name, new_file_name) { def docker_run(platform, function_name, use_nvidia, shared_mem = '500m', env_vars = "") { - def command = "ci/build.py %ENV_VARS% --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%" + def command = "ci/build.py %ENV_VARS% --docker-registry ${env.DOCKER_ECR_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%" command = command.replaceAll('%ENV_VARS%', env_vars.length() > 0 ? "-e ${env_vars}" : '') command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '') command = command.replaceAll('%PLATFORM%', platform) From bb1436d6199df7484a2dbeaa192f31b4f788aef3 Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Thu, 10 Dec 2020 15:45:08 -0800 Subject: [PATCH 03/11] Fix typo. --- ci/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/build.py b/ci/build.py index b02ebaa00e51..85af3e087157 100755 --- a/ci/build.py +++ b/ci/build.py @@ -273,7 +273,7 @@ def load_docker_cache(tag, docker_registry) -> None: else: logging.info('Distributed docker cache disabled') -def push_docker_cache(registry, docker_tag, image_id) -> None: +def push_docker_cache(registry, tag, image_id) -> None: """Uploads tagged container to given docker registry""" if docker_registry: # noinspection PyBroadException From a10381b9d7dafb57f966359af5afc6674a7becd6 Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Thu, 10 Dec 2020 15:52:31 -0800 Subject: [PATCH 04/11] Fix variable name and make sure we login to ECR before attempting to pull image. --- ci/build.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/build.py b/ci/build.py index 85af3e087157..a3c046a66738 100755 --- a/ci/build.py +++ b/ci/build.py @@ -265,6 +265,9 @@ def load_docker_cache(tag, docker_registry) -> None: if docker_registry: # noinspection PyBroadException try: + if "dkr.ecr" in registry: + # we need to get credentials to login to ECR + os.system("$(aws ecr get-login --no-include-email)") import docker_cache logging.info('Docker cache download is enabled from registry %s', docker_registry) docker_cache.load_docker_cache(registry=docker_registry, docker_tag=tag) @@ -275,7 +278,7 @@ def load_docker_cache(tag, docker_registry) -> None: def push_docker_cache(registry, tag, image_id) -> None: """Uploads tagged container to given docker registry""" - if docker_registry: + if registry: # noinspection PyBroadException try: if "dkr.ecr" in registry: From a4df0780331d54f3e47369d18f58b33a623a0817 Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Thu, 10 Dec 2020 15:55:17 -0800 Subject: [PATCH 05/11] Fix variable. --- ci/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/build.py b/ci/build.py index a3c046a66738..525a83031e9a 100755 --- a/ci/build.py +++ b/ci/build.py @@ -265,7 +265,7 @@ def load_docker_cache(tag, docker_registry) -> None: if docker_registry: # noinspection PyBroadException try: - if "dkr.ecr" in registry: + if "dkr.ecr" in docker_registry: # we need to get credentials to login to ECR os.system("$(aws ecr get-login --no-include-email)") import docker_cache From 762dc55eea810c39cfb8d5cf8c4db642f8fc2ce1 Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Thu, 10 Dec 2020 16:05:55 -0800 Subject: [PATCH 06/11] Specify region based on environment variable. --- ci/build.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/build.py b/ci/build.py index 525a83031e9a..9c72c017c3d6 100755 --- a/ci/build.py +++ b/ci/build.py @@ -267,7 +267,7 @@ def load_docker_cache(tag, docker_registry) -> None: try: if "dkr.ecr" in docker_registry: # we need to get credentials to login to ECR - os.system("$(aws ecr get-login --no-include-email)") + os.system("$(aws ecr get-login --region ${DOCKER_ECR_REGION} --no-include-email)") import docker_cache logging.info('Docker cache download is enabled from registry %s', docker_registry) docker_cache.load_docker_cache(registry=docker_registry, docker_tag=tag) @@ -283,7 +283,7 @@ def push_docker_cache(registry, tag, image_id) -> None: try: if "dkr.ecr" in registry: # we need to get credentials to login to ECR - os.system("$(aws ecr get-login --no-include-email)") + os.system("$(aws ecr get-login --region ${DOCKER_ECR_REGION} --no-include-email)") import docker_cache logging.info('Docker cache upload is enabled to registry %s', registry) docker_cache._upload_image(registry, tag, image_id) From 235390a36298a553ef1bafbb9a7f017a56f41f97 Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Fri, 11 Dec 2020 10:54:30 -0800 Subject: [PATCH 07/11] Only push image to docker cache when DOCKER_ECR_CACHE is defined. --- ci/build.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/build.py b/ci/build.py index 9c72c017c3d6..1607540804da 100755 --- a/ci/build.py +++ b/ci/build.py @@ -123,8 +123,9 @@ def run_cmd(): image_id = _get_local_image_id(docker_tag=tag) if not image_id: raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag)) - # now that we've built the container, push it to our docker cache - push_docker_cache(registry, tag, image_id) + # now that we've built the container, push it to our docker cache if DOCKER_ECR_CACHE is defined + if 'DOCKER_ECR_REGISTRY' in os.environ: + push_docker_cache(registry, tag, image_id) return image_id From d1e8cae6b5e9215c3d283600e9408d7945867fdc Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Mon, 8 Feb 2021 21:25:20 -0800 Subject: [PATCH 08/11] Add all copied files to hash context to make sure it's unique. --- ci/build.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/ci/build.py b/ci/build.py index 1607540804da..32d78d827556 100755 --- a/ci/build.py +++ b/ci/build.py @@ -54,17 +54,41 @@ def get_platforms(path: str = get_dockerfiles_path()) -> List[str]: platforms = list(map(lambda x: os.path.split(x)[1], sorted(files))) return platforms +def _find_copied_files(dockerfile): + """ + Creates a list of files copied into given dockerfile. + """ + copied_files = [] + basedir = os.path.dirname(dockerfile) + with open(dockerfile, "r") as f: + for line in f.readlines(): + if line.startswith("COPY "): + copied_files.append(os.path.join(basedir, line.split(" ")[1])) + return copied_files + +def _hash_file(ctx, filename): + """ + Add contents of passed file into passed hash context. + """ + bufsiz = 16384 + with open(filename,"rb") as f: + while True: + d = f.read(bufsiz) + if not d: + break + ctx.update(d) def get_docker_tag(platform: str, registry: str) -> str: """:return: docker tag to be used for the container""" platform = platform if any(x in platform for x in ['build.', 'publish.']) else 'build.{}'.format(platform) if not registry: registry = "mxnet_local" - dockerfile_hash = 'unk' - with open(get_dockerfile(platform),"rb") as f: - bytes = f.read() - dockerfile_hash = hashlib.sha256(bytes).hexdigest()[:12] - return "{0}:{1}-{2}".format(registry, platform, dockerfile_hash) + dockerfile = get_dockerfile(platform) + sha256 = hashlib.sha256() + _hash_file(sha256, dockerfile) + for f in _find_copied_files(dockerfile): + _hash_file(sha256, f) + return "{0}:{1}-{2}".format(registry, platform, sha256.hexdigest()[:12]) def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str: From 2e58613b10f756ffbac5fa77330fa0ac85544766 Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Mon, 8 Feb 2021 22:28:10 -0800 Subject: [PATCH 09/11] Move ECR logic to docker_cache.py, don't push containers in normal pipeline - only push in restricted docker cache pipeline. --- ci/Jenkinsfile_docker_cache | 2 +- ci/build.py | 9 --------- ci/docker_cache.py | 15 +++++++++++++++ 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/ci/Jenkinsfile_docker_cache b/ci/Jenkinsfile_docker_cache index 35e6ff9e7d56..f90bf0459f03 100644 --- a/ci/Jenkinsfile_docker_cache +++ b/ci/Jenkinsfile_docker_cache @@ -37,7 +37,7 @@ core_logic: { ws('workspace/docker_cache') { timeout(time: total_timeout, unit: 'MINUTES') { utils.init_git() - sh "ci/docker_cache.py --docker-registry ${env.DOCKER_CACHE_REGISTRY}" + sh "ci/docker_cache.py --docker-registry ${env.DOCKER_ECR_REGISTRY}" } } } diff --git a/ci/build.py b/ci/build.py index 32d78d827556..dfbf9621a99c 100755 --- a/ci/build.py +++ b/ci/build.py @@ -147,9 +147,6 @@ def run_cmd(): image_id = _get_local_image_id(docker_tag=tag) if not image_id: raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag)) - # now that we've built the container, push it to our docker cache if DOCKER_ECR_CACHE is defined - if 'DOCKER_ECR_REGISTRY' in os.environ: - push_docker_cache(registry, tag, image_id) return image_id @@ -290,9 +287,6 @@ def load_docker_cache(tag, docker_registry) -> None: if docker_registry: # noinspection PyBroadException try: - if "dkr.ecr" in docker_registry: - # we need to get credentials to login to ECR - os.system("$(aws ecr get-login --region ${DOCKER_ECR_REGION} --no-include-email)") import docker_cache logging.info('Docker cache download is enabled from registry %s', docker_registry) docker_cache.load_docker_cache(registry=docker_registry, docker_tag=tag) @@ -306,9 +300,6 @@ def push_docker_cache(registry, tag, image_id) -> None: if registry: # noinspection PyBroadException try: - if "dkr.ecr" in registry: - # we need to get credentials to login to ECR - os.system("$(aws ecr get-login --region ${DOCKER_ECR_REGION} --no-include-email)") import docker_cache logging.info('Docker cache upload is enabled to registry %s', registry) docker_cache._upload_image(registry, tag, image_id) diff --git a/ci/docker_cache.py b/ci/docker_cache.py index 254d6237d6e2..315461b6ab44 100755 --- a/ci/docker_cache.py +++ b/ci/docker_cache.py @@ -96,6 +96,14 @@ def _build_save_container(platform, registry, load_cache) -> Optional[str]: # Error handling is done by returning the errorous platform name. This is necessary due to # Parallel being unable to handle exceptions +def _ecr_login(registry): + """ + Use the AWS CLI to get credentials to login to ECR. + """ + # extract region from registry + region = registry.split(".")[3] + logging.info("Logging into ECR region %s using aws-cli..", region) + os.system("$(aws ecr get-login --region "+region+" --no-include-email)") def _upload_image(registry, docker_tag, image_id) -> None: """ @@ -105,6 +113,10 @@ def _upload_image(registry, docker_tag, image_id) -> None: :param image_id: Image id :return: None """ + + if "dkr.ecr" in registry: + _ecr_login(registry) + # We don't have to retag the image since it is already in the right format logging.info('Uploading %s (%s) to %s', docker_tag, image_id, registry) push_cmd = ['docker', 'push', docker_tag] @@ -125,6 +137,9 @@ def load_docker_cache(registry, docker_tag) -> None: return assert docker_tag + if "dkr.ecr" in registry: + _ecr_login(registry) + logging.info('Loading Docker cache for %s from %s', docker_tag, registry) pull_cmd = ['docker', 'pull', docker_tag] From 624223b5fdda16826c2d5f1b73ad69b37441ce7d Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Mon, 8 Feb 2021 23:26:43 -0800 Subject: [PATCH 10/11] Remove invalid keyword arg docker_binary. --- ci/docker_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/docker_cache.py b/ci/docker_cache.py index 315461b6ab44..c1f2711ed077 100755 --- a/ci/docker_cache.py +++ b/ci/docker_cache.py @@ -84,7 +84,7 @@ def _build_save_container(platform, registry, load_cache) -> Optional[str]: logging.debug('Building %s as %s', platform, docker_tag) try: # Increase the number of retries for building the cache. - image_id = build_util.build_docker(docker_binary='docker', platform=platform, registry=registry, num_retries=10, no_cache=False) + image_id = build_util.build_docker(platform=platform, registry=registry, num_retries=10, no_cache=False) logging.info('Built %s as %s', docker_tag, image_id) # Push cache to registry From 70bb17a7cf9d8cd511d52509e38a5645b736b5ca Mon Sep 17 00:00:00 2001 From: Joe Evans Date: Wed, 10 Feb 2021 17:19:17 +0000 Subject: [PATCH 11/11] Remove unused function. --- ci/build.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/ci/build.py b/ci/build.py index dfbf9621a99c..78387d89aca0 100755 --- a/ci/build.py +++ b/ci/build.py @@ -295,19 +295,6 @@ def load_docker_cache(tag, docker_registry) -> None: else: logging.info('Distributed docker cache disabled') -def push_docker_cache(registry, tag, image_id) -> None: - """Uploads tagged container to given docker registry""" - if registry: - # noinspection PyBroadException - try: - import docker_cache - logging.info('Docker cache upload is enabled to registry %s', registry) - docker_cache._upload_image(registry, tag, image_id) - except Exception: - logging.exception('Unable to push image to Docker cache...') - else: - logging.info('Distributed docker cache disabled') - def log_environment(): instance_info = ec2_instance_info()