diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 4e3df620..102a8542 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -82,6 +82,7 @@ class RunDetails: data_size (str): The size of the data. data_download_duration (str): The duration of data download. build_number (str): The CI build number. + additional_docker_run_options (str): The additional options used for docker run. """ # Avoiding @property for ease of code, add if needed. @@ -110,6 +111,7 @@ def __init__(self): self.data_size = "" self.data_download_duration = "" self.build_number = "" + self.additional_docker_run_options = "" def print_perf(self): """Print the performance results of a model. @@ -258,16 +260,16 @@ def get_build_arg(self, run_build_arg: typing.Dict = {}) -> str: return build_args def apply_tools( - self, - pre_encapsulate_post_scripts: typing.Dict, + self, + pre_encapsulate_post_scripts: typing.Dict, run_env: typing.Dict ) -> None: """Apply tools to the model. - + Args: pre_encapsulate_post_scripts: The pre, encapsulate and post scripts. run_env: The run environment. - + Raises: Exception: An error occurred while applying tools to the model. """ @@ -311,22 +313,22 @@ def apply_tools( ) def gather_system_env_details( - self, - pre_encapsulate_post_scripts: typing.Dict, + self, + pre_encapsulate_post_scripts: typing.Dict, model_name: str ) -> None: """Gather system environment details. - + Args: pre_encapsulate_post_scripts: The pre, encapsulate and post scripts. model_name: The model name. Returns: None - + Raises: Exception: An error occurred while gathering system environment details. - + Note: This function is used to gather system environment details. """ @@ -334,7 +336,7 @@ def gather_system_env_details( pre_env_details = {} pre_env_details["path"] = "scripts/common/pre_scripts/run_rocenv_tool.sh" pre_env_details["args"] = model_name.replace("/", "_") + "_env" - pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details) + pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details) print(f"pre encap post scripts: {pre_encapsulate_post_scripts}") def copy_scripts(self) -> None: @@ -342,7 +344,7 @@ def copy_scripts(self) -> None: scripts_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "scripts") print(f"Package path: {scripts_path}") # copy the scripts to the model directory - self.console.sh(f"cp -vLR --preserve=all {scripts_path}/* scripts/") + self.console.sh(f"cp -vLR --preserve=all {scripts_path} .") print(f"Scripts copied to {os.getcwd()}/scripts") def cleanup(self) -> None: @@ -367,18 +369,18 @@ def cleanup(self) -> None: self.console.sh("rm -rf scripts/common/post_scripts") if os.path.exists("scripts/common/tools"): # remove the scripts/common/tools directory - self.console.sh("rm -rf scripts/common/tools") - print(f"scripts/common directory has been cleaned up.") + self.console.sh("rm -rf scripts/common/tools") + print(f"scripts/common directory has been cleaned up.") def get_gpu_arg(self, requested_gpus: str) -> str: """Get the GPU arguments. - + Args: requested_gpus: The requested GPUs. - + Returns: str: The GPU arguments. - + Raises: RuntimeError: An error occurred while getting the GPU arguments. """ @@ -438,10 +440,10 @@ def get_gpu_arg(self, requested_gpus: str) -> str: def get_cpu_arg(self) -> str: """Get the CPU arguments. - + Returns: str: The CPU arguments. - + Raises: RuntimeError: An error occurred while getting the CPU arguments. """ @@ -455,13 +457,13 @@ def get_cpu_arg(self) -> str: def get_env_arg(self, run_env: typing.Dict) -> str: """Get the environment arguments. - + Args: run_env: The run environment. - + Returns: str: The environment arguments. - + Raises: RuntimeError: An error occurred while getting the environment arguments. """ @@ -483,13 +485,13 @@ def get_env_arg(self, run_env: typing.Dict) -> str: def get_mount_arg(self, mount_datapaths: typing.List) -> str: """Get the mount arguments. - + Args: mount_datapaths: The mount data paths. - + Returns: str: The mount arguments. - + Raises: RuntimeError: An error occurred while getting the mount arguments. """ @@ -571,7 +573,7 @@ def run_model_impl( use_cache_str = "" if self.args.clean_docker_cache: - use_cache_str = "--no-cache" + use_cache_str = "--no-cache" # build docker container print(f"Building Docker image...") @@ -616,8 +618,8 @@ def run_model_impl( print(f"BASE DOCKER is {run_details.base_docker}") # print base docker image digest - run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ") - print(f"BASE DOCKER SHA is {run_details.docker_sha}") + run_details.docker_sha = self.console.sh("docker inspect --format='{{index .RepoDigests 0}}' " + run_details.base_docker + " | cut -d '@' -f 2") + print(f"BASE DOCKER SHA is {run_details.docker_sha}") else: container_name = "container_" + self.context.ctx["MAD_CONTAINER_IMAGE"].replace("/", "_").replace(":", "_") @@ -653,7 +655,7 @@ def run_model_impl( # get docker run options docker_options += "--env MAD_MODEL_NAME='" + info["name"] + "' " # Since we are doing Jenkins level environment collection in the docker container, pass in the jenkins build number. - docker_options += f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' " + docker_options += f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' " # gather data # TODO: probably can use context.ctx instead of another dictionary like run_env here @@ -690,8 +692,9 @@ def run_model_impl( # Must set env vars and mounts at the end docker_options += self.get_env_arg(run_env) docker_options += self.get_mount_arg(mount_datapaths) + docker_options += f" {run_details.additional_docker_run_options}" - print(docker_options) + print(docker_options) # get machine name run_details.machine_name = self.console.sh("hostname") @@ -720,7 +723,7 @@ def run_model_impl( elif gpu_vendor.find("NVIDIA") != -1: smi = model_docker.sh("/usr/bin/nvidia-smi || true") else: - raise RuntimeError("Unable to determine gpu vendor.") + raise RuntimeError("Unable to determine gpu vendor.") # clean up previous model run model_dir = "run_directory" @@ -755,7 +758,7 @@ def run_model_impl( else: # http or https model_docker.sh("git clone -c credential.helper='!f() { echo username=" + self.creds[ info["cred"] ]["username"] + \ "; echo password=" + self.creds[ info["cred"] ]["password"] + "; };f' " + \ - info['url'], timeout=240, secret="git clone " + info['url'] ) + info['url'], timeout=240, secret="git clone " + info['url'] ) else: model_docker.sh("git clone " + info["url"], timeout=240) @@ -795,7 +798,7 @@ def run_model_impl( commit = model_docker.sh("cd "+ dir_path +"; git rev-parse HEAD || true ") print("======================================================") print("MODEL REPO COMMIT: ", commit ) - print("======================================================") + print("======================================================") # copy scripts to model directory model_docker.sh("cp -vLR --preserve=all "+ dir_path +"/. "+ model_dir +"/") @@ -827,7 +830,7 @@ def run_model_impl( print(f"Build Info::{selected_data_provider}") # keep model_dir as universally rw - model_docker.sh("chmod -R a+rw " + model_dir) + model_docker.sh("chmod -R a+rw " + model_dir) # run model test_start_time = time.time() @@ -875,7 +878,7 @@ def run_model_impl( print("keep_alive is specified; model_dir(" + model_dir + ") is not removed") # explicitly delete model docker to stop the container, without waiting for the in-built garbage collector - del model_docker + del model_docker def run_model(self, model_info: typing.Dict) -> bool: """Run model on container. @@ -899,6 +902,7 @@ def run_model(self, model_info: typing.Dict) -> bool: run_details.training_precision = model_info["training_precision"] run_details.args = model_info["args"] run_details.tags = model_info["tags"] + run_details.additional_docker_run_options = model_info.get("additional_docker_run_options", "") # gets pipeline variable from jenkinsfile, default value is none run_details.pipeline = os.environ.get("pipeline") # Taking gpu arch from context assumes the host image and container have the same gpu arch. @@ -906,12 +910,12 @@ def run_model(self, model_info: typing.Dict) -> bool: run_details.gpu_architecture = self.context.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] # Check if model is deprecated - if model_info.get("is_deprecated", False): - print(f"WARNING: Model {model_info['name']} has been deprecated.") - if self.args.ignore_deprecated_flag: - print(f"WARNING: Running deprecated model {model_info['name']} due to --ignore-deprecated-flag.") - else: - print(f"WARNING: Skipping execution. No bypass flags mentioned.") + if model_info.get("is_deprecated", False): + print(f"WARNING: Model {model_info['name']} has been deprecated.") + if self.args.ignore_deprecated_flag: + print(f"WARNING: Running deprecated model {model_info['name']} due to --ignore-deprecated-flag.") + else: + print(f"WARNING: Skipping execution. No bypass flags mentioned.") return True # exit early # check if model is supported on current gpu architecture, if not skip. @@ -1026,7 +1030,7 @@ def run_model(self, model_info: typing.Dict) -> bool: if col == '': run_details.performance = None print("Error: Performance metric is empty in multiple results file.") - break + break else: perf_regex = ".*performance:\\s*\\([+|-]\?[0-9]*[.]\\?[0-9]*\(e[+|-]\?[0-9]\+\)\?\\)\\s*.*\\s*" run_details.performance = self.console.sh("cat " + log_file_path + @@ -1058,7 +1062,7 @@ def run_model(self, model_info: typing.Dict) -> bool: perf_csv=self.args.output, ) - self.return_status &= (run_details.status == 'SUCCESS') + self.return_status &= (run_details.status == 'SUCCESS') except Exception as e: self.return_status = False @@ -1072,7 +1076,7 @@ def run_model(self, model_info: typing.Dict) -> bool: update_perf_csv( exception_result="perf_entry.json", perf_csv=self.args.output, - ) + ) except Exception as e: self.return_status = False @@ -1086,7 +1090,7 @@ def run_model(self, model_info: typing.Dict) -> bool: update_perf_csv( exception_result="perf_entry.json", perf_csv=self.args.output, - ) + ) return self.return_status @@ -1112,7 +1116,7 @@ def run(self) -> bool: elif host_os.find("HOST_SLES") != -1: print(self.console.sh("zypper info rocm-libs")) elif host_os.find("HOST_AZURE") != -1: - print(self.console.sh("tdnf info rocm-libs")) + print(self.console.sh("tdnf info rocm-libs")) else: print("ERROR: Unable to detect host OS.") self.return_status = False @@ -1135,12 +1139,12 @@ def run(self) -> bool: self.copy_scripts() discover_models = DiscoverModels(args=self.args) - models = discover_models.run() + models = discover_models.run() # create performance csv if not os.path.exists(self.args.output): file_print( - "model, n_gpus, training_precision, pipeline, args, tags, docker_file, base_docker, docker_sha, docker_image, git_commit, machine_name, gpu_architecture, performance, metric, relative_change, status, build_duration, test_duration, dataname, data_provider_type, data_size, data_download_duration, build_number", + "model, n_gpus, training_precision, pipeline, args, tags, docker_file, base_docker, docker_sha, docker_image, git_commit, machine_name, gpu_architecture, performance, metric, relative_change, status, build_duration, test_duration, dataname, data_provider_type, data_size, data_download_duration, build_number, additional_docker_run_options", filename=self.args.output, mode="w", )