From a6c028714053c6069c08f164c128c4bf70d953eb Mon Sep 17 00:00:00 2001 From: Gene Su Date: Mon, 9 Jun 2025 10:53:12 -0700 Subject: [PATCH 01/19] add some logging Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 80 ++++++++++++++++--------------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index dd528c4e..b1c5457a 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -258,16 +258,16 @@ def get_build_arg(self, run_build_arg: typing.Dict = {}) -> str: return build_args def apply_tools( - self, - pre_encapsulate_post_scripts: typing.Dict, + self, + pre_encapsulate_post_scripts: typing.Dict, run_env: typing.Dict ) -> None: """Apply tools to the model. - + Args: pre_encapsulate_post_scripts: The pre, encapsulate and post scripts. run_env: The run environment. - + Raises: Exception: An error occurred while applying tools to the model. """ @@ -311,22 +311,22 @@ def apply_tools( ) def gather_system_env_details( - self, - pre_encapsulate_post_scripts: typing.Dict, + self, + pre_encapsulate_post_scripts: typing.Dict, model_name: str ) -> None: """Gather system environment details. - + Args: pre_encapsulate_post_scripts: The pre, encapsulate and post scripts. model_name: The model name. Returns: None - + Raises: Exception: An error occurred while gathering system environment details. - + Note: This function is used to gather system environment details. """ @@ -334,7 +334,7 @@ def gather_system_env_details( pre_env_details = {} pre_env_details["path"] = "scripts/common/pre_scripts/run_rocenv_tool.sh" pre_env_details["args"] = model_name.replace("/", "_") + "_env" - pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details) + pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details) print(f"pre encap post scripts: {pre_encapsulate_post_scripts}") def copy_scripts(self) -> None: @@ -367,18 +367,18 @@ def cleanup(self) -> None: self.console.sh("rm -rf scripts/common/post_scripts") if os.path.exists("scripts/common/tools"): # remove the scripts/common/tools directory - self.console.sh("rm -rf scripts/common/tools") - print(f"scripts/common directory has been cleaned up.") + self.console.sh("rm -rf scripts/common/tools") + print(f"scripts/common directory has been cleaned up.") def get_gpu_arg(self, requested_gpus: str) -> str: """Get the GPU arguments. - + Args: requested_gpus: The requested GPUs. - + Returns: str: The GPU arguments. - + Raises: RuntimeError: An error occurred while getting the GPU arguments. """ @@ -438,10 +438,10 @@ def get_gpu_arg(self, requested_gpus: str) -> str: def get_cpu_arg(self) -> str: """Get the CPU arguments. - + Returns: str: The CPU arguments. - + Raises: RuntimeError: An error occurred while getting the CPU arguments. """ @@ -455,13 +455,13 @@ def get_cpu_arg(self) -> str: def get_env_arg(self, run_env: typing.Dict) -> str: """Get the environment arguments. - + Args: run_env: The run environment. - + Returns: str: The environment arguments. - + Raises: RuntimeError: An error occurred while getting the environment arguments. """ @@ -483,13 +483,13 @@ def get_env_arg(self, run_env: typing.Dict) -> str: def get_mount_arg(self, mount_datapaths: typing.List) -> str: """Get the mount arguments. - + Args: mount_datapaths: The mount data paths. - + Returns: str: The mount arguments. - + Raises: RuntimeError: An error occurred while getting the mount arguments. """ @@ -571,7 +571,7 @@ def run_model_impl( use_cache_str = "" if self.args.clean_docker_cache: - use_cache_str = "--no-cache" + use_cache_str = "--no-cache" # build docker container print(f"Building Docker image...") @@ -617,7 +617,7 @@ def run_model_impl( # print base docker image digest run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ") - print(f"BASE DOCKER SHA is {run_details.docker_sha}") + print(f"BASE DOCKER SHA is {run_details.docker_sha}") else: container_name = "container_" + self.context.ctx["MAD_CONTAINER_IMAGE"].replace("/", "_").replace(":", "_") @@ -653,7 +653,7 @@ def run_model_impl( # get docker run options docker_options += "--env MAD_MODEL_NAME='" + info["name"] + "' " # Since we are doing Jenkins level environment collection in the docker container, pass in the jenkins build number. - docker_options += f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' " + docker_options += f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' " # gather data # TODO: probably can use context.ctx instead of another dictionary like run_env here @@ -691,7 +691,7 @@ def run_model_impl( docker_options += self.get_env_arg(run_env) docker_options += self.get_mount_arg(mount_datapaths) - print(docker_options) + print(docker_options) # get machine name run_details.machine_name = self.console.sh("hostname") @@ -720,7 +720,7 @@ def run_model_impl( elif gpu_vendor.find("NVIDIA") != -1: smi = model_docker.sh("/usr/bin/nvidia-smi || true") else: - raise RuntimeError("Unable to determine gpu vendor.") + raise RuntimeError("Unable to determine gpu vendor.") # clean up previous model run model_dir = "run_directory" @@ -755,7 +755,7 @@ def run_model_impl( else: # http or https model_docker.sh("git clone -c credential.helper='!f() { echo username=" + self.creds[ info["cred"] ]["username"] + \ "; echo password=" + self.creds[ info["cred"] ]["password"] + "; };f' " + \ - info['url'], timeout=240, secret="git clone " + info['url'] ) + info['url'], timeout=240, secret="git clone " + info['url'] ) else: model_docker.sh("git clone " + info["url"], timeout=240) @@ -795,7 +795,7 @@ def run_model_impl( commit = model_docker.sh("cd "+ dir_path +"; git rev-parse HEAD || true ") print("======================================================") print("MODEL REPO COMMIT: ", commit ) - print("======================================================") + print("======================================================") # copy scripts to model directory model_docker.sh("cp -vLR --preserve=all "+ dir_path +"/. "+ model_dir +"/") @@ -827,7 +827,7 @@ def run_model_impl( print(f"Build Info::{selected_data_provider}") # keep model_dir as universally rw - model_docker.sh("chmod -R a+rw " + model_dir) + model_docker.sh("chmod -R a+rw " + model_dir) # run model test_start_time = time.time() @@ -875,7 +875,7 @@ def run_model_impl( print("keep_alive is specified; model_dir(" + model_dir + ") is not removed") # explicitly delete model docker to stop the container, without waiting for the in-built garbage collector - del model_docker + del model_docker def run_model(self, model_info: typing.Dict) -> bool: """Run model on container. @@ -1005,6 +1005,10 @@ def run_model(self, model_info: typing.Dict) -> bool: if multiple_results: run_details.performance = multiple_results + self.console.sh("pwd") + self.console.sh("ls -l") + self.console.sh(f"cat {multiple_results}") + # check the file of multiple results, check the columns of 'model,performance,metric' with open(multiple_results, 'r') as f: header = f.readline().strip().split(',') @@ -1017,7 +1021,7 @@ def run_model(self, model_info: typing.Dict) -> bool: if col == '': run_details.performance = None print("Error: Performance metric is empty in multiple results file.") - break + break else: perf_regex = ".*performance:\\s*\\([+|-]\?[0-9]*[.]\\?[0-9]*\(e[+|-]\?[0-9]\+\)\?\\)\\s*.*\\s*" run_details.performance = self.console.sh("cat " + log_file_path + @@ -1049,7 +1053,7 @@ def run_model(self, model_info: typing.Dict) -> bool: perf_csv=self.args.output, ) - self.return_status &= (run_details.status == 'SUCCESS') + self.return_status &= (run_details.status == 'SUCCESS') except Exception as e: self.return_status = False @@ -1063,7 +1067,7 @@ def run_model(self, model_info: typing.Dict) -> bool: update_perf_csv( exception_result="perf_entry.json", perf_csv=self.args.output, - ) + ) except Exception as e: self.return_status = False @@ -1077,7 +1081,7 @@ def run_model(self, model_info: typing.Dict) -> bool: update_perf_csv( exception_result="perf_entry.json", perf_csv=self.args.output, - ) + ) return self.return_status @@ -1103,7 +1107,7 @@ def run(self) -> bool: elif host_os.find("HOST_SLES") != -1: print(self.console.sh("zypper info rocm-libs")) elif host_os.find("HOST_AZURE") != -1: - print(self.console.sh("tdnf info rocm-libs")) + print(self.console.sh("tdnf info rocm-libs")) else: print("ERROR: Unable to detect host OS.") self.return_status = False @@ -1126,7 +1130,7 @@ def run(self) -> bool: self.copy_scripts() discover_models = DiscoverModels(args=self.args) - models = discover_models.run() + models = discover_models.run() # create performance csv if not os.path.exists(self.args.output): From 09636968390f16f5379230a1ad045de8189b10cd Mon Sep 17 00:00:00 2001 From: Gene Su Date: Mon, 9 Jun 2025 12:12:33 -0700 Subject: [PATCH 02/19] ls /myworkspace instead Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index b1c5457a..226ea783 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -1006,7 +1006,7 @@ def run_model(self, model_info: typing.Dict) -> bool: run_details.performance = multiple_results self.console.sh("pwd") - self.console.sh("ls -l") + self.console.sh("ls -l /myworkspace") self.console.sh(f"cat {multiple_results}") # check the file of multiple results, check the columns of 'model,performance,metric' From 77ad2831f79b500d61316162a39d0229bf2e1f64 Mon Sep 17 00:00:00 2001 From: Gene Su Date: Mon, 9 Jun 2025 12:26:15 -0700 Subject: [PATCH 03/19] log the content of scripts_path Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 226ea783..3d6c65a9 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -341,6 +341,7 @@ def copy_scripts(self) -> None: """Copy scripts to the model directory.""" scripts_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "scripts") print(f"Package path: {scripts_path}") + self.console.sh(f"ls -l {scripts_path}") # copy the scripts to the model directory self.console.sh(f"cp -vLR --preserve=all {scripts_path}/* scripts/") print(f"Scripts copied to {os.getcwd()}/scripts") From 63a70bf6230705f64d96253364bcac9e40616b53 Mon Sep 17 00:00:00 2001 From: Gene Su Date: Mon, 9 Jun 2025 13:02:03 -0700 Subject: [PATCH 04/19] fix cp missing common dir Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 3d6c65a9..312f3306 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -341,9 +341,8 @@ def copy_scripts(self) -> None: """Copy scripts to the model directory.""" scripts_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "scripts") print(f"Package path: {scripts_path}") - self.console.sh(f"ls -l {scripts_path}") # copy the scripts to the model directory - self.console.sh(f"cp -vLR --preserve=all {scripts_path}/* scripts/") + self.console.sh(f"cp -vLR --preserve=all {scripts_path} scripts/") print(f"Scripts copied to {os.getcwd()}/scripts") def cleanup(self) -> None: From 9bedf2cb3ce8dfb4bede2c715ae3de0347658ad5 Mon Sep 17 00:00:00 2001 From: Gene Su Date: Mon, 9 Jun 2025 13:12:25 -0700 Subject: [PATCH 05/19] ls -l pwd instead Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 312f3306..edf07b4f 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -1006,7 +1006,7 @@ def run_model(self, model_info: typing.Dict) -> bool: run_details.performance = multiple_results self.console.sh("pwd") - self.console.sh("ls -l /myworkspace") + self.console.sh("ls -l") self.console.sh(f"cat {multiple_results}") # check the file of multiple results, check the columns of 'model,performance,metric' From 2384b2f253af9eb502136c19de5b7b1d88fbb5ba Mon Sep 17 00:00:00 2001 From: Gene Su Date: Mon, 9 Jun 2025 14:43:24 -0700 Subject: [PATCH 06/19] install jq before using it Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index edf07b4f..09bea62e 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -616,6 +616,7 @@ def run_model_impl( print(f"BASE DOCKER is {run_details.base_docker}") # print base docker image digest + self.console.sh("apt-get update && apt-get install -y jq") run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ") print(f"BASE DOCKER SHA is {run_details.docker_sha}") From 39ddf7c6e9fdcb00f013aa6e15df413c04d88092 Mon Sep 17 00:00:00 2001 From: Gene Su Date: Mon, 9 Jun 2025 17:02:11 -0700 Subject: [PATCH 07/19] install apt-transport-https first Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 09bea62e..50bb34a6 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -616,7 +616,7 @@ def run_model_impl( print(f"BASE DOCKER is {run_details.base_docker}") # print base docker image digest - self.console.sh("apt-get update && apt-get install -y jq") + self.console.sh("apt-get install -y apt-transport-https && apt-get update && apt-get install -y jq") run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ") print(f"BASE DOCKER SHA is {run_details.docker_sha}") From 31cf4d11936936f70c4c62341e448562fb89f38b Mon Sep 17 00:00:00 2001 From: Gene Su Date: Mon, 9 Jun 2025 17:06:17 -0700 Subject: [PATCH 08/19] test using -y option Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 50bb34a6..29555337 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -616,7 +616,7 @@ def run_model_impl( print(f"BASE DOCKER is {run_details.base_docker}") # print base docker image digest - self.console.sh("apt-get install -y apt-transport-https && apt-get update && apt-get install -y jq") + self.console.sh("apt-get -y update && apt-get install -y jq") run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ") print(f"BASE DOCKER SHA is {run_details.docker_sha}") From f3d86dd14760367edecb7c249255b9cd74e7b7c3 Mon Sep 17 00:00:00 2001 From: Gene Su Date: Mon, 9 Jun 2025 17:21:06 -0700 Subject: [PATCH 09/19] use sudo Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 29555337..fb01a953 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -616,7 +616,7 @@ def run_model_impl( print(f"BASE DOCKER is {run_details.base_docker}") # print base docker image digest - self.console.sh("apt-get -y update && apt-get install -y jq") + self.console.sh("sudo apt-get -y update && sudo apt-get install -y jq") run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ") print(f"BASE DOCKER SHA is {run_details.docker_sha}") From 29a315d340fbb1464efb82740b5948721e2686b4 Mon Sep 17 00:00:00 2001 From: Gene Su Date: Tue, 10 Jun 2025 08:47:24 -0700 Subject: [PATCH 10/19] use . as dest: Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index fb01a953..2c61c4ce 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -342,7 +342,7 @@ def copy_scripts(self) -> None: scripts_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "scripts") print(f"Package path: {scripts_path}") # copy the scripts to the model directory - self.console.sh(f"cp -vLR --preserve=all {scripts_path} scripts/") + self.console.sh(f"cp -vLR --preserve=all {scripts_path} .") print(f"Scripts copied to {os.getcwd()}/scripts") def cleanup(self) -> None: From 75a9735994429da9deca163ab94e0bfb1e1d59c7 Mon Sep 17 00:00:00 2001 From: Gene Su Date: Tue, 10 Jun 2025 10:09:43 -0700 Subject: [PATCH 11/19] reset gpu before start Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 2c61c4ce..ef83b15a 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -717,6 +717,8 @@ def run_model_impl( # echo gpu smi info if gpu_vendor.find("AMD") != -1: + for i in range(int(info["n_gpus"])): + model_docker.sh(f"sudo /opt/rocm/bin/rocm-smi --gpureset -d {i} || true") smi = model_docker.sh("/opt/rocm/bin/rocm-smi || true") elif gpu_vendor.find("NVIDIA") != -1: smi = model_docker.sh("/usr/bin/nvidia-smi || true") From 9adc7010a80d791041f3cc48346c0cff35c9c959 Mon Sep 17 00:00:00 2001 From: Gene Su Date: Tue, 10 Jun 2025 10:20:15 -0700 Subject: [PATCH 12/19] reset gpu again Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index ef83b15a..03e92c5b 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -201,6 +201,8 @@ def clean_up_docker_container(self, is_cleaned: bool = False) -> None: gpu_vendor = self.context.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] # show gpu info if gpu_vendor.find("AMD") != -1: + for i in range(int(self.context.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"])): + self.console.sh(f"sudo /opt/rocm/bin/rocm-smi --gpureset -d {i} || true") self.console.sh("/opt/rocm/bin/rocm-smi || true") elif gpu_vendor.find("NVIDIA") != -1: self.console.sh("nvidia-smi -L || true") From 18e374d4a8f324e28aa6e2cacef30e2a05992e51 Mon Sep 17 00:00:00 2001 From: Gene Su Date: Tue, 10 Jun 2025 11:20:13 -0700 Subject: [PATCH 13/19] remove gpu resets Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 03e92c5b..2c61c4ce 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -201,8 +201,6 @@ def clean_up_docker_container(self, is_cleaned: bool = False) -> None: gpu_vendor = self.context.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] # show gpu info if gpu_vendor.find("AMD") != -1: - for i in range(int(self.context.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"])): - self.console.sh(f"sudo /opt/rocm/bin/rocm-smi --gpureset -d {i} || true") self.console.sh("/opt/rocm/bin/rocm-smi || true") elif gpu_vendor.find("NVIDIA") != -1: self.console.sh("nvidia-smi -L || true") @@ -719,8 +717,6 @@ def run_model_impl( # echo gpu smi info if gpu_vendor.find("AMD") != -1: - for i in range(int(info["n_gpus"])): - model_docker.sh(f"sudo /opt/rocm/bin/rocm-smi --gpureset -d {i} || true") smi = model_docker.sh("/opt/rocm/bin/rocm-smi || true") elif gpu_vendor.find("NVIDIA") != -1: smi = model_docker.sh("/usr/bin/nvidia-smi || true") From 852f3f9e5363a72fcfa3af37afa1c6a57515b40f Mon Sep 17 00:00:00 2001 From: Gene Su Date: Tue, 10 Jun 2025 11:49:16 -0700 Subject: [PATCH 14/19] remove debug logging Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 2c61c4ce..62a0d403 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -1006,10 +1006,6 @@ def run_model(self, model_info: typing.Dict) -> bool: if multiple_results: run_details.performance = multiple_results - self.console.sh("pwd") - self.console.sh("ls -l") - self.console.sh(f"cat {multiple_results}") - # check the file of multiple results, check the columns of 'model,performance,metric' with open(multiple_results, 'r') as f: header = f.readline().strip().split(',') From b04122b1ec543da83e552768d05fffffaf8adc3e Mon Sep 17 00:00:00 2001 From: Gene Su Date: Fri, 13 Jun 2025 08:59:46 -0700 Subject: [PATCH 15/19] add additional docker run option Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 85a7cc82..be30fa52 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -82,6 +82,7 @@ class RunDetails: data_size (str): The size of the data. data_download_duration (str): The duration of data download. build_number (str): The CI build number. + additional_docker_run_options (str): The additional options used for docker run. """ # Avoiding @property for ease of code, add if needed. @@ -110,6 +111,7 @@ def __init__(self): self.data_size = "" self.data_download_duration = "" self.build_number = "" + self.additional_docker_run_options = "" def print_perf(self): """Print the performance results of a model. @@ -691,6 +693,7 @@ def run_model_impl( # Must set env vars and mounts at the end docker_options += self.get_env_arg(run_env) docker_options += self.get_mount_arg(mount_datapaths) + docker_options += f" {run_details.additional_docker_run_options}" print(docker_options) @@ -900,6 +903,7 @@ def run_model(self, model_info: typing.Dict) -> bool: run_details.training_precision = model_info["training_precision"] run_details.args = model_info["args"] run_details.tags = model_info["tags"] + run_details.additional_docker_run_options = model_info["additional_docker_run_options"] # gets pipeline variable from jenkinsfile, default value is none run_details.pipeline = os.environ.get("pipeline") # Taking gpu arch from context assumes the host image and container have the same gpu arch. From 5697fce263b85d1d76a4da9854e431810644abae Mon Sep 17 00:00:00 2001 From: Gene Su Date: Fri, 13 Jun 2025 14:54:13 -0700 Subject: [PATCH 16/19] make additional_docker_run_options optional Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index be30fa52..03cdd3dd 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -903,7 +903,7 @@ def run_model(self, model_info: typing.Dict) -> bool: run_details.training_precision = model_info["training_precision"] run_details.args = model_info["args"] run_details.tags = model_info["tags"] - run_details.additional_docker_run_options = model_info["additional_docker_run_options"] + run_details.additional_docker_run_options = model_info.get("additional_docker_run_options", "") # gets pipeline variable from jenkinsfile, default value is none run_details.pipeline = os.environ.get("pipeline") # Taking gpu arch from context assumes the host image and container have the same gpu arch. @@ -911,12 +911,12 @@ def run_model(self, model_info: typing.Dict) -> bool: run_details.gpu_architecture = self.context.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] # Check if model is deprecated - if model_info.get("is_deprecated", False): - print(f"WARNING: Model {model_info['name']} has been deprecated.") - if self.args.ignore_deprecated_flag: - print(f"WARNING: Running deprecated model {model_info['name']} due to --ignore-deprecated-flag.") - else: - print(f"WARNING: Skipping execution. No bypass flags mentioned.") + if model_info.get("is_deprecated", False): + print(f"WARNING: Model {model_info['name']} has been deprecated.") + if self.args.ignore_deprecated_flag: + print(f"WARNING: Running deprecated model {model_info['name']} due to --ignore-deprecated-flag.") + else: + print(f"WARNING: Skipping execution. No bypass flags mentioned.") return True # exit early # check if model is supported on current gpu architecture, if not skip. From 48f8940c89f2a412ddc3dff6ef08879936f72b0d Mon Sep 17 00:00:00 2001 From: Gene Su Date: Mon, 16 Jun 2025 12:49:41 -0700 Subject: [PATCH 17/19] add additional_docker_run_options to the output file Signed-off-by: Gene Su --- src/madengine/tools/run_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 03cdd3dd..83a0a6a1 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -1145,7 +1145,7 @@ def run(self) -> bool: # create performance csv if not os.path.exists(self.args.output): file_print( - "model, n_gpus, training_precision, pipeline, args, tags, docker_file, base_docker, docker_sha, docker_image, git_commit, machine_name, gpu_architecture, performance, metric, relative_change, status, build_duration, test_duration, dataname, data_provider_type, data_size, data_download_duration, build_number", + "model, n_gpus, training_precision, pipeline, args, tags, docker_file, base_docker, docker_sha, docker_image, git_commit, machine_name, gpu_architecture, performance, metric, relative_change, status, build_duration, test_duration, dataname, data_provider_type, data_size, data_download_duration, build_number, additional_docker_run_options", filename=self.args.output, mode="w", ) From b0ad39594dbee692dfdaae9373564584cf4dd93d Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Fri, 20 Jun 2025 18:47:39 -0500 Subject: [PATCH 18/19] drop jq and use docker inspect instead --- src/madengine/tools/run_models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 83a0a6a1..fe26aaa1 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -618,8 +618,7 @@ def run_model_impl( print(f"BASE DOCKER is {run_details.base_docker}") # print base docker image digest - self.console.sh("sudo apt-get -y update && sudo apt-get install -y jq") - run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ") + run_details.docker_sha = self.console.sh("docker inspect --format='{{index .RepoDigests 0}}' " + run_details.docker_image + " | cut -d '@' -f 2") print(f"BASE DOCKER SHA is {run_details.docker_sha}") else: From 0f86795e82c6c143abfc8710444d4c60412bbd26 Mon Sep 17 00:00:00 2001 From: Rohan138 Date: Fri, 20 Jun 2025 18:53:57 -0500 Subject: [PATCH 19/19] drop jq and use docker inspect instead --- src/madengine/tools/run_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index fe26aaa1..102a8542 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -618,7 +618,7 @@ def run_model_impl( print(f"BASE DOCKER is {run_details.base_docker}") # print base docker image digest - run_details.docker_sha = self.console.sh("docker inspect --format='{{index .RepoDigests 0}}' " + run_details.docker_image + " | cut -d '@' -f 2") + run_details.docker_sha = self.console.sh("docker inspect --format='{{index .RepoDigests 0}}' " + run_details.base_docker + " | cut -d '@' -f 2") print(f"BASE DOCKER SHA is {run_details.docker_sha}") else: