From a6c028714053c6069c08f164c128c4bf70d953eb Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Mon, 9 Jun 2025 10:53:12 -0700
Subject: [PATCH 01/19] add some logging

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 80 ++++++++++++++++---------------
 1 file changed, 42 insertions(+), 38 deletions(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index dd528c4e..b1c5457a 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -258,16 +258,16 @@ def get_build_arg(self, run_build_arg: typing.Dict = {}) -> str:
         return build_args
 
     def apply_tools(
-            self, 
-            pre_encapsulate_post_scripts: typing.Dict, 
+            self,
+            pre_encapsulate_post_scripts: typing.Dict,
             run_env: typing.Dict
         ) -> None:
         """Apply tools to the model.
-        
+
         Args:
             pre_encapsulate_post_scripts: The pre, encapsulate and post scripts.
             run_env: The run environment.
-            
+
         Raises:
             Exception: An error occurred while applying tools to the model.
         """
@@ -311,22 +311,22 @@ def apply_tools(
                 )
 
     def gather_system_env_details(
-            self, 
-            pre_encapsulate_post_scripts: typing.Dict, 
+            self,
+            pre_encapsulate_post_scripts: typing.Dict,
             model_name: str
         ) -> None:
         """Gather system environment details.
-        
+
         Args:
             pre_encapsulate_post_scripts: The pre, encapsulate and post scripts.
             model_name: The model name.
 
         Returns:
             None
-        
+
         Raises:
             Exception: An error occurred while gathering system environment details.
-        
+
         Note:
             This function is used to gather system environment details.
         """
@@ -334,7 +334,7 @@ def gather_system_env_details(
         pre_env_details = {}
         pre_env_details["path"] = "scripts/common/pre_scripts/run_rocenv_tool.sh"
         pre_env_details["args"] = model_name.replace("/", "_") + "_env"
-        pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details)        
+        pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details)
         print(f"pre encap post scripts: {pre_encapsulate_post_scripts}")
 
     def copy_scripts(self) -> None:
@@ -367,18 +367,18 @@ def cleanup(self) -> None:
                 self.console.sh("rm -rf scripts/common/post_scripts")
             if os.path.exists("scripts/common/tools"):
                 # remove the scripts/common/tools directory
-                self.console.sh("rm -rf scripts/common/tools")                
-            print(f"scripts/common directory has been cleaned up.")    
+                self.console.sh("rm -rf scripts/common/tools")
+            print(f"scripts/common directory has been cleaned up.")
 
     def get_gpu_arg(self, requested_gpus: str) -> str:
         """Get the GPU arguments.
-        
+
         Args:
             requested_gpus: The requested GPUs.
-        
+
         Returns:
             str: The GPU arguments.
-        
+
         Raises:
             RuntimeError: An error occurred while getting the GPU arguments.
         """
@@ -438,10 +438,10 @@ def get_gpu_arg(self, requested_gpus: str) -> str:
 
     def get_cpu_arg(self) -> str:
         """Get the CPU arguments.
-        
+
         Returns:
             str: The CPU arguments.
-        
+
         Raises:
             RuntimeError: An error occurred while getting the CPU arguments.
         """
@@ -455,13 +455,13 @@ def get_cpu_arg(self) -> str:
 
     def get_env_arg(self, run_env: typing.Dict) -> str:
         """Get the environment arguments.
-        
+
         Args:
             run_env: The run environment.
-        
+
         Returns:
             str: The environment arguments.
-        
+
         Raises:
             RuntimeError: An error occurred while getting the environment arguments.
         """
@@ -483,13 +483,13 @@ def get_env_arg(self, run_env: typing.Dict) -> str:
 
     def get_mount_arg(self, mount_datapaths: typing.List) -> str:
         """Get the mount arguments.
-        
+
         Args:
             mount_datapaths: The mount data paths.
-        
+
         Returns:
             str: The mount arguments.
-            
+
         Raises:
             RuntimeError: An error occurred while getting the mount arguments.
         """
@@ -571,7 +571,7 @@ def run_model_impl(
 
             use_cache_str = ""
             if self.args.clean_docker_cache:
-                use_cache_str = "--no-cache"       
+                use_cache_str = "--no-cache"
 
             # build docker container
             print(f"Building Docker image...")
@@ -617,7 +617,7 @@ def run_model_impl(
 
             # print base docker image digest
             run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ")
-            print(f"BASE DOCKER SHA is {run_details.docker_sha}")     
+            print(f"BASE DOCKER SHA is {run_details.docker_sha}")
 
         else:
             container_name = "container_" + self.context.ctx["MAD_CONTAINER_IMAGE"].replace("/", "_").replace(":", "_")
@@ -653,7 +653,7 @@ def run_model_impl(
         # get docker run options
         docker_options += "--env MAD_MODEL_NAME='" + info["name"] + "' "
         # Since we are doing Jenkins level environment collection in the docker container, pass in the jenkins build number.
-        docker_options += f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' "         
+        docker_options += f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' "
 
         # gather data
         # TODO: probably can use context.ctx instead of another dictionary like run_env here
@@ -691,7 +691,7 @@ def run_model_impl(
         docker_options += self.get_env_arg(run_env)
         docker_options += self.get_mount_arg(mount_datapaths)
 
-        print(docker_options)        
+        print(docker_options)
 
         # get machine name
         run_details.machine_name = self.console.sh("hostname")
@@ -720,7 +720,7 @@ def run_model_impl(
             elif gpu_vendor.find("NVIDIA") != -1:
                 smi = model_docker.sh("/usr/bin/nvidia-smi || true")
             else:
-                raise RuntimeError("Unable to determine gpu vendor.")    
+                raise RuntimeError("Unable to determine gpu vendor.")
 
             # clean up previous model run
             model_dir = "run_directory"
@@ -755,7 +755,7 @@ def run_model_impl(
                     else:   # http or https
                         model_docker.sh("git clone -c credential.helper='!f() { echo username=" + self.creds[ info["cred"] ]["username"] + \
                                     "; echo password=" + self.creds[ info["cred"] ]["password"] + "; };f' " + \
-                                    info['url'], timeout=240, secret="git clone " + info['url'] )                    
+                                    info['url'], timeout=240, secret="git clone " + info['url'] )
                 else:
                     model_docker.sh("git clone " + info["url"], timeout=240)
 
@@ -795,7 +795,7 @@ def run_model_impl(
             commit = model_docker.sh("cd "+ dir_path +"; git rev-parse HEAD || true  ")
             print("======================================================")
             print("MODEL REPO COMMIT: ", commit )
-            print("======================================================")            
+            print("======================================================")
 
             # copy scripts to model directory
             model_docker.sh("cp -vLR --preserve=all "+ dir_path +"/. "+ model_dir +"/")
@@ -827,7 +827,7 @@ def run_model_impl(
             print(f"Build Info::{selected_data_provider}")
 
             # keep model_dir as universally rw
-            model_docker.sh("chmod -R a+rw " + model_dir) 
+            model_docker.sh("chmod -R a+rw " + model_dir)
 
             # run model
             test_start_time = time.time()
@@ -875,7 +875,7 @@ def run_model_impl(
                 print("keep_alive is specified; model_dir(" + model_dir + ") is not removed")
 
         # explicitly delete model docker to stop the container, without waiting for the in-built garbage collector
-        del model_docker                           
+        del model_docker
 
     def run_model(self, model_info: typing.Dict) -> bool:
         """Run model on container.
@@ -1005,6 +1005,10 @@ def run_model(self, model_info: typing.Dict) -> bool:
                         if multiple_results:
                             run_details.performance = multiple_results
 
+                            self.console.sh("pwd")
+                            self.console.sh("ls -l")
+                            self.console.sh(f"cat {multiple_results}")
+
                             # check the file of multiple results, check the columns of 'model,performance,metric'
                             with open(multiple_results, 'r') as f:
                                 header = f.readline().strip().split(',')
@@ -1017,7 +1021,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
                                         if col == '':
                                             run_details.performance = None
                                             print("Error: Performance metric is empty in multiple results file.")
-                                            break 
+                                            break
                         else:
                             perf_regex = ".*performance:\\s*\\([+|-]\?[0-9]*[.]\\?[0-9]*\(e[+|-]\?[0-9]\+\)\?\\)\\s*.*\\s*"
                             run_details.performance = self.console.sh("cat " + log_file_path +
@@ -1049,7 +1053,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
                                 perf_csv=self.args.output,
                             )
 
-                        self.return_status &= (run_details.status == 'SUCCESS')                    
+                        self.return_status &= (run_details.status == 'SUCCESS')
 
                     except Exception as e:
                         self.return_status = False
@@ -1063,7 +1067,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
                         update_perf_csv(
                             exception_result="perf_entry.json",
                             perf_csv=self.args.output,
-                        )     
+                        )
 
             except Exception as e:
                 self.return_status = False
@@ -1077,7 +1081,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
                 update_perf_csv(
                     exception_result="perf_entry.json",
                     perf_csv=self.args.output,
-                )                                     
+                )
 
         return self.return_status
 
@@ -1103,7 +1107,7 @@ def run(self) -> bool:
         elif host_os.find("HOST_SLES") != -1:
             print(self.console.sh("zypper info rocm-libs"))
         elif host_os.find("HOST_AZURE") != -1:
-            print(self.console.sh("tdnf info rocm-libs"))            
+            print(self.console.sh("tdnf info rocm-libs"))
         else:
             print("ERROR: Unable to detect host OS.")
             self.return_status = False
@@ -1126,7 +1130,7 @@ def run(self) -> bool:
         self.copy_scripts()
 
         discover_models = DiscoverModels(args=self.args)
-        models = discover_models.run()     
+        models = discover_models.run()
 
         # create performance csv
         if not os.path.exists(self.args.output):

From 09636968390f16f5379230a1ad045de8189b10cd Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Mon, 9 Jun 2025 12:12:33 -0700
Subject: [PATCH 02/19] ls /myworkspace instead

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index b1c5457a..226ea783 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -1006,7 +1006,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
                             run_details.performance = multiple_results
 
                             self.console.sh("pwd")
-                            self.console.sh("ls -l")
+                            self.console.sh("ls -l /myworkspace")
                             self.console.sh(f"cat {multiple_results}")
 
                             # check the file of multiple results, check the columns of 'model,performance,metric'

From 77ad2831f79b500d61316162a39d0229bf2e1f64 Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Mon, 9 Jun 2025 12:26:15 -0700
Subject: [PATCH 03/19] log the content of scripts_path

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index 226ea783..3d6c65a9 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -341,6 +341,7 @@ def copy_scripts(self) -> None:
         """Copy scripts to the model directory."""
         scripts_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "scripts")
         print(f"Package path: {scripts_path}")
+        self.console.sh(f"ls -l {scripts_path}")
         # copy the scripts to the model directory
         self.console.sh(f"cp -vLR --preserve=all {scripts_path}/* scripts/")
         print(f"Scripts copied to {os.getcwd()}/scripts")

From 63a70bf6230705f64d96253364bcac9e40616b53 Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Mon, 9 Jun 2025 13:02:03 -0700
Subject: [PATCH 04/19] fix cp missing common dir

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index 3d6c65a9..312f3306 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -341,9 +341,8 @@ def copy_scripts(self) -> None:
         """Copy scripts to the model directory."""
         scripts_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "scripts")
         print(f"Package path: {scripts_path}")
-        self.console.sh(f"ls -l {scripts_path}")
         # copy the scripts to the model directory
-        self.console.sh(f"cp -vLR --preserve=all {scripts_path}/* scripts/")
+        self.console.sh(f"cp -vLR --preserve=all {scripts_path} scripts/")
         print(f"Scripts copied to {os.getcwd()}/scripts")
 
     def cleanup(self) -> None:

From 9bedf2cb3ce8dfb4bede2c715ae3de0347658ad5 Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Mon, 9 Jun 2025 13:12:25 -0700
Subject: [PATCH 05/19] ls -l pwd instead

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index 312f3306..edf07b4f 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -1006,7 +1006,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
                             run_details.performance = multiple_results
 
                             self.console.sh("pwd")
-                            self.console.sh("ls -l /myworkspace")
+                            self.console.sh("ls -l")
                             self.console.sh(f"cat {multiple_results}")
 
                             # check the file of multiple results, check the columns of 'model,performance,metric'

From 2384b2f253af9eb502136c19de5b7b1d88fbb5ba Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Mon, 9 Jun 2025 14:43:24 -0700
Subject: [PATCH 06/19] install jq before using it

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index edf07b4f..09bea62e 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -616,6 +616,7 @@ def run_model_impl(
             print(f"BASE DOCKER is {run_details.base_docker}")
 
             # print base docker image digest
+            self.console.sh("apt-get update && apt-get install -y jq")
             run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ")
             print(f"BASE DOCKER SHA is {run_details.docker_sha}")
 

From 39ddf7c6e9fdcb00f013aa6e15df413c04d88092 Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Mon, 9 Jun 2025 17:02:11 -0700
Subject: [PATCH 07/19] install  apt-transport-https first

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index 09bea62e..50bb34a6 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -616,7 +616,7 @@ def run_model_impl(
             print(f"BASE DOCKER is {run_details.base_docker}")
 
             # print base docker image digest
-            self.console.sh("apt-get update && apt-get install -y jq")
+            self.console.sh("apt-get install -y apt-transport-https && apt-get update && apt-get install -y jq")
             run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ")
             print(f"BASE DOCKER SHA is {run_details.docker_sha}")
 

From 31cf4d11936936f70c4c62341e448562fb89f38b Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Mon, 9 Jun 2025 17:06:17 -0700
Subject: [PATCH 08/19] test using -y option

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index 50bb34a6..29555337 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -616,7 +616,7 @@ def run_model_impl(
             print(f"BASE DOCKER is {run_details.base_docker}")
 
             # print base docker image digest
-            self.console.sh("apt-get install -y apt-transport-https && apt-get update && apt-get install -y jq")
+            self.console.sh("apt-get -y update && apt-get install -y jq")
             run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ")
             print(f"BASE DOCKER SHA is {run_details.docker_sha}")
 

From f3d86dd14760367edecb7c249255b9cd74e7b7c3 Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Mon, 9 Jun 2025 17:21:06 -0700
Subject: [PATCH 09/19] use sudo

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index 29555337..fb01a953 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -616,7 +616,7 @@ def run_model_impl(
             print(f"BASE DOCKER is {run_details.base_docker}")
 
             # print base docker image digest
-            self.console.sh("apt-get -y update && apt-get install -y jq")
+            self.console.sh("sudo apt-get -y update && sudo apt-get install -y jq")
             run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ")
             print(f"BASE DOCKER SHA is {run_details.docker_sha}")
 

From 29a315d340fbb1464efb82740b5948721e2686b4 Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Tue, 10 Jun 2025 08:47:24 -0700
Subject: [PATCH 10/19] use . as dest:

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index fb01a953..2c61c4ce 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -342,7 +342,7 @@ def copy_scripts(self) -> None:
         scripts_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "scripts")
         print(f"Package path: {scripts_path}")
         # copy the scripts to the model directory
-        self.console.sh(f"cp -vLR --preserve=all {scripts_path} scripts/")
+        self.console.sh(f"cp -vLR --preserve=all {scripts_path} .")
         print(f"Scripts copied to {os.getcwd()}/scripts")
 
     def cleanup(self) -> None:

From 75a9735994429da9deca163ab94e0bfb1e1d59c7 Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Tue, 10 Jun 2025 10:09:43 -0700
Subject: [PATCH 11/19] reset gpu before start

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index 2c61c4ce..ef83b15a 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -717,6 +717,8 @@ def run_model_impl(
 
             # echo gpu smi info
             if gpu_vendor.find("AMD") != -1:
+                for i in range(int(info["n_gpus"])):
+                    model_docker.sh(f"sudo /opt/rocm/bin/rocm-smi --gpureset -d {i} || true")
                 smi = model_docker.sh("/opt/rocm/bin/rocm-smi || true")
             elif gpu_vendor.find("NVIDIA") != -1:
                 smi = model_docker.sh("/usr/bin/nvidia-smi || true")

From 9adc7010a80d791041f3cc48346c0cff35c9c959 Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Tue, 10 Jun 2025 10:20:15 -0700
Subject: [PATCH 12/19] reset gpu again

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index ef83b15a..03e92c5b 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -201,6 +201,8 @@ def clean_up_docker_container(self, is_cleaned: bool = False) -> None:
         gpu_vendor = self.context.ctx["docker_env_vars"]["MAD_GPU_VENDOR"]
         # show gpu info
         if gpu_vendor.find("AMD") != -1:
+            for i in range(int(self.context.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"])):
+                self.console.sh(f"sudo /opt/rocm/bin/rocm-smi --gpureset -d {i} || true")
             self.console.sh("/opt/rocm/bin/rocm-smi || true")
         elif gpu_vendor.find("NVIDIA") != -1:
             self.console.sh("nvidia-smi -L || true")

From 18e374d4a8f324e28aa6e2cacef30e2a05992e51 Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Tue, 10 Jun 2025 11:20:13 -0700
Subject: [PATCH 13/19] remove gpu resets

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index 03e92c5b..2c61c4ce 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -201,8 +201,6 @@ def clean_up_docker_container(self, is_cleaned: bool = False) -> None:
         gpu_vendor = self.context.ctx["docker_env_vars"]["MAD_GPU_VENDOR"]
         # show gpu info
         if gpu_vendor.find("AMD") != -1:
-            for i in range(int(self.context.ctx["docker_env_vars"]["MAD_SYSTEM_NGPUS"])):
-                self.console.sh(f"sudo /opt/rocm/bin/rocm-smi --gpureset -d {i} || true")
             self.console.sh("/opt/rocm/bin/rocm-smi || true")
         elif gpu_vendor.find("NVIDIA") != -1:
             self.console.sh("nvidia-smi -L || true")
@@ -719,8 +717,6 @@ def run_model_impl(
 
             # echo gpu smi info
             if gpu_vendor.find("AMD") != -1:
-                for i in range(int(info["n_gpus"])):
-                    model_docker.sh(f"sudo /opt/rocm/bin/rocm-smi --gpureset -d {i} || true")
                 smi = model_docker.sh("/opt/rocm/bin/rocm-smi || true")
             elif gpu_vendor.find("NVIDIA") != -1:
                 smi = model_docker.sh("/usr/bin/nvidia-smi || true")

From 852f3f9e5363a72fcfa3af37afa1c6a57515b40f Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Tue, 10 Jun 2025 11:49:16 -0700
Subject: [PATCH 14/19] remove debug logging

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index 2c61c4ce..62a0d403 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -1006,10 +1006,6 @@ def run_model(self, model_info: typing.Dict) -> bool:
                         if multiple_results:
                             run_details.performance = multiple_results
 
-                            self.console.sh("pwd")
-                            self.console.sh("ls -l")
-                            self.console.sh(f"cat {multiple_results}")
-
                             # check the file of multiple results, check the columns of 'model,performance,metric'
                             with open(multiple_results, 'r') as f:
                                 header = f.readline().strip().split(',')

From b04122b1ec543da83e552768d05fffffaf8adc3e Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Fri, 13 Jun 2025 08:59:46 -0700
Subject: [PATCH 15/19] add additional docker run option

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index 85a7cc82..be30fa52 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -82,6 +82,7 @@ class RunDetails:
         data_size (str): The size of the data.
         data_download_duration (str): The duration of data download.
         build_number (str): The CI build number.
+        additional_docker_run_options (str): The additional options used for docker run.
     """
 
     # Avoiding @property for ease of code, add if needed.
@@ -110,6 +111,7 @@ def __init__(self):
         self.data_size = ""
         self.data_download_duration = ""
         self.build_number = ""
+        self.additional_docker_run_options = ""
 
     def print_perf(self):
         """Print the performance results of a model.
@@ -691,6 +693,7 @@ def run_model_impl(
         # Must set env vars and mounts at the end
         docker_options += self.get_env_arg(run_env)
         docker_options += self.get_mount_arg(mount_datapaths)
+        docker_options += f" {run_details.additional_docker_run_options}"
 
         print(docker_options)
 
@@ -900,6 +903,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
         run_details.training_precision = model_info["training_precision"]
         run_details.args = model_info["args"]
         run_details.tags = model_info["tags"]
+        run_details.additional_docker_run_options = model_info["additional_docker_run_options"]
         # gets pipeline variable from jenkinsfile, default value is none
         run_details.pipeline = os.environ.get("pipeline")
         # Taking gpu arch from context assumes the host image and container have the same gpu arch.

From 5697fce263b85d1d76a4da9854e431810644abae Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Fri, 13 Jun 2025 14:54:13 -0700
Subject: [PATCH 16/19] make additional_docker_run_options optional

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index be30fa52..03cdd3dd 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -903,7 +903,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
         run_details.training_precision = model_info["training_precision"]
         run_details.args = model_info["args"]
         run_details.tags = model_info["tags"]
-        run_details.additional_docker_run_options = model_info["additional_docker_run_options"]
+        run_details.additional_docker_run_options = model_info.get("additional_docker_run_options", "")
         # gets pipeline variable from jenkinsfile, default value is none
         run_details.pipeline = os.environ.get("pipeline")
         # Taking gpu arch from context assumes the host image and container have the same gpu arch.
@@ -911,12 +911,12 @@ def run_model(self, model_info: typing.Dict) -> bool:
         run_details.gpu_architecture = self.context.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"]
 
         # Check if model is deprecated
-        if model_info.get("is_deprecated", False): 
-            print(f"WARNING: Model {model_info['name']} has been deprecated.") 
-            if self.args.ignore_deprecated_flag:  
-                print(f"WARNING: Running deprecated model {model_info['name']} due to --ignore-deprecated-flag.")  
-            else:  
-                print(f"WARNING: Skipping execution. No bypass flags mentioned.")  
+        if model_info.get("is_deprecated", False):
+            print(f"WARNING: Model {model_info['name']} has been deprecated.")
+            if self.args.ignore_deprecated_flag:
+                print(f"WARNING: Running deprecated model {model_info['name']} due to --ignore-deprecated-flag.")
+            else:
+                print(f"WARNING: Skipping execution. No bypass flags mentioned.")
                 return True  # exit early
 
         # check if model is supported on current gpu architecture, if not skip.

From 48f8940c89f2a412ddc3dff6ef08879936f72b0d Mon Sep 17 00:00:00 2001
From: Gene Su <Gene.Su@amd.com>
Date: Mon, 16 Jun 2025 12:49:41 -0700
Subject: [PATCH 17/19] add additional_docker_run_options to the output file

Signed-off-by: Gene Su <Gene.Su@amd.com>
---
 src/madengine/tools/run_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index 03cdd3dd..83a0a6a1 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -1145,7 +1145,7 @@ def run(self) -> bool:
         # create performance csv
         if not os.path.exists(self.args.output):
             file_print(
-                "model, n_gpus, training_precision, pipeline, args, tags, docker_file, base_docker, docker_sha, docker_image, git_commit, machine_name, gpu_architecture, performance, metric, relative_change, status, build_duration, test_duration, dataname, data_provider_type, data_size, data_download_duration, build_number",
+                "model, n_gpus, training_precision, pipeline, args, tags, docker_file, base_docker, docker_sha, docker_image, git_commit, machine_name, gpu_architecture, performance, metric, relative_change, status, build_duration, test_duration, dataname, data_provider_type, data_size, data_download_duration, build_number, additional_docker_run_options",
                 filename=self.args.output,
                 mode="w",
             )

From b0ad39594dbee692dfdaae9373564584cf4dd93d Mon Sep 17 00:00:00 2001
From: Rohan138 <rohanpotdar138@gmail.com>
Date: Fri, 20 Jun 2025 18:47:39 -0500
Subject: [PATCH 18/19] drop jq and use docker inspect instead

---
 src/madengine/tools/run_models.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index 83a0a6a1..fe26aaa1 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -618,8 +618,7 @@ def run_model_impl(
             print(f"BASE DOCKER is {run_details.base_docker}")
 
             # print base docker image digest
-            self.console.sh("sudo apt-get -y update && sudo apt-get install -y jq")
-            run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ")
+            run_details.docker_sha = self.console.sh("docker inspect --format='{{index .RepoDigests 0}}' " + run_details.docker_image + " | cut -d '@' -f 2")
             print(f"BASE DOCKER SHA is {run_details.docker_sha}")
 
         else:

From 0f86795e82c6c143abfc8710444d4c60412bbd26 Mon Sep 17 00:00:00 2001
From: Rohan138 <rohanpotdar138@gmail.com>
Date: Fri, 20 Jun 2025 18:53:57 -0500
Subject: [PATCH 19/19] drop jq and use docker inspect instead

---
 src/madengine/tools/run_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py
index fe26aaa1..102a8542 100644
--- a/src/madengine/tools/run_models.py
+++ b/src/madengine/tools/run_models.py
@@ -618,7 +618,7 @@ def run_model_impl(
             print(f"BASE DOCKER is {run_details.base_docker}")
 
             # print base docker image digest
-            run_details.docker_sha = self.console.sh("docker inspect --format='{{index .RepoDigests 0}}' " + run_details.docker_image + " | cut -d '@' -f 2")
+            run_details.docker_sha = self.console.sh("docker inspect --format='{{index .RepoDigests 0}}' " + run_details.base_docker + " | cut -d '@' -f 2")
             print(f"BASE DOCKER SHA is {run_details.docker_sha}")
 
         else: