ROCm · gargrahul · Jun 23, 2025 · Jun 9, 2025 · Jun 9, 2025 · Jun 9, 2025
@@ -82,6 +82,7 @@ class RunDetails:
         data_size (str): The size of the data.
         data_download_duration (str): The duration of data download.
         build_number (str): The CI build number.
+        additional_docker_run_options (str): The additional options used for docker run.
     """
 
     # Avoiding @property for ease of code, add if needed.
@@ -110,6 +111,7 @@ def __init__(self):
         self.data_size = ""
         self.data_download_duration = ""
         self.build_number = ""
+        self.additional_docker_run_options = ""
 
     def print_perf(self):
         """Print the performance results of a model.
@@ -258,16 +260,16 @@ def get_build_arg(self, run_build_arg: typing.Dict = {}) -> str:
         return build_args
 
     def apply_tools(
-            self, 
-            pre_encapsulate_post_scripts: typing.Dict, 
+            self,
+            pre_encapsulate_post_scripts: typing.Dict,
             run_env: typing.Dict
         ) -> None:
         """Apply tools to the model.
-        
+
         Args:
             pre_encapsulate_post_scripts: The pre, encapsulate and post scripts.
             run_env: The run environment.
-            
+
         Raises:
             Exception: An error occurred while applying tools to the model.
         """
@@ -311,38 +313,38 @@ def apply_tools(
                 )
 
     def gather_system_env_details(
-            self, 
-            pre_encapsulate_post_scripts: typing.Dict, 
+            self,
+            pre_encapsulate_post_scripts: typing.Dict,
             model_name: str
         ) -> None:
         """Gather system environment details.
-        
+
         Args:
             pre_encapsulate_post_scripts: The pre, encapsulate and post scripts.
             model_name: The model name.
 
         Returns:
             None
-        
+
         Raises:
             Exception: An error occurred while gathering system environment details.
-        
+
         Note:
             This function is used to gather system environment details.
         """
         # initialize pre_env_details
         pre_env_details = {}
         pre_env_details["path"] = "scripts/common/pre_scripts/run_rocenv_tool.sh"
         pre_env_details["args"] = model_name.replace("/", "_") + "_env"
-        pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details)        
+        pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details)
         print(f"pre encap post scripts: {pre_encapsulate_post_scripts}")
 
     def copy_scripts(self) -> None:
         """Copy scripts to the model directory."""
         scripts_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "scripts")
         print(f"Package path: {scripts_path}")
         # copy the scripts to the model directory
-        self.console.sh(f"cp -vLR --preserve=all {scripts_path}/* scripts/")
+        self.console.sh(f"cp -vLR --preserve=all {scripts_path} .")
         print(f"Scripts copied to {os.getcwd()}/scripts")
 
     def cleanup(self) -> None:
@@ -367,18 +369,18 @@ def cleanup(self) -> None:
                 self.console.sh("rm -rf scripts/common/post_scripts")
             if os.path.exists("scripts/common/tools"):
                 # remove the scripts/common/tools directory
-                self.console.sh("rm -rf scripts/common/tools")                
-            print(f"scripts/common directory has been cleaned up.")    
+                self.console.sh("rm -rf scripts/common/tools")
+            print(f"scripts/common directory has been cleaned up.")
 
     def get_gpu_arg(self, requested_gpus: str) -> str:
         """Get the GPU arguments.
-        
+
         Args:
             requested_gpus: The requested GPUs.
-        
+
         Returns:
             str: The GPU arguments.
-        
+
         Raises:
             RuntimeError: An error occurred while getting the GPU arguments.
         """
@@ -438,10 +440,10 @@ def get_gpu_arg(self, requested_gpus: str) -> str:
 
     def get_cpu_arg(self) -> str:
         """Get the CPU arguments.
-        
+
         Returns:
             str: The CPU arguments.
-        
+
         Raises:
             RuntimeError: An error occurred while getting the CPU arguments.
         """
@@ -455,13 +457,13 @@ def get_cpu_arg(self) -> str:
 
     def get_env_arg(self, run_env: typing.Dict) -> str:
         """Get the environment arguments.
-        
+
         Args:
             run_env: The run environment.
-        
+
         Returns:
             str: The environment arguments.
-        
+
         Raises:
             RuntimeError: An error occurred while getting the environment arguments.
         """
@@ -483,13 +485,13 @@ def get_env_arg(self, run_env: typing.Dict) -> str:
 
     def get_mount_arg(self, mount_datapaths: typing.List) -> str:
         """Get the mount arguments.
-        
+
         Args:
             mount_datapaths: The mount data paths.
-        
+
         Returns:
             str: The mount arguments.
-            
+
         Raises:
             RuntimeError: An error occurred while getting the mount arguments.
         """
@@ -571,7 +573,7 @@ def run_model_impl(
 
             use_cache_str = ""
             if self.args.clean_docker_cache:
-                use_cache_str = "--no-cache"       
+                use_cache_str = "--no-cache"
 
             # build docker container
             print(f"Building Docker image...")
@@ -616,8 +618,8 @@ def run_model_impl(
             print(f"BASE DOCKER is {run_details.base_docker}")
 
             # print base docker image digest
-            run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ")
-            print(f"BASE DOCKER SHA is {run_details.docker_sha}")     
+            run_details.docker_sha = self.console.sh("docker inspect --format='{{index .RepoDigests 0}}' " + run_details.base_docker + " | cut -d '@' -f 2")
+            print(f"BASE DOCKER SHA is {run_details.docker_sha}")
 
         else:
             container_name = "container_" + self.context.ctx["MAD_CONTAINER_IMAGE"].replace("/", "_").replace(":", "_")
@@ -653,7 +655,7 @@ def run_model_impl(
         # get docker run options
         docker_options += "--env MAD_MODEL_NAME='" + info["name"] + "' "
         # Since we are doing Jenkins level environment collection in the docker container, pass in the jenkins build number.
-        docker_options += f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' "         
+        docker_options += f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' "
 
         # gather data
         # TODO: probably can use context.ctx instead of another dictionary like run_env here
@@ -690,8 +692,9 @@ def run_model_impl(
         # Must set env vars and mounts at the end
         docker_options += self.get_env_arg(run_env)
         docker_options += self.get_mount_arg(mount_datapaths)
+        docker_options += f" {run_details.additional_docker_run_options}"
 
-        print(docker_options)        
+        print(docker_options)
 
         # get machine name
         run_details.machine_name = self.console.sh("hostname")
@@ -720,7 +723,7 @@ def run_model_impl(
             elif gpu_vendor.find("NVIDIA") != -1:
                 smi = model_docker.sh("/usr/bin/nvidia-smi || true")
             else:
-                raise RuntimeError("Unable to determine gpu vendor.")    
+                raise RuntimeError("Unable to determine gpu vendor.")
 
             # clean up previous model run
             model_dir = "run_directory"
@@ -755,7 +758,7 @@ def run_model_impl(
                     else:   # http or https
                         model_docker.sh("git clone -c credential.helper='!f() { echo username=" + self.creds[ info["cred"] ]["username"] + \
                                     "; echo password=" + self.creds[ info["cred"] ]["password"] + "; };f' " + \
-                                    info['url'], timeout=240, secret="git clone " + info['url'] )                    
+                                    info['url'], timeout=240, secret="git clone " + info['url'] )
                 else:
                     model_docker.sh("git clone " + info["url"], timeout=240)
 
@@ -795,7 +798,7 @@ def run_model_impl(
             commit = model_docker.sh("cd "+ dir_path +"; git rev-parse HEAD || true  ")
             print("======================================================")
             print("MODEL REPO COMMIT: ", commit )
-            print("======================================================")            
+            print("======================================================")
 
             # copy scripts to model directory
             model_docker.sh("cp -vLR --preserve=all "+ dir_path +"/. "+ model_dir +"/")
@@ -827,7 +830,7 @@ def run_model_impl(
             print(f"Build Info::{selected_data_provider}")
 
             # keep model_dir as universally rw
-            model_docker.sh("chmod -R a+rw " + model_dir) 
+            model_docker.sh("chmod -R a+rw " + model_dir)
 
             # run model
             test_start_time = time.time()
@@ -875,7 +878,7 @@ def run_model_impl(
                 print("keep_alive is specified; model_dir(" + model_dir + ") is not removed")
 
         # explicitly delete model docker to stop the container, without waiting for the in-built garbage collector
-        del model_docker                           
+        del model_docker
 
     def run_model(self, model_info: typing.Dict) -> bool:
         """Run model on container.
@@ -899,19 +902,20 @@ def run_model(self, model_info: typing.Dict) -> bool:
         run_details.training_precision = model_info["training_precision"]
         run_details.args = model_info["args"]
         run_details.tags = model_info["tags"]
+        run_details.additional_docker_run_options = model_info.get("additional_docker_run_options", "")
         # gets pipeline variable from jenkinsfile, default value is none
         run_details.pipeline = os.environ.get("pipeline")
         # Taking gpu arch from context assumes the host image and container have the same gpu arch.
         # Environment variable updates for MAD Public CI
         run_details.gpu_architecture = self.context.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"]
 
         # Check if model is deprecated
-        if model_info.get("is_deprecated", False): 
-            print(f"WARNING: Model {model_info['name']} has been deprecated.") 
-            if self.args.ignore_deprecated_flag:  
-                print(f"WARNING: Running deprecated model {model_info['name']} due to --ignore-deprecated-flag.")  
-            else:  
-                print(f"WARNING: Skipping execution. No bypass flags mentioned.")  
+        if model_info.get("is_deprecated", False):
+            print(f"WARNING: Model {model_info['name']} has been deprecated.")
+            if self.args.ignore_deprecated_flag:
+                print(f"WARNING: Running deprecated model {model_info['name']} due to --ignore-deprecated-flag.")
+            else:
+                print(f"WARNING: Skipping execution. No bypass flags mentioned.")
                 return True  # exit early
 
         # check if model is supported on current gpu architecture, if not skip.
@@ -1026,7 +1030,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
                                         if col == '':
                                             run_details.performance = None
                                             print("Error: Performance metric is empty in multiple results file.")
-                                            break 
+                                            break
                         else:
                             perf_regex = ".*performance:\\s*\\([+|-]\?[0-9]*[.]\\?[0-9]*\(e[+|-]\?[0-9]\+\)\?\\)\\s*.*\\s*"
                             run_details.performance = self.console.sh("cat " + log_file_path +
@@ -1058,7 +1062,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
                                 perf_csv=self.args.output,
                             )
 
-                        self.return_status &= (run_details.status == 'SUCCESS')                    
+                        self.return_status &= (run_details.status == 'SUCCESS')
 
                     except Exception as e:
                         self.return_status = False
@@ -1072,7 +1076,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
                         update_perf_csv(
                             exception_result="perf_entry.json",
                             perf_csv=self.args.output,
-                        )     
+                        )
 
             except Exception as e:
                 self.return_status = False
@@ -1086,7 +1090,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
                 update_perf_csv(
                     exception_result="perf_entry.json",
                     perf_csv=self.args.output,
-                )                                     
+                )
 
         return self.return_status
 
@@ -1112,7 +1116,7 @@ def run(self) -> bool:
         elif host_os.find("HOST_SLES") != -1:
             print(self.console.sh("zypper info rocm-libs"))
         elif host_os.find("HOST_AZURE") != -1:
-            print(self.console.sh("tdnf info rocm-libs"))            
+            print(self.console.sh("tdnf info rocm-libs"))
         else:
             print("ERROR: Unable to detect host OS.")
             self.return_status = False
@@ -1135,12 +1139,12 @@ def run(self) -> bool:
         self.copy_scripts()
 
         discover_models = DiscoverModels(args=self.args)
-        models = discover_models.run()     
+        models = discover_models.run()
 
         # create performance csv
         if not os.path.exists(self.args.output):
             file_print(
-                "model, n_gpus, training_precision, pipeline, args, tags, docker_file, base_docker, docker_sha, docker_image, git_commit, machine_name, gpu_architecture, performance, metric, relative_change, status, build_duration, test_duration, dataname, data_provider_type, data_size, data_download_duration, build_number",
+                "model, n_gpus, training_precision, pipeline, args, tags, docker_file, base_docker, docker_sha, docker_image, git_commit, machine_name, gpu_architecture, performance, metric, relative_change, status, build_duration, test_duration, dataname, data_provider_type, data_size, data_download_duration, build_number, additional_docker_run_options",
                 filename=self.args.output,
                 mode="w",
             )