Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 51 additions & 47 deletions src/madengine/tools/run_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ class RunDetails:
data_size (str): The size of the data.
data_download_duration (str): The duration of data download.
build_number (str): The CI build number.
additional_docker_run_options (str): The additional options used for docker run.
"""

# Avoiding @property for ease of code, add if needed.
Expand Down Expand Up @@ -110,6 +111,7 @@ def __init__(self):
self.data_size = ""
self.data_download_duration = ""
self.build_number = ""
self.additional_docker_run_options = ""

def print_perf(self):
"""Print the performance results of a model.
Expand Down Expand Up @@ -258,16 +260,16 @@ def get_build_arg(self, run_build_arg: typing.Dict = {}) -> str:
return build_args

def apply_tools(
self,
pre_encapsulate_post_scripts: typing.Dict,
self,
pre_encapsulate_post_scripts: typing.Dict,
run_env: typing.Dict
) -> None:
"""Apply tools to the model.

Args:
pre_encapsulate_post_scripts: The pre, encapsulate and post scripts.
run_env: The run environment.

Raises:
Exception: An error occurred while applying tools to the model.
"""
Expand Down Expand Up @@ -311,38 +313,38 @@ def apply_tools(
)

def gather_system_env_details(
self,
pre_encapsulate_post_scripts: typing.Dict,
self,
pre_encapsulate_post_scripts: typing.Dict,
model_name: str
) -> None:
"""Gather system environment details.

Args:
pre_encapsulate_post_scripts: The pre, encapsulate and post scripts.
model_name: The model name.

Returns:
None

Raises:
Exception: An error occurred while gathering system environment details.

Note:
This function is used to gather system environment details.
"""
# initialize pre_env_details
pre_env_details = {}
pre_env_details["path"] = "scripts/common/pre_scripts/run_rocenv_tool.sh"
pre_env_details["args"] = model_name.replace("/", "_") + "_env"
pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details)
pre_encapsulate_post_scripts["pre_scripts"].append(pre_env_details)
print(f"pre encap post scripts: {pre_encapsulate_post_scripts}")

def copy_scripts(self) -> None:
"""Copy scripts to the model directory."""
scripts_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "scripts")
print(f"Package path: {scripts_path}")
# copy the scripts to the model directory
self.console.sh(f"cp -vLR --preserve=all {scripts_path}/* scripts/")
self.console.sh(f"cp -vLR --preserve=all {scripts_path} .")
Comment thread
Rohan138 marked this conversation as resolved.
print(f"Scripts copied to {os.getcwd()}/scripts")

def cleanup(self) -> None:
Expand All @@ -367,18 +369,18 @@ def cleanup(self) -> None:
self.console.sh("rm -rf scripts/common/post_scripts")
if os.path.exists("scripts/common/tools"):
# remove the scripts/common/tools directory
self.console.sh("rm -rf scripts/common/tools")
print(f"scripts/common directory has been cleaned up.")
self.console.sh("rm -rf scripts/common/tools")
print(f"scripts/common directory has been cleaned up.")

def get_gpu_arg(self, requested_gpus: str) -> str:
"""Get the GPU arguments.

Args:
requested_gpus: The requested GPUs.

Returns:
str: The GPU arguments.

Raises:
RuntimeError: An error occurred while getting the GPU arguments.
"""
Expand Down Expand Up @@ -438,10 +440,10 @@ def get_gpu_arg(self, requested_gpus: str) -> str:

def get_cpu_arg(self) -> str:
"""Get the CPU arguments.

Returns:
str: The CPU arguments.

Raises:
RuntimeError: An error occurred while getting the CPU arguments.
"""
Expand All @@ -455,13 +457,13 @@ def get_cpu_arg(self) -> str:

def get_env_arg(self, run_env: typing.Dict) -> str:
"""Get the environment arguments.

Args:
run_env: The run environment.

Returns:
str: The environment arguments.

Raises:
RuntimeError: An error occurred while getting the environment arguments.
"""
Expand All @@ -483,13 +485,13 @@ def get_env_arg(self, run_env: typing.Dict) -> str:

def get_mount_arg(self, mount_datapaths: typing.List) -> str:
"""Get the mount arguments.

Args:
mount_datapaths: The mount data paths.

Returns:
str: The mount arguments.

Raises:
RuntimeError: An error occurred while getting the mount arguments.
"""
Expand Down Expand Up @@ -571,7 +573,7 @@ def run_model_impl(

use_cache_str = ""
if self.args.clean_docker_cache:
use_cache_str = "--no-cache"
use_cache_str = "--no-cache"

# build docker container
print(f"Building Docker image...")
Expand Down Expand Up @@ -616,8 +618,8 @@ def run_model_impl(
print(f"BASE DOCKER is {run_details.base_docker}")

# print base docker image digest
run_details.docker_sha = self.console.sh("docker manifest inspect " + run_details.base_docker + " -v | jq '.Descriptor.digest' | sed 's/\"//g' ")
print(f"BASE DOCKER SHA is {run_details.docker_sha}")
run_details.docker_sha = self.console.sh("docker inspect --format='{{index .RepoDigests 0}}' " + run_details.base_docker + " | cut -d '@' -f 2")
print(f"BASE DOCKER SHA is {run_details.docker_sha}")

else:
container_name = "container_" + self.context.ctx["MAD_CONTAINER_IMAGE"].replace("/", "_").replace(":", "_")
Expand Down Expand Up @@ -653,7 +655,7 @@ def run_model_impl(
# get docker run options
docker_options += "--env MAD_MODEL_NAME='" + info["name"] + "' "
# Since we are doing Jenkins level environment collection in the docker container, pass in the jenkins build number.
docker_options += f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' "
docker_options += f"--env JENKINS_BUILD_NUMBER='{os.environ.get('BUILD_NUMBER','0')}' "

# gather data
# TODO: probably can use context.ctx instead of another dictionary like run_env here
Expand Down Expand Up @@ -690,8 +692,9 @@ def run_model_impl(
# Must set env vars and mounts at the end
docker_options += self.get_env_arg(run_env)
docker_options += self.get_mount_arg(mount_datapaths)
docker_options += f" {run_details.additional_docker_run_options}"

print(docker_options)
print(docker_options)

# get machine name
run_details.machine_name = self.console.sh("hostname")
Expand Down Expand Up @@ -720,7 +723,7 @@ def run_model_impl(
elif gpu_vendor.find("NVIDIA") != -1:
smi = model_docker.sh("/usr/bin/nvidia-smi || true")
else:
raise RuntimeError("Unable to determine gpu vendor.")
raise RuntimeError("Unable to determine gpu vendor.")

# clean up previous model run
model_dir = "run_directory"
Expand Down Expand Up @@ -755,7 +758,7 @@ def run_model_impl(
else: # http or https
model_docker.sh("git clone -c credential.helper='!f() { echo username=" + self.creds[ info["cred"] ]["username"] + \
"; echo password=" + self.creds[ info["cred"] ]["password"] + "; };f' " + \
info['url'], timeout=240, secret="git clone " + info['url'] )
info['url'], timeout=240, secret="git clone " + info['url'] )
else:
model_docker.sh("git clone " + info["url"], timeout=240)

Expand Down Expand Up @@ -795,7 +798,7 @@ def run_model_impl(
commit = model_docker.sh("cd "+ dir_path +"; git rev-parse HEAD || true ")
print("======================================================")
print("MODEL REPO COMMIT: ", commit )
print("======================================================")
print("======================================================")

# copy scripts to model directory
model_docker.sh("cp -vLR --preserve=all "+ dir_path +"/. "+ model_dir +"/")
Expand Down Expand Up @@ -827,7 +830,7 @@ def run_model_impl(
print(f"Build Info::{selected_data_provider}")

# keep model_dir as universally rw
model_docker.sh("chmod -R a+rw " + model_dir)
model_docker.sh("chmod -R a+rw " + model_dir)

# run model
test_start_time = time.time()
Expand Down Expand Up @@ -875,7 +878,7 @@ def run_model_impl(
print("keep_alive is specified; model_dir(" + model_dir + ") is not removed")

# explicitly delete model docker to stop the container, without waiting for the in-built garbage collector
del model_docker
del model_docker

def run_model(self, model_info: typing.Dict) -> bool:
"""Run model on container.
Expand All @@ -899,19 +902,20 @@ def run_model(self, model_info: typing.Dict) -> bool:
run_details.training_precision = model_info["training_precision"]
run_details.args = model_info["args"]
run_details.tags = model_info["tags"]
run_details.additional_docker_run_options = model_info.get("additional_docker_run_options", "")
# gets pipeline variable from jenkinsfile, default value is none
run_details.pipeline = os.environ.get("pipeline")
# Taking gpu arch from context assumes the host image and container have the same gpu arch.
# Environment variable updates for MAD Public CI
run_details.gpu_architecture = self.context.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"]

# Check if model is deprecated
if model_info.get("is_deprecated", False):
print(f"WARNING: Model {model_info['name']} has been deprecated.")
if self.args.ignore_deprecated_flag:
print(f"WARNING: Running deprecated model {model_info['name']} due to --ignore-deprecated-flag.")
else:
print(f"WARNING: Skipping execution. No bypass flags mentioned.")
if model_info.get("is_deprecated", False):
print(f"WARNING: Model {model_info['name']} has been deprecated.")
if self.args.ignore_deprecated_flag:
print(f"WARNING: Running deprecated model {model_info['name']} due to --ignore-deprecated-flag.")
else:
print(f"WARNING: Skipping execution. No bypass flags mentioned.")
return True # exit early

# check if model is supported on current gpu architecture, if not skip.
Expand Down Expand Up @@ -1026,7 +1030,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
if col == '':
run_details.performance = None
print("Error: Performance metric is empty in multiple results file.")
break
break
else:
perf_regex = ".*performance:\\s*\\([+|-]\?[0-9]*[.]\\?[0-9]*\(e[+|-]\?[0-9]\+\)\?\\)\\s*.*\\s*"
run_details.performance = self.console.sh("cat " + log_file_path +
Expand Down Expand Up @@ -1058,7 +1062,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
perf_csv=self.args.output,
)

self.return_status &= (run_details.status == 'SUCCESS')
self.return_status &= (run_details.status == 'SUCCESS')

except Exception as e:
self.return_status = False
Expand All @@ -1072,7 +1076,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
update_perf_csv(
exception_result="perf_entry.json",
perf_csv=self.args.output,
)
)

except Exception as e:
self.return_status = False
Expand All @@ -1086,7 +1090,7 @@ def run_model(self, model_info: typing.Dict) -> bool:
update_perf_csv(
exception_result="perf_entry.json",
perf_csv=self.args.output,
)
)

return self.return_status

Expand All @@ -1112,7 +1116,7 @@ def run(self) -> bool:
elif host_os.find("HOST_SLES") != -1:
print(self.console.sh("zypper info rocm-libs"))
elif host_os.find("HOST_AZURE") != -1:
print(self.console.sh("tdnf info rocm-libs"))
print(self.console.sh("tdnf info rocm-libs"))
else:
print("ERROR: Unable to detect host OS.")
self.return_status = False
Expand All @@ -1135,12 +1139,12 @@ def run(self) -> bool:
self.copy_scripts()

discover_models = DiscoverModels(args=self.args)
models = discover_models.run()
models = discover_models.run()

# create performance csv
if not os.path.exists(self.args.output):
file_print(
"model, n_gpus, training_precision, pipeline, args, tags, docker_file, base_docker, docker_sha, docker_image, git_commit, machine_name, gpu_architecture, performance, metric, relative_change, status, build_duration, test_duration, dataname, data_provider_type, data_size, data_download_duration, build_number",
"model, n_gpus, training_precision, pipeline, args, tags, docker_file, base_docker, docker_sha, docker_image, git_commit, machine_name, gpu_architecture, performance, metric, relative_change, status, build_duration, test_duration, dataname, data_provider_type, data_size, data_download_duration, build_number, additional_docker_run_options",
filename=self.args.output,
mode="w",
)
Expand Down