diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/env_tags.json b/src/madengine/scripts/common/pre_scripts/rocEnvTool/env_tags.json new file mode 100644 index 00000000..8965fa00 --- /dev/null +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/env_tags.json @@ -0,0 +1,32 @@ +{ + "env_tags": [ + "os_information", + "cpu_information", + "gpu_information", + "rocm_smi_gpudeviceid", + "memory_information", + "rocm_information", + "rocm_repo_setup", + "rocm_packages_installed", + "rocm_env_variables", + "rocm_smi", + "ifwi_version", + "rocm_smi_showhw", + "rocm_smi_pcie", + "rocm_smi_pids", + "rocm_smi_topology", + "rocm_smi_showserial", + "rocm_smi_showperflevel", + "rocm_smi_showrasinfo", + "rocm_smi_showxgmierr", + "rocm_smi_clocks", + "rocm_smi_showcompute_partition", + "rocm_smi_nodesbwi", + "rocm_info", + "cuda_information", + "cuda_env_variables", + "cuda_packages_installed", + "pip_list", + "numa_balancing" + ] +} diff --git a/src/madengine/scripts/common/tools.json b/src/madengine/scripts/common/tools.json new file mode 100644 index 00000000..afc931e3 --- /dev/null +++ b/src/madengine/scripts/common/tools.json @@ -0,0 +1,113 @@ +{ + "tools": { + "rpd": { + "pre_scripts": [ + { + "path": "scripts/common/pre_scripts/trace.sh", + "args": "rpd" + } + ], + "cmd": "runTracer.sh", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rpd" + } + ] + }, + "rocprof": { + "pre_scripts": [], + "cmd": "rocprof", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/trace.sh", + "args": "rocprof" + } + ] + }, + "rocblas_trace": { + "env_vars": {"ROCBLAS_TRACE": "1"}, + "cmd": "python3 ../scripts/common/tools/get_library_trace.py" + }, + "hipblaslt_trace": { + "env_vars": {"HIPBLASLT_TRACE": "1"}, + "cmd": "python3 ../scripts/common/tools/get_library_trace.py" + }, + "miopen_trace": { + "env_vars": {"MIOPEN_TRACE": "1"}, + "cmd": "python3 ../scripts/common/tools/get_library_trace.py" + }, + "tensile_trace": { + "env_vars": {"TENSILE_TRACE": "1"}, + "cmd": "python3 ../scripts/common/tools/get_library_trace.py" + }, + "rccl_trace": { + "env_vars": {"RCCL_TRACE": "1"}, + "cmd": "python3 ../scripts/common/tools/get_library_trace.py" + }, + "test_tools_A": { + "pre_scripts": [ + { + "path": "scripts/common/test_echo.sh", + "args": "pre_script A" + } + ], + "cmd": "echo cmd_A && ", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/test_echo.sh", + "args": "post_script A" + } + ] + }, + "test_tools_B": { + "pre_scripts": [ + { + "path": "scripts/common/test_echo.sh", + "args": "pre_script B" + } + ], + "cmd": "echo cmd_B && ", + "env_vars": {}, + "post_scripts": [ + { + "path": "scripts/common/test_echo.sh", + "args": "post_script B" + } + ] + }, + "gpu_info_power_profiler": { + "pre_scripts": [ + { + "path": "scripts/common/pre_scripts/gpu_info_pre.sh" + } + ], + "cmd": "python3 ../scripts/common/tools/gpu_info_profiler.py", + "env_vars": {"DEVICE":"all", "SAMPLING_RATE":"0.1", "MODE":"power", "DUAL-GCD":"false"}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/gpu_info_post.sh", + "args": "gpu_info_power_profiler" + } + ] + }, + "gpu_info_vram_profiler": { + "pre_scripts": [ + { + "path": "scripts/common/pre_scripts/gpu_info_pre.sh" + } + ], + "cmd": "python3 ../scripts/common/tools/gpu_info_profiler.py", + "env_vars": {"DEVICE":"all", "SAMPLING_RATE":"0.1", "MODE":"vram", "DUAL-GCD":"false"}, + "post_scripts": [ + { + "path": "scripts/common/post_scripts/gpu_info_post.sh", + "args": "gpu_info_vram_profiler" + } + ] + } + } +} diff --git a/tests/fixtures/dummy/credential.json b/tests/fixtures/dummy/credential.json new file mode 100644 index 00000000..1b8a56df --- /dev/null +++ b/tests/fixtures/dummy/credential.json @@ -0,0 +1,21 @@ +{ + "NAS_NODES": [ + { + "NAME": "default", + "HOST": "localhost", + "PORT": "22", + "USERNAME": "admin", + "PASSWORD": "admin" + } + ], + "MAD_AWS_S3": { + "USERNAME": "admin", + "PASSWORD": "admin" + }, + "MAD_MINIO": { + "USERNAME": "admin-access-key", + "PASSWORD": "admin-secret-key", + "MINIO_ENDPOINT": "http://127.0.1:9000", + "AWS_ENDPOINT_URL_S3": "http://127.0.1:9000" + } +} \ No newline at end of file diff --git a/tests/fixtures/dummy/data.json b/tests/fixtures/dummy/data.json new file mode 100644 index 00000000..2c76f3df --- /dev/null +++ b/tests/fixtures/dummy/data.json @@ -0,0 +1,12 @@ +{ + "dummy_data_local": { + "local": { + "path": "/tmp" + } + }, + "dummy_data_local_fail": { + "local": { + "path": "/tmp/nonexistent" + } + } +} \ No newline at end of file diff --git a/tests/fixtures/dummy/docker/dummy.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..ee2bf723 --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy.ubuntu.amd.Dockerfile @@ -0,0 +1,3 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +ARG BASE_DOCKER=rocm/pytorch +FROM $BASE_DOCKER diff --git a/tests/fixtures/dummy/docker/dummy.ubuntu.nvidia.Dockerfile b/tests/fixtures/dummy/docker/dummy.ubuntu.nvidia.Dockerfile new file mode 100644 index 00000000..bee7bd22 --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy.ubuntu.nvidia.Dockerfile @@ -0,0 +1,38 @@ +# CONTEXT {'gpu_vendor': 'NVIDIA', 'guest_os': 'UBUNTU'} +ARG BASE_DOCKER=nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 +FROM $BASE_DOCKER +USER root +ENV WORKSPACE_DIR=/workspace +RUN mkdir -p $WORKSPACE_DIR +WORKDIR $WORKSPACE_DIR + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt update && apt install -y \ + unzip \ + jq \ + python3-pip \ + git \ + vim \ + wget \ + openmpi-bin libopenmpi-dev + +ENV PATH="/root/miniconda3/bin:${PATH}" +ARG PATH="/root/miniconda3/bin:${PATH}" +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + mkdir /root/.conda && \ + bash Miniconda3-latest-Linux-x86_64.sh -b && \ + rm -rf Miniconda3-latest-Linux-x86_64.sh + +RUN conda --version && \ + conda init +RUN pip install --upgrade pip +RUN pip install typing-extensions +RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121 + +RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null && \ + echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ bionic main' | tee /etc/apt/sources.list.d/kitware.list >/dev/null && \ + apt-get update && \ + apt-get install -y cmake + +# record configuration for posterity +RUN pip list diff --git a/tests/fixtures/dummy/docker/dummy_ctxtest.ctx1.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_ctxtest.ctx1.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..fc089932 --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_ctxtest.ctx1.ubuntu.amd.Dockerfile @@ -0,0 +1,5 @@ +# CONTEXT {'ctx_test': '1'} +ARG BASE_DOCKER=rocm/pytorch +FROM $BASE_DOCKER + +ENV ctxtest=1 diff --git a/tests/fixtures/dummy/docker/dummy_ctxtest.ctx2a.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_ctxtest.ctx2a.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..c5908719 --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_ctxtest.ctx2a.ubuntu.amd.Dockerfile @@ -0,0 +1,5 @@ +# CONTEXT {'ctx_test': '2'} +ARG BASE_DOCKER=rocm/pytorch +FROM $BASE_DOCKER + +ENV ctxtest=2 diff --git a/tests/fixtures/dummy/docker/dummy_ctxtest.ctx2b.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_ctxtest.ctx2b.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..c5908719 --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_ctxtest.ctx2b.ubuntu.amd.Dockerfile @@ -0,0 +1,5 @@ +# CONTEXT {'ctx_test': '2'} +ARG BASE_DOCKER=rocm/pytorch +FROM $BASE_DOCKER + +ENV ctxtest=2 diff --git a/tests/fixtures/dummy/docker/dummy_ctxtest.noctx.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/dummy_ctxtest.noctx.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..cb28dc45 --- /dev/null +++ b/tests/fixtures/dummy/docker/dummy_ctxtest.noctx.ubuntu.amd.Dockerfile @@ -0,0 +1,5 @@ +# CONTEXT {'ctx_test': 'None'} +ARG BASE_DOCKER=rocm/pytorch +FROM $BASE_DOCKER + +ENV ctxtest=0 diff --git a/tests/fixtures/dummy/models.json b/tests/fixtures/dummy/models.json new file mode 100644 index 00000000..1ff21c23 --- /dev/null +++ b/tests/fixtures/dummy/models.json @@ -0,0 +1,198 @@ +[ + { + "name": "dummy", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_test_group_1", + "dummy_group_1" + ], + "args": "" + }, + { + "name": "dummy2", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_group_1", + "dummy_group_2" + ], + "args": "", + "timeout": 360 + }, + { + "name": "dummy3", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_group_2" + ], + "args": "" + }, + { + "name": "dummy_timeout", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies" + ], + "args": "", + "timeout": 360 + }, + { + "name": "dummy_sleep", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_sleep.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies" + ], + "args": "300", + "timeout": 120 + }, + { + "name": "dummy_ctxtest", + "dockerfile": "docker/dummy_ctxtest", + "scripts": "scripts/dummy/run_ctxtest.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies" + ], + "args": "" + }, + { + "name": "dummy_mountpath", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_data_local.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies" + ], + "args": "" + }, + { + "name": "dummy_gpubind", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_gpu_bind.sh", + "n_gpus": "6", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies" + ], + "args": "", + "multiple_results": "results_dummy_gpubind.csv" + }, + { + "name": "dummy_cpubind", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_cpu_bind.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies" + ], + "args": "", + "multiple_results": "results_dummy_cpubind.csv" + }, + { + "name": "dummy_skip_gpu_arch", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies" + ], + "args": "", + "skip_gpu_arch": "gfx908, gfx90a, gfx1100, gfx940, gfx941, gfx942, A100, H100" + }, + { + "name": "dummy_data_local", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_data_local.sh", + "data": "dummy_data_local", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_data" + ], + "args": "" + }, + { + "name": "dummy_data_local_fail", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_data_local.sh", + "data": "dummy_data_local_fail", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_data" + ], + "args": "" + }, + { + "name": "dummy_prof", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_prof.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies" + ], + "args": "" + }, + { + "name": "dummy_prof_rccl", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_nccl_trace.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies" + ], + "args": "" + }, + { + "name": "dummy_multi", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_multi.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies" + ], + "args": "", + "multiple_results": "perf_dummy.csv" + } +] diff --git a/tests/fixtures/dummy/scripts/dummy2/models.json b/tests/fixtures/dummy/scripts/dummy2/models.json new file mode 100644 index 00000000..de114986 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy2/models.json @@ -0,0 +1,28 @@ +[ + { + "name": "model1", + "dockerfile": "../../docker/dummy", + "scripts": "run.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_test_group_2" + ], + "args": "" + }, + { + "name": "model2", + "dockerfile": "../../docker/dummy", + "scripts": "run.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "dummy_test_group_2" + ], + "args": "" + } +] \ No newline at end of file diff --git a/tests/fixtures/dummy/scripts/dummy3/docker/dummy.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/scripts/dummy3/docker/dummy.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..5a66c4ec --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy3/docker/dummy.ubuntu.amd.Dockerfile @@ -0,0 +1 @@ +../../../docker/dummy.ubuntu.amd.Dockerfile \ No newline at end of file diff --git a/tests/fixtures/dummy/scripts/dummy3/docker/dummy.ubuntu.nvidia.Dockerfile b/tests/fixtures/dummy/scripts/dummy3/docker/dummy.ubuntu.nvidia.Dockerfile new file mode 100644 index 00000000..01029ed7 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy3/docker/dummy.ubuntu.nvidia.Dockerfile @@ -0,0 +1 @@ +../../../docker/dummy.ubuntu.nvidia.Dockerfile \ No newline at end of file