From 8564e42cd8418f532d1b75002462223002e154a4 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 10 Feb 2025 19:00:07 +0100 Subject: [PATCH 1/3] Account for the fact that nvidia-smi might be installed on a CPU node. The command will exist, but return a non-zero exit when run with .e.g --version because there are no GPU drivers --- EESSI-install-software.sh | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 3a9ba175c9..25057216a3 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -271,11 +271,18 @@ fi # Install NVIDIA drivers in host_injections (if they exist) if command_exists "nvidia-smi"; then - echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh + nvidia-smi --version + ec=$? + if [ ${ec} -eq 0 ]; then + echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh + else + echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." + echo "This script now assumes this is NOT a GPU node." + echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." + fi fi - if [ ! -z "${shared_fs_path}" ]; then shared_eb_sourcepath=${shared_fs_path}/easybuild/sources echo ">> Using ${shared_eb_sourcepath} as shared EasyBuild source path" From a285a1cb6cdb691a20bec58760156916ce9816cd Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 10 Feb 2025 19:01:41 +0100 Subject: [PATCH 2/3] Account for the fact that nvidia-smi might be installed on a CPU node. The command will exist, but return a non-zero exit when run with .e.g --version because there are no GPU drivers --- bot/build.sh | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/bot/build.sh b/bot/build.sh index 29444a32c2..d904a020e2 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -243,14 +243,28 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR} # prepare arguments to eessi_container.sh specific to build step BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") + # add options required to handle NVIDIA support if command_exists "nvidia-smi"; then - echo "Command 'nvidia-smi' found, using available GPU" - BUILD_STEP_ARGS+=("--nvidia" "all") + # Accept that this may fail + set +e + nvidia-smi --version + ec=$? + set -e + if [ ${ec} -eq 0 ]; then + echo "Command 'nvidia-smi' found, using available GPU" + BUILD_STEP_ARGS+=("--nvidia" "all") + else + echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." + echo "This script now assumes this is NOT a GPU node." + echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." + BUILD_STEP_ARGS+=("--nvidia" "install") + fi else echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" BUILD_STEP_ARGS+=("--nvidia" "install") fi + # Retain location for host injections so we don't reinstall CUDA # (Always need to run the driver installation as available driver may change) if [[ ! -z ${SHARED_FS_PATH} ]]; then From e6f89cc8cd2eeaa422b3980f251fc0985c5bb19b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 10 Feb 2025 19:03:33 +0100 Subject: [PATCH 3/3] Account for the fact that nvidia-smi might be installed on a CPU node. The command will exist, but return a non-zero exit when run with .e.g --version because there are no GPU drivers --- bot/test.sh | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/bot/test.sh b/bot/test.sh index 464c4817a9..2b1d98c488 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -214,8 +214,19 @@ TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") # add options required to handle NVIDIA support if command_exists "nvidia-smi"; then - echo "Command 'nvidia-smi' found, using available GPU" - TEST_STEP_ARGS+=("--nvidia" "run") + # Accept that this may fail + set +e + nvidia-smi --version + ec=$? + set -e + if [ ${ec} -eq 0 ]; then + echo "Command 'nvidia-smi' found, using available GPU" + TEST_STEP_ARGS+=("--nvidia" "run") + else + echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." + echo "This script now assumes this is NOT a GPU node." + echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." + fi fi # prepare arguments to test_suite.sh (specific to test step)