From 6674bc3edbd3d2187f7b10fd40333949a5575bc8 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 31 Jan 2025 20:02:53 +0100 Subject: [PATCH 01/30] Add pmt for H100 to test eessi bot on Snellius --- .../nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml new file mode 100644 index 0000000000..0bf49b79dc --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml @@ -0,0 +1,2 @@ +easyconfigs: + - pmt-1.2.0-GCCcore-12.3.0-CUDA-12.1.1.eb From cfcadfdd45e809f95c98508e501e7f19546139b7 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 31 Jan 2025 23:43:35 +0100 Subject: [PATCH 02/30] Add CUDA explicitely, since we also need the runtime part to be installed, and we need to accept the eula --- .../nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml index 0bf49b79dc..fbb9203fd2 100644 --- a/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml @@ -1,2 +1,5 @@ easyconfigs: + - CUDA-12.1.1.eb: + options: + accept-eula-for: CUDA - pmt-1.2.0-GCCcore-12.3.0-CUDA-12.1.1.eb From 705db14b55f9716d12f10852ffc3d1fd1745f581 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 4 Feb 2025 23:54:21 +0100 Subject: [PATCH 03/30] Need to strip single quotes --- bot/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/build.sh b/bot/build.sh index 29444a32c2..c204e7684c 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -115,7 +115,7 @@ mkdir -p ${SINGULARITY_TMPDIR} # load modules if LOAD_MODULES is not empty if [[ ! -z ${LOAD_MODULES} ]]; then - for mod in $(echo ${LOAD_MODULES} | tr ',' '\n') + for mod in $(echo ${LOAD_MODULES} | tr "'" | tr ',' '\n') do echo "bot/build.sh: loading module '${mod}'" module load ${mod} From d039321b868130ca8bf7fd800a8cb1c01a4ac34b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 4 Feb 2025 23:56:02 +0100 Subject: [PATCH 04/30] Need to strip single quotes --- bot/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/build.sh b/bot/build.sh index c204e7684c..450fa4dbb0 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -115,7 +115,7 @@ mkdir -p ${SINGULARITY_TMPDIR} # load modules if LOAD_MODULES is not empty if [[ ! -z ${LOAD_MODULES} ]]; then - for mod in $(echo ${LOAD_MODULES} | tr "'" | tr ',' '\n') + for mod in $(echo ${LOAD_MODULES} | tr -d "'" | tr ',' '\n') do echo "bot/build.sh: loading module '${mod}'" module load ${mod} From afddc248d7b71024d077dbb833925d4bce023cab Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 00:01:13 +0100 Subject: [PATCH 05/30] It still didn't get expanded over multiple entries. Better make an explicit array out of it first --- bot/build.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bot/build.sh b/bot/build.sh index 450fa4dbb0..f6e7565590 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -115,7 +115,8 @@ mkdir -p ${SINGULARITY_TMPDIR} # load modules if LOAD_MODULES is not empty if [[ ! -z ${LOAD_MODULES} ]]; then - for mod in $(echo ${LOAD_MODULES} | tr -d "'" | tr ',' '\n') + IFS=',' read -r -a modules <<< "$(echo "${LOAD_MODULES}")" + for mod in "${modules[@]}"; do echo "bot/build.sh: loading module '${mod}'" module load ${mod} From 63efcdcc7e7f86c084086b0432290ab6f1933d15 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 02:18:26 +0100 Subject: [PATCH 06/30] Use bash array when looping over loadable modules in test.sh --- bot/test.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bot/test.sh b/bot/test.sh index 464c4817a9..fd6c4de040 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -135,7 +135,8 @@ mkdir -p ${SINGULARITY_TMPDIR} # load modules if LOAD_MODULES is not empty if [[ ! -z ${LOAD_MODULES} ]]; then - for mod in $(echo ${LOAD_MODULES} | tr ',' '\n') + IFS=',' read -r -a modules <<< "$(echo "${LOAD_MODULES}")" + for mod in "${modules[@]}"; do echo "bot/test.sh: loading module '${mod}'" module load ${mod} From 0e17f3213e070659c556961861448bda551843ae Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 03:25:22 +0100 Subject: [PATCH 07/30] Extra echo's for debugging --- EESSI-install-software.sh | 3 +++ bot/build.sh | 2 ++ create_tarball.sh | 1 + 3 files changed, 6 insertions(+) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 3a9ba175c9..2dbcd16f82 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -103,6 +103,9 @@ TOPDIR=$(dirname $(realpath $0)) source $TOPDIR/scripts/utils.sh + +echo "$TMPDIR in EESSI-install-software.sh: $TMPDIR" + # honor $TMPDIR if it is already defined, use /tmp otherwise if [ -z $TMPDIR ]; then export WORKDIR=/tmp/$USER diff --git a/bot/build.sh b/bot/build.sh index f6e7565590..e6e6a83467 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -261,6 +261,7 @@ fi # create tmp file for output of build step build_outerr=$(mktemp build.outerr.XXXX) +echo "TMPDIR when calling build software: $TMPDIR" echo "Executing command to build software:" echo "$software_layer_dir/eessi_container.sh ${COMMON_ARGS[@]} ${BUILD_STEP_ARGS[@]}" echo " -- $software_layer_dir/install_software_layer.sh \"${INSTALL_SCRIPT_ARGS[@]}\" \"$@\" 2>&1 | tee -a ${build_outerr}" @@ -298,6 +299,7 @@ export TGZ=$(printf "eessi-%s-software-%s-%s-%d.tar.gz" ${EESSI_VERSION} ${EESSI # TODO should we make this a configurable parameter of eessi_container.sh using # /tmp as default? TMP_IN_CONTAINER=/tmp +echo "TMPDIR when calling create tarball: $TMPDIR" echo "Executing command to create tarball:" echo "$software_layer_dir/eessi_container.sh ${COMMON_ARGS[@]} ${TARBALL_STEP_ARGS[@]}" echo " -- $software_layer_dir/create_tarball.sh ${TMP_IN_CONTAINER} ${EESSI_VERSION} ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} \"${EESSI_ACCELERATOR_TARGET}\" /eessi_bot_job/${TGZ} 2>&1 | tee -a ${tar_outerr}" diff --git a/create_tarball.sh b/create_tarball.sh index 01f498e1ac..80d931552c 100755 --- a/create_tarball.sh +++ b/create_tarball.sh @@ -14,6 +14,7 @@ cpu_arch_subdir=$3 accel_subdir=$4 target_tgz=$5 +echo "$TMPDIR in create_tarball: $TMPDIR" tmpdir=`mktemp -d` echo ">> tmpdir: $tmpdir" From 30c305b5cd49defdaa85e25bc238477e847804c7 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 03:29:19 +0100 Subject: [PATCH 08/30] print TMPDIR as word --- EESSI-install-software.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 2dbcd16f82..07402c17e3 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -104,7 +104,7 @@ TOPDIR=$(dirname $(realpath $0)) source $TOPDIR/scripts/utils.sh -echo "$TMPDIR in EESSI-install-software.sh: $TMPDIR" +echo "TMPDIR in EESSI-install-software.sh: $TMPDIR" # honor $TMPDIR if it is already defined, use /tmp otherwise if [ -z $TMPDIR ]; then From 190406dad0fc1ae6c3cc0189b2dc40540886d537 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 03:37:33 +0100 Subject: [PATCH 09/30] RFemove debugging echo's, bind-mount the TMPDIR if it is set --- EESSI-install-software.sh | 3 --- bot/build.sh | 13 +++++++++++-- create_tarball.sh | 1 - 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 07402c17e3..3a9ba175c9 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -103,9 +103,6 @@ TOPDIR=$(dirname $(realpath $0)) source $TOPDIR/scripts/utils.sh - -echo "TMPDIR in EESSI-install-software.sh: $TMPDIR" - # honor $TMPDIR if it is already defined, use /tmp otherwise if [ -z $TMPDIR ]; then export WORKDIR=/tmp/$USER diff --git a/bot/build.sh b/bot/build.sh index e6e6a83467..198f703e6e 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -55,6 +55,17 @@ LOCAL_TMP=$(cfg_get_value "site_config" "local_tmp") echo "bot/build.sh: LOCAL_TMP='${LOCAL_TMP}'" # TODO should local_tmp be mandatory? --> then we check here and exit if it is not provided +# Bind mount the current $TMPDIR into the container. +# A call to e.g. `mktemp` inside the container would try to use this path - and will fail if we don't bind-mount +if [[ -z ${TMPDIR} ]]; then + if [[ -z ${SINGULARITY_BIND} ]]; then + export SINGULARITY_BIND="${TMPDIR}" + else + export SINGULARITY_BIND="${SINGULARITY_BIND},${TMPDIR}" + fi +fi + + # check if path to copy build logs to is specified, so we can copy build logs for failing builds there BUILD_LOGS_DIR=$(cfg_get_value "site_config" "build_logs_dir") echo "bot/build.sh: BUILD_LOGS_DIR='${BUILD_LOGS_DIR}'" @@ -261,7 +272,6 @@ fi # create tmp file for output of build step build_outerr=$(mktemp build.outerr.XXXX) -echo "TMPDIR when calling build software: $TMPDIR" echo "Executing command to build software:" echo "$software_layer_dir/eessi_container.sh ${COMMON_ARGS[@]} ${BUILD_STEP_ARGS[@]}" echo " -- $software_layer_dir/install_software_layer.sh \"${INSTALL_SCRIPT_ARGS[@]}\" \"$@\" 2>&1 | tee -a ${build_outerr}" @@ -299,7 +309,6 @@ export TGZ=$(printf "eessi-%s-software-%s-%s-%d.tar.gz" ${EESSI_VERSION} ${EESSI # TODO should we make this a configurable parameter of eessi_container.sh using # /tmp as default? TMP_IN_CONTAINER=/tmp -echo "TMPDIR when calling create tarball: $TMPDIR" echo "Executing command to create tarball:" echo "$software_layer_dir/eessi_container.sh ${COMMON_ARGS[@]} ${TARBALL_STEP_ARGS[@]}" echo " -- $software_layer_dir/create_tarball.sh ${TMP_IN_CONTAINER} ${EESSI_VERSION} ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} \"${EESSI_ACCELERATOR_TARGET}\" /eessi_bot_job/${TGZ} 2>&1 | tee -a ${tar_outerr}" diff --git a/create_tarball.sh b/create_tarball.sh index 80d931552c..01f498e1ac 100755 --- a/create_tarball.sh +++ b/create_tarball.sh @@ -14,7 +14,6 @@ cpu_arch_subdir=$3 accel_subdir=$4 target_tgz=$5 -echo "$TMPDIR in create_tarball: $TMPDIR" tmpdir=`mktemp -d` echo ">> tmpdir: $tmpdir" From 79bdc9b6de2c08aacf96409bab60350bf6ce2b4f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 15:34:12 +0100 Subject: [PATCH 10/30] Add debugging output --- bot/build.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bot/build.sh b/bot/build.sh index 198f703e6e..4e432e1fe1 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -57,6 +57,7 @@ echo "bot/build.sh: LOCAL_TMP='${LOCAL_TMP}'" # Bind mount the current $TMPDIR into the container. # A call to e.g. `mktemp` inside the container would try to use this path - and will fail if we don't bind-mount +echo "bot/build.sh: TMPDIR=${TMDPIR}| if [[ -z ${TMPDIR} ]]; then if [[ -z ${SINGULARITY_BIND} ]]; then export SINGULARITY_BIND="${TMPDIR}" @@ -93,6 +94,8 @@ if [[ ! -z ${SHARED_FS_PATH} ]]; then fi fi +echo "DEBUG: SINGULARITY_BIND AFTER APPENDING: ${SINGULARITY_BIND}" + SINGULARITY_CACHEDIR=$(cfg_get_value "site_config" "container_cachedir") echo "bot/build.sh: SINGULARITY_CACHEDIR='${SINGULARITY_CACHEDIR}'" if [[ ! -z ${SINGULARITY_CACHEDIR} ]]; then From 06edeb35054decc7cdd5d6170fad79a2a0ba1058 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 15:38:02 +0100 Subject: [PATCH 11/30] Fix typo --- bot/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/build.sh b/bot/build.sh index 4e432e1fe1..a270537a1d 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -57,7 +57,7 @@ echo "bot/build.sh: LOCAL_TMP='${LOCAL_TMP}'" # Bind mount the current $TMPDIR into the container. # A call to e.g. `mktemp` inside the container would try to use this path - and will fail if we don't bind-mount -echo "bot/build.sh: TMPDIR=${TMDPIR}| +echo "bot/build.sh: TMPDIR=${TMDPIR}" if [[ -z ${TMPDIR} ]]; then if [[ -z ${SINGULARITY_BIND} ]]; then export SINGULARITY_BIND="${TMPDIR}" From 16c748f87e42fbb4e068c9e36e517fe1f6db537f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 16:02:21 +0100 Subject: [PATCH 12/30] Check tmpdir early on --- bot/build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bot/build.sh b/bot/build.sh index a270537a1d..96175ca4d0 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -21,6 +21,8 @@ # stop as soon as something fails set -e +echo "DEBUG: TMPDIR=$TMPDIR" + # Make sure we are referring to software-layer as working directory software_layer_dir=$(dirname $(dirname $(realpath $0))) # source utils.sh and cfg_files.sh From 6960cfda3aec3ecd08956a0dbae6347860b54eb2 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 16:46:01 +0100 Subject: [PATCH 13/30] Add more debugging output --- bot/build.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bot/build.sh b/bot/build.sh index 96175ca4d0..25492f2b45 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -29,6 +29,8 @@ software_layer_dir=$(dirname $(dirname $(realpath $0))) source $software_layer_dir/scripts/utils.sh source $software_layer_dir/scripts/cfg_files.sh +echo "DEBUG2: TMPDIR=$TMPDIR" + # defaults export JOB_CFG_FILE="${JOB_CFG_FILE_OVERRIDE:=cfg/job.cfg}" HOST_ARCH=$(uname -m) @@ -43,6 +45,8 @@ cat ${JOB_CFG_FILE} echo "bot/build.sh: obtaining configuration settings from '${JOB_CFG_FILE}'" cfg_load ${JOB_CFG_FILE} +echo "DEBUG3: TMPDIR=$TMPDIR" + # if http_proxy is defined in ${JOB_CFG_FILE} use it, if not use env var $http_proxy HTTP_PROXY=$(cfg_get_value "site_config" "http_proxy") HTTP_PROXY=${HTTP_PROXY:-${http_proxy}} From 83440477654ca7b69d8b77732ebca23a8738cafc Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 19:11:18 +0100 Subject: [PATCH 14/30] More dbugging output --- bot/build.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bot/build.sh b/bot/build.sh index 25492f2b45..0afbecbd19 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -51,13 +51,16 @@ echo "DEBUG3: TMPDIR=$TMPDIR" HTTP_PROXY=$(cfg_get_value "site_config" "http_proxy") HTTP_PROXY=${HTTP_PROXY:-${http_proxy}} echo "bot/build.sh: HTTP_PROXY='${HTTP_PROXY}'" +echo "DEBUG4: TMPDIR=$TMPDIR" # if https_proxy is defined in ${JOB_CFG_FILE} use it, if not use env var $https_proxy HTTPS_PROXY=$(cfg_get_value "site_config" "https_proxy") HTTPS_PROXY=${HTTPS_PROXY:-${https_proxy}} echo "bot/build.sh: HTTPS_PROXY='${HTTPS_PROXY}'" +echo "DEBUG5: TMPDIR=$TMPDIR" LOCAL_TMP=$(cfg_get_value "site_config" "local_tmp") +echo "DEBUG6: TMPDIR=$TMPDIR" echo "bot/build.sh: LOCAL_TMP='${LOCAL_TMP}'" # TODO should local_tmp be mandatory? --> then we check here and exit if it is not provided From 71c2dc7874050eae1a2dc5daf92fd0e97cea4d69 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 19:47:42 +0100 Subject: [PATCH 15/30] Make sure we unconditionally set TMPDIR, and make sure we also set --storage for the tarball step, so that the TMPDIR eventually is set to this value --- bot/build.sh | 3 +++ eessi_container.sh | 57 +++++++++++++++++++++++----------------------- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/bot/build.sh b/bot/build.sh index 0afbecbd19..8b2dd85230 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -311,6 +311,9 @@ else TARBALL_STEP_ARGS+=("--resume" "${REMOVAL_TMPDIR}") fi +# Make sure we define storage, so that the TMPDIR is set to this in eessi_container.sh +TARBALL_STEP_ARGS+=("--storage" "${STORAGE}") + timestamp=$(date +%s) # to set EESSI_VERSION we need to source init/eessi_defaults now source $software_layer_dir/init/eessi_defaults diff --git a/eessi_container.sh b/eessi_container.sh index fc97f9877c..d92c63b83f 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -363,40 +363,41 @@ fi # 2. set up host storage/tmp if necessary # if session to be resumed from a previous one (--resume ARG) and ARG is a directory # just reuse ARG, define environment variables accordingly and skip creating a new -# tmp storage +# eessi.XXXXXXXXXXX tempdir within TMPDIR + +# But before we call mktemp, we need to potentially set or create TMPDIR +# as location for temporary data use in the following order +# a. command line argument -l|--host-storage +# b. env var TMPDIR +# c. /tmp +# note, we ensure that (a) takes precedence by setting TMPDIR to STORAGE +# if STORAGE is not empty +# note, (b) & (c) are automatically ensured by using 'mktemp -d --tmpdir' to +# create a temporary directory +if [[ ! -z ${STORAGE} ]]; then + export TMPDIR=${STORAGE} + # mktemp fails if TMPDIR does not exist, so let's create it + mkdir -p ${TMPDIR} +fi +if [[ ! -z ${TMPDIR} ]]; then + # TODO check if TMPDIR already exists + # mktemp fails if TMPDIR does not exist, so let's create it + mkdir -p ${TMPDIR} +fi +if [[ -z ${TMPDIR} ]]; then + # mktemp falls back to using /tmp if TMPDIR is empty + # TODO check if /tmp is writable, large enough and usable (different + # features for ro-access and rw-access) + [[ ${VERBOSE} -eq 1 ]] && echo "skipping sanity checks for /tmp" +fi + +# Now, set the EESSI_HOST_STORAGE either baed on the resumed directory, or create a new one with mktemp if [[ ! -z ${RESUME} && -d ${RESUME} ]]; then # resume from directory ${RESUME} # skip creating a new tmp directory, just set environment variables echo "Resuming from previous run using temporary storage at ${RESUME}" EESSI_HOST_STORAGE=${RESUME} else - # we need a tmp location (and possibly init it with ${RESUME} if it was not - # a directory - - # as location for temporary data use in the following order - # a. command line argument -l|--host-storage - # b. env var TMPDIR - # c. /tmp - # note, we ensure that (a) takes precedence by setting TMPDIR to STORAGE - # if STORAGE is not empty - # note, (b) & (c) are automatically ensured by using 'mktemp -d --tmpdir' to - # create a temporary directory - if [[ ! -z ${STORAGE} ]]; then - export TMPDIR=${STORAGE} - # mktemp fails if TMPDIR does not exist, so let's create it - mkdir -p ${TMPDIR} - fi - if [[ ! -z ${TMPDIR} ]]; then - # TODO check if TMPDIR already exists - # mktemp fails if TMPDIR does not exist, so let's create it - mkdir -p ${TMPDIR} - fi - if [[ -z ${TMPDIR} ]]; then - # mktemp falls back to using /tmp if TMPDIR is empty - # TODO check if /tmp is writable, large enough and usable (different - # features for ro-access and rw-access) - [[ ${VERBOSE} -eq 1 ]] && echo "skipping sanity checks for /tmp" - fi EESSI_HOST_STORAGE=$(mktemp -d --tmpdir eessi.XXXXXXXXXX) echo "Using ${EESSI_HOST_STORAGE} as tmp directory (to resume session add '--resume ${EESSI_HOST_STORAGE}')." fi From ed650a04ca79a78b7b5d6b82600d2cc7e8c7e10b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 19:51:17 +0100 Subject: [PATCH 16/30] Debugging cleanup --- bot/build.sh | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/bot/build.sh b/bot/build.sh index 8b2dd85230..d2e211dd60 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -21,16 +21,12 @@ # stop as soon as something fails set -e -echo "DEBUG: TMPDIR=$TMPDIR" - # Make sure we are referring to software-layer as working directory software_layer_dir=$(dirname $(dirname $(realpath $0))) # source utils.sh and cfg_files.sh source $software_layer_dir/scripts/utils.sh source $software_layer_dir/scripts/cfg_files.sh -echo "DEBUG2: TMPDIR=$TMPDIR" - # defaults export JOB_CFG_FILE="${JOB_CFG_FILE_OVERRIDE:=cfg/job.cfg}" HOST_ARCH=$(uname -m) @@ -45,37 +41,20 @@ cat ${JOB_CFG_FILE} echo "bot/build.sh: obtaining configuration settings from '${JOB_CFG_FILE}'" cfg_load ${JOB_CFG_FILE} -echo "DEBUG3: TMPDIR=$TMPDIR" - # if http_proxy is defined in ${JOB_CFG_FILE} use it, if not use env var $http_proxy HTTP_PROXY=$(cfg_get_value "site_config" "http_proxy") HTTP_PROXY=${HTTP_PROXY:-${http_proxy}} echo "bot/build.sh: HTTP_PROXY='${HTTP_PROXY}'" -echo "DEBUG4: TMPDIR=$TMPDIR" # if https_proxy is defined in ${JOB_CFG_FILE} use it, if not use env var $https_proxy HTTPS_PROXY=$(cfg_get_value "site_config" "https_proxy") HTTPS_PROXY=${HTTPS_PROXY:-${https_proxy}} echo "bot/build.sh: HTTPS_PROXY='${HTTPS_PROXY}'" -echo "DEBUG5: TMPDIR=$TMPDIR" LOCAL_TMP=$(cfg_get_value "site_config" "local_tmp") -echo "DEBUG6: TMPDIR=$TMPDIR" echo "bot/build.sh: LOCAL_TMP='${LOCAL_TMP}'" # TODO should local_tmp be mandatory? --> then we check here and exit if it is not provided -# Bind mount the current $TMPDIR into the container. -# A call to e.g. `mktemp` inside the container would try to use this path - and will fail if we don't bind-mount -echo "bot/build.sh: TMPDIR=${TMDPIR}" -if [[ -z ${TMPDIR} ]]; then - if [[ -z ${SINGULARITY_BIND} ]]; then - export SINGULARITY_BIND="${TMPDIR}" - else - export SINGULARITY_BIND="${SINGULARITY_BIND},${TMPDIR}" - fi -fi - - # check if path to copy build logs to is specified, so we can copy build logs for failing builds there BUILD_LOGS_DIR=$(cfg_get_value "site_config" "build_logs_dir") echo "bot/build.sh: BUILD_LOGS_DIR='${BUILD_LOGS_DIR}'" @@ -103,8 +82,6 @@ if [[ ! -z ${SHARED_FS_PATH} ]]; then fi fi -echo "DEBUG: SINGULARITY_BIND AFTER APPENDING: ${SINGULARITY_BIND}" - SINGULARITY_CACHEDIR=$(cfg_get_value "site_config" "container_cachedir") echo "bot/build.sh: SINGULARITY_CACHEDIR='${SINGULARITY_CACHEDIR}'" if [[ ! -z ${SINGULARITY_CACHEDIR} ]]; then From be2fd57bc75251d9630e43b09afb9f11ae90cb33 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 21:05:13 +0100 Subject: [PATCH 17/30] Add something fast to build.. --- ...-2023a-CUDA.yml => eessi-2023.06-eb-4.9.4-2023a-CUDA.yml.bak} | 0 .../software.eessi.io/2023.06/eessi-2023.06-eb-4.9.2-2023b.yml | 1 + 2 files changed, 1 insertion(+) rename easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/{eessi-2023.06-eb-4.9.4-2023a-CUDA.yml => eessi-2023.06-eb-4.9.4-2023a-CUDA.yml.bak} (100%) diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml.bak similarity index 100% rename from easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml rename to easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml.bak diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.2-2023b.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.2-2023b.yml index 6398f014dc..a12e6a13b9 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.2-2023b.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.2-2023b.yml @@ -28,3 +28,4 @@ easyconfigs: options: # see https://github.com/easybuilders/easybuild-easyconfigs/pull/21285 from-commit: 41a2cd83f9fb017b76f0693f6a264d8acb548317 + - BCFtools-1.19-GCC-13.2.0.eb From 86b56081948911e18774af8c36ba03d71e024208 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 21:50:19 +0100 Subject: [PATCH 18/30] Make sure STORAGE gets bind-mounted --- bot/build.sh | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/bot/build.sh b/bot/build.sh index d2e211dd60..aa7ccc91cf 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -90,15 +90,23 @@ if [[ ! -z ${SINGULARITY_CACHEDIR} ]]; then export SINGULARITY_CACHEDIR fi -echo -n "setting \$STORAGE by replacing any var in '${LOCAL_TMP}' -> " -# replace any env variable in ${LOCAL_TMP} with its -# current value (e.g., a value that is local to the job) -STORAGE=$(envsubst <<< ${LOCAL_TMP}) -echo "'${STORAGE}'" +if [[ -z "${TMPDIR}" ]]; then + echo -n "setting \$STORAGE by replacing any var in '${LOCAL_TMP}' -> " + # replace any env variable in ${LOCAL_TMP} with its + # current value (e.g., a value that is local to the job) + STORAGE=$(envsubst <<< ${LOCAL_TMP}) +else + STORAGE=${TMPDIR} +fi +echo "bot/build.sh: STORAGE='${STORAGE}'" # make sure ${STORAGE} exists mkdir -p ${STORAGE} +# Make sure ${STORAGE} gets bind-mounted +# This will make sure that any subsequent jobs that create dirs or files under STORAGE have access to it in the container +export SINGULARITY_BIND="${SINGULARITY_BIND},${STORAGE}" + # make sure the base tmp storage is unique JOB_STORAGE=$(mktemp --directory --tmpdir=${STORAGE} bot_job_tmp_XXX) echo "bot/build.sh: created unique base tmp storage directory at ${JOB_STORAGE}" From a6d963e76c583dac18ee3501773dca36468f969c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Feb 2025 22:53:22 +0100 Subject: [PATCH 19/30] Add BCFtools to the easyconfig with latest eb version --- .../software.eessi.io/2023.06/eessi-2023.06-eb-4.9.2-2023b.yml | 1 - .../software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.2-2023b.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.2-2023b.yml index a12e6a13b9..6398f014dc 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.2-2023b.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.2-2023b.yml @@ -28,4 +28,3 @@ easyconfigs: options: # see https://github.com/easybuilders/easybuild-easyconfigs/pull/21285 from-commit: 41a2cd83f9fb017b76f0693f6a264d8acb548317 - - BCFtools-1.19-GCC-13.2.0.eb diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml index ee161b8b9c..5664a67e8b 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml @@ -25,3 +25,4 @@ easyconfigs: options: # see https://github.com/easybuilders/easybuild-easyconfigs/pull/21915 from-commit: 58f16c0caf8c5494c68e9eda8cbf19e9145d3cfa + - BCFtools-1.19-GCC-13.2.0.eb From e444af858d6c355f501cc84f138ae06c6197f913 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 6 Feb 2025 02:44:14 +0100 Subject: [PATCH 20/30] Fix issue that pops up if the nvidia-smi command is present on non-GPU nodes. In that case, the command exists, but an nvidia-smi --version will fail. That case should be considered as a non-GPU node --- EESSI-install-software.sh | 11 +++++++++-- bot/build.sh | 13 +++++++++++-- bot/test.sh | 11 +++++++++-- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 3a9ba175c9..1ecc29a9cb 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -271,8 +271,15 @@ fi # Install NVIDIA drivers in host_injections (if they exist) if command_exists "nvidia-smi"; then - echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh + nvidia-smi --version + ec=$? + if [ ${ec} -ne 0 ]; then + echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh + else + echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." + echo "IF the current node acutally does contain Nvidia GPUs, this should be considered an error." + fi fi diff --git a/bot/build.sh b/bot/build.sh index aa7ccc91cf..1dd6494978 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -254,12 +254,21 @@ BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") # add options required to handle NVIDIA support if command_exists "nvidia-smi"; then - echo "Command 'nvidia-smi' found, using available GPU" - BUILD_STEP_ARGS+=("--nvidia" "all") + nvidia-smi --version + ec=$? + if [ ${ec} -ne 0 ]; then + echo "Command 'nvidia-smi' found, using available GPU" + BUILD_STEP_ARGS+=("--nvidia" "all") + else + echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." + echo "IF the current node acutally does contain Nvidia GPUs, this should be considered an error." + BUILD_STEP_ARGS+=("--nvidia" "install") + fi else echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" BUILD_STEP_ARGS+=("--nvidia" "install") fi + # Retain location for host injections so we don't reinstall CUDA # (Always need to run the driver installation as available driver may change) if [[ ! -z ${SHARED_FS_PATH} ]]; then diff --git a/bot/test.sh b/bot/test.sh index fd6c4de040..95c2f2346e 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -215,8 +215,15 @@ TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") # add options required to handle NVIDIA support if command_exists "nvidia-smi"; then - echo "Command 'nvidia-smi' found, using available GPU" - TEST_STEP_ARGS+=("--nvidia" "run") + nvidia-smi --version + ec=$? + if [ ${ec} -ne 0 ]; then + echo "Command 'nvidia-smi' found, using available GPU" + BUILD_STEP_ARGS+=("--nvidia" "run") + else + echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." + echo "IF the current node acutally does contain Nvidia GPUs, this should be considered an error." + fi fi # prepare arguments to test_suite.sh (specific to test step) From 8c3ad1d9b66479fed9435c8755b096f8f85ee0d2 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 6 Feb 2025 02:49:37 +0100 Subject: [PATCH 21/30] Temporarily disable set -e, because we know and accept that nvidia-smi --version might fail --- bot/build.sh | 3 +++ bot/test.sh | 3 +++ 2 files changed, 6 insertions(+) diff --git a/bot/build.sh b/bot/build.sh index 1dd6494978..336dcb3135 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -254,8 +254,11 @@ BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") # add options required to handle NVIDIA support if command_exists "nvidia-smi"; then + # Accept that this may fail + set +e nvidia-smi --version ec=$? + set -e if [ ${ec} -ne 0 ]; then echo "Command 'nvidia-smi' found, using available GPU" BUILD_STEP_ARGS+=("--nvidia" "all") diff --git a/bot/test.sh b/bot/test.sh index 95c2f2346e..ddb5a8f662 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -215,8 +215,11 @@ TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") # add options required to handle NVIDIA support if command_exists "nvidia-smi"; then + # Accept that this may fail + set +e nvidia-smi --version ec=$? + set -e if [ ${ec} -ne 0 ]; then echo "Command 'nvidia-smi' found, using available GPU" BUILD_STEP_ARGS+=("--nvidia" "run") From aaf01de83881986e4c90b125988571128667588a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 6 Feb 2025 02:52:58 +0100 Subject: [PATCH 22/30] Fix the check in the if-statement --- EESSI-install-software.sh | 2 +- bot/build.sh | 2 +- bot/test.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 1ecc29a9cb..2cd8a054df 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -273,7 +273,7 @@ fi if command_exists "nvidia-smi"; then nvidia-smi --version ec=$? - if [ ${ec} -ne 0 ]; then + if [ ${ec} -eq 0 ]; then echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh else diff --git a/bot/build.sh b/bot/build.sh index 336dcb3135..6ce87eb0b0 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -259,7 +259,7 @@ if command_exists "nvidia-smi"; then nvidia-smi --version ec=$? set -e - if [ ${ec} -ne 0 ]; then + if [ ${ec} -eq 0 ]; then echo "Command 'nvidia-smi' found, using available GPU" BUILD_STEP_ARGS+=("--nvidia" "all") else diff --git a/bot/test.sh b/bot/test.sh index ddb5a8f662..36b76eca5f 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -220,7 +220,7 @@ if command_exists "nvidia-smi"; then nvidia-smi --version ec=$? set -e - if [ ${ec} -ne 0 ]; then + if [ ${ec} -eq 0 ]; then echo "Command 'nvidia-smi' found, using available GPU" BUILD_STEP_ARGS+=("--nvidia" "run") else From 7571be60e8cae30c4f465b256ecf10a78126386d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 6 Feb 2025 02:58:06 +0100 Subject: [PATCH 23/30] Make warning more clear --- EESSI-install-software.sh | 3 ++- bot/build.sh | 3 ++- bot/test.sh | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 2cd8a054df..1e33f42e15 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -278,7 +278,8 @@ if command_exists "nvidia-smi"; then ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh else echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "IF the current node acutally does contain Nvidia GPUs, this should be considered an error." + echo "This script now assumes this is NOT a GPU node." + echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." fi fi diff --git a/bot/build.sh b/bot/build.sh index 6ce87eb0b0..49ad43120b 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -264,7 +264,8 @@ if command_exists "nvidia-smi"; then BUILD_STEP_ARGS+=("--nvidia" "all") else echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "IF the current node acutally does contain Nvidia GPUs, this should be considered an error." + echo "This script now assumes this is NOT a GPU node." + echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." BUILD_STEP_ARGS+=("--nvidia" "install") fi else diff --git a/bot/test.sh b/bot/test.sh index 36b76eca5f..7f1ab8b983 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -225,7 +225,8 @@ if command_exists "nvidia-smi"; then BUILD_STEP_ARGS+=("--nvidia" "run") else echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "IF the current node acutally does contain Nvidia GPUs, this should be considered an error." + echo "This script now assumes this is NOT a GPU node." + echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." fi fi From 6d164b9b2475b885eb2aaf855d172e64bdcb29f0 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 6 Feb 2025 03:33:40 +0100 Subject: [PATCH 24/30] Debugging output --- run_tests.sh | 1 + test_suite.sh | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/run_tests.sh b/run_tests.sh index f6264c3cc8..074fd65985 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -27,4 +27,5 @@ CHECKOUT_LATEST="git checkout \${VERSION}" ./run_in_compat_layer_env.sh "${TEST_CLONE} && ${LATEST_VERSION} && ${CHECKOUT_LATEST}" # Run the test suite +echo "REFRAME_SCALE_TAG in run_tests: $REFRAME_SCALE_TAG" ./test_suite.sh "$@" diff --git a/test_suite.sh b/test_suite.sh index 4121a37c2e..b51e1ced50 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -203,7 +203,8 @@ else fatal_error "Failed to extract names of tests to run: ${REFRAME_NAME_ARGS}" exit ${test_selection_exit_code} fi -# Allow people deploying the bot to overrwide this +# Allow people deploying the bot to override this +echo "REFRAME_SCALE_TAG: $REFRAME_SCALE_TAG" if [ -z "$REFRAME_SCALE_TAG" ]; then REFRAME_SCALE_TAG="--tag 1_node" fi From 682fed1401b899e05c8df0bb82f31506dea4eaaf Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 6 Feb 2025 15:22:38 +0100 Subject: [PATCH 25/30] Remove BCFtool as that was only a test build on CPU with short build time. Add an actual CUDA package to test --- ...-2023a-CUDA.yml.bak => eessi-2023.06-eb-4.9.4-2023a-CUDA.yml} | 0 .../software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml | 1 - 2 files changed, 1 deletion(-) rename easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/{eessi-2023.06-eb-4.9.4-2023a-CUDA.yml.bak => eessi-2023.06-eb-4.9.4-2023a-CUDA.yml} (100%) diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml.bak b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml similarity index 100% rename from easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml.bak rename to easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml diff --git a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml index 5664a67e8b..ee161b8b9c 100644 --- a/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml +++ b/easystacks/software.eessi.io/2023.06/eessi-2023.06-eb-4.9.4-2023b.yml @@ -25,4 +25,3 @@ easyconfigs: options: # see https://github.com/easybuilders/easybuild-easyconfigs/pull/21915 from-commit: 58f16c0caf8c5494c68e9eda8cbf19e9145d3cfa - - BCFtools-1.19-GCC-13.2.0.eb From 47194682a822e73fe47d3ce13e6ee457f7e93699 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 10 Feb 2025 18:55:25 +0100 Subject: [PATCH 26/30] Remove debugging prints --- run_tests.sh | 1 - test_suite.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/run_tests.sh b/run_tests.sh index 074fd65985..f6264c3cc8 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -27,5 +27,4 @@ CHECKOUT_LATEST="git checkout \${VERSION}" ./run_in_compat_layer_env.sh "${TEST_CLONE} && ${LATEST_VERSION} && ${CHECKOUT_LATEST}" # Run the test suite -echo "REFRAME_SCALE_TAG in run_tests: $REFRAME_SCALE_TAG" ./test_suite.sh "$@" diff --git a/test_suite.sh b/test_suite.sh index b51e1ced50..f5f3255841 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -204,7 +204,6 @@ else exit ${test_selection_exit_code} fi # Allow people deploying the bot to override this -echo "REFRAME_SCALE_TAG: $REFRAME_SCALE_TAG" if [ -z "$REFRAME_SCALE_TAG" ]; then REFRAME_SCALE_TAG="--tag 1_node" fi From 6fcaf89675615067d2615c226a171256c55f5547 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 12 Feb 2025 18:48:20 +0100 Subject: [PATCH 27/30] Build CUDA 12.1.1 and CUDA 12.4.0 --- .../nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml | 1 - .../nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023b-CUDA.yml | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023b-CUDA.yml diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml index fbb9203fd2..60d82d46ad 100644 --- a/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml @@ -2,4 +2,3 @@ easyconfigs: - CUDA-12.1.1.eb: options: accept-eula-for: CUDA - - pmt-1.2.0-GCCcore-12.3.0-CUDA-12.1.1.eb diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023b-CUDA.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023b-CUDA.yml new file mode 100644 index 0000000000..8a55c5aa7e --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023b-CUDA.yml @@ -0,0 +1,4 @@ +easyconfigs: +- CUDA-12.4.0.eb: + options: + accept-eula-for: CUDA From f241881f31a65aeb856a4f00876d365c35d3939c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 13 Feb 2025 15:46:00 +0100 Subject: [PATCH 28/30] Add pmt --- .../accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml index 60d82d46ad..fbb9203fd2 100644 --- a/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml +++ b/easystacks/software.eessi.io/2023.06/accel/nvidia/zen4_h100/eessi-2023.06-eb-4.9.4-2023a-CUDA.yml @@ -2,3 +2,4 @@ easyconfigs: - CUDA-12.1.1.eb: options: accept-eula-for: CUDA + - pmt-1.2.0-GCCcore-12.3.0-CUDA-12.1.1.eb From 383ed27504a727b78834cf5671afacfaf8725078 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 2 Apr 2025 17:53:53 +0200 Subject: [PATCH 29/30] Add support for compression with zstd, which is typically much faster than gzip. We do this only if zstd is present on the system --- eessi_container.sh | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/eessi_container.sh b/eessi_container.sh index 87b859dc81..643b722716 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -402,9 +402,15 @@ else echo "Using ${EESSI_HOST_STORAGE} as tmp directory (to resume session add '--resume ${EESSI_HOST_STORAGE}')." fi -# if ${RESUME} is a file (assume a tgz), unpack it into ${EESSI_HOST_STORAGE} +# if ${RESUME} is a file, unpack it into ${EESSI_HOST_STORAGE} if [[ ! -z ${RESUME} && -f ${RESUME} ]]; then - tar xf ${RESUME} -C ${EESSI_HOST_STORAGE} + if [[ "${RESUME}" == *.tgz ]]; + tar xf ${RESUME} -C ${EESSI_HOST_STORAGE} + elif [[ "${RESUME}" == *.zst && -x "$(command -v zstd)" ]]; # Add support for resuming from zstd-compressed tarballs + zstd -dc ${RESUME} | tar -xf - -C ${EESSI_HOST_STORAGE} + elif [[ "${RESUME}" == *.zst && ! -x "$(command -v zstd)" ]]; + fatal_error "Trying to resume from tarball ${RESUME} which was compressed using zstd, but zstd command not found" + fi echo "Resuming from previous run using temporary storage ${RESUME} unpacked into ${EESSI_HOST_STORAGE}" fi @@ -854,17 +860,31 @@ if [[ ! -z ${SAVE} ]]; then # ARCH which might have been used internally, eg, when software packages # were built ... we rather keep the script here "stupid" and leave the handling # of these aspects to where the script is used + + # Compression with zlib may be quite slow. On some systems, the pipeline takes ~20 mins for a 2 min build because of this. + # Check if zstd is present for faster compression and decompression if [[ -d ${SAVE} ]]; then # assume SAVE is name of a directory to which tarball shall be written to # name format: tmp_storage-{TIMESTAMP}.tgz ts=$(date +%s) - TGZ=${SAVE}/tmp_storage-${ts}.tgz + if [[ -x "$(command -v zstd)" ]]; + TARBALL=${SAVE}/tmp_storage-${ts}.zst + tar -cf - -C ${EESSI_TMPDIR} . | zstd > ${TARBALL} + else + TARBALL=${SAVE}/tmp_storage-${ts}.tgz + tar czf ${TARBALL} -C ${EESSI_TMPDIR} . + fi else # assume SAVE is the full path to a tarball's name - TGZ=${SAVE} + TARBALL=${SAVE} + # if zstd is present and a .zst extension is asked for, use it + if [[ "${SAVE}" == *.zst && -x "$(command -v zstd)" ]]; then + tar -cf - -C ${EESSI_TMPDIR} . | zstd > ${TARBALL} + else + tar czf ${TARBALL} -C ${EESSI_TMPDIR} + fi fi - tar czf ${TGZ} -C ${EESSI_TMPDIR} . - echo "Saved contents of tmp directory '${EESSI_TMPDIR}' to tarball '${TGZ}' (to resume session add '--resume ${TGZ}')" + echo "Saved contents of tmp directory '${EESSI_TMPDIR}' to tarball '${TARBALL}' (to resume session add '--resume ${TARBALL}')" fi # TODO clean up tmp by default? only retain if another option provided (--retain-tmp) From 95d7d5687abd4b12ed2638d0bee3666377d53825 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 2 Apr 2025 18:53:13 +0200 Subject: [PATCH 30/30] Add -T0 and correct if-elif-then statements --- eessi_container.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/eessi_container.sh b/eessi_container.sh index 643b722716..c8920902e4 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -404,11 +404,12 @@ fi # if ${RESUME} is a file, unpack it into ${EESSI_HOST_STORAGE} if [[ ! -z ${RESUME} && -f ${RESUME} ]]; then - if [[ "${RESUME}" == *.tgz ]]; + if [[ "${RESUME}" == *.tgz ]]; then tar xf ${RESUME} -C ${EESSI_HOST_STORAGE} - elif [[ "${RESUME}" == *.zst && -x "$(command -v zstd)" ]]; # Add support for resuming from zstd-compressed tarballs + # Add support for resuming from zstd-compressed tarballs + elif [[ "${RESUME}" == *.zst && -x "$(command -v zstd)" ]]; then zstd -dc ${RESUME} | tar -xf - -C ${EESSI_HOST_STORAGE} - elif [[ "${RESUME}" == *.zst && ! -x "$(command -v zstd)" ]]; + elif [[ "${RESUME}" == *.zst && ! -x "$(command -v zstd)" ]]; then fatal_error "Trying to resume from tarball ${RESUME} which was compressed using zstd, but zstd command not found" fi echo "Resuming from previous run using temporary storage ${RESUME} unpacked into ${EESSI_HOST_STORAGE}" @@ -867,9 +868,9 @@ if [[ ! -z ${SAVE} ]]; then # assume SAVE is name of a directory to which tarball shall be written to # name format: tmp_storage-{TIMESTAMP}.tgz ts=$(date +%s) - if [[ -x "$(command -v zstd)" ]]; + if [[ -x "$(command -v zstd)" ]]; then TARBALL=${SAVE}/tmp_storage-${ts}.zst - tar -cf - -C ${EESSI_TMPDIR} . | zstd > ${TARBALL} + tar -cf - -C ${EESSI_TMPDIR} . | zstd -T0 > ${TARBALL} else TARBALL=${SAVE}/tmp_storage-${ts}.tgz tar czf ${TARBALL} -C ${EESSI_TMPDIR} . @@ -879,7 +880,7 @@ if [[ ! -z ${SAVE} ]]; then TARBALL=${SAVE} # if zstd is present and a .zst extension is asked for, use it if [[ "${SAVE}" == *.zst && -x "$(command -v zstd)" ]]; then - tar -cf - -C ${EESSI_TMPDIR} . | zstd > ${TARBALL} + tar -cf - -C ${EESSI_TMPDIR} . | zstd -T0 > ${TARBALL} else tar czf ${TARBALL} -C ${EESSI_TMPDIR} fi