Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 32 additions & 4 deletions test_suite.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,10 @@ fi
TMPDIR=$(mktemp -d)

echo ">> Setting up environment..."
module --force purge
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS)
# For this call to be succesful, it needs to be able to import archspec (which is part of EESSI)
# Thus, we execute it in a subshell where EESSI is already initialized (a bit like a bootstrap)
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(source $TOPDIR/init/bash > /dev/null 2>&1; python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS)
echo "EESSI_SOFTWARE_SUBDIR_OVERRIDE: $EESSI_SOFTWARE_SUBDIR_OVERRIDE"

source $TOPDIR/init/bash

Expand Down Expand Up @@ -136,40 +138,66 @@ echo "Configured reframe with the following environment variables:"
env | grep "RFM_"

# Inject correct CPU/memory properties into the ReFrame config file
echo "Collecting system-specific input for the ReFrame configuration file"
cpuinfo=$(lscpu)
if [[ "${cpuinfo}" =~ CPU\(s\):[^0-9]*([0-9]+) ]]; then
cpu_count=${BASH_REMATCH[1]}
echo "Detected CPU count: ${cpu_count}"
else
fatal_error "Failed to get the number of CPUs for the current test hardware with lscpu."
fi
if [[ "${cpuinfo}" =~ Socket\(s\):[^0-9]*([0-9]+) ]]; then
socket_count=${BASH_REMATCH[1]}
echo "Detected socket count: ${socket_count}"
else
fatal_error "Failed to get the number of sockets for the current test hardware with lscpu."
fi
if [[ "${cpuinfo}" =~ (Thread\(s\) per core:[^0-9]*([0-9]+)) ]]; then
threads_per_core=${BASH_REMATCH[2]}
echo "Detected threads per core: ${threads_per_core}"
else
fatal_error "Failed to get the number of threads per core for the current test hardware with lscpu."
fi
if [[ "${cpuinfo}" =~ (Core\(s\) per socket:[^0-9]*([0-9]+)) ]]; then
cores_per_socket=${BASH_REMATCH[2]}
echo "Detected cores per socket: ${cores_per_socket}"
else
fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu."
fi
cgroup_mem_bytes=$(cat /hostsys/fs/cgroup/memory/slurm/uid_${UID}/job_${SLURM_JOB_ID}/memory.limit_in_bytes)
cgroup_v1_mem_limit="/sys/fs/cgroup/memory/$(</proc/self/cpuset)/memory.limit_in_bytes"
cgroup_v2_mem_limit="/sys/fs/cgroup/$(</proc/self/cpuset)/memory.max"
if [ -f "$cgroup_v1_mem_limit" ]; then
cgroup_mem_bytes=$(cat "$cgroup_v1_mem_limit")
else
cgroup_mem_bytes=$(cat "$cgroup_v2_mem_limit")
if [ "$cgroup_mem_bytes" = 'max' ]; then
# In cgroupsv2, the memory.max file may contain 'max', meaning the group can use the full system memory
# Here, we get the system memory from /proc/meminfo. Units are supposedly always in kb, but lets match them too
cgroup_mem_kilobytes=$(grep -oP 'MemTotal:\s+\K\d+(?=\s+kB)' /proc/meminfo)
if [[ $? -ne 0 ]] || [[ -z "$cgroup_mem_kilobytes" ]]; then
fatal_error "Failed to get memory limit from /proc/meminfo"
fi
cgroup_mem_bytes=$(("$cgroup_mem_kilobytes"*1024))
fi
fi
if [[ $? -eq 0 ]]; then
# Convert to MiB
cgroup_mem_mib=$((cgroup_mem_bytes/(1024*1024)))
cgroup_mem_mib=$(("$cgroup_mem_bytes"/(1024*1024)))
else
fatal_error "Failed to get the memory limit in bytes from the current cgroup"
fi
echo "Detected available memory: ${cgroup_mem_mib} MiB"

echo "Replacing detected system information in template ReFrame config file..."
cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES}
sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES
sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES
sed -i "s/__NUM_CPUS_PER_CORE__/${threads_per_core}/g" $RFM_CONFIG_FILES
sed -i "s/__NUM_CPUS_PER_SOCKET__/${cores_per_socket}/g" $RFM_CONFIG_FILES
sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES
# Make debugging easier by printing the final config file:
echo "Final config file (after replacements):"
cat "${RFM_CONFIG_FILES}"

# Workaround for https://github.com/EESSI/software-layer/pull/467#issuecomment-1973341966
export PSM3_DEVICES='self,shm' # this is enough, since we only run single node for now
Expand Down