diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 2830754b29..d3cbfe27ed 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -35,7 +35,7 @@ while [[ $# -gt 0 ]]; do export https_proxy="$2" shift 2 ;; - -*|--*) + -*) echo "Error: Unknown option: $1" >&2 exit 1 ;; @@ -48,12 +48,12 @@ done set -- "${POSITIONAL_ARGS[@]}" -TOPDIR=$(dirname $(realpath $0)) +TOPDIR=$(dirname "$(realpath "$0")") -source $TOPDIR/scripts/utils.sh +source "$TOPDIR"/scripts/utils.sh # honor $TMPDIR if it is already defined, use /tmp otherwise -if [ -z $TMPDIR ]; then +if [ -z "$TMPDIR" ]; then export WORKDIR=/tmp/$USER else export WORKDIR=$TMPDIR/$USER @@ -63,20 +63,16 @@ TMPDIR=$(mktemp -d) echo ">> Setting up environment..." -source $TOPDIR/init/minimal_eessi_env +source "$TOPDIR"/init/minimal_eessi_env -if [ -d $EESSI_CVMFS_REPO ]; then +if [ -d "$EESSI_CVMFS_REPO" ]; then echo_green "$EESSI_CVMFS_REPO available, OK!" else fatal_error "$EESSI_CVMFS_REPO is not available!" fi # make sure we're in Prefix environment by checking $SHELL -if [[ ${SHELL} = ${EPREFIX}/bin/bash ]]; then - echo_green ">> It looks like we're in a Gentoo Prefix environment, good!" -else - fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" -fi +check_in_prefix_shell # avoid that pyc files for EasyBuild are stored in EasyBuild installation directory export PYTHONPYCACHEPREFIX=$TMPDIR/pycache @@ -92,8 +88,10 @@ if [[ "$EASYBUILD_OPTARCH" == "GENERIC" ]]; then fi echo ">> Determining software subdirectory to use for current build host..." -if [ -z $EESSI_SOFTWARE_SUBDIR_OVERRIDE ]; then - export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS) +if [ -z "$EESSI_SOFTWARE_SUBDIR_OVERRIDE" ]; then + # shellcheck disable=SC2086 + EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 "$TOPDIR"/eessi_software_subdir.py $DETECTION_PARAMETERS) + export EESSI_SOFTWARE_SUBDIR_OVERRIDE echo ">> Determined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE via 'eessi_software_subdir.py $DETECTION_PARAMETERS' script" else echo ">> Picking up pre-defined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE: ${EESSI_SOFTWARE_SUBDIR_OVERRIDE}" @@ -102,7 +100,7 @@ fi # Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE) # $EESSI_SILENT - don't print any messages # $EESSI_BASIC_ENV - give a basic set of environment variables -EESSI_SILENT=1 EESSI_BASIC_ENV=1 source $TOPDIR/init/eessi_environment_variables +EESSI_SILENT=1 EESSI_BASIC_ENV=1 source "$TOPDIR"/init/eessi_environment_variables if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then fatal_error "Failed to determine software subdirectory?!" @@ -113,9 +111,10 @@ else fi echo ">> Initializing Lmod..." -source $EPREFIX/usr/share/Lmod/init/bash +source "$EPREFIX"/usr/share/Lmod/init/bash ml_version_out=$TMPDIR/ml.out -ml --version &> $ml_version_out +ml --version &> "$ml_version_out" +# shellcheck disable=SC2181 if [[ $? -eq 0 ]]; then echo_green ">> Found Lmod ${LMOD_VERSION}" else @@ -123,14 +122,14 @@ else fi echo ">> Configuring EasyBuild..." -source $TOPDIR/configure_easybuild +source "$TOPDIR"/configure_easybuild echo ">> Setting up \$MODULEPATH..." # make sure no modules are loaded module --force purge # ignore current $MODULEPATH entirely -module unuse $MODULEPATH -module use $EASYBUILD_INSTALLPATH/modules/all +module unuse "$MODULEPATH" +module use "$EASYBUILD_INSTALLPATH"/modules/all if [[ -z ${MODULEPATH} ]]; then fatal_error "Failed to set up \$MODULEPATH?!" else @@ -141,7 +140,8 @@ REQ_EB_VERSION='4.5.0' echo ">> Checking for EasyBuild module..." ml_av_easybuild_out=$TMPDIR/ml_av_easybuild.out -module avail 2>&1 | grep -i easybuild/${REQ_EB_VERSION} &> ${ml_av_easybuild_out} +module avail 2>&1 | grep -i easybuild/${REQ_EB_VERSION} &> "${ml_av_easybuild_out}" +# shellcheck disable=SC2181 if [[ $? -eq 0 ]]; then echo_green ">> EasyBuild module found!" else @@ -150,7 +150,7 @@ else EB_TMPDIR=${TMPDIR}/ebtmp echo ">> Temporary installation (in ${EB_TMPDIR})..." pip_install_out=${TMPDIR}/pip_install.out - pip3 install --prefix $EB_TMPDIR easybuild &> ${pip_install_out} + pip3 install --prefix "$EB_TMPDIR" easybuild &> "${pip_install_out}" # keep track of original $PATH and $PYTHONPATH values, so we can restore them ORIG_PATH=$PATH @@ -158,11 +158,12 @@ else echo ">> Final installation in ${EASYBUILD_INSTALLPATH}..." export PATH=${EB_TMPDIR}/bin:$PATH - export PYTHONPATH=$(ls -d ${EB_TMPDIR}/lib/python*/site-packages):$PYTHONPATH + PYTHONPATH=$(ls -d "${EB_TMPDIR}"/lib/python*/site-packages):$PYTHONPATH + export PYTHONPATH eb_install_out=${TMPDIR}/eb_install.out ok_msg="Latest EasyBuild release installed, let's go!" fail_msg="Installing latest EasyBuild release failed, that's not good... (output: ${eb_install_out})" - eb --install-latest-eb-release &> ${eb_install_out} + eb --install-latest-eb-release &> "${eb_install_out}" check_exit_code $? "${ok_msg}" "${fail_msg}" # restore origin $PATH and $PYTHONPATH values @@ -173,11 +174,11 @@ else if [[ $? -eq 0 ]]; then ok_msg="EasyBuild v${REQ_EB_VERSION} installed, alright!" fail_msg="Installing EasyBuild v${REQ_EB_VERSION}, yikes! (output: ${eb_install_out})" - eb EasyBuild-${REQ_EB_VERSION}.eb >> ${eb_install_out} 2>&1 + eb EasyBuild-${REQ_EB_VERSION}.eb >> "${eb_install_out}" 2>&1 check_exit_code $? "${ok_msg}" "${fail_msg}" fi - module avail easybuild/${REQ_EB_VERSION} &> ${ml_av_easybuild_out} + module avail easybuild/${REQ_EB_VERSION} &> "${ml_av_easybuild_out}" if [[ $? -eq 0 ]]; then echo_green ">> EasyBuild module installed!" else @@ -188,7 +189,8 @@ fi echo ">> Loading EasyBuild module..." module load EasyBuild/$REQ_EB_VERSION eb_show_system_info_out=${TMPDIR}/eb_show_system_info.out -$EB --show-system-info > ${eb_show_system_info_out} +$EB --show-system-info > "${eb_show_system_info_out}" +# shellcheck disable=SC2181 if [[ $? -eq 0 ]]; then echo_green ">> EasyBuild seems to be working!" $EB --version | grep "${REQ_EB_VERSION}" @@ -200,7 +202,7 @@ if [[ $? -eq 0 ]]; then fi $EB --show-config else - cat ${eb_show_system_info_out} + cat "${eb_show_system_info_out}" fatal_error "EasyBuild not working?!" fi @@ -241,6 +243,7 @@ if [[ $GENERIC -eq 1 ]]; then else openblas_include_easyblocks_from_pr='' fi +# shellcheck disable=SC2086 $EB $openblas_include_easyblocks_from_pr OpenBLAS-0.3.9-GCC-9.3.0.eb --robot check_exit_code $? "${ok_msg}" "${fail_msg}" @@ -414,6 +417,7 @@ $EB CMake-3.20.1-GCCcore-10.3.0.eb --robot --include-easyblocks-from-pr 2248 $EB --from-pr 14584 Rust-1.52.1-GCCcore-10.3.0.eb --robot # use OpenBLAS easyconfig from https://github.com/easybuilders/easybuild-easyconfigs/pull/15885 # which includes a patch to fix installation on POWER +# shellcheck disable=SC2086 $EB $openblas_include_easyblocks_from_pr --from-pr 15885 OpenBLAS-0.3.15-GCC-10.3.0.eb --robot # ignore failing FlexiBLAS tests when building on POWER; # some tests are failing due to a segmentation fault due to "invalid memory reference", @@ -429,18 +433,64 @@ fi $EB SciPy-bundle-2021.05-foss-2021a.eb --robot check_exit_code $? "${ok_msg}" "${fail_msg}" +# CUDA support + +cuda_version="11.3.1" + +# Need recent version of EasyBuild +echo ">> Installing EasyBuild 4.7.0..." +ok_msg="EasyBuild v4.7.0 installed" +fail_msg="EasyBuild v4.7.0 failed to install" +$EB --from-pr 17065 --include-easyblocks-from-pr 2893 --try-amend=use_pip=1 +check_exit_code $? "${ok_msg}" "${fail_msg}" + +LMOD_IGNORE_CACHE=1 module swap EasyBuild/4.7.0 +check_exit_code $? "Swapped to EasyBuild/4.7.0" "Couldn't swap to EasyBuild/4.7.0" + +# install p7zip (to be able to unpack RPMs) +p7zip_ec="p7zip-17.04-GCCcore-10.3.0.eb" +echo ">> Installing $p7zip_ec..." +ok_msg="$p7zip_ec installed, off to a good (?) start!" +fail_msg="Failed to install $p7zip_ec, woopsie..." +$EB $p7zip_ec --robot +check_exit_code $? "${ok_msg}" "${fail_msg}" + +# install CUDA (uses eb_hooks.py to only install runtime) +cuda_ec="CUDA-${cuda_version}.eb" +echo ">> Installing $cuda_ec..." +ok_msg="$cuda_ec installed, off to a good (?) start!" +fail_msg="Failed to install $cuda_ec, woopsie..." +$EB $cuda_ec --robot +check_exit_code $? "${ok_msg}" "${fail_msg}" + +# Add the host_injections CUDA so we can actually build CUDA apps +# (which unbreaks the symlinks from the runtime installation) +echo ">> Re-installing CUDA $cuda_version under host_injections (to un-break symlinks in EESSI installation)..." +"${TOPDIR}"/gpu_support/cuda_utils/install_cuda_host_injections.sh ${cuda_version} +ok_msg="CUDA $cuda_version (re)installed under host_injections!" +fail_msg="Failed to install CUDA $cuda_version under host_injections, woopsie..." +check_exit_code $? "${ok_msg}" "${fail_msg}" + +# install CUDA samples (requires EESSI support for CUDA) +cuda_samples_ec="CUDA-Samples-11.3-GCC-10.3.0-CUDA-11.3.1.eb" +echo ">> Installing $cuda_samples_ec..." +ok_msg="$cuda_samples_ec installed, off to a good (?) start!" +fail_msg="Failed to install $cuda_samples_ec, woopsie..." +$EB $cuda_samples_ec --robot --from-pr=16914 +check_exit_code $? "${ok_msg}" "${fail_msg}" + ### add packages here echo ">> Creating/updating Lmod cache..." export LMOD_RC="${EASYBUILD_INSTALLPATH}/.lmod/lmodrc.lua" -if [ ! -f $LMOD_RC ]; then - python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH} +if [ ! -f "$LMOD_RC" ]; then + python3 "$TOPDIR"/create_lmodrc.py "${EASYBUILD_INSTALLPATH}" check_exit_code $? "$LMOD_RC created" "Failed to create $LMOD_RC" fi -$TOPDIR/update_lmod_cache.sh ${EPREFIX} ${EASYBUILD_INSTALLPATH} +"$TOPDIR"/update_lmod_cache.sh "${EPREFIX}" "${EASYBUILD_INSTALLPATH}" -$TOPDIR/check_missing_installations.sh +"$TOPDIR"/check_missing_installations.sh echo ">> Cleaning up ${TMPDIR}..." -rm -r ${TMPDIR} +rm -r "${TMPDIR}" diff --git a/eb_hooks.py b/eb_hooks.py index df7742f999..c7358d5f13 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -8,51 +8,40 @@ from easybuild.tools.systemtools import AARCH64, POWER, X86_64, get_cpu_architecture, get_cpu_features from easybuild.tools.toolchain.compiler import OPTARCH_GENERIC -EESSI_RPATH_OVERRIDE_ATTR = 'orig_rpath_override_dirs' - - -def get_eessi_envvar(eessi_envvar): - """Get an EESSI environment variable from the environment""" - - eessi_envvar_value = os.getenv(eessi_envvar) - if eessi_envvar_value is None: - raise EasyBuildError("$%s is not defined!", eessi_envvar) - - return eessi_envvar_value - - -def get_rpath_override_dirs(software_name): - # determine path to installations in software layer via $EESSI_SOFTWARE_PATH - eessi_software_path = get_eessi_envvar('EESSI_SOFTWARE_PATH') - eessi_pilot_version = get_eessi_envvar('EESSI_PILOT_VERSION') - - # construct the rpath override directory stub - rpath_injection_stub = os.path.join( - # Make sure we are looking inside the `host_injections` directory - eessi_software_path.replace(eessi_pilot_version, os.path.join('host_injections', eessi_pilot_version), 1), - # Add the subdirectory for the specific software - 'rpath_overrides', - software_name, - # We can't know the version, but this allows the use of a symlink - # to facilitate version upgrades without removing files - 'system', - ) - - # Allow for libraries in lib or lib64 - rpath_injection_dirs = [os.path.join(rpath_injection_stub, x) for x in ('lib', 'lib64')] - - return rpath_injection_dirs +EESSI_RPATH_OVERRIDE_ATTR = "orig_rpath_override_dirs" + +CUDA_ENABLED_TOOLCHAINS = [ + "fosscuda", + "gcccuda", + "gimpic", + "giolfc", + "gmklc", + "golfc", + "gomklc", + "gompic", + "goolfc", + "iccifortcuda", + "iimklc", + "iimpic", + "intelcuda", + "iomklc", + "iompic", + "nvompic", + "nvpsmpic", +] def parse_hook(ec, *args, **kwargs): """Main parse hook: trigger custom functions based on software name.""" # determine path to Prefix installation in compat layer via $EPREFIX - eprefix = get_eessi_envvar('EPREFIX') + eprefix = get_eessi_envvar("EPREFIX") if ec.name in PARSE_HOOKS: PARSE_HOOKS[ec.name](ec, eprefix) + ec = inject_gpu_property(ec) + def pre_configure_hook(self, *args, **kwargs): """Main pre-configure hook: trigger custom functions based on software name.""" @@ -74,19 +63,21 @@ def pre_prepare_hook(self, *args, **kwargs): # update the relevant option (but keep the original value so we can reset it later) if hasattr(self, EESSI_RPATH_OVERRIDE_ATTR): - raise EasyBuildError("'self' already has attribute %s! Can't use pre_prepare hook.", - EESSI_RPATH_OVERRIDE_ATTR) + raise EasyBuildError( + "'self' already has attribute %s! Can't use pre_prepare hook.", EESSI_RPATH_OVERRIDE_ATTR + ) - setattr(self, EESSI_RPATH_OVERRIDE_ATTR, build_option('rpath_override_dirs')) + setattr(self, EESSI_RPATH_OVERRIDE_ATTR, build_option("rpath_override_dirs")) if getattr(self, EESSI_RPATH_OVERRIDE_ATTR): # self.EESSI_RPATH_OVERRIDE_ATTR is (already) a colon separated string, let's make it a list orig_rpath_override_dirs = [getattr(self, EESSI_RPATH_OVERRIDE_ATTR)] - rpath_override_dirs = ':'.join(orig_rpath_override_dirs + mpi_rpath_override_dirs) + rpath_override_dirs = ":".join(orig_rpath_override_dirs + mpi_rpath_override_dirs) else: - rpath_override_dirs = ':'.join(mpi_rpath_override_dirs) - update_build_option('rpath_override_dirs', rpath_override_dirs) - print_msg("Updated rpath_override_dirs (to allow overriding MPI family %s): %s", - mpi_family, rpath_override_dirs) + rpath_override_dirs = ":".join(mpi_rpath_override_dirs) + update_build_option("rpath_override_dirs", rpath_override_dirs) + print_msg( + "Updated rpath_override_dirs (to allow overriding MPI family %s): %s", mpi_family, rpath_override_dirs + ) def post_prepare_hook(self, *args, **kwargs): @@ -94,30 +85,78 @@ def post_prepare_hook(self, *args, **kwargs): if hasattr(self, EESSI_RPATH_OVERRIDE_ATTR): # Reset the value of 'rpath_override_dirs' now that we are finished with it - update_build_option('rpath_override_dirs', getattr(self, EESSI_RPATH_OVERRIDE_ATTR)) + update_build_option("rpath_override_dirs", getattr(self, EESSI_RPATH_OVERRIDE_ATTR)) print_msg("Resetting rpath_override_dirs to original value: %s", getattr(self, EESSI_RPATH_OVERRIDE_ATTR)) delattr(self, EESSI_RPATH_OVERRIDE_ATTR) +def pre_configure_hook(self, *args, **kwargs): + """Main pre-configure hook: trigger custom functions based on software name.""" + if self.name in PRE_CONFIGURE_HOOKS: + PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs) + + +def post_package_hook(self, *args, **kwargs): + """Main post-package hook: trigger custom functions based on software name.""" + if self.name in POST_PACKAGE_HOOKS: + POST_PACKAGE_HOOKS[self.name](self, *args, **kwargs) + + +# Functions used by hooks + + +def get_eessi_envvar(eessi_envvar): + """Get an EESSI environment variable from the environment""" + + eessi_envvar_value = os.getenv(eessi_envvar) + if eessi_envvar_value is None: + raise EasyBuildError("$%s is not defined!", eessi_envvar) + + return eessi_envvar_value + + +def get_rpath_override_dirs(software_name): + # determine path to installations in software layer via $EESSI_SOFTWARE_PATH + eessi_software_path = get_eessi_envvar("EESSI_SOFTWARE_PATH") + eessi_pilot_version = get_eessi_envvar("EESSI_PILOT_VERSION") + + # construct the rpath override directory stub + rpath_injection_stub = os.path.join( + # Make sure we are looking inside the `host_injections` directory + eessi_software_path.replace(eessi_pilot_version, os.path.join("host_injections", eessi_pilot_version), 1), + # Add the subdirectory for the specific software + "rpath_overrides", + software_name, + # We can't know the version, but this allows the use of a symlink + # to facilitate version upgrades without removing files + "system", + ) + + # Allow for libraries in lib or lib64 + rpath_injection_dirs = [os.path.join(rpath_injection_stub, x) for x in ("lib", "lib64")] + + return rpath_injection_dirs + + def cgal_toolchainopts_precise(ec, eprefix): """Enable 'precise' rather than 'strict' toolchain option for CGAL on POWER.""" - if ec.name == 'CGAL': + if ec.name == "CGAL": if get_cpu_architecture() == POWER: # 'strict' implies '-mieee-fp', which is not supported on POWER # see https://github.com/easybuilders/easybuild-framework/issues/2077 - ec['toolchainopts']['strict'] = False - ec['toolchainopts']['precise'] = True - print_msg("Tweaked toochainopts for %s: %s", ec.name, ec['toolchainopts']) + ec["toolchainopts"]["strict"] = False + ec["toolchainopts"]["precise"] = True + print_msg("Tweaked toochainopts for %s: %s", ec.name, ec["toolchainopts"]) else: raise EasyBuildError("CGAL-specific hook triggered for non-CGAL easyconfig?!") def fontconfig_add_fonts(ec, eprefix): """Inject --with-add-fonts configure option for fontconfig.""" - if ec.name == 'fontconfig': + if ec.name == "fontconfig": # make fontconfig aware of fonts included with compat layer - with_add_fonts = '--with-add-fonts=%s' % os.path.join(eprefix, 'usr', 'share', 'fonts') - ec.update('configopts', with_add_fonts) + with_add_fonts = "--with-add-fonts=%s" % os.path.join(eprefix, "usr", "share", "fonts") + ec.update("configopts", with_add_fonts) print_msg("Added '%s' configure option for %s", with_add_fonts, ec.name) else: raise EasyBuildError("fontconfig-specific hook triggered for non-fontconfig easyconfig?!") @@ -125,29 +164,23 @@ def fontconfig_add_fonts(ec, eprefix): def ucx_eprefix(ec, eprefix): """Make UCX aware of compatibility layer via additional configuration options.""" - if ec.name == 'UCX': - ec.update('configopts', '--with-sysroot=%s' % eprefix) - ec.update('configopts', '--with-rdmacm=%s' % os.path.join(eprefix, 'usr')) - print_msg("Using custom configure options for %s: %s", ec.name, ec['configopts']) + if ec.name == "UCX": + ec.update("configopts", "--with-sysroot=%s" % eprefix) + ec.update("configopts", "--with-rdmacm=%s" % os.path.join(eprefix, "usr")) + print_msg("Using custom configure options for %s: %s", ec.name, ec["configopts"]) else: raise EasyBuildError("UCX-specific hook triggered for non-UCX easyconfig?!") -def pre_configure_hook(self, *args, **kwargs): - """Main pre-configure hook: trigger custom functions based on software name.""" - if self.name in PRE_CONFIGURE_HOOKS: - PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs) - - def libfabric_disable_psm3_x86_64_generic(self, *args, **kwargs): """Add --disable-psm3 to libfabric configure options when building with --optarch=GENERIC on x86_64.""" - if self.name == 'libfabric': + if self.name == "libfabric": if get_cpu_architecture() == X86_64: - generic = build_option('optarch') == OPTARCH_GENERIC - no_avx = 'avx' not in get_cpu_features() + generic = build_option("optarch") == OPTARCH_GENERIC + no_avx = "avx" not in get_cpu_features() if generic or no_avx: - self.cfg.update('configopts', '--disable-psm3') - print_msg("Using custom configure options for %s: %s", self.name, self.cfg['configopts']) + self.cfg.update("configopts", "--disable-psm3") + print_msg("Using custom configure options for %s: %s", self.name, self.cfg["configopts"]) else: raise EasyBuildError("libfabric-specific hook triggered for non-libfabric easyconfig?!") @@ -158,10 +191,10 @@ def metabat_preconfigure(self, *args, **kwargs): - take into account that zlib is a filtered dependency, and that there's no libz.a in the EESSI compat layer """ - if self.name == 'MetaBAT': - configopts = self.cfg['configopts'] + if self.name == "MetaBAT": + configopts = self.cfg["configopts"] regex = re.compile(r"\$EBROOTZLIB/lib/libz.a") - self.cfg['configopts'] = regex.sub('$EPREFIX/usr/lib64/libz.so', configopts) + self.cfg["configopts"] = regex.sub("$EPREFIX/usr/lib64/libz.so", configopts) else: raise EasyBuildError("MetaBAT-specific hook triggered for non-MetaBAT easyconfig?!") @@ -171,24 +204,103 @@ def wrf_preconfigure(self, *args, **kwargs): Pre-configure hook for WRF: - patch arch/configure_new.defaults so building WRF with foss toolchain works on aarch64 """ - if self.name == 'WRF': + if self.name == "WRF": if get_cpu_architecture() == AARCH64: pattern = "Linux x86_64 ppc64le, gfortran" repl = "Linux x86_64 aarch64 ppc64le, gfortran" - self.cfg.update('preconfigopts', "sed -i 's/%s/%s/g' arch/configure_new.defaults && " % (pattern, repl)) - print_msg("Using custom preconfigopts for %s: %s", self.name, self.cfg['preconfigopts']) + self.cfg.update("preconfigopts", "sed -i 's/%s/%s/g' arch/configure_new.defaults && " % (pattern, repl)) + print_msg("Using custom preconfigopts for %s: %s", self.name, self.cfg["preconfigopts"]) else: raise EasyBuildError("WRF-specific hook triggered for non-WRF easyconfig?!") +def cuda_postpackage(self, *args, **kwargs): + """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections.""" + print_msg("Replacing CUDA stuff we cannot ship with symlinks...") + # read CUDA EULA + eula_path = os.path.join(self.installdir, "EULA.txt") + tmp_buffer = [] + with open(eula_path) as infile: + copy = False + for line in infile: + if line.strip() == "2.6. Attachment A": + copy = True + continue + elif line.strip() == "2.7. Attachment B": + copy = False + continue + elif copy: + tmp_buffer.append(line) + # create whitelist without file extensions, they're not really needed and they only complicate things + whitelist = ['EULA'] + file_extensions = [".so", ".a", ".h", ".bc"] + for tmp in tmp_buffer: + for word in tmp.split(): + if any(ext in word for ext in file_extensions): + whitelist.append(word.split(".")[0]) + whitelist = list(set(whitelist)) + # Do some quick checks for things we should or shouldn't have in the list + if "nvcc" in whitelist: + raise EasyBuildError("Found 'nvcc' in whitelist: %s" % whitelist) + if "libcudart" not in whitelist: + raise EasyBuildError("Did not find 'libcudart' in whitelist: %s" % whitelist) + # iterate over all files in the CUDA path + for root, dirs, files in os.walk(self.installdir): + for filename in files: + # we only really care about real files, i.e. not symlinks + if not os.path.islink(os.path.join(root, filename)): + # check if the current file is part of the whitelist + basename = filename.split(".")[0] + if basename not in whitelist: + # if it is not in the whitelist, delete the file and create a symlink to host_injections + source = os.path.join(root, filename) + target = source.replace("versions", "host_injections") + os.remove(source) + # Using os.symlink requires the existence of the target directory, so we use os.system + system_command="ln -s %s %s" % (target, source) + if os.system(system_command) != 0: + raise EasyBuildError("Failed to create symbolic link: %s" % system_command) + + +def inject_gpu_property(ec): + ec_dict = ec.asdict() + # Check if CUDA is in the dependencies, if so add the GPU Lmod tag + if ( + "CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])] + or ec_dict["toolchain"]["name"] in CUDA_ENABLED_TOOLCHAINS + ): + ec.log.info("[parse hook] Injecting gpu as Lmod arch property and envvar with CUDA version") + key = "modluafooter" + value = 'add_property("arch","gpu")' + cuda_version = 0 + for dep in iter(ec_dict["dependencies"]): + # Make CUDA a build dependency only (rpathing saves us from link errors) + if "CUDA" in dep[0]: + cuda_version = dep[1] + ec_dict["dependencies"].remove(dep) + ec_dict["builddependencies"].append(dep) if dep not in ec_dict["builddependencies"] else ec_dict[ + "builddependencies" + ] + value = "\n".join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version]) + if key in ec_dict: + if not value in ec_dict[key]: + ec[key] = "\n".join([ec_dict[key], value]) + else: + ec[key] = value + return ec + PARSE_HOOKS = { - 'CGAL': cgal_toolchainopts_precise, - 'fontconfig': fontconfig_add_fonts, - 'UCX': ucx_eprefix, + "CGAL": cgal_toolchainopts_precise, + "fontconfig": fontconfig_add_fonts, + "UCX": ucx_eprefix, } PRE_CONFIGURE_HOOKS = { - 'libfabric': libfabric_disable_psm3_x86_64_generic, - 'MetaBAT': metabat_preconfigure, - 'WRF': wrf_preconfigure, + "libfabric": libfabric_disable_psm3_x86_64_generic, + "MetaBAT": metabat_preconfigure, + "WRF": wrf_preconfigure, +} + +POST_PACKAGE_HOOKS = { + "CUDA": cuda_postpackage, } diff --git a/eessi-2021.12.yml b/eessi-2021.12.yml index 210bbb2845..dc80a010ea 100644 --- a/eessi-2021.12.yml +++ b/eessi-2021.12.yml @@ -1,4 +1,14 @@ software: + CUDA: + toolchains: + SYSTEM: + versions: '11.3.1' + CUDA-Samples: + toolchains: + GCC-10.3.0: + versions: + '11.3': + versionsuffix: -CUDA-11.3.1 code-server: toolchains: SYSTEM: @@ -30,7 +40,11 @@ software: gompi-2020a: versions: ['5.6.3'] gompi-2021a: - versions: ['5.7.1'] + versions: [ '5.7.1' ] + p7zip: + toolchains: + GCCcore-10.3.0: + versions: ['17.04'] QuantumESPRESSO: toolchains: foss-2020a: diff --git a/gpu_support/README.md b/gpu_support/README.md new file mode 100644 index 0000000000..2b075a29dd --- /dev/null +++ b/gpu_support/README.md @@ -0,0 +1,26 @@ +# How to add GPU support +The collection of scripts in this directory enables you to add GPU support to your setup. +Note that currently this means that CUDA support can be added for Nvidia GPUs. AMD GPUs are not yet supported (feel free +to contribute that though!). +To enable the usage of the CUDA runtime in your setup, simply run the following script: +``` +./add_nvidia_gpu_support.sh +``` +This script will install the compatibility libraries (and only those by default!) you need to use the shipped runtime +environment of CUDA. + +If you plan on using the full CUDA suite, i.e. if you want to load the CUDA module, you will have to modify the script +execution as follows: +``` +export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh +``` +This will again install the needed compatibility libraries as well as the whole CUDA suite. + +## Prerequisites and tips +* You need write permissions to `/cvmfs/pilot.eessi-hpc.org/host_injections` (which by default is a symlink to + `/opt/eessi` but can be configured in your CernVMFS config file to point somewhere else). If you would like to make a + system-wide installation you should change this in your configuration to point somewhere on a shared filesystem. +* If you want to install CUDA on a node without GPUs (e.g. on a login node where you want to be able to compile your + CUDA-enabled code), you should `export INSTALL_WO_GPU=true` in order to skip checks and tests that can only succeed + if you have access to a GPU. This approach is not recommended as there is a chance the CUDA compatibility library + installed is not compatible with the existing CUDA driver on GPU nodes (and this will not be detected). diff --git a/gpu_support/add_amd_gpu_support.sh b/gpu_support/add_amd_gpu_support.sh new file mode 100755 index 0000000000..3f5fa13805 --- /dev/null +++ b/gpu_support/add_amd_gpu_support.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +cat << EOF +This is not implemented yet :( + +If you would like to contribute this support there are a few things you will +need to consider: +- We will need to change the Lmod property added to GPU software so we can + distinguish AMD and Nvidia GPUs +- Support should be implemented in user space, if this is not possible (e.g., + requires a driver update) you need to tell the user what to do +- Support needs to be _verified_ and a trigger put in place (like the existence + of a particular path) so we can tell Lmod to display the associated modules +EOF diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh new file mode 100755 index 0000000000..32fe500a5c --- /dev/null +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash + +# Drop into the prefix shell or pipe this script into a Prefix shell with +# $EPREFIX/startprefix <<< /path/to/this_script.sh + +TOPDIR=$(dirname "$(realpath "$0")") + +source "$TOPDIR"/../scripts/utils.sh + +install_cuda="${INSTALL_CUDA:=false}" +eessi_version="${EESSI_PILOT_VERSION:=latest}" +if [ ! "$eessi_version" = "latest" ]; then + eessi_version="versions/$eessi_version" +fi + +# Initialise EESSI environment +# shellcheck disable=SC1090 +EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/"${eessi_version}"/init/bash +# Expectation is we are in a Prefix shell (as we need certain commands), let's check +check_in_prefix_shell + +# If you want to install CUDA support on login nodes (typically without GPUs), +# set environment variable to true. This will skip all GPU-dependent checks +install_wo_gpu=false +[ "$INSTALL_WO_GPU" = true ] && install_wo_gpu=true + +# verify existence of nvidia-smi or this is a waste of time +# Check if nvidia-smi exists and can be executed without error +if [[ "${install_wo_gpu}" != "true" ]]; then + if command -v nvidia-smi > /dev/null 2>&1; then + nvidia-smi > /dev/null 2>&1 + # shellcheck disable=SC2181 + if [ $? -ne 0 ]; then + error="nvidia-smi was found but returned error code, exiting now...\n" + error="${error}If you do not have a GPU on this device but wish to force the installation,\n" + error="${error}please set the environment variable INSTALL_WO_GPU=true" + fatal_error "$error" + fi + echo "nvidia-smi found, continue setup." + else + error="nvidia-smi not found, exiting now...\n" + error="${error}If you do not have a GPU on this device but wish to force the installation,\n" + error="${error}please set the environment variable INSTALL_WO_GPU=true\n" + fatal_error "$error" + fi +else + echo_yellow "You requested to install CUDA without GPUs present." + echo_yellow "This means that all GPU-dependent tests/checks will be skipped!" +fi + +############################################################################################## +# Check that the CUDA driver version is adequate +# ( +# needs to be r450 or r470 which are LTS, other production branches are acceptable but not +# recommended, below r450 is not compatible [with an exception we will not explore,see +# https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers] +# ) +# only check first number in case of multiple GPUs +if [[ "${install_wo_gpu}" != "true" ]]; then + driver_major_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | tail -n1) + driver_major_version="${driver_major_version%%.*}" + # Now check driver_version for compatibility + # Check driver is at least LTS driver R450, see https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers + if (( driver_major_version < 450 )); then + fatal_error "Your NVIDIA driver version ($driver_major_version) is too old, please update first.." + fi +fi + +############################################################################################### +# Install CUDA +############################################################################################### + +# Now we have the EESSI context enabled let's grab the version(s) of CUDA we need to install +# (we assume here that CUDA versions are always simple version strings with semantic versions) +cuda_versions=($(ls "$EESSI_SOFTWARE_PATH"/software/CUDA/)) +latest_cuda_version="${cuda_versions[0]}" # EESSI starts with CUDA 11, no need for <10 logic +if [ "${install_cuda}" != false ]; then + for cuda_version in "${cuda_versions[@]}" + do + "$TOPDIR"/cuda_utils/install_cuda_host_injections.sh "${cuda_version}" + done +fi +############################################################################################### +# Prepare installation of CUDA compat libraries, i.e. install p7zip if it is missing +############################################################################################### +# Try installing different versions of CUDA compat libraries until the test works. +"$TOPDIR"/cuda_utils/install_cuda_compatlibs_loop.sh "${latest_cuda_version}" diff --git a/gpu_support/cuda_utils/get_cuda_compatlibs.sh b/gpu_support/cuda_utils/get_cuda_compatlibs.sh new file mode 100755 index 0000000000..07194a6c74 --- /dev/null +++ b/gpu_support/cuda_utils/get_cuda_compatlibs.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +# Get arch type from EESSI environment +if [[ -z "${EESSI_CPU_FAMILY}" ]]; then + # set up basic environment variables, EasyBuild and Lmod + EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/latest/init/bash +fi +eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}" + +# build URL for CUDA libraries +# take rpm file for compat libs from rhel8 folder, deb and rpm files contain the same libraries +cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${eessi_cpu_family}/" +# get all versions in descending order +files=$(curl -s "${cuda_url}" | grep 'cuda-compat' | sed 's/<\/\?[^>]\+>//g' | xargs -n1 | /cvmfs/pilot.eessi-hpc.org/latest/compat/linux/"${eessi_cpu_family}"/bin/sort -r --version-sort ) +if [[ -z "${files// }" ]]; then + echo "Could not find any compat lib files under ${cuda_url}" + exit 1 +fi +for file in $files; do echo "${cuda_url}$file"; done diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs.sh b/gpu_support/cuda_utils/install_cuda_compatlibs.sh new file mode 100755 index 0000000000..f3bad4b695 --- /dev/null +++ b/gpu_support/cuda_utils/install_cuda_compatlibs.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash + +# Initialise our bash functions +TOPDIR=$(dirname "$(realpath "$0")") +source "$TOPDIR"/../../scripts/utils.sh + +# Expect to be in a prefix shell so we know all our required commands exist +check_in_prefix_shell + +# Make sure the EESSI environment has been initialised +check_eessi_initialised + +libs_url=$1 +required_cuda_version=$2 + +current_dir=$PWD +host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" +host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} + +# Check if our target CUDA is satisfied by what is installed already +# (driver CUDA is reported as major.minor, i.e., like a float) +driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') +eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') +cuda_major_minor=${required_cuda_version%.*} + +if [[ ${driver_cuda_version%.*} =~ ^[0-9]+$ ]]; then + if float_greater_than "$driver_cuda_version" "$eessi_cuda_version" ; then + echo_yellow "You need to update your CUDA compatibility libraries!" + elif [[ ${eessi_cuda_version%.*} =~ ^[0-9]+$ ]]; then + if float_greater_than "$eessi_cuda_version" "$cuda_major_minor" ; then + echo_yellow "Existing CUDA compatibility libraries in EESSI should be ok, testing..." + if "$TOPDIR"/test_cuda.sh "${required_cuda_version}" ; then + exit 0 + else + echo_yellow "Seems not, continuing to install requested version..." + fi + fi + else + echo_yellow "Installing CUDA compatibility libraries" + fi +fi + +# Grab the latest compat library RPM or deb +# Download and unpack in temporary directory, easier cleanup after installation +tmpdir=$(mktemp -d) +cd "$tmpdir" ||fatal_error "Could not 'cd' to ${tmpdir}" +compat_file=${libs_url##*/} +curl -O "${libs_url}" +echo "$compat_file" + +# Unpack it +# rpm files are the default for all OSes +# Keep support for deb files in case it is needed in the future +file_extension=${compat_file##*.} +if [[ ${file_extension} == "rpm" ]]; then + # Load p7zip to extract files from rpm file + module load p7zip + # Extract .cpio + 7z x "${compat_file}" + # Extract lib* + 7z x "${compat_file/rpm/cpio}" + # Restore symlinks + cd usr/local/cuda-*/compat || fatal_error "Could not 'cd' to $(echo "$PWD"/usr/local/cuda-*/compat)" + # shellcheck disable=SC2011 + ls ./*.so ./*.so.? | xargs -i -I % sh -c '{ echo -n ln -sf" "; cat %; echo " "%; }'| xargs -i sh -c "{}" + cd - || fatal_error "Could not 'cd -' from extracted compat dir" +elif [[ ${file_extension} == "deb" ]]; then + ar x "${compat_file}" + tar xf data.tar.* +else + echo "File extension of cuda compat lib not supported, exiting now..." >&2 + exit 1 +fi + +# Create a general space for our NVIDIA compat drivers +if ! create_directory_structure $host_injections_dir ; then + fatal_error "Cannot create/write to $host_injections_dir space, exiting now..." +fi +cd $host_injections_dir || fatal_error "Could not 'cd' to $host_injections_dir" +# install the compat libs +cuda_dir=$(basename "${tmpdir}"/usr/local/cuda-*) +# TODO: This would prevent error messages if folder already exists, but +# could be problematic if only some files are missing in destination dir +rm -rf "${cuda_dir}" +mv -n "${tmpdir}"/usr/local/cuda-* . +rm -r "${tmpdir}" + +# Add a symlink that points the latest version to the version we just installed +ln -sfn "${cuda_dir}" latest + +if [ ! -e latest ] ; then + fatal_error "Symlink to latest cuda compat lib version is broken, exiting now..." +fi + +# Symlink in the path to the latest libraries +if [ ! -d "${host_injection_linker_dir}/lib" ]; then + # Create the space to host the libraries for the linker + if ! create_directory_structure "${host_injection_linker_dir}" ; then + fatal_error "Cannot create/write to ${host_injection_linker_dir} space, exiting now..." + fi + ln -s ${host_injections_dir}/latest/compat "${host_injection_linker_dir}"/lib +elif [ ! "${host_injection_linker_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then + error_msg="CUDA compat libs symlink exists but points to the wrong location, please fix this...\n" + error_msg="${error_msg}${host_injection_linker_dir}/lib should point to ${host_injections_dir}/latest/compat" + fatal_error "$error_msg" +fi + +# return to initial dir +cd "$current_dir" || fatal_error "Could not 'cd' to $current_dir" + +echo +echo CUDA driver compatability drivers installed for CUDA version: +echo "${cuda_dir/cuda-/}" diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh new file mode 100755 index 0000000000..2d7efc189c --- /dev/null +++ b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash + +# Initialise our bash functions +TOPDIR=$(dirname "$(realpath "$0")") +source "$TOPDIR"/../../scripts/utils.sh + +install_cuda_version=$1 + +MAXLOOPS=12 + +# Check if the CUDA compat libraries are installed and compatible with the target CUDA version +# if not find the latest version of the compatibility libraries and install them + +# get URL to latest CUDA compat libs, exit if URL is invalid +cuda_compat_urls="$("$TOPDIR"/get_cuda_compatlibs.sh)" +ret=$? +if [ $ret -ne 0 ]; then + fatal_error "Couldn't find current URLs of the CUDA compat libraries, instead got:\n $cuda_compat_urls" +fi + +# loop over the compat library versions until we get one that works for us +keep_driver_check=1 +# Do a maximum of MAXLOOPS attempts +# shellcheck disable=SC2034 +for i in $(seq 1 $MAXLOOPS) +do + # shellcheck disable=SC2086 + latest_cuda_compat_url=$(echo $cuda_compat_urls | cut -d " " -f1) + # Chomp that value out of the list + # shellcheck disable=SC2086 + cuda_compat_urls=$(echo $cuda_compat_urls | cut -d " " -f2-) + latest_driver_version="${latest_cuda_compat_url%-*}" + latest_driver_version="${latest_driver_version##*-}" + # URLs differ for different OSes; check if we already have a number, if not remove string part that is not needed + if [[ ! $latest_driver_version =~ ^[0-9]+$ ]]; then + latest_driver_version="${latest_driver_version##*_}" + fi + + install_compat_libs=false + host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" + # libcuda.so points to actual cuda compat lib with driver version in its name + # if this file exists, cuda compat libs are installed and we can compare the version + if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then + eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so) + eessi_driver_version="${eessi_driver_version##*so.}" + else + eessi_driver_version=0 + fi + + if [ $keep_driver_check -eq 1 ] + then + # only keep the driver check for the latest version + keep_driver_check=0 + else + eessi_driver_version=0 + fi + + if (( ${latest_driver_version//./} > ${eessi_driver_version//./} )); then + install_compat_libs=true + else + echo "CUDA compat libs are up-to-date, skip installation." + fi + + if [ "${install_compat_libs}" == true ]; then + "$TOPDIR"/install_cuda_compatlibs.sh "${latest_cuda_compat_url}" "${install_cuda_version}" + fi + + if [[ "${INSTALL_WO_GPU}" != "true" ]]; then + + if "$TOPDIR"/test_cuda.sh "${install_cuda_version}" ; then + cuda_version_file="${host_injections_dir}/latest/version.txt" + echo "${install_cuda_version}" > ${cuda_version_file} + exit 0 + else + echo_yellow "Your driver does not seem to be not recent enough to work with that release of CUDA compat libs," + echo_yellow "consider updating!" + echo_yellow "I'll try an older release to see if that will work..." + fi + else + echo_yellow "Requested to install CUDA without GPUs present, with no way to verify we skip final tests." + echo_yellow "Since we have no GPU to test with, we cannot guarantee that it will work with the installed CUDA" + echo_yellow "drivers on your GPU node(s)." + exit 0 + fi +done + +echo "Tried to install $MAXLOOPS different generations of compat libraries and none worked," +echo "this usually means your driver is very out of date (or some other issue)!" +exit 1 diff --git a/gpu_support/cuda_utils/install_cuda_host_injections.sh b/gpu_support/cuda_utils/install_cuda_host_injections.sh new file mode 100755 index 0000000000..bafff8cb31 --- /dev/null +++ b/gpu_support/cuda_utils/install_cuda_host_injections.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash + +# Initialise our bash functions +source scripts/utils.sh + +if [[ $# -eq 0 ]] ; then + fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" +fi +install_cuda_version=$1 +if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" +else + # As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` + # (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) + cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} +fi + +# Only install CUDA if specified version is not found. +# This is only relevant for users, the shipped CUDA installation will +# always be in versions instead of host_injections and have symlinks pointing +# to host_injections for everything we're not allowed to ship +# (existence of easybuild subdir implies a successful install) +if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then + echo_green "CUDA software found! No need to install CUDA again, proceed with testing." +else + # We need to be able write to the installation space so let's make sure we can + if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then + fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" + fi + + # we need a directory we can use for temporary storage + if [[ -z "${CUDA_TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) + else + tmpdir="${CUDA_TEMP_DIR}"/temp + if ! mkdir "$tmpdir" ; then + fatal_error "Could not create directory ${tmpdir}" + fi + fi + + required_space_in_tmpdir=50000 + # Let's see if we have sources and build locations defined if not, we use the temporary space + if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) + fi + if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) + fi + + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), + # need to do a space check before we proceed + avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < 5000000 )); then + fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." + fi + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < required_space_in_tmpdir )); then + error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" + error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check." + error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " + error="${error}to reduce this requirement. Exiting now..." + fatal_error "${error}" + fi + + if [[ -z "${EBROOTEASYBUILD}" ]]; then + echo_yellow "Loading EasyBuild module to do actual install" + module load EasyBuild + fi + + # we need the --rebuild option and a (random) dir for the module since we are + # fixing the broken links of the EESSI-shipped installation + extra_args="--rebuild --installpath-modules=${tmpdir}" + + # We don't want hooks used in this install, we need a vanilla CUDA installation + touch "$tmpdir"/none.py + # shellcheck disable=SC2086 # Intended splitting of extra_args + eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb + ret=$? + if [ $ret -ne 0 ]; then + fatal_error "CUDA installation failed, please check EasyBuild logs..." + else + echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" + fi + # clean up tmpdir + rm -rf "${tmpdir}" +fi diff --git a/gpu_support/cuda_utils/test_cuda.sh b/gpu_support/cuda_utils/test_cuda.sh new file mode 100755 index 0000000000..a39cdd1cc8 --- /dev/null +++ b/gpu_support/cuda_utils/test_cuda.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# Initialise our bash functions +TOPDIR=$(dirname "$(realpath "$0")") +source "$TOPDIR"/../../scripts/utils.sh + +if [[ $# -eq 0 ]] ; then + fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" +fi +cuda_version=$1 + +check_eessi_initialised + +# We can figure out the (EasyBuild MNS) CUDA samples version we need since we know the version suffix +cuda_samples_version=$(basename "$(ls -d "${EESSI_SOFTWARE_PATH}"/software/CUDA-Samples/*-CUDA-"${cuda_version}")") + +# Test CUDA (making sure to use EasyBuild MNS) +unset MODULEPATH +module use "${EESSI_SOFTWARE_PATH}"/modules/all +module load CUDA-Samples/"${cuda_samples_version}" +ret=$? +if [ $ret -ne 0 ]; then + fatal_error "Could not load CUDA samples module CUDA-Samples/${cuda_samples_version}\n (MODULEPATH=$MODULEPATH)..." +fi + +if deviceQuery; +then + echo_green "Congratulations, your GPU is working with EESSI!" +else + echo_yellow "Uff, your GPU doesn't seem to be working with EESSI :(" >&2 + exit "${ANY_ERROR_EXITCODE}" +fi + +# Test another CUDA-enabled module from EESSI +# TODO: GROMACS? +# TODO: Include a GDR copy test? +############################################################################################### diff --git a/scripts/utils.sh b/scripts/utils.sh index d0da95e87f..07760f0dd0 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -14,7 +14,7 @@ ANY_ERROR_EXITCODE=1 function fatal_error() { echo_red "ERROR: $1" >&2 if [[ $# -gt 1 ]]; then - exit $2 + exit "$2" else exit "${ANY_ERROR_EXITCODE}" fi @@ -32,11 +32,81 @@ function check_exit_code { fi } +function check_eessi_initialised() { + if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + fatal_error "EESSI has not been initialised!" + else + return 0 + fi +} + +function float_greater_than() { + # Make sure we have two arguments + if [ $# -ne 2 ]; then + echo_red "greater_than_float requires two (float) numbers" >&2 + return $ANY_ERROR_EXITCODE + fi + # Make sure the arguments are numbers + if [[ ! $1 =~ ^[+-]?[0-9]+\.?[0-9]*$ ]]; then + echo_yellow "Input to float_greater_than is not a float, ignoring" + return $ANY_ERROR_EXITCODE + fi + if [[ ! $2 =~ ^[+-]?[0-9]+\.?[0-9]*$ ]]; then + echo_yellow "Input to float_greater_than is not a float, ignoring" + return $ANY_ERROR_EXITCODE + fi + # Now do the actual evaluation + return_code=$ANY_ERROR_EXITCODE + result=$(echo "$1" "$2" | awk '{if ($1 > $2) print "true"}') + if [ "$result" = true ] ; then + return_code=0 + fi + return $return_code +} + +function check_in_prefix_shell() { + # Make sure EPREFIX is defined + if [[ -z "${EPREFIX}" ]]; then + fatal_error "This script cannot be used without having first defined EPREFIX" + fi + if [[ ! ${SHELL} = ${EPREFIX}/bin/bash ]]; then + fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" + fi +} + +function create_directory_structure() { + # Ensure we are given a single path argument + if [ $# -ne 1 ]; then + echo_red "Function requires a single (relative or absolute) path argument" >&2 + return $ANY_ERROR_EXITCODE + fi + dir_structure="$1" + + # Attempt to create the directory structure + error_message=$(mkdir -p "$dir_structure" 2>&1) + return_code=$? + # If it fails be explicit about the error + if [ ${return_code} -ne 0 ]; then + real_dir=$(realpath -m "$dir_structure") + echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 + else + # If we're creating it, our use case is that we want to be able to write there + # (this is a check in case the directory already existed) + if [ ! -w "${dir_structure}" ]; then + real_dir=$(realpath -m "$dir_structure") + echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" + return_code=$ANY_ERROR_EXITCODE + fi + fi + + return $return_code +} + function get_path_for_tool { tool_name=$1 tool_envvar_name=$2 - which_out=$(which ${tool_name} 2>&1) + which_out=$(which "${tool_name}" 2>&1) exit_code=$? if [[ ${exit_code} -eq 0 ]]; then echo "INFO: found tool ${tool_name} in PATH (${which_out})" >&2 @@ -68,7 +138,7 @@ function get_host_from_url { url=$1 re="(http|https)://([^/:]+)" if [[ $url =~ $re ]]; then - echo ${BASH_REMATCH[2]} + echo "${BASH_REMATCH[2]}" return 0 else echo "" @@ -80,7 +150,7 @@ function get_port_from_url { url=$1 re="(http|https)://[^:]+:([0-9]+)" if [[ $url =~ $re ]]; then - echo ${BASH_REMATCH[2]} + echo "${BASH_REMATCH[2]}" return 0 else echo "" @@ -90,7 +160,7 @@ function get_port_from_url { function get_ipv4_address { hname=$1 - hipv4=$(grep ${hname} /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) + hipv4=$(grep "${hname}" /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) # TODO try other methods if the one above does not work --> tool that verifies # what method can be used? echo "${hipv4}"