From a86c61462b5d0dab3df979b50aed79ad1b055d0a Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 11:01:25 +0100 Subject: [PATCH 01/49] Add CUDA support to software_layer --- EESSI-pilot-install-software.sh | 28 ++++ eb_hooks.py | 265 ++++++++++++++++++++++---------- eessi-2021.12.yml | 14 ++ 3 files changed, 227 insertions(+), 80 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 2bc6876965..1698066bf4 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -394,6 +394,34 @@ $EB --from-pr 15885 OpenBLAS-0.3.15-GCC-10.3.0.eb --robot $EB SciPy-bundle-2021.05-foss-2021a.eb -r --buildpath /dev/shm/$USER/easybuild_build check_exit_code $? "${ok_msg}" "${fail_msg}" +# CUDA support + +# install p7zip (to be able to unpack RPMs) +p7zip_ec="p7zip-17.04-GCCcore-10.3.0.eb" +echo ">> Installing $p7zip_ec..." +ok_msg="$p7zip_ec installed, off to a good (?) start!" +fail_msg="Failed to install $p7zip_ec, woopsie..." +$EB $p7zip_ec --robot +check_exit_code $? "${ok_msg}" "${fail_msg}" + +# install CUDA (uses eb_hooks.py to only install runtime) +cuda_ec="CUDA-11.3.1.eb" +echo ">> Installing $cuda_ec..." +ok_msg="$cuda_ec installed, off to a good (?) start!" +fail_msg="Failed to install $cuda_ec, woopsie..." +$EB $cuda_ec --robot +check_exit_code $? "${ok_msg}" "${fail_msg}" + +# install CUDA samples (requires EESSI support for CUDA) +# TODO Run EESSI NVIDIA GPU support script here +# (which unbreaks the symlinks from the runtime installation) +cuda_samples_ec="CUDA-Samples-11.3-GCC-10.3.0-CUDA-11.3.1.eb" +echo ">> Installing $cuda_samples_ec..." +ok_msg="$cuda_ec installed, off to a good (?) start!" +fail_msg="Failed to install $cuda_samples_ec, woopsie..." +$EB $cuda_samples_ec --robot --from-pr=16914 +check_exit_code $? "${ok_msg}" "${fail_msg}" + ### add packages here echo ">> Creating/updating Lmod cache..." diff --git a/eb_hooks.py b/eb_hooks.py index df7742f999..c5a0ca9cca 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -8,51 +8,56 @@ from easybuild.tools.systemtools import AARCH64, POWER, X86_64, get_cpu_architecture, get_cpu_features from easybuild.tools.toolchain.compiler import OPTARCH_GENERIC -EESSI_RPATH_OVERRIDE_ATTR = 'orig_rpath_override_dirs' +EESSI_RPATH_OVERRIDE_ATTR = "orig_rpath_override_dirs" + +CUDA_ENABLED_TOOLCHAINS = [ + "fosscuda", + "gcccuda", + "gimpic", + "giolfc", + "gmklc", + "golfc", + "gomklc", + "gompic", + "goolfc", + "iccifortcuda", + "iimklc", + "iimpic", + "intelcuda", + "iomklc", + "iompic", + "nvompic", + "nvpsmpic", +] +PARSE_HOOKS = { + "CGAL": cgal_toolchainopts_precise, + "fontconfig": fontconfig_add_fonts, + "UCX": ucx_eprefix, +} -def get_eessi_envvar(eessi_envvar): - """Get an EESSI environment variable from the environment""" - - eessi_envvar_value = os.getenv(eessi_envvar) - if eessi_envvar_value is None: - raise EasyBuildError("$%s is not defined!", eessi_envvar) - - return eessi_envvar_value - - -def get_rpath_override_dirs(software_name): - # determine path to installations in software layer via $EESSI_SOFTWARE_PATH - eessi_software_path = get_eessi_envvar('EESSI_SOFTWARE_PATH') - eessi_pilot_version = get_eessi_envvar('EESSI_PILOT_VERSION') - - # construct the rpath override directory stub - rpath_injection_stub = os.path.join( - # Make sure we are looking inside the `host_injections` directory - eessi_software_path.replace(eessi_pilot_version, os.path.join('host_injections', eessi_pilot_version), 1), - # Add the subdirectory for the specific software - 'rpath_overrides', - software_name, - # We can't know the version, but this allows the use of a symlink - # to facilitate version upgrades without removing files - 'system', - ) - - # Allow for libraries in lib or lib64 - rpath_injection_dirs = [os.path.join(rpath_injection_stub, x) for x in ('lib', 'lib64')] +PRE_CONFIGURE_HOOKS = { + "libfabric": libfabric_disable_psm3_x86_64_generic, + "MetaBAT": metabat_preconfigure, + "WRF": wrf_preconfigure, +} - return rpath_injection_dirs +POST_PACKAGE_HOOKS = { + "CUDA": cuda_postpackage, +} def parse_hook(ec, *args, **kwargs): """Main parse hook: trigger custom functions based on software name.""" # determine path to Prefix installation in compat layer via $EPREFIX - eprefix = get_eessi_envvar('EPREFIX') + eprefix = get_eessi_envvar("EPREFIX") if ec.name in PARSE_HOOKS: PARSE_HOOKS[ec.name](ec, eprefix) + ec = inject_gpu_property(ec) + def pre_configure_hook(self, *args, **kwargs): """Main pre-configure hook: trigger custom functions based on software name.""" @@ -74,19 +79,21 @@ def pre_prepare_hook(self, *args, **kwargs): # update the relevant option (but keep the original value so we can reset it later) if hasattr(self, EESSI_RPATH_OVERRIDE_ATTR): - raise EasyBuildError("'self' already has attribute %s! Can't use pre_prepare hook.", - EESSI_RPATH_OVERRIDE_ATTR) + raise EasyBuildError( + "'self' already has attribute %s! Can't use pre_prepare hook.", EESSI_RPATH_OVERRIDE_ATTR + ) - setattr(self, EESSI_RPATH_OVERRIDE_ATTR, build_option('rpath_override_dirs')) + setattr(self, EESSI_RPATH_OVERRIDE_ATTR, build_option("rpath_override_dirs")) if getattr(self, EESSI_RPATH_OVERRIDE_ATTR): # self.EESSI_RPATH_OVERRIDE_ATTR is (already) a colon separated string, let's make it a list orig_rpath_override_dirs = [getattr(self, EESSI_RPATH_OVERRIDE_ATTR)] - rpath_override_dirs = ':'.join(orig_rpath_override_dirs + mpi_rpath_override_dirs) + rpath_override_dirs = ":".join(orig_rpath_override_dirs + mpi_rpath_override_dirs) else: - rpath_override_dirs = ':'.join(mpi_rpath_override_dirs) - update_build_option('rpath_override_dirs', rpath_override_dirs) - print_msg("Updated rpath_override_dirs (to allow overriding MPI family %s): %s", - mpi_family, rpath_override_dirs) + rpath_override_dirs = ":".join(mpi_rpath_override_dirs) + update_build_option("rpath_override_dirs", rpath_override_dirs) + print_msg( + "Updated rpath_override_dirs (to allow overriding MPI family %s): %s", mpi_family, rpath_override_dirs + ) def post_prepare_hook(self, *args, **kwargs): @@ -94,30 +101,78 @@ def post_prepare_hook(self, *args, **kwargs): if hasattr(self, EESSI_RPATH_OVERRIDE_ATTR): # Reset the value of 'rpath_override_dirs' now that we are finished with it - update_build_option('rpath_override_dirs', getattr(self, EESSI_RPATH_OVERRIDE_ATTR)) + update_build_option("rpath_override_dirs", getattr(self, EESSI_RPATH_OVERRIDE_ATTR)) print_msg("Resetting rpath_override_dirs to original value: %s", getattr(self, EESSI_RPATH_OVERRIDE_ATTR)) delattr(self, EESSI_RPATH_OVERRIDE_ATTR) +def pre_configure_hook(self, *args, **kwargs): + """Main pre-configure hook: trigger custom functions based on software name.""" + if self.name in PRE_CONFIGURE_HOOKS: + PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs) + + +def post_package_hook(self, *args, **kwargs): + """Main post-package hook: trigger custom functions based on software name.""" + if self.name in POST_PACKAGE_HOOKS: + POST_PACKAGE_HOOKS[self.name](self, *args, **kwargs) + + +# Functions used by hooks + + +def get_eessi_envvar(eessi_envvar): + """Get an EESSI environment variable from the environment""" + + eessi_envvar_value = os.getenv(eessi_envvar) + if eessi_envvar_value is None: + raise EasyBuildError("$%s is not defined!", eessi_envvar) + + return eessi_envvar_value + + +def get_rpath_override_dirs(software_name): + # determine path to installations in software layer via $EESSI_SOFTWARE_PATH + eessi_software_path = get_eessi_envvar("EESSI_SOFTWARE_PATH") + eessi_pilot_version = get_eessi_envvar("EESSI_PILOT_VERSION") + + # construct the rpath override directory stub + rpath_injection_stub = os.path.join( + # Make sure we are looking inside the `host_injections` directory + eessi_software_path.replace(eessi_pilot_version, os.path.join("host_injections", eessi_pilot_version), 1), + # Add the subdirectory for the specific software + "rpath_overrides", + software_name, + # We can't know the version, but this allows the use of a symlink + # to facilitate version upgrades without removing files + "system", + ) + + # Allow for libraries in lib or lib64 + rpath_injection_dirs = [os.path.join(rpath_injection_stub, x) for x in ("lib", "lib64")] + + return rpath_injection_dirs + + def cgal_toolchainopts_precise(ec, eprefix): """Enable 'precise' rather than 'strict' toolchain option for CGAL on POWER.""" - if ec.name == 'CGAL': + if ec.name == "CGAL": if get_cpu_architecture() == POWER: # 'strict' implies '-mieee-fp', which is not supported on POWER # see https://github.com/easybuilders/easybuild-framework/issues/2077 - ec['toolchainopts']['strict'] = False - ec['toolchainopts']['precise'] = True - print_msg("Tweaked toochainopts for %s: %s", ec.name, ec['toolchainopts']) + ec["toolchainopts"]["strict"] = False + ec["toolchainopts"]["precise"] = True + print_msg("Tweaked toochainopts for %s: %s", ec.name, ec["toolchainopts"]) else: raise EasyBuildError("CGAL-specific hook triggered for non-CGAL easyconfig?!") def fontconfig_add_fonts(ec, eprefix): """Inject --with-add-fonts configure option for fontconfig.""" - if ec.name == 'fontconfig': + if ec.name == "fontconfig": # make fontconfig aware of fonts included with compat layer - with_add_fonts = '--with-add-fonts=%s' % os.path.join(eprefix, 'usr', 'share', 'fonts') - ec.update('configopts', with_add_fonts) + with_add_fonts = "--with-add-fonts=%s" % os.path.join(eprefix, "usr", "share", "fonts") + ec.update("configopts", with_add_fonts) print_msg("Added '%s' configure option for %s", with_add_fonts, ec.name) else: raise EasyBuildError("fontconfig-specific hook triggered for non-fontconfig easyconfig?!") @@ -125,29 +180,23 @@ def fontconfig_add_fonts(ec, eprefix): def ucx_eprefix(ec, eprefix): """Make UCX aware of compatibility layer via additional configuration options.""" - if ec.name == 'UCX': - ec.update('configopts', '--with-sysroot=%s' % eprefix) - ec.update('configopts', '--with-rdmacm=%s' % os.path.join(eprefix, 'usr')) - print_msg("Using custom configure options for %s: %s", ec.name, ec['configopts']) + if ec.name == "UCX": + ec.update("configopts", "--with-sysroot=%s" % eprefix) + ec.update("configopts", "--with-rdmacm=%s" % os.path.join(eprefix, "usr")) + print_msg("Using custom configure options for %s: %s", ec.name, ec["configopts"]) else: raise EasyBuildError("UCX-specific hook triggered for non-UCX easyconfig?!") -def pre_configure_hook(self, *args, **kwargs): - """Main pre-configure hook: trigger custom functions based on software name.""" - if self.name in PRE_CONFIGURE_HOOKS: - PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs) - - def libfabric_disable_psm3_x86_64_generic(self, *args, **kwargs): """Add --disable-psm3 to libfabric configure options when building with --optarch=GENERIC on x86_64.""" - if self.name == 'libfabric': + if self.name == "libfabric": if get_cpu_architecture() == X86_64: - generic = build_option('optarch') == OPTARCH_GENERIC - no_avx = 'avx' not in get_cpu_features() + generic = build_option("optarch") == OPTARCH_GENERIC + no_avx = "avx" not in get_cpu_features() if generic or no_avx: - self.cfg.update('configopts', '--disable-psm3') - print_msg("Using custom configure options for %s: %s", self.name, self.cfg['configopts']) + self.cfg.update("configopts", "--disable-psm3") + print_msg("Using custom configure options for %s: %s", self.name, self.cfg["configopts"]) else: raise EasyBuildError("libfabric-specific hook triggered for non-libfabric easyconfig?!") @@ -158,10 +207,10 @@ def metabat_preconfigure(self, *args, **kwargs): - take into account that zlib is a filtered dependency, and that there's no libz.a in the EESSI compat layer """ - if self.name == 'MetaBAT': - configopts = self.cfg['configopts'] + if self.name == "MetaBAT": + configopts = self.cfg["configopts"] regex = re.compile(r"\$EBROOTZLIB/lib/libz.a") - self.cfg['configopts'] = regex.sub('$EPREFIX/usr/lib64/libz.so', configopts) + self.cfg["configopts"] = regex.sub("$EPREFIX/usr/lib64/libz.so", configopts) else: raise EasyBuildError("MetaBAT-specific hook triggered for non-MetaBAT easyconfig?!") @@ -171,24 +220,80 @@ def wrf_preconfigure(self, *args, **kwargs): Pre-configure hook for WRF: - patch arch/configure_new.defaults so building WRF with foss toolchain works on aarch64 """ - if self.name == 'WRF': + if self.name == "WRF": if get_cpu_architecture() == AARCH64: pattern = "Linux x86_64 ppc64le, gfortran" repl = "Linux x86_64 aarch64 ppc64le, gfortran" - self.cfg.update('preconfigopts', "sed -i 's/%s/%s/g' arch/configure_new.defaults && " % (pattern, repl)) - print_msg("Using custom preconfigopts for %s: %s", self.name, self.cfg['preconfigopts']) + self.cfg.update("preconfigopts", "sed -i 's/%s/%s/g' arch/configure_new.defaults && " % (pattern, repl)) + print_msg("Using custom preconfigopts for %s: %s", self.name, self.cfg["preconfigopts"]) else: raise EasyBuildError("WRF-specific hook triggered for non-WRF easyconfig?!") -PARSE_HOOKS = { - 'CGAL': cgal_toolchainopts_precise, - 'fontconfig': fontconfig_add_fonts, - 'UCX': ucx_eprefix, -} - -PRE_CONFIGURE_HOOKS = { - 'libfabric': libfabric_disable_psm3_x86_64_generic, - 'MetaBAT': metabat_preconfigure, - 'WRF': wrf_preconfigure, -} +def cuda_post_package(self, *args, **kwargs): + """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections.""" + print_msg("Replacing CUDA stuff we cannot ship with symlinks...") + # read CUDA EULA + eula_path = os.path.join(self.installdir, "EULA.txt") + tmp_buffer = [] + with open(eula_path) as infile: + copy = False + for line in infile: + if line.strip() == "2.6. Attachment A": + copy = True + continue + elif line.strip() == "2.7. Attachment B": + copy = False + continue + elif copy: + tmp_buffer.append(line) + # create whitelist without file extensions, they're not really needed and they only complicate things + whitelist = [] + file_extensions = [".so", ".a", ".h", ".bc"] + for tmp in tmp_buffer: + for word in tmp.split(): + if any(ext in word for ext in file_extensions): + whitelist.append(word.split(".")[0]) + whitelist = list(set(whitelist)) + # iterate over all files in the CUDA path + for root, dirs, files in os.walk(self.installdir): + for filename in files: + # we only really care about real files, i.e. not symlinks + if not os.path.islink(os.path.join(root, filename)): + # check if the current file is part of the whitelist + basename = filename.split(".")[0] + if basename not in whitelist: + # if it is not in the whitelist, delete the file and create a symlink to host_injections + source = os.path.join(root, filename) + target = source.replace("versions", "host_injections") + os.remove(source) + # Using os.symlink requires the existence of the target directory, so we use os.system + os.system("ln %s %s" % (target, source)) + + +def inject_gpu_property(ec): + ec_dict = ec.asdict() + # Check if CUDA is in the dependencies, if so add the GPU Lmod tag + if ( + "CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])] + or ec_dict["toolchain"]["name"] in CUDA_ENABLED_TOOLCHAINS + ): + ec.log.info("[parse hook] Injecting gpu as Lmod arch property and envvar with CUDA version") + key = "modluafooter" + value = 'add_property("arch","gpu")' + cuda_version = 0 + for dep in iter(ec_dict["dependencies"]): + # Make CUDA a build dependency only (rpathing saves us from link errors) + if "CUDA" in dep[0]: + cuda_version = dep[1] + ec_dict["dependencies"].remove(dep) + ec_dict["builddependencies"].append(dep) if dep not in ec_dict["builddependencies"] else ec_dict[ + "builddependencies" + ] + value = "\n".join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version]) + if key in ec_dict: + if not value in ec_dict[key]: + ec[key] = "\n".join([ec_dict[key], value]) + else: + ec[key] = value + return ec diff --git a/eessi-2021.12.yml b/eessi-2021.12.yml index 977c0f9804..bbfadb9daa 100644 --- a/eessi-2021.12.yml +++ b/eessi-2021.12.yml @@ -1,4 +1,14 @@ software: + CUDA: + toolchains: + SYSTEM: + versions: '11.3.1' + CUDA-Samples: + toolchains: + GCC-10.3.0: + versions: + '11.3': + versionsuffix: -CUDA-11.3.1 code-server: toolchains: SYSTEM: @@ -29,6 +39,10 @@ software: toolchains: gompi-2020a: versions: ['5.6.3'] + p7zip: + toolchains: + GCCcore-10.3.0: + versions: ['17.04'] QuantumESPRESSO: toolchains: foss-2020a: From 6c41b262d020ce818baa9aedc19c4322d2dee33d Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 13:04:45 +0100 Subject: [PATCH 02/49] singularity install does not seem to install mksquashfs --- .github/workflows/tests_scripts.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml index 18fcd7b255..24ce26ab61 100644 --- a/.github/workflows/tests_scripts.yml +++ b/.github/workflows/tests_scripts.yml @@ -35,6 +35,7 @@ jobs: curl -OL https://dl.fedoraproject.org/pub/epel/8/Everything/x86_64/Packages/s/$singularity_rpm sudo alien -d $singularity_rpm sudo apt install ./singularity*.deb + sudo apt install mksquashfs singularity --version - name: test install_software_layer.sh script From 7d53b030d7acb343baab2866842a913d795fd363 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 13:07:38 +0100 Subject: [PATCH 03/49] Trigger script test --- EESSI-pilot-install-software.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 1698066bf4..16c7c48aa2 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -443,3 +443,4 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" echo ">> Cleaning up ${TMPDIR}..." rm -r ${TMPDIR} +echo nothing From 58357b9b6a405e3cc3d2f6e4bfc173e5f519021b Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 13:11:24 +0100 Subject: [PATCH 04/49] Revert --- EESSI-pilot-install-software.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 16c7c48aa2..1698066bf4 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -443,4 +443,3 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" echo ">> Cleaning up ${TMPDIR}..." rm -r ${TMPDIR} -echo nothing From 4b6654dcb5a9e9957784adc9c824fa8081842af7 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 13:13:29 +0100 Subject: [PATCH 05/49] Use the right package name for squash-fs --- .github/workflows/tests_scripts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml index 24ce26ab61..acacaa54a1 100644 --- a/.github/workflows/tests_scripts.yml +++ b/.github/workflows/tests_scripts.yml @@ -35,7 +35,7 @@ jobs: curl -OL https://dl.fedoraproject.org/pub/epel/8/Everything/x86_64/Packages/s/$singularity_rpm sudo alien -d $singularity_rpm sudo apt install ./singularity*.deb - sudo apt install mksquashfs + sudo apt install squashfs-tools singularity --version - name: test install_software_layer.sh script From 33ce58497b3cc5beff81b5d8f0265d997a0b4b15 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 14:14:09 +0100 Subject: [PATCH 06/49] Tidy up hooks --- eb_hooks.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index c5a0ca9cca..3ab991771f 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -30,22 +30,6 @@ "nvpsmpic", ] -PARSE_HOOKS = { - "CGAL": cgal_toolchainopts_precise, - "fontconfig": fontconfig_add_fonts, - "UCX": ucx_eprefix, -} - -PRE_CONFIGURE_HOOKS = { - "libfabric": libfabric_disable_psm3_x86_64_generic, - "MetaBAT": metabat_preconfigure, - "WRF": wrf_preconfigure, -} - -POST_PACKAGE_HOOKS = { - "CUDA": cuda_postpackage, -} - def parse_hook(ec, *args, **kwargs): """Main parse hook: trigger custom functions based on software name.""" @@ -230,7 +214,7 @@ def wrf_preconfigure(self, *args, **kwargs): raise EasyBuildError("WRF-specific hook triggered for non-WRF easyconfig?!") -def cuda_post_package(self, *args, **kwargs): +def cuda_postpackage(self, *args, **kwargs): """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections.""" print_msg("Replacing CUDA stuff we cannot ship with symlinks...") # read CUDA EULA @@ -297,3 +281,19 @@ def inject_gpu_property(ec): else: ec[key] = value return ec + +PARSE_HOOKS = { + "CGAL": cgal_toolchainopts_precise, + "fontconfig": fontconfig_add_fonts, + "UCX": ucx_eprefix, +} + +PRE_CONFIGURE_HOOKS = { + "libfabric": libfabric_disable_psm3_x86_64_generic, + "MetaBAT": metabat_preconfigure, + "WRF": wrf_preconfigure, +} + +POST_PACKAGE_HOOKS = { + "CUDA": cuda_postpackage, +} From f1cd893823227a934e5537bba50cc6b5e9358502 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 15:25:32 +0100 Subject: [PATCH 07/49] Force creation of links --- eb_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index 3ab991771f..ad55573660 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -252,7 +252,7 @@ def cuda_postpackage(self, *args, **kwargs): target = source.replace("versions", "host_injections") os.remove(source) # Using os.symlink requires the existence of the target directory, so we use os.system - os.system("ln %s %s" % (target, source)) + os.system("ln -s %s %s" % (target, source)) def inject_gpu_property(ec): From 06a9eaf5eb4118528ac130be31c9b1af26ca4306 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Mon, 19 Dec 2022 23:06:44 +0100 Subject: [PATCH 08/49] Install host_injections CUDA --- EESSI-pilot-install-software.sh | 3 ++ eb_hooks.py | 6 ++-- install_cuda_host_injections.sh | 53 +++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) create mode 100755 install_cuda_host_injections.sh diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 1698066bf4..94b71e14c9 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -412,6 +412,9 @@ fail_msg="Failed to install $cuda_ec, woopsie..." $EB $cuda_ec --robot check_exit_code $? "${ok_msg}" "${fail_msg}" +# Add the host_injections CUDA so we can actually build CUDA apps +./install_cuda_host_injections.sh 11.3.1 + # install CUDA samples (requires EESSI support for CUDA) # TODO Run EESSI NVIDIA GPU support script here # (which unbreaks the symlinks from the runtime installation) diff --git a/eb_hooks.py b/eb_hooks.py index ad55573660..4d31a5f4b4 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -232,7 +232,7 @@ def cuda_postpackage(self, *args, **kwargs): elif copy: tmp_buffer.append(line) # create whitelist without file extensions, they're not really needed and they only complicate things - whitelist = [] + whitelist = ['eula'] file_extensions = [".so", ".a", ".h", ".bc"] for tmp in tmp_buffer: for word in tmp.split(): @@ -252,7 +252,9 @@ def cuda_postpackage(self, *args, **kwargs): target = source.replace("versions", "host_injections") os.remove(source) # Using os.symlink requires the existence of the target directory, so we use os.system - os.system("ln -s %s %s" % (target, source)) + system_command="ln -s %s %s" % (target, source) + if os.system(system_command) != 0: + raise EasyBuildError("Failed to create symbolic link: %s" % system_command) def inject_gpu_property(ec): diff --git a/install_cuda_host_injections.sh b/install_cuda_host_injections.sh new file mode 100755 index 0000000000..038ff19ff3 --- /dev/null +++ b/install_cuda_host_injections.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +if [[ $# -eq 0 ]] ; then + echo 'You must provide the CUDA version as an argument, e.g.:' + echo " $0 11.3.1" + exit 1 +fi +install_cuda_version=$1 +if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + echo "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" + exit 1 +else + # As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` + # (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) + cuda_install_dir=${EESSI_SOFTWARE_PATH/versions/host_injections} +fi + +# Only install CUDA if specified version is not found. +# This is only relevant for users, the shipped CUDA installation will +# always be in versions instead of host_injections and have symlinks pointing +# to host_injections for everything we're not allowed to ship +# (existence of easybuild subdir implies a successful install) +if [ -d ${cuda_install_dir}/software/CUDA/${install_cuda_version}/easybuild ]; then + echo "CUDA software found! No need to install CUDA again, proceed with testing." +else + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), need to do a space check before we proceed + avail_space=$(df --output=avail ${cuda_install_dir}/ | tail -n 1 | awk '{print $1}') + if (( ${avail_space} < 16000000 )); then + echo "Need more disk space to install CUDA, exiting now..." + exit 1 + fi + if [[ ! -z "${EBROOTEASYBUILD}" ]]; then + echo "Loading EasyBuild module to do actual install" + module load EasyBuild + fi + # we need the --rebuild option and a random dir for the module if the module file is shipped with EESSI + if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + tmpdir=$(mktemp -d) + extra_args="--rebuild --installpath-modules=${tmpdir}" + fi + # We don't want hooks used in this install, we need a vanilla CUDA installation + touch $tmpdir/none.py + eb ${extra_args} --hooks=$tmpdir/none.py --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb + ret=$? + if [ $ret -ne 0 ]; then + echo "CUDA installation failed, please check EasyBuild logs..." + exit 1 + fi + # clean up tmpdir if it exists + if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + rm -rf ${tmpdir} + fi +fi From b4e80a153390be28dda32fba288f780a5899d430 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Mon, 19 Dec 2022 23:47:06 +0100 Subject: [PATCH 09/49] Move comments to the right place --- EESSI-pilot-install-software.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 94b71e14c9..b7752d3e83 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -413,11 +413,10 @@ $EB $cuda_ec --robot check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps +# (which unbreaks the symlinks from the runtime installation) ./install_cuda_host_injections.sh 11.3.1 # install CUDA samples (requires EESSI support for CUDA) -# TODO Run EESSI NVIDIA GPU support script here -# (which unbreaks the symlinks from the runtime installation) cuda_samples_ec="CUDA-Samples-11.3-GCC-10.3.0-CUDA-11.3.1.eb" echo ">> Installing $cuda_samples_ec..." ok_msg="$cuda_ec installed, off to a good (?) start!" From 2c8697330155ca91a3ea78f2e3de4d46134877cf Mon Sep 17 00:00:00 2001 From: ocaisa Date: Tue, 14 Feb 2023 17:11:18 +0100 Subject: [PATCH 10/49] Reimplement `mkdir -p` reporting where permissions break down This will allow us to log where creating directory structures under `host_injections` is breaking down. --- scripts/utils.sh | 68 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/scripts/utils.sh b/scripts/utils.sh index 5d8455bb68..06fc897912 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -31,3 +31,71 @@ function check_exit_code { fatal_error "${fail_msg}" fi } + +# Reimplement 'mkdir -p' with reporting on where permissions break down +function create_directory_structure() { + # Ensure we are given a single path argument + if [ $# -ne 1 ]; then + echo "Function requires a single (relative or absolute) path argument" >&2 + return 1 + fi + + # set a persistent variable that knows the full structure + # (i.e., retains the value upon recursive calls) + full_structure="${full_structure:="$1"}" + + local directory_structure="$1" + + # Check if directory exists and is writeable + if [ -d "${directory_structure}" ]; then + if [ "${directory_structure}" = "${full_structure}" ]; then + # release our (unneeded) global variable + unset full_structure + fi + if [ -w "${directory_structure}" ]; then + # Nothing to be done + return 0 + else + echo "Directory ${directory_structure} exists but is not writeable" >&2 + return 1 + fi + fi + + local directory_structure_parent=$(dirname "${directory_structure}") + + # If the parent doesn't exist we need to create it + if [ ! -d "${directory_structure_parent}" ]; then + # Create the parent via a recursive call to this function + # (if this doesn't succeed we need to return the error code) + if ! create_directory_structure "${directory_structure_parent}"; then + if [ "${directory_structure}" = "${full_structure}" ]; then + # release our (unneeded) global variable + unset full_structure + fi + return 1 + fi + fi + + # Check the parent is writeable, and create the new subdir + if [ -w "${directory_structure_parent}" ]; then + if [ "${directory_structure}" = "${full_structure}" ]; then + # release our (unneeded) global variable + unset full_structure + fi + if ! mkdir "${directory_structure}"; then + echo "'mkdir ${directory_structure}' failed for an unknown reason!" >&2 + return 1 + else + # Success! + return 0 + fi + else + echo "Attempt to create ${full_structure} failed," \ + "${directory_structure_parent} exists but you don't have write permissions." >&2 + if [ "${directory_structure}" = "${full_structure}" ]; then + # release our global variable + unset full_structure + fi + return 1 + fi +} From 9590047d008f3be05fc809b6b4b3bbd53c1810cb Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 24 Feb 2023 15:47:34 +0100 Subject: [PATCH 11/49] Be more agressive on catching errors --- EESSI-pilot-install-software.sh | 14 ++++++ install_cuda_host_injections.sh | 86 +++++++++++++++++++++++---------- 2 files changed, 74 insertions(+), 26 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 2d1132efdb..d702e96778 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -431,6 +431,16 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # CUDA support +# Need recent version of EasyBuild +echo ">> Installing EasyBuild 4.5.1..." +ok_msg="EasyBuild v4.7.0 installed" +fail_msg="EasyBuild v4.7.0 failed to install" +$EB --from-pr 17065 --include-easyblocks-from-pr 2893 +check_exit_code $? "${ok_msg}" "${fail_msg}" + +LMOD_IGNORE_CACHE=1 module swap EasyBuild/4.7.0 +check_exit_code $? "Swapped to EasyBuild/4.7.0" "Couldn't swap to EasyBuild/4.7.0" + # install p7zip (to be able to unpack RPMs) p7zip_ec="p7zip-17.04-GCCcore-10.3.0.eb" echo ">> Installing $p7zip_ec..." @@ -450,6 +460,10 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps # (which unbreaks the symlinks from the runtime installation) ./install_cuda_host_injections.sh 11.3.1 +echo ">> Installing $cuda_ec under host_injections..." +ok_msg="$cuda_ec (re)installed under host_injections!" +fail_msg="Failed to install $cuda_ec under host_injections, woopsie..." +check_exit_code $? "${ok_msg}" "${fail_msg}" # install CUDA samples (requires EESSI support for CUDA) cuda_samples_ec="CUDA-Samples-11.3-GCC-10.3.0-CUDA-11.3.1.eb" diff --git a/install_cuda_host_injections.sh b/install_cuda_host_injections.sh index 038ff19ff3..59c1d72996 100755 --- a/install_cuda_host_injections.sh +++ b/install_cuda_host_injections.sh @@ -1,18 +1,18 @@ #!/bin/bash +# Initialise our bash functions +source scripts/utils.sh + if [[ $# -eq 0 ]] ; then - echo 'You must provide the CUDA version as an argument, e.g.:' - echo " $0 11.3.1" - exit 1 + fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" fi install_cuda_version=$1 if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then - echo "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" - exit 1 + fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" else # As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` # (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) - cuda_install_dir=${EESSI_SOFTWARE_PATH/versions/host_injections} + cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} fi # Only install CUDA if specified version is not found. @@ -20,34 +20,68 @@ fi # always be in versions instead of host_injections and have symlinks pointing # to host_injections for everything we're not allowed to ship # (existence of easybuild subdir implies a successful install) -if [ -d ${cuda_install_dir}/software/CUDA/${install_cuda_version}/easybuild ]; then - echo "CUDA software found! No need to install CUDA again, proceed with testing." +if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then + echo_green "CUDA software found! No need to install CUDA again, proceed with testing." else - # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), need to do a space check before we proceed - avail_space=$(df --output=avail ${cuda_install_dir}/ | tail -n 1 | awk '{print $1}') - if (( ${avail_space} < 16000000 )); then - echo "Need more disk space to install CUDA, exiting now..." - exit 1 - fi - if [[ ! -z "${EBROOTEASYBUILD}" ]]; then - echo "Loading EasyBuild module to do actual install" + # We need to be able write to the installation space so let's make sure we can + if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then + fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" + fi + + # we need a directory we can use for temporary storage + if [[ -z "${CUDA_TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) + else + tmpdir="${CUDA_TEMP_DIR}"/temp + if ! mkdir "$tmpdir" ; then + fatal_error "Could not create directory ${tmpdir}" + fi + fi + + + required_space_in_tmpdir=50000 + # Let's see if we have sources and build locations defined if not, we use the temporary space + if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) + fi + if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) + fi + + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), + # need to do a space check before we proceed + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < 5000000 )); then + fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." + fi + avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < required_space_in_tmpdir )); then + error1="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" + error2="You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH to reduce this requirement. Exiting now..." + fatal_error "${error1}${error2}" + fi + + if [[ -z "${EBROOTEASYBUILD}" ]]; then + echo_yellow "Loading EasyBuild module to do actual install" module load EasyBuild fi - # we need the --rebuild option and a random dir for the module if the module file is shipped with EESSI + + # we need the --rebuild option and a (random) dir for the module if the module + # file is shipped with EESSI if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then - tmpdir=$(mktemp -d) extra_args="--rebuild --installpath-modules=${tmpdir}" fi # We don't want hooks used in this install, we need a vanilla CUDA installation - touch $tmpdir/none.py - eb ${extra_args} --hooks=$tmpdir/none.py --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb + touch "$tmpdir"/none.py + eb "${extra_args}" --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb ret=$? if [ $ret -ne 0 ]; then - echo "CUDA installation failed, please check EasyBuild logs..." - exit 1 - fi - # clean up tmpdir if it exists - if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then - rm -rf ${tmpdir} + fatal_error "CUDA installation failed, please check EasyBuild logs..." + else + echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" fi + # clean up tmpdir + rm -rf "${tmpdir}" fi From 1357f763b5f27a0be82ed6c443ce3213a7e8361e Mon Sep 17 00:00:00 2001 From: ocaisa Date: Mon, 27 Feb 2023 16:43:08 +0100 Subject: [PATCH 12/49] `${extra_args}` is actually multiple args not a single string --- install_cuda_host_injections.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install_cuda_host_injections.sh b/install_cuda_host_injections.sh index 59c1d72996..b6f83b4e9d 100755 --- a/install_cuda_host_injections.sh +++ b/install_cuda_host_injections.sh @@ -75,7 +75,7 @@ else fi # We don't want hooks used in this install, we need a vanilla CUDA installation touch "$tmpdir"/none.py - eb "${extra_args}" --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb + eb ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb ret=$? if [ $ret -ne 0 ]; then fatal_error "CUDA installation failed, please check EasyBuild logs..." From 8096c5460c583f88d83bb8f2b00f204fb8830d3c Mon Sep 17 00:00:00 2001 From: ocaisa Date: Mon, 27 Feb 2023 16:46:33 +0100 Subject: [PATCH 13/49] Update EESSI-pilot-install-software.sh --- EESSI-pilot-install-software.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index d702e96778..5b604deaa7 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -468,7 +468,7 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # install CUDA samples (requires EESSI support for CUDA) cuda_samples_ec="CUDA-Samples-11.3-GCC-10.3.0-CUDA-11.3.1.eb" echo ">> Installing $cuda_samples_ec..." -ok_msg="$cuda_ec installed, off to a good (?) start!" +ok_msg="$cuda_samples_ec installed, off to a good (?) start!" fail_msg="Failed to install $cuda_samples_ec, woopsie..." $EB $cuda_samples_ec --robot --from-pr=16914 check_exit_code $? "${ok_msg}" "${fail_msg}" From ec31edfbb1954982f197cd44cd296a62e2f47954 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 1 Mar 2023 14:46:56 +0100 Subject: [PATCH 14/49] Catching echo exit code instead of actual code --- EESSI-pilot-install-software.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 5b604deaa7..a9f6576866 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -459,8 +459,8 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps # (which unbreaks the symlinks from the runtime installation) -./install_cuda_host_injections.sh 11.3.1 echo ">> Installing $cuda_ec under host_injections..." +./install_cuda_host_injections.sh 11.3.1 ok_msg="$cuda_ec (re)installed under host_injections!" fail_msg="Failed to install $cuda_ec under host_injections, woopsie..." check_exit_code $? "${ok_msg}" "${fail_msg}" From 0e99db5127e2735828d22a415aeeb194216f1378 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 1 Mar 2023 14:48:52 +0100 Subject: [PATCH 15/49] Give a full path to the CUDA host injections script --- EESSI-pilot-install-software.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index a9f6576866..c715dd55ad 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -460,7 +460,7 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps # (which unbreaks the symlinks from the runtime installation) echo ">> Installing $cuda_ec under host_injections..." -./install_cuda_host_injections.sh 11.3.1 +$(dirname "$BASH_SOURCE")/install_cuda_host_injections.sh 11.3.1 ok_msg="$cuda_ec (re)installed under host_injections!" fail_msg="Failed to install $cuda_ec under host_injections, woopsie..." check_exit_code $? "${ok_msg}" "${fail_msg}" From cd11792609b4a637f324748089da61f1456827f5 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 1 Mar 2023 15:29:02 +0100 Subject: [PATCH 16/49] Add checks for some whitelist entries for CUDA --- eb_hooks.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index 4d31a5f4b4..c7358d5f13 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -232,13 +232,18 @@ def cuda_postpackage(self, *args, **kwargs): elif copy: tmp_buffer.append(line) # create whitelist without file extensions, they're not really needed and they only complicate things - whitelist = ['eula'] + whitelist = ['EULA'] file_extensions = [".so", ".a", ".h", ".bc"] for tmp in tmp_buffer: for word in tmp.split(): if any(ext in word for ext in file_extensions): whitelist.append(word.split(".")[0]) whitelist = list(set(whitelist)) + # Do some quick checks for things we should or shouldn't have in the list + if "nvcc" in whitelist: + raise EasyBuildError("Found 'nvcc' in whitelist: %s" % whitelist) + if "libcudart" not in whitelist: + raise EasyBuildError("Did not find 'libcudart' in whitelist: %s" % whitelist) # iterate over all files in the CUDA path for root, dirs, files in os.walk(self.installdir): for filename in files: From f514f8155fd6f02e894fc48e4a7a83d046b08222 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 1 Mar 2023 15:56:24 +0100 Subject: [PATCH 17/49] Fix failing eb installation --- EESSI-pilot-install-software.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index c715dd55ad..f97894df74 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -432,10 +432,10 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # CUDA support # Need recent version of EasyBuild -echo ">> Installing EasyBuild 4.5.1..." +echo ">> Installing EasyBuild 4.7.0..." ok_msg="EasyBuild v4.7.0 installed" fail_msg="EasyBuild v4.7.0 failed to install" -$EB --from-pr 17065 --include-easyblocks-from-pr 2893 +$EB --from-pr 17065 --include-easyblocks-from-pr 2893 --try-amend=use_pip=1 check_exit_code $? "${ok_msg}" "${fail_msg}" LMOD_IGNORE_CACHE=1 module swap EasyBuild/4.7.0 From be326a1e5842460035ad6dffc580c73395a4c8b2 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Wed, 1 Mar 2023 21:02:01 +0100 Subject: [PATCH 18/49] Make sure we check space in the right places --- install_cuda_host_injections.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/install_cuda_host_injections.sh b/install_cuda_host_injections.sh index b6f83b4e9d..0684c7ca80 100755 --- a/install_cuda_host_injections.sh +++ b/install_cuda_host_injections.sh @@ -38,7 +38,6 @@ else fi fi - required_space_in_tmpdir=50000 # Let's see if we have sources and build locations defined if not, we use the temporary space if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then @@ -52,11 +51,11 @@ else # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), # need to do a space check before we proceed - avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') if (( avail_space < 5000000 )); then fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." fi - avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') if (( avail_space < required_space_in_tmpdir )); then error1="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" error2="You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH to reduce this requirement. Exiting now..." From a9cc56cf1363d0b806571b6eac1c0720e3b408a3 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 2 Mar 2023 11:13:14 +0100 Subject: [PATCH 19/49] Bring GPU support in line with #212 --- gpu_support/README.md | 27 +++++ gpu_support/add_amd_gpu_support.sh | 14 +++ gpu_support/add_nvidia_gpu_support.sh | 77 +++++++++++++ gpu_support/cuda_utils/get_cuda_compatlibs.sh | 19 ++++ .../cuda_utils/install_cuda_compatlibs.sh | 92 ++++++++++++++++ .../install_cuda_compatlibs_loop.sh | 103 ++++++++++++++++++ .../cuda_utils/prepare_cuda_compatlibs.sh | 31 ++++++ gpu_support/cuda_utils/test_cuda.sh | 82 ++++++++++++++ 8 files changed, 445 insertions(+) create mode 100644 gpu_support/README.md create mode 100755 gpu_support/add_amd_gpu_support.sh create mode 100755 gpu_support/add_nvidia_gpu_support.sh create mode 100755 gpu_support/cuda_utils/get_cuda_compatlibs.sh create mode 100644 gpu_support/cuda_utils/install_cuda_compatlibs.sh create mode 100644 gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh create mode 100755 gpu_support/cuda_utils/prepare_cuda_compatlibs.sh create mode 100644 gpu_support/cuda_utils/test_cuda.sh diff --git a/gpu_support/README.md b/gpu_support/README.md new file mode 100644 index 0000000000..226a7fca6c --- /dev/null +++ b/gpu_support/README.md @@ -0,0 +1,27 @@ +# How to add GPU support +The collection of scripts in this directory enables you to add GPU support to your setup. +Note that currently this means that CUDA support can be added for Nvidia GPUs. AMD GPUs are not yet supported (feel free to contribute that though!). +To enable the usage of the CUDA runtime in your setup, simply run the following script: +``` +./add_nvidia_gpu_support.sh +``` +This script will install the compatibility libraries (and only those by default!) you need to use the shipped runtime environment of CUDA. + +If you plan on using the full CUDA suite, i.e. if you want to load the CUDA module, you will have to modify the script execution as follows: +``` +export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh +``` +This will again install the needed compatibility libraries as well as the whole CUDA suite. + +If you need a different CUDA version than what is shipped with EESSI, you can also specify that particular version for the script: +``` +export INSTALL_CUDA_VERSION=xx.y.z && export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh +``` +Please note, however, that versions for which the runtime is not shipped with EESSI are not installed in the default modules path. +Thus, you will have to add the following to your modules path to get access to your custom CUDA version: +``` +module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/ +``` +## Prerequisites and tips +* You need write permissions to `/cvmfs/pilot.eessi-hpc.org/host_injections` (which by default is a symlink to `/opt/eessi` but can be configured in your CVMFS config file to point somewhere else). If you would like to make a system-wide installation you should change this in your configuration to point somewhere on a shared filesystem. +* If you want to install CUDA on a node without GPUs (e.g. on a login node where you want to be able to compile your CUDA-enabled code), you should `export INSTALL_WO_GPU=true` in order to skip checks and tests that can only succeed if you have access to a GPU. This approach is not recommended as there is a chance the CUDA compatibility library installed is not compatible with the existing CUDA driver on GPU nodes (and this will not be detected). diff --git a/gpu_support/add_amd_gpu_support.sh b/gpu_support/add_amd_gpu_support.sh new file mode 100755 index 0000000000..29c8abdc88 --- /dev/null +++ b/gpu_support/add_amd_gpu_support.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +cat << EOF +This is not implemented yet :( + +If you would like to contribute this support there are a few things you will +need to consider: +- We will need to change the Lmod property added to GPU software so we can + distinguish AMD and Nvidia GPUs +- Support should be implemented in user space, if this is not possible (e.g., + requires a driver update) you need to tell the user what to do +- Support needs to be _verified_ and a trigger put in place (like the existence + of a particular path) so we can tell Lmod to display the associated modules +EOF diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh new file mode 100755 index 0000000000..8706b492eb --- /dev/null +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +# Drop into the prefix shell or pipe this script into a Prefix shell with +# $EPREFIX/startprefix <<< /path/to/this_script.sh + +install_cuda="${INSTALL_CUDA:=false}" +eessi_version="${EESSI_PILOT_VERSION:=latest}" + +# If you want to install CUDA support on login nodes (typically without GPUs), +# set this variable to true. This will skip all GPU-dependent checks +install_wo_gpu=false +[ "$INSTALL_WO_GPU" = true ] && install_wo_gpu=true + +# verify existence of nvidia-smi or this is a waste of time +# Check if nvidia-smi exists and can be executed without error +if [[ "${install_wo_gpu}" != "true" ]]; then + if command -v nvidia-smi > /dev/null 2>&1; then + nvidia-smi > /dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "nvidia-smi was found but returned error code, exiting now..." >&2 + echo "If you do not have a GPU on this device but wish to force the installation," + echo "please set the environment variable INSTALL_WO_GPU=true" + exit 1 + fi + echo "nvidia-smi found, continue setup." + else + echo "nvidia-smi not found, exiting now..." >&2 + echo "If you do not have a GPU on this device but wish to force the installation," + echo "please set the environment variable INSTALL_WO_GPU=true" + exit 1 + fi +else + echo "You requested to install CUDA without GPUs present." + echo "This means that all GPU-dependent tests/checks will be skipped!" +fi + +EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/"${eessi_version}"/init/bash + +############################################################################################## +# Check that the CUDA driver version is adequate +# ( +# needs to be r450 or r470 which are LTS, other production branches are acceptable but not +# recommended, below r450 is not compatible [with an exception we will not explore,see +# https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers] +# ) +# only check first number in case of multiple GPUs +if [[ "${install_wo_gpu}" != "true" ]]; then + driver_major_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | tail -n1) + driver_major_version="${driver_major_version%%.*}" + # Now check driver_version for compatibility + # Check driver is at least LTS driver R450, see https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers + if (( $driver_major_version < 450 )); then + echo "Your NVIDIA driver version is too old, please update first.." + exit 1 + fi +fi + +############################################################################################### +# Install CUDA +############################################################################################### + +# Now we have the EESSI context enabled let's grab the version(s) of CUDA we need to install +# (we assume here that CUDA versions are always simple version strings with semantic versions) +cuda_versions=($(ls "$EESSI_SOFTWARE_PATH"/software/CUDA/)) +latest_cuda_version="${cuda_versions[0]}" # EESSI starts with CUDA 11, no need for <10 logic +if [ "${install_cuda}" != false ]; then + for cuda_version in "${cuda_versions[@]}" + do + bash $(dirname "$BASH_SOURCE")/cuda_utils/install_cuda.sh ${install_cuda_version} + done +fi +############################################################################################### +# Prepare installation of CUDA compat libraries, i.e. install p7zip if it is missing +############################################################################################### +# Try installing different versions of CUDA compat libraries until the test works. +# Otherwise, give up +bash $(dirname "$BASH_SOURCE")/cuda_utils/install_cuda_compatlibs_loop.sh "${latest_cuda_version}" diff --git a/gpu_support/cuda_utils/get_cuda_compatlibs.sh b/gpu_support/cuda_utils/get_cuda_compatlibs.sh new file mode 100755 index 0000000000..9639917a27 --- /dev/null +++ b/gpu_support/cuda_utils/get_cuda_compatlibs.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Get arch type from EESSI environment +if [[ -z "${EESSI_CPU_FAMILY}" ]]; then + # set up basic environment variables, EasyBuild and Lmod + EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/latest/init/bash +fi +eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}" + +# build URL for CUDA libraries +# take rpm file for compat libs from rhel8 folder, deb and rpm files contain the same libraries +cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/rhel8/"${eessi_cpu_family}"/" +# get all versions in decending order +files=$(curl -s "${cuda_url}" | grep 'cuda-compat' | sed 's/<\/\?[^>]\+>//g' | xargs -n1 | /cvmfs/pilot.eessi-hpc.org/latest/compat/linux/${eessi_cpu_family}/bin/sort -r --version-sort ) +if [[ -z "${files// }" ]]; then + echo "Could not find any compat lib files under" ${cuda_url} + exit 1 +fi +for file in $files; do echo "${cuda_url}$file"; done diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs.sh b/gpu_support/cuda_utils/install_cuda_compatlibs.sh new file mode 100644 index 0000000000..11a7aa7e3d --- /dev/null +++ b/gpu_support/cuda_utils/install_cuda_compatlibs.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +libs_url=$1 +cuda_install_dir=$2 + +current_dir=$(dirname $(realpath $0)) +host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" +host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} + +# Create a general space for our NVIDIA compat drivers +if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then + mkdir -p ${host_injections_dir} +else + echo "Cannot write to eessi host_injections space, exiting now..." >&2 + exit 1 +fi +cd ${host_injections_dir} + +# Check if our target CUDA is satisfied by what is installed already +# TODO: Find required CUDA version and see if we need an update +driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) +eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) +if [[ $driver_cuda_version =~ ^[0-9]+$ ]]; then + if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi +fi + +# If not, grab the latest compat library RPM or deb +# Download and unpack in temporary directory, easier cleanup after installation +tmpdir=$(mktemp -d) +cd $tmpdir +compat_file=${libs_url##*/} +wget ${libs_url} +echo $compat_file + +# Unpack it +# rpm files are the default for all OSes +# Keep support for deb files in case it is needed in the future +file_extension=${compat_file##*.} +if [[ ${file_extension} == "rpm" ]]; then + # p7zip is installed under host_injections for now, make that known to the environment + if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ + fi + # Load p7zip to extract files from rpm file + module load p7zip + # Extract .cpio + 7z x ${compat_file} + # Extract lib* + 7z x ${compat_file/rpm/cpio} + # Restore symlinks + cd usr/local/cuda-*/compat + ls *.so *.so.? | xargs -i -I % sh -c '{ echo -n ln -sf" "; cat %; echo " "%; }'| xargs -i sh -c "{}" + cd - +elif [[ ${file_extension} == "deb" ]]; then + ar x ${compat_file} + tar xf data.tar.* +else + echo "File extension of cuda compat lib not supported, exiting now..." >&2 + exit 1 +fi +cd $host_injections_dir +cuda_dir=$(basename ${tmpdir}/usr/local/cuda-*) +# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir +rm -rf ${cuda_dir} +mv -n ${tmpdir}/usr/local/cuda-* . +rm -r ${tmpdir} + +# Add a symlink that points the latest version to the version we just installed +ln -sfn ${cuda_dir} latest + +if [ ! -e latest ] ; then + echo "Symlink to latest cuda compat lib version is broken, exiting now..." + exit 1 +fi + +# Create the space to host the libraries +mkdir -p ${host_injection_linker_dir} +# Symlink in the path to the latest libraries +if [ ! -d "${host_injection_linker_dir}/lib" ]; then + ln -s ${host_injections_dir}/latest/compat ${host_injection_linker_dir}/lib +elif [ ! "${host_injection_linker_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then + echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..." + echo "${host_injection_linker_dir}/lib should point to ${host_injections_dir}/latest/compat" + exit 1 +fi + +# return to initial dir +cd $current_dir + +echo +echo CUDA driver compatability drivers installed for CUDA version: +echo ${cuda_dir/cuda-/} diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh new file mode 100644 index 0000000000..9e066e0c65 --- /dev/null +++ b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +install_cuda_version=$1 + +# Check if the CUDA compat libraries are installed and compatible with the target CUDA version +# if not find the latest version of the compatibility libraries and install them + +# get URL to latest CUDA compat libs, exit if URL is invalid +cuda_compat_urls="$($(dirname "$BASH_SOURCE")/get_cuda_compatlibs.sh)" +ret=$? +if [ $ret -ne 0 ]; then + echo "Couldn't find current URLs of the CUDA compat libraries, instead got:" + echo $cuda_compat_urls + exit 1 +fi + +# loop over the compat library versions until we get one that works for us +keep_driver_check=1 +# Do a maximum of five attempts +for value in {1..5} +do + latest_cuda_compat_url=$(echo "$cuda_compat_urls" | cut -d " " -f1) + # Chomp that value out of the list + cuda_compat_urls=$(echo $cuda_compat_urls | cut -d " " -f2-) + latest_driver_version="${latest_cuda_compat_url%-*}" + latest_driver_version="${latest_driver_version##*-}" + # URLs differ for different OSes; check if we already have a number, if not remove string part that is not needed + if [[ ! $latest_driver_version =~ ^[0-9]+$ ]]; then + latest_driver_version="${latest_driver_version##*_}" + fi + + install_compat_libs=false + host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" + # libcuda.so points to actual cuda compat lib with driver version in its name + # if this file exists, cuda compat libs are installed and we can compare the version + if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then + eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so) + eessi_driver_version="${eessi_driver_version##*so.}" + else + eessi_driver_version=0 + fi + + if [ $keep_driver_check -eq 1 ] + then + # only keep the driver check for the latest version + keep_driver_check=0 + else + eessi_driver_version=0 + fi + + if (( ${latest_driver_version//./} > ${eessi_driver_version//./} )); then + install_compat_libs=true + else + echo "CUDA compat libs are up-to-date, skip installation." + fi + + if [ "${install_compat_libs}" == true ]; then + bash $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh ${latest_cuda_compat_url} ${cuda_install_dir} + fi + + if [[ "${install_wo_gpu}" != "true" ]]; then + bash $(dirname "$BASH_SOURCE")/test_cuda.sh "${install_cuda_version}" + if [ $? -eq 0 ] + then + cuda_version_file="${host_injections_dir}/latest/version.txt" + echo "${install_cuda_version}" > ${cuda_version_file} + exit 0 + else + echo + echo "It looks like your driver is not recent enough to work with that release of CUDA, consider updating!" + echo "I'll try an older release to see if that will work..." + echo + fi + else + echo "Requested to install CUDA without GPUs present, so we skip final tests." + echo "Instead we test if module load CUDA works as expected..." + if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ + else + echo "Cannot load CUDA, modules path does not exist, exiting now..." + exit 1 + fi + module load CUDA + ret=$? + if [ $ret -ne 0 ]; then + echo "Could not load CUDA even though modules path exists..." + exit 1 + else + echo "Successfully loaded CUDA, you are good to go! :)" + echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" + echo " - To use these modules:" + echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" + echo " - Please keep in mind that we just installed the latest CUDA compat libs." + echo " Since we have no GPU to test with, we cannot guarantee that it will work with the installed CUDA drivers on your GPU node(s)." + exit 0 + fi + break + fi +done + +echo "Tried to install 5 different generations of compat libraries and none worked," +echo "this usually means your driver is very out of date!" +exit 1 diff --git a/gpu_support/cuda_utils/prepare_cuda_compatlibs.sh b/gpu_support/cuda_utils/prepare_cuda_compatlibs.sh new file mode 100755 index 0000000000..9efd2b8e66 --- /dev/null +++ b/gpu_support/cuda_utils/prepare_cuda_compatlibs.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +install_p7zip_version=$1 +cuda_install_dir=$2 + +# Install p7zip, this will be used to install the CUDA compat libraries from rpm. +# The rpm and deb files contain the same libraries, so we just stick to the rpm version. +# If p7zip is missing from the software layer (for whatever reason), we need to install it. +# This has to happen in host_injections, so we check first if it is already installed there. +if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ +fi +module avail 2>&1 | grep -i p7zip &> /dev/null +if [[ $? -eq 0 ]]; then + echo "p7zip module found! No need to install p7zip again, proceeding with installation of compat libraries" +else + # install p7zip in host_injections + export EASYBUILD_IGNORE_OSDEPS=1 + export EASYBUILD_SYSROOT=${EPREFIX} + export EASYBUILD_RPATH=1 + export EASYBUILD_FILTER_ENV_VARS=LD_LIBRARY_PATH + export EASYBUILD_FILTER_DEPS=Autoconf,Automake,Autotools,binutils,bzip2,cURL,DBus,flex,gettext,gperf,help2man,intltool,libreadline,libtool,Lua,M4,makeinfo,ncurses,util-linux,XZ,zlib + export EASYBUILD_MODULE_EXTENSIONS=1 + module load EasyBuild + eb --robot --installpath=${cuda_install_dir}/ p7zip-${install_p7zip_version}.eb + ret=$? + if [ $ret -ne 0 ]; then + echo "p7zip installation failed, please check EasyBuild logs..." + exit 1 + fi +fi diff --git a/gpu_support/cuda_utils/test_cuda.sh b/gpu_support/cuda_utils/test_cuda.sh new file mode 100644 index 0000000000..6939c77815 --- /dev/null +++ b/gpu_support/cuda_utils/test_cuda.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +install_cuda_version=$1 +save_compiled_test="${SAVE_COMPILED_TEST:=false}" + +# Test CUDA +cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}" +current_dir=$PWD +if [ -d ${cuda_install_dir}/modules/all ]; then + module use ${cuda_install_dir}/modules/all/ +else + echo "Cannot test CUDA, modules path does not exist, exiting now..." + exit 1 +fi +module load CUDA/${install_cuda_version} +ret=$? +if [ $ret -ne 0 ]; then + echo "Could not load CUDA even though modules path exists..." + exit 1 +fi +# if we don't want to save the compiled sample, it means we have a shipped version available +if [ "${save_compiled_test}" != false ]; then + tmp_dir=$(mktemp -d) + # convert cuda version to an integer so we can test if the samples are shipped with this version + # starting from version 11.6 the samples can be found in a github repo + cuda_version=$(echo ${install_cuda_version} | cut -f1,2 -d'.') + cuda_version=${cuda_version//./} + if (( ${cuda_version} < 116 )); then + cp -r $EBROOTCUDA/samples $tmp_dir + cd $tmp_dir/samples/1_Utilities/deviceQuery + else + git clone https://github.com/NVIDIA/cuda-samples.git ${tmp_dir} -q + cd $tmp_dir/Samples/1_Utilities/deviceQuery + fi + module load GCCcore + ret=$? + if [ $ret -ne 0 ]; then + echo "Could not load GCC, but it should have been shipped with EESSI?! Exiting..." + exit 1 + fi + make HOST_COMPILER=$(which g++) -j +else + cd ${EESSI_SOFTWARE_PATH}/software/CUDA/${install_cuda_version} +fi +./deviceQuery + +if [ $? -eq 0 ] +then + # Set the color variable + green='\033[0;32m' + # Clear the color after that + clear='\033[0m' + echo -e ${green} + echo "Congratulations, your GPU is working with EESSI!" + echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" + echo " - To use these modules:" + echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" + echo -e ${clear} + + if [ "${save_compiled_test}" != false ]; then + mv deviceQuery ${EESSI_SOFTWARE_PATH}/software/CUDA/${install_cuda_version} + fi + + # Clean up + cd $current_dir + if [ "${save_compiled_test}" != false ]; then + rm -rf $tmp_dir + fi +else + echo "Uff, your GPU doesn't seem to be working with EESSI :(" >&2 + # Clean up + cd $current_dir + if [ "${save_compiled_test}" != false ]; then + rm -rf $tmp_dir + fi + exit 1 +fi + +# Test a CUDA-enabled module from EESSI +# TODO: GROMACS? +# TODO: Include a GDR copy test? +############################################################################################### From 103f5fa34ea9c4cc740088d25a6d4ce6b6ca78bb Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 09:38:12 +0100 Subject: [PATCH 20/49] Simply wrap `mkdir -p` for better error reporting --- EESSI-pilot-install-software.sh | 4 +- install_cuda_host_injections.sh | 87 --------------------------------- scripts/utils.sh | 82 ++++++------------------------- 3 files changed, 19 insertions(+), 154 deletions(-) delete mode 100755 install_cuda_host_injections.sh diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index f97894df74..d648162c31 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -431,6 +431,8 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # CUDA support +cuda_version="11.3.1" + # Need recent version of EasyBuild echo ">> Installing EasyBuild 4.7.0..." ok_msg="EasyBuild v4.7.0 installed" @@ -460,7 +462,7 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps # (which unbreaks the symlinks from the runtime installation) echo ">> Installing $cuda_ec under host_injections..." -$(dirname "$BASH_SOURCE")/install_cuda_host_injections.sh 11.3.1 +$(dirname "$BASH_SOURCE")/gpu_support/cuda_utils/install_cuda_host_injections.sh 11.3.1 ok_msg="$cuda_ec (re)installed under host_injections!" fail_msg="Failed to install $cuda_ec under host_injections, woopsie..." check_exit_code $? "${ok_msg}" "${fail_msg}" diff --git a/install_cuda_host_injections.sh b/install_cuda_host_injections.sh deleted file mode 100755 index b6f83b4e9d..0000000000 --- a/install_cuda_host_injections.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/bash - -# Initialise our bash functions -source scripts/utils.sh - -if [[ $# -eq 0 ]] ; then - fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" -fi -install_cuda_version=$1 -if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then - fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" -else - # As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` - # (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) - cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} -fi - -# Only install CUDA if specified version is not found. -# This is only relevant for users, the shipped CUDA installation will -# always be in versions instead of host_injections and have symlinks pointing -# to host_injections for everything we're not allowed to ship -# (existence of easybuild subdir implies a successful install) -if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then - echo_green "CUDA software found! No need to install CUDA again, proceed with testing." -else - # We need to be able write to the installation space so let's make sure we can - if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then - fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" - fi - - # we need a directory we can use for temporary storage - if [[ -z "${CUDA_TEMP_DIR}" ]]; then - tmpdir=$(mktemp -d) - else - tmpdir="${CUDA_TEMP_DIR}"/temp - if ! mkdir "$tmpdir" ; then - fatal_error "Could not create directory ${tmpdir}" - fi - fi - - - required_space_in_tmpdir=50000 - # Let's see if we have sources and build locations defined if not, we use the temporary space - if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then - export EASYBUILD_BUILDPATH=${tmpdir}/build - required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) - fi - if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then - export EASYBUILD_SOURCEPATH=${tmpdir}/sources - required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) - fi - - # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), - # need to do a space check before we proceed - avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < 5000000 )); then - fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." - fi - avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < required_space_in_tmpdir )); then - error1="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" - error2="You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH to reduce this requirement. Exiting now..." - fatal_error "${error1}${error2}" - fi - - if [[ -z "${EBROOTEASYBUILD}" ]]; then - echo_yellow "Loading EasyBuild module to do actual install" - module load EasyBuild - fi - - # we need the --rebuild option and a (random) dir for the module if the module - # file is shipped with EESSI - if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then - extra_args="--rebuild --installpath-modules=${tmpdir}" - fi - # We don't want hooks used in this install, we need a vanilla CUDA installation - touch "$tmpdir"/none.py - eb ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb - ret=$? - if [ $ret -ne 0 ]; then - fatal_error "CUDA installation failed, please check EasyBuild logs..." - else - echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" - fi - # clean up tmpdir - rm -rf "${tmpdir}" -fi diff --git a/scripts/utils.sh b/scripts/utils.sh index 035a914445..c1342c21fc 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -14,7 +14,7 @@ ANY_ERROR_EXITCODE=1 function fatal_error() { echo_red "ERROR: $1" >&2 if [[ $# -gt 1 ]]; then - exit $2 + exit "$2" else exit "${ANY_ERROR_EXITCODE}" fi @@ -32,79 +32,29 @@ function check_exit_code { fi } -# Reimplement 'mkdir -p' with reporting on where permissions break down function create_directory_structure() { # Ensure we are given a single path argument if [ $# -ne 1 ]; then - echo "Function requires a single (relative or absolute) path argument" >&2 - return 1 + echo_red "Function requires a single (relative or absolute) path argument" >&2 + return $ANY_ERROR_EXITCODE fi - - # set a persistent variable that knows the full structure - # (i.e., retains the value upon recursive calls) - full_structure="${full_structure:="$1"}" - - local directory_structure="$1" - - # Check if directory exists and is writeable - if [ -d "${directory_structure}" ]; then - if [ "${directory_structure}" = "${full_structure}" ]; then - # release our (unneeded) global variable - unset full_structure - fi - if [ -w "${directory_structure}" ]; then - # Nothing to be done - return 0 - else - echo "Directory ${directory_structure} exists but is not writeable" >&2 - return 1 - fi - fi - - local directory_structure_parent=$(dirname "${directory_structure}") - - # If the parent doesn't exist we need to create it - if [ ! -d "${directory_structure_parent}" ]; then - # Create the parent via a recursive call to this function - # (if this doesn't succeed we need to return the error code) - if ! create_directory_structure "${directory_structure_parent}"; then - if [ "${directory_structure}" = "${full_structure}" ]; then - # release our (unneeded) global variable - unset full_structure - fi - return 1 - fi - fi - - # Check the parent is writeable, and create the new subdir - if [ -w "${directory_structure_parent}" ]; then - if [ "${directory_structure}" = "${full_structure}" ]; then - # release our (unneeded) global variable - unset full_structure - fi - if ! mkdir "${directory_structure}"; then - echo "'mkdir ${directory_structure}' failed for an unknown reason!" >&2 - return 1 - else - # Success! - return 0 - fi - else - echo "Attempt to create ${full_structure} failed," \ - "${directory_structure_parent} exists but you don't have write permissions." >&2 - if [ "${directory_structure}" = "${full_structure}" ]; then - # release our global variable - unset full_structure - fi - return 1 + dir_structure="$1" + + # Attempt to create the directory structure + error_message=$(mkdir -p 2>&1) + return_code=$? + # If it fails be explicit about the error + if [ ${return_code} -ne 0 ]; then + echo_red "Creating ${dir_structure} failed with\n${error_message}" >&2 fi + return $return_code } function get_path_for_tool { tool_name=$1 tool_envvar_name=$2 - which_out=$(which ${tool_name} 2>&1) + which_out=$(which "${tool_name}" 2>&1) exit_code=$? if [[ ${exit_code} -eq 0 ]]; then echo "INFO: found tool ${tool_name} in PATH (${which_out})" >&2 @@ -136,7 +86,7 @@ function get_host_from_url { url=$1 re="(http|https)://([^/:]+)" if [[ $url =~ $re ]]; then - echo ${BASH_REMATCH[2]} + echo "${BASH_REMATCH[2]}" return 0 else echo "" @@ -148,7 +98,7 @@ function get_port_from_url { url=$1 re="(http|https)://[^:]+:([0-9]+)" if [[ $url =~ $re ]]; then - echo ${BASH_REMATCH[2]} + echo "${BASH_REMATCH[2]}" return 0 else echo "" @@ -158,7 +108,7 @@ function get_port_from_url { function get_ipv4_address { hname=$1 - hipv4=$(grep ${hname} /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) + hipv4=$(grep "${hname}" /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) # TODO try other methods if the one above does not work --> tool that verifies # what method can be used? echo "${hipv4}" From 793ba29d5bb1359a6ef5d631371cdf445b1147a5 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 09:42:57 +0100 Subject: [PATCH 21/49] Simply wrap `mkdir -p` for better error reporting --- .../install_cuda_host_injections.sh | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100755 gpu_support/cuda_utils/install_cuda_host_injections.sh diff --git a/gpu_support/cuda_utils/install_cuda_host_injections.sh b/gpu_support/cuda_utils/install_cuda_host_injections.sh new file mode 100755 index 0000000000..0684c7ca80 --- /dev/null +++ b/gpu_support/cuda_utils/install_cuda_host_injections.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Initialise our bash functions +source scripts/utils.sh + +if [[ $# -eq 0 ]] ; then + fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" +fi +install_cuda_version=$1 +if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" +else + # As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` + # (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) + cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} +fi + +# Only install CUDA if specified version is not found. +# This is only relevant for users, the shipped CUDA installation will +# always be in versions instead of host_injections and have symlinks pointing +# to host_injections for everything we're not allowed to ship +# (existence of easybuild subdir implies a successful install) +if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then + echo_green "CUDA software found! No need to install CUDA again, proceed with testing." +else + # We need to be able write to the installation space so let's make sure we can + if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then + fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" + fi + + # we need a directory we can use for temporary storage + if [[ -z "${CUDA_TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) + else + tmpdir="${CUDA_TEMP_DIR}"/temp + if ! mkdir "$tmpdir" ; then + fatal_error "Could not create directory ${tmpdir}" + fi + fi + + required_space_in_tmpdir=50000 + # Let's see if we have sources and build locations defined if not, we use the temporary space + if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) + fi + if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) + fi + + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), + # need to do a space check before we proceed + avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < 5000000 )); then + fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." + fi + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < required_space_in_tmpdir )); then + error1="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" + error2="You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH to reduce this requirement. Exiting now..." + fatal_error "${error1}${error2}" + fi + + if [[ -z "${EBROOTEASYBUILD}" ]]; then + echo_yellow "Loading EasyBuild module to do actual install" + module load EasyBuild + fi + + # we need the --rebuild option and a (random) dir for the module if the module + # file is shipped with EESSI + if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + extra_args="--rebuild --installpath-modules=${tmpdir}" + fi + # We don't want hooks used in this install, we need a vanilla CUDA installation + touch "$tmpdir"/none.py + eb ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb + ret=$? + if [ $ret -ne 0 ]; then + fatal_error "CUDA installation failed, please check EasyBuild logs..." + else + echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" + fi + # clean up tmpdir + rm -rf "${tmpdir}" +fi From c0a12470de4ba2c4523f084edd8b516d6fb62235 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 09:46:57 +0100 Subject: [PATCH 22/49] Make CUDA version a variable --- EESSI-pilot-install-software.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index d648162c31..f535634631 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -452,7 +452,7 @@ $EB $p7zip_ec --robot check_exit_code $? "${ok_msg}" "${fail_msg}" # install CUDA (uses eb_hooks.py to only install runtime) -cuda_ec="CUDA-11.3.1.eb" +cuda_ec="CUDA-${cuda_version}.eb" echo ">> Installing $cuda_ec..." ok_msg="$cuda_ec installed, off to a good (?) start!" fail_msg="Failed to install $cuda_ec, woopsie..." @@ -462,9 +462,9 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps # (which unbreaks the symlinks from the runtime installation) echo ">> Installing $cuda_ec under host_injections..." -$(dirname "$BASH_SOURCE")/gpu_support/cuda_utils/install_cuda_host_injections.sh 11.3.1 -ok_msg="$cuda_ec (re)installed under host_injections!" -fail_msg="Failed to install $cuda_ec under host_injections, woopsie..." +$(dirname "$BASH_SOURCE")/gpu_support/cuda_utils/install_cuda_host_injections.sh ${cuda_version} +ok_msg="CUDA $cuda_version (re)installed under host_injections!" +fail_msg="Failed to install CUDA $cuda_version under host_injections, woopsie..." check_exit_code $? "${ok_msg}" "${fail_msg}" # install CUDA samples (requires EESSI support for CUDA) From 5e82923f01ff3089074e0760a1f64cfa5d3a5b2d Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 09:51:37 +0100 Subject: [PATCH 23/49] Use TOPDIR, be more descriptive --- EESSI-pilot-install-software.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index f535634631..035f851d61 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -461,8 +461,8 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps # (which unbreaks the symlinks from the runtime installation) -echo ">> Installing $cuda_ec under host_injections..." -$(dirname "$BASH_SOURCE")/gpu_support/cuda_utils/install_cuda_host_injections.sh ${cuda_version} +echo ">> Re-installing CUDA $cuda_version under host_injections (to un-break symlinks in EESSI installation)..." +"${TOPDIR}"/gpu_support/cuda_utils/install_cuda_host_injections.sh ${cuda_version} ok_msg="CUDA $cuda_version (re)installed under host_injections!" fail_msg="Failed to install CUDA $cuda_version under host_injections, woopsie..." check_exit_code $? "${ok_msg}" "${fail_msg}" From 8384b25a9658944fa57de03da7dcac84ae5b0a8e Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 10:06:45 +0100 Subject: [PATCH 24/49] Add missing argument --- scripts/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index c1342c21fc..ef9e4095c2 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -41,7 +41,7 @@ function create_directory_structure() { dir_structure="$1" # Attempt to create the directory structure - error_message=$(mkdir -p 2>&1) + error_message=$(mkdir -p "$dir_structure" 2>&1) return_code=$? # If it fails be explicit about the error if [ ${return_code} -ne 0 ]; then From 44de61c2c68b6f96cdb9c1f137dc73a46fc00a32 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 11:25:38 +0100 Subject: [PATCH 25/49] Reuse utils.sh --- gpu_support/add_nvidia_gpu_support.sh | 34 ++++++++++++++------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 8706b492eb..5cc4f0eb30 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -3,6 +3,10 @@ # Drop into the prefix shell or pipe this script into a Prefix shell with # $EPREFIX/startprefix <<< /path/to/this_script.sh +TOPDIR=$(dirname $(realpath $0)) + +source $TOPDIR/scripts/utils.sh + install_cuda="${INSTALL_CUDA:=false}" eessi_version="${EESSI_PILOT_VERSION:=latest}" @@ -17,21 +21,21 @@ if [[ "${install_wo_gpu}" != "true" ]]; then if command -v nvidia-smi > /dev/null 2>&1; then nvidia-smi > /dev/null 2>&1 if [ $? -ne 0 ]; then - echo "nvidia-smi was found but returned error code, exiting now..." >&2 - echo "If you do not have a GPU on this device but wish to force the installation," - echo "please set the environment variable INSTALL_WO_GPU=true" - exit 1 + error="nvidia-smi was found but returned error code, exiting now...\n" + error="${error}If you do not have a GPU on this device but wish to force the installation,\n" + error="${error}please set the environment variable INSTALL_WO_GPU=true" + fatal_error "$error" fi echo "nvidia-smi found, continue setup." else - echo "nvidia-smi not found, exiting now..." >&2 - echo "If you do not have a GPU on this device but wish to force the installation," - echo "please set the environment variable INSTALL_WO_GPU=true" - exit 1 + error="nvidia-smi not found, exiting now...\n" + error="${error}If you do not have a GPU on this device but wish to force the installation,\n" + error="${error}please set the environment variable INSTALL_WO_GPU=true\n" + fatal_error "$error" fi else - echo "You requested to install CUDA without GPUs present." - echo "This means that all GPU-dependent tests/checks will be skipped!" + echo_green "You requested to install CUDA without GPUs present." + echo_green "This means that all GPU-dependent tests/checks will be skipped!" fi EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/"${eessi_version}"/init/bash @@ -49,9 +53,8 @@ if [[ "${install_wo_gpu}" != "true" ]]; then driver_major_version="${driver_major_version%%.*}" # Now check driver_version for compatibility # Check driver is at least LTS driver R450, see https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers - if (( $driver_major_version < 450 )); then - echo "Your NVIDIA driver version is too old, please update first.." - exit 1 + if (( driver_major_version < 450 )); then + fatal_error "Your NVIDIA driver version ($driver_major_version) is too old, please update first.." fi fi @@ -66,12 +69,11 @@ latest_cuda_version="${cuda_versions[0]}" # EESSI starts with CUDA 11, no need if [ "${install_cuda}" != false ]; then for cuda_version in "${cuda_versions[@]}" do - bash $(dirname "$BASH_SOURCE")/cuda_utils/install_cuda.sh ${install_cuda_version} + "$TOPDIR"/cuda_utils/install_cuda_host_injections.sh "${latest_cuda_version}" done fi ############################################################################################### # Prepare installation of CUDA compat libraries, i.e. install p7zip if it is missing ############################################################################################### # Try installing different versions of CUDA compat libraries until the test works. -# Otherwise, give up -bash $(dirname "$BASH_SOURCE")/cuda_utils/install_cuda_compatlibs_loop.sh "${latest_cuda_version}" +"$TOPDIR"/cuda_utils/install_cuda_compatlibs_loop.sh "${latest_cuda_version}" From 98fe2a747fef928e123288fba157433322473dac Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 11:49:48 +0100 Subject: [PATCH 26/49] Improve error messages in new bash function --- scripts/utils.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index ef9e4095c2..b9aad997e1 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -45,8 +45,18 @@ function create_directory_structure() { return_code=$? # If it fails be explicit about the error if [ ${return_code} -ne 0 ]; then - echo_red "Creating ${dir_structure} failed with\n${error_message}" >&2 + real_dir=$(readlink -m "$dir_structure") + echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 + else + # If we're creating it, our use case is that we want to be able to write there + # (this is a check in case the directory already existed) + if [ ! -w "${dir_structure}" ]; then + real_dir=$(readlink -m "$dir_structure") + echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" + return $ANY_ERROR_EXITCODE + fi fi + return $return_code } From bbe7df210da6594ba94a27cf6ce072c7042cb24b Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 11:53:09 +0100 Subject: [PATCH 27/49] Stick with return_code --- scripts/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index b9aad997e1..b052e0a1ec 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -53,7 +53,7 @@ function create_directory_structure() { if [ ! -w "${dir_structure}" ]; then real_dir=$(readlink -m "$dir_structure") echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" - return $ANY_ERROR_EXITCODE + return_code=$ANY_ERROR_EXITCODE fi fi From 95dc24509434d38e353f21a54413d771ebfdc401 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 11:56:02 +0100 Subject: [PATCH 28/49] Use realpath to be consistent with other scripts --- scripts/utils.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index b052e0a1ec..099c1712ef 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -45,13 +45,13 @@ function create_directory_structure() { return_code=$? # If it fails be explicit about the error if [ ${return_code} -ne 0 ]; then - real_dir=$(readlink -m "$dir_structure") + real_dir=$(realpath -s "$dir_structure") echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 else # If we're creating it, our use case is that we want to be able to write there # (this is a check in case the directory already existed) if [ ! -w "${dir_structure}" ]; then - real_dir=$(readlink -m "$dir_structure") + real_dir=$(realpath -s "$dir_structure") echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" return_code=$ANY_ERROR_EXITCODE fi From a1270f2359d64f9ee19f7ac10645e90d71ee2437 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 11:58:22 +0100 Subject: [PATCH 29/49] Wrong realpath flag --- scripts/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index 099c1712ef..b5aa430985 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -51,7 +51,7 @@ function create_directory_structure() { # If we're creating it, our use case is that we want to be able to write there # (this is a check in case the directory already existed) if [ ! -w "${dir_structure}" ]; then - real_dir=$(realpath -s "$dir_structure") + real_dir=$(realpath -m "$dir_structure") echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" return_code=$ANY_ERROR_EXITCODE fi From aba486de42f250b7ada1d7459188ba579337540b Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 11:59:06 +0100 Subject: [PATCH 30/49] Wrong realpath flag --- scripts/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index b5aa430985..f043ba0ca6 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -45,7 +45,7 @@ function create_directory_structure() { return_code=$? # If it fails be explicit about the error if [ ${return_code} -ne 0 ]; then - real_dir=$(realpath -s "$dir_structure") + real_dir=$(realpath -m "$dir_structure") echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 else # If we're creating it, our use case is that we want to be able to write there From d2d1fc3b4a6b4e5372f1218b3da86f621f09f674 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 12:06:58 +0100 Subject: [PATCH 31/49] Fix typo --- .../cuda_utils/install_cuda_host_injections.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/gpu_support/cuda_utils/install_cuda_host_injections.sh b/gpu_support/cuda_utils/install_cuda_host_injections.sh index 0684c7ca80..4264f06b83 100755 --- a/gpu_support/cuda_utils/install_cuda_host_injections.sh +++ b/gpu_support/cuda_utils/install_cuda_host_injections.sh @@ -42,11 +42,11 @@ else # Let's see if we have sources and build locations defined if not, we use the temporary space if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then export EASYBUILD_BUILDPATH=${tmpdir}/build - required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) fi if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then export EASYBUILD_SOURCEPATH=${tmpdir}/sources - required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) fi # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), @@ -57,9 +57,10 @@ else fi avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') if (( avail_space < required_space_in_tmpdir )); then - error1="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" - error2="You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH to reduce this requirement. Exiting now..." - fatal_error "${error1}${error2}" + error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" + error="${error}You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " + error="${error}to reduce this requirement. Exiting now..." + fatal_error "${error}" fi if [[ -z "${EBROOTEASYBUILD}" ]]; then @@ -69,11 +70,12 @@ else # we need the --rebuild option and a (random) dir for the module if the module # file is shipped with EESSI - if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + if [ -f "${EESSI_SOFTWARE_PATH}"/modules/all/CUDA/"${install_cuda_version}".lua ]; then extra_args="--rebuild --installpath-modules=${tmpdir}" fi # We don't want hooks used in this install, we need a vanilla CUDA installation touch "$tmpdir"/none.py + # shellcheck disable=SC2086 # Intended splitting of extra_args eb ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb ret=$? if [ $ret -ne 0 ]; then From 562e94b7beba8e9e4fc71c84a6ad5b154ad9283f Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 12:11:49 +0100 Subject: [PATCH 32/49] Always add the rebuild option if we get to the point where we actually install CUDA --- gpu_support/cuda_utils/install_cuda_host_injections.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gpu_support/cuda_utils/install_cuda_host_injections.sh b/gpu_support/cuda_utils/install_cuda_host_injections.sh index 4264f06b83..0d68759c1e 100755 --- a/gpu_support/cuda_utils/install_cuda_host_injections.sh +++ b/gpu_support/cuda_utils/install_cuda_host_injections.sh @@ -68,11 +68,10 @@ else module load EasyBuild fi - # we need the --rebuild option and a (random) dir for the module if the module - # file is shipped with EESSI - if [ -f "${EESSI_SOFTWARE_PATH}"/modules/all/CUDA/"${install_cuda_version}".lua ]; then - extra_args="--rebuild --installpath-modules=${tmpdir}" - fi + # we need the --rebuild option and a (random) dir for the module since we are + # fixing the broken links of the EESSI-shipped installation + extra_args="--rebuild --installpath-modules=${tmpdir}" + # We don't want hooks used in this install, we need a vanilla CUDA installation touch "$tmpdir"/none.py # shellcheck disable=SC2086 # Intended splitting of extra_args From b4ae5f027f77fd38cf215b0fdf04b8136760d414 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 12:42:31 +0100 Subject: [PATCH 33/49] Expose CUDA_TEMP_DIR --- gpu_support/cuda_utils/install_cuda_host_injections.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gpu_support/cuda_utils/install_cuda_host_injections.sh b/gpu_support/cuda_utils/install_cuda_host_injections.sh index 0d68759c1e..1ddccf4e82 100755 --- a/gpu_support/cuda_utils/install_cuda_host_injections.sh +++ b/gpu_support/cuda_utils/install_cuda_host_injections.sh @@ -58,7 +58,8 @@ else avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') if (( avail_space < required_space_in_tmpdir )); then error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" - error="${error}You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " + error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check." + error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " error="${error}to reduce this requirement. Exiting now..." fatal_error "${error}" fi @@ -75,7 +76,7 @@ else # We don't want hooks used in this install, we need a vanilla CUDA installation touch "$tmpdir"/none.py # shellcheck disable=SC2086 # Intended splitting of extra_args - eb ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb + eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb ret=$? if [ $ret -ne 0 ]; then fatal_error "CUDA installation failed, please check EasyBuild logs..." From e7728d72408ca9766db5de0f243c3f07f385c87b Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 13:41:09 +0100 Subject: [PATCH 34/49] Update test script --- gpu_support/add_nvidia_gpu_support.sh | 4 +- gpu_support/cuda_utils/test_cuda.sh | 87 +++++++-------------------- 2 files changed, 23 insertions(+), 68 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 5cc4f0eb30..98e0e4c365 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -34,8 +34,8 @@ if [[ "${install_wo_gpu}" != "true" ]]; then fatal_error "$error" fi else - echo_green "You requested to install CUDA without GPUs present." - echo_green "This means that all GPU-dependent tests/checks will be skipped!" + echo_yellow "You requested to install CUDA without GPUs present." + echo_yellow "This means that all GPU-dependent tests/checks will be skipped!" fi EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/"${eessi_version}"/init/bash diff --git a/gpu_support/cuda_utils/test_cuda.sh b/gpu_support/cuda_utils/test_cuda.sh index 6939c77815..7d78045342 100644 --- a/gpu_support/cuda_utils/test_cuda.sh +++ b/gpu_support/cuda_utils/test_cuda.sh @@ -1,82 +1,37 @@ #!/bin/bash -install_cuda_version=$1 -save_compiled_test="${SAVE_COMPILED_TEST:=false}" +# Initialise our bash functions +source scripts/utils.sh -# Test CUDA -cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}" -current_dir=$PWD -if [ -d ${cuda_install_dir}/modules/all ]; then - module use ${cuda_install_dir}/modules/all/ +if [[ $# -eq 0 ]] ; then + fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" +fi +cuda_version=$1 +if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" else - echo "Cannot test CUDA, modules path does not exist, exiting now..." - exit 1 + # We can figure out the (EasyBuild MNS) CUDA samples version we need since we know the version suffix + cuda_samples_version=$(basename "$(ls -d "${EESSI_SOFTWARE_PATH}"/software/CUDA-Samples/*-CUDA-"${cuda_version}")") fi -module load CUDA/${install_cuda_version} + +# Test CUDA (making sure to use EasyBuild MNS) +unset MODULEPATH +module use "${EESSI_SOFTWARE_PATH}"/modules/all +module load CUDA-Samples/"${cuda_samples_version}" ret=$? if [ $ret -ne 0 ]; then - echo "Could not load CUDA even though modules path exists..." - exit 1 -fi -# if we don't want to save the compiled sample, it means we have a shipped version available -if [ "${save_compiled_test}" != false ]; then - tmp_dir=$(mktemp -d) - # convert cuda version to an integer so we can test if the samples are shipped with this version - # starting from version 11.6 the samples can be found in a github repo - cuda_version=$(echo ${install_cuda_version} | cut -f1,2 -d'.') - cuda_version=${cuda_version//./} - if (( ${cuda_version} < 116 )); then - cp -r $EBROOTCUDA/samples $tmp_dir - cd $tmp_dir/samples/1_Utilities/deviceQuery - else - git clone https://github.com/NVIDIA/cuda-samples.git ${tmp_dir} -q - cd $tmp_dir/Samples/1_Utilities/deviceQuery - fi - module load GCCcore - ret=$? - if [ $ret -ne 0 ]; then - echo "Could not load GCC, but it should have been shipped with EESSI?! Exiting..." - exit 1 - fi - make HOST_COMPILER=$(which g++) -j -else - cd ${EESSI_SOFTWARE_PATH}/software/CUDA/${install_cuda_version} + fatal_error "Could not load CUDA samples module CUDA-Samples/${cuda_samples_version}\n (MODULEPATH=$MODULEPATH)..." fi -./deviceQuery -if [ $? -eq 0 ] +if deviceQuery; then - # Set the color variable - green='\033[0;32m' - # Clear the color after that - clear='\033[0m' - echo -e ${green} - echo "Congratulations, your GPU is working with EESSI!" - echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" - echo " - To use these modules:" - echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" - echo -e ${clear} - - if [ "${save_compiled_test}" != false ]; then - mv deviceQuery ${EESSI_SOFTWARE_PATH}/software/CUDA/${install_cuda_version} - fi - - # Clean up - cd $current_dir - if [ "${save_compiled_test}" != false ]; then - rm -rf $tmp_dir - fi + echo_green "Congratulations, your GPU is working with EESSI!" else - echo "Uff, your GPU doesn't seem to be working with EESSI :(" >&2 - # Clean up - cd $current_dir - if [ "${save_compiled_test}" != false ]; then - rm -rf $tmp_dir - fi - exit 1 + echo_yellow "Uff, your GPU doesn't seem to be working with EESSI :(" >&2 + exit "${ANY_ERROR_EXITCODE}" fi -# Test a CUDA-enabled module from EESSI +# Test another CUDA-enabled module from EESSI # TODO: GROMACS? # TODO: Include a GDR copy test? ############################################################################################### From c12224126a5b43187786d697a0ad85e97dd1639f Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 16:23:58 +0100 Subject: [PATCH 35/49] Polish a lot of the compat scripts --- EESSI-pilot-install-software.sh | 6 +- .../cuda_utils/install_cuda_compatlibs.sh | 74 +++++++++++-------- .../install_cuda_compatlibs_loop.sh | 58 ++++++--------- .../cuda_utils/prepare_cuda_compatlibs.sh | 31 -------- gpu_support/cuda_utils/test_cuda.sh | 14 ++-- scripts/utils.sh | 44 +++++++++++ 6 files changed, 118 insertions(+), 109 deletions(-) delete mode 100755 gpu_support/cuda_utils/prepare_cuda_compatlibs.sh diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 035f851d61..530622c7f4 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -72,11 +72,7 @@ else fi # make sure we're in Prefix environment by checking $SHELL -if [[ ${SHELL} = ${EPREFIX}/bin/bash ]]; then - echo_green ">> It looks like we're in a Gentoo Prefix environment, good!" -else - fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" -fi +check_in_prefix_shell # avoid that pyc files for EasyBuild are stored in EasyBuild installation directory export PYTHONPYCACHEPREFIX=$TMPDIR/pycache diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs.sh b/gpu_support/cuda_utils/install_cuda_compatlibs.sh index 11a7aa7e3d..721ee6f82e 100644 --- a/gpu_support/cuda_utils/install_cuda_compatlibs.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs.sh @@ -1,30 +1,42 @@ #!/bin/bash +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $0)) +source "$TOPDIR"/../../scripts/utils.sh + +# Expect to be in a prefix shell so we know all our required commands exist +check_in_prefix_shell + +# Make sure the EESSI environment has been initialised +check_eessi_initialised + libs_url=$1 -cuda_install_dir=$2 +required_cuda_version=$2 -current_dir=$(dirname $(realpath $0)) +current_dir=$PWD host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia" host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} -# Create a general space for our NVIDIA compat drivers -if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then - mkdir -p ${host_injections_dir} -else - echo "Cannot write to eessi host_injections space, exiting now..." >&2 - exit 1 -fi -cd ${host_injections_dir} - # Check if our target CUDA is satisfied by what is installed already -# TODO: Find required CUDA version and see if we need an update -driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) -eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//) -if [[ $driver_cuda_version =~ ^[0-9]+$ ]]; then - if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi +# (driver CUDA is reported as major.minor, i.e., like a float) +driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') +eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') +cuda_major_minor=${required_cuda_version%.*} + +if [[ ${driver_cuda_version%.*} =~ ^[0-9]+$ ]]; then + if float_greater_than $driver_cuda_version $eessi_cuda_version ; then + echo_yellow "You need to update your CUDA compatibility libraries!" + elif [[ ${eessi_cuda_version%.*} =~ ^[0-9]+$ ]]; then + if float_greater_than $eessi_cuda_version $cuda_major_minor ; then + echo_green "Existing CUDA compatibility libraries in EESSI should be ok!" + exit 0 + fi + else + echo_yellow "Installing CUDA compatibility libraries" + fi fi -# If not, grab the latest compat library RPM or deb +# Grab the latest compat library RPM or deb # Download and unpack in temporary directory, easier cleanup after installation tmpdir=$(mktemp -d) cd $tmpdir @@ -37,10 +49,6 @@ echo $compat_file # Keep support for deb files in case it is needed in the future file_extension=${compat_file##*.} if [[ ${file_extension} == "rpm" ]]; then - # p7zip is installed under host_injections for now, make that known to the environment - if [ -d ${cuda_install_dir}/modules/all ]; then - module use ${cuda_install_dir}/modules/all/ - fi # Load p7zip to extract files from rpm file module load p7zip # Extract .cpio @@ -58,9 +66,16 @@ else echo "File extension of cuda compat lib not supported, exiting now..." >&2 exit 1 fi + +# Create a general space for our NVIDIA compat drivers +if ! create_directory_structure $host_injections_dir ; then + fatal_error "Cannot create/write to $host_injections_dir space, exiting now..." +fi cd $host_injections_dir +# install the compat libs cuda_dir=$(basename ${tmpdir}/usr/local/cuda-*) -# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir +# TODO: This would prevent error messages if folder already exists, but +# could be problematic if only some files are missing in destination dir rm -rf ${cuda_dir} mv -n ${tmpdir}/usr/local/cuda-* . rm -r ${tmpdir} @@ -69,19 +84,20 @@ rm -r ${tmpdir} ln -sfn ${cuda_dir} latest if [ ! -e latest ] ; then - echo "Symlink to latest cuda compat lib version is broken, exiting now..." - exit 1 + fatal_error "Symlink to latest cuda compat lib version is broken, exiting now..." fi -# Create the space to host the libraries -mkdir -p ${host_injection_linker_dir} # Symlink in the path to the latest libraries if [ ! -d "${host_injection_linker_dir}/lib" ]; then + # Create the space to host the libraries for the linker + if ! create_directory_structure ${host_injection_linker_dir} ; then + fatal_error "Cannot create/write to ${host_injection_linker_dir} space, exiting now..." + fi ln -s ${host_injections_dir}/latest/compat ${host_injection_linker_dir}/lib elif [ ! "${host_injection_linker_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then - echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..." - echo "${host_injection_linker_dir}/lib should point to ${host_injections_dir}/latest/compat" - exit 1 + error_msg="CUDA compat libs symlink exists but points to the wrong location, please fix this...\n" + error_msg="${error_msg}${host_injection_linker_dir}/lib should point to ${host_injections_dir}/latest/compat" + fatal_error $error_msg fi # return to initial dir diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh index 9e066e0c65..693277a033 100644 --- a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh @@ -1,23 +1,27 @@ #!/bin/bash +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $0)) +source "$TOPDIR"/../../scripts/utils.sh + install_cuda_version=$1 +MAXLOOPS=10 + # Check if the CUDA compat libraries are installed and compatible with the target CUDA version # if not find the latest version of the compatibility libraries and install them # get URL to latest CUDA compat libs, exit if URL is invalid -cuda_compat_urls="$($(dirname "$BASH_SOURCE")/get_cuda_compatlibs.sh)" +cuda_compat_urls="$($TOPDIR/get_cuda_compatlibs.sh)" ret=$? if [ $ret -ne 0 ]; then - echo "Couldn't find current URLs of the CUDA compat libraries, instead got:" - echo $cuda_compat_urls - exit 1 + fatal_error "Couldn't find current URLs of the CUDA compat libraries, instead got:\n $cuda_compat_urls" fi # loop over the compat library versions until we get one that works for us keep_driver_check=1 -# Do a maximum of five attempts -for value in {1..5} +# Do a maximum of MAXLOOPS attempts +for value in {1..$MAXLOOPS} do latest_cuda_compat_url=$(echo "$cuda_compat_urls" | cut -d " " -f1) # Chomp that value out of the list @@ -55,49 +59,29 @@ do fi if [ "${install_compat_libs}" == true ]; then - bash $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh ${latest_cuda_compat_url} ${cuda_install_dir} + bash $TOPDIR/install_cuda_compatlibs.sh ${latest_cuda_compat_url} ${install_cuda_version} fi if [[ "${install_wo_gpu}" != "true" ]]; then - bash $(dirname "$BASH_SOURCE")/test_cuda.sh "${install_cuda_version}" + bash $TOPDIR/test_cuda.sh "${install_cuda_version}" if [ $? -eq 0 ] then cuda_version_file="${host_injections_dir}/latest/version.txt" echo "${install_cuda_version}" > ${cuda_version_file} exit 0 else - echo - echo "It looks like your driver is not recent enough to work with that release of CUDA, consider updating!" - echo "I'll try an older release to see if that will work..." - echo + echo_yellow "Your driver does not seem to be not recent enough to work with that release of CUDA compat libs," + echo_yellow "consider updating!" + echo_yellow "I'll try an older release to see if that will work..." fi else - echo "Requested to install CUDA without GPUs present, so we skip final tests." - echo "Instead we test if module load CUDA works as expected..." - if [ -d ${cuda_install_dir}/modules/all ]; then - module use ${cuda_install_dir}/modules/all/ - else - echo "Cannot load CUDA, modules path does not exist, exiting now..." - exit 1 - fi - module load CUDA - ret=$? - if [ $ret -ne 0 ]; then - echo "Could not load CUDA even though modules path exists..." - exit 1 - else - echo "Successfully loaded CUDA, you are good to go! :)" - echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix" - echo " - To use these modules:" - echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/" - echo " - Please keep in mind that we just installed the latest CUDA compat libs." - echo " Since we have no GPU to test with, we cannot guarantee that it will work with the installed CUDA drivers on your GPU node(s)." - exit 0 - fi - break + echo_yellow "Requested to install CUDA without GPUs present, with no way to verify we skip final tests." + echo_yellow "Since we have no GPU to test with, we cannot guarantee that it will work with the installed CUDA" + echo_yellow "drivers on your GPU node(s)." + exit 0 fi done -echo "Tried to install 5 different generations of compat libraries and none worked," -echo "this usually means your driver is very out of date!" +echo "Tried to install $MAXLOOPS different generations of compat libraries and none worked," +echo "this usually means your driver is very out of date (or some other issue)!" exit 1 diff --git a/gpu_support/cuda_utils/prepare_cuda_compatlibs.sh b/gpu_support/cuda_utils/prepare_cuda_compatlibs.sh deleted file mode 100755 index 9efd2b8e66..0000000000 --- a/gpu_support/cuda_utils/prepare_cuda_compatlibs.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -install_p7zip_version=$1 -cuda_install_dir=$2 - -# Install p7zip, this will be used to install the CUDA compat libraries from rpm. -# The rpm and deb files contain the same libraries, so we just stick to the rpm version. -# If p7zip is missing from the software layer (for whatever reason), we need to install it. -# This has to happen in host_injections, so we check first if it is already installed there. -if [ -d ${cuda_install_dir}/modules/all ]; then - module use ${cuda_install_dir}/modules/all/ -fi -module avail 2>&1 | grep -i p7zip &> /dev/null -if [[ $? -eq 0 ]]; then - echo "p7zip module found! No need to install p7zip again, proceeding with installation of compat libraries" -else - # install p7zip in host_injections - export EASYBUILD_IGNORE_OSDEPS=1 - export EASYBUILD_SYSROOT=${EPREFIX} - export EASYBUILD_RPATH=1 - export EASYBUILD_FILTER_ENV_VARS=LD_LIBRARY_PATH - export EASYBUILD_FILTER_DEPS=Autoconf,Automake,Autotools,binutils,bzip2,cURL,DBus,flex,gettext,gperf,help2man,intltool,libreadline,libtool,Lua,M4,makeinfo,ncurses,util-linux,XZ,zlib - export EASYBUILD_MODULE_EXTENSIONS=1 - module load EasyBuild - eb --robot --installpath=${cuda_install_dir}/ p7zip-${install_p7zip_version}.eb - ret=$? - if [ $ret -ne 0 ]; then - echo "p7zip installation failed, please check EasyBuild logs..." - exit 1 - fi -fi diff --git a/gpu_support/cuda_utils/test_cuda.sh b/gpu_support/cuda_utils/test_cuda.sh index 7d78045342..506c4ae849 100644 --- a/gpu_support/cuda_utils/test_cuda.sh +++ b/gpu_support/cuda_utils/test_cuda.sh @@ -1,18 +1,18 @@ #!/bin/bash # Initialise our bash functions -source scripts/utils.sh +TOPDIR=$(dirname $(realpath $0)) +source "$TOPDIR"/../../scripts/utils.sh if [[ $# -eq 0 ]] ; then fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" fi cuda_version=$1 -if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then - fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" -else - # We can figure out the (EasyBuild MNS) CUDA samples version we need since we know the version suffix - cuda_samples_version=$(basename "$(ls -d "${EESSI_SOFTWARE_PATH}"/software/CUDA-Samples/*-CUDA-"${cuda_version}")") -fi + +check_eessi_initialised + +# We can figure out the (EasyBuild MNS) CUDA samples version we need since we know the version suffix +cuda_samples_version=$(basename "$(ls -d "${EESSI_SOFTWARE_PATH}"/software/CUDA-Samples/*-CUDA-"${cuda_version}")") # Test CUDA (making sure to use EasyBuild MNS) unset MODULEPATH diff --git a/scripts/utils.sh b/scripts/utils.sh index f043ba0ca6..60342ba3f4 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -32,6 +32,50 @@ function check_exit_code { fi } +function check_eessi_initialised() { + if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + fatal_error "EESSI has not been initialised!" + else + return 0 + fi +} + +function float_greater_than() { + # Make sure we have two arguments + if [ $# -ne 2 ]; then + echo_red "greater_than_float requires two (float) numbers" >&2 + return $ANY_ERROR_EXITCODE + fi + # Make sure the arguments are numbers + if [[ ! $1 =~ ^[+-]?[0-9]+\.?[0-9]*$ ]]; then + echo_red "Input is not a float." + return $ANY_ERROR_EXITCODE + fi + if [[ ! $2 =~ ^[+-]?[0-9]+\.?[0-9]*$ ]]; then + echo_red "Input is not a float." + return $ANY_ERROR_EXITCODE + fi + # Now do the actual evaluation + return_code=$ANY_ERROR_EXITCODE + result=$(echo $1 $2 | awk '{if ($1 > $2) print "true"}') + if [ "$result" = true ] ; then + return_code=0 + fi + return $return_code +} + +function check_in_prefix_shell() { + # Make sure EPREFIX is defined + if [[ -z "${EPREFIX}" ]]; then + fatal_error "This script cannot be used without having first defined EPREFIX" + fi + if [[ ${SHELL} = ${EPREFIX}/bin/bash ]]; then + echo_green ">> It looks like we're in a Gentoo Prefix environment, good!" + else + fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" + fi +} + function create_directory_structure() { # Ensure we are given a single path argument if [ $# -ne 1 ]; then From e91423f6a9c19081ece5df8980e54e41e20b9f71 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 16:26:47 +0100 Subject: [PATCH 36/49] Make scripts executable --- gpu_support/cuda_utils/install_cuda_compatlibs.sh | 0 gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh | 0 gpu_support/cuda_utils/test_cuda.sh | 0 3 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 gpu_support/cuda_utils/install_cuda_compatlibs.sh mode change 100644 => 100755 gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh mode change 100644 => 100755 gpu_support/cuda_utils/test_cuda.sh diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs.sh b/gpu_support/cuda_utils/install_cuda_compatlibs.sh old mode 100644 new mode 100755 diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh old mode 100644 new mode 100755 diff --git a/gpu_support/cuda_utils/test_cuda.sh b/gpu_support/cuda_utils/test_cuda.sh old mode 100644 new mode 100755 From 186802154cae60c5d160545808cead65d1337f4a Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 16:30:34 +0100 Subject: [PATCH 37/49] Fix shebangs --- gpu_support/cuda_utils/get_cuda_compatlibs.sh | 2 +- gpu_support/cuda_utils/install_cuda_compatlibs.sh | 2 +- gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh | 6 +++--- gpu_support/cuda_utils/install_cuda_host_injections.sh | 2 +- gpu_support/cuda_utils/test_cuda.sh | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/gpu_support/cuda_utils/get_cuda_compatlibs.sh b/gpu_support/cuda_utils/get_cuda_compatlibs.sh index 9639917a27..c9f29a379a 100755 --- a/gpu_support/cuda_utils/get_cuda_compatlibs.sh +++ b/gpu_support/cuda_utils/get_cuda_compatlibs.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Get arch type from EESSI environment if [[ -z "${EESSI_CPU_FAMILY}" ]]; then diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs.sh b/gpu_support/cuda_utils/install_cuda_compatlibs.sh index 721ee6f82e..596f3ac61f 100755 --- a/gpu_support/cuda_utils/install_cuda_compatlibs.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Initialise our bash functions TOPDIR=$(dirname $(realpath $0)) diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh index 693277a033..80d527328c 100755 --- a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Initialise our bash functions TOPDIR=$(dirname $(realpath $0)) @@ -59,11 +59,11 @@ do fi if [ "${install_compat_libs}" == true ]; then - bash $TOPDIR/install_cuda_compatlibs.sh ${latest_cuda_compat_url} ${install_cuda_version} + $TOPDIR/install_cuda_compatlibs.sh ${latest_cuda_compat_url} ${install_cuda_version} fi if [[ "${install_wo_gpu}" != "true" ]]; then - bash $TOPDIR/test_cuda.sh "${install_cuda_version}" + $TOPDIR/test_cuda.sh "${install_cuda_version}" if [ $? -eq 0 ] then cuda_version_file="${host_injections_dir}/latest/version.txt" diff --git a/gpu_support/cuda_utils/install_cuda_host_injections.sh b/gpu_support/cuda_utils/install_cuda_host_injections.sh index 1ddccf4e82..bafff8cb31 100755 --- a/gpu_support/cuda_utils/install_cuda_host_injections.sh +++ b/gpu_support/cuda_utils/install_cuda_host_injections.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Initialise our bash functions source scripts/utils.sh diff --git a/gpu_support/cuda_utils/test_cuda.sh b/gpu_support/cuda_utils/test_cuda.sh index 506c4ae849..d0623bf9a0 100755 --- a/gpu_support/cuda_utils/test_cuda.sh +++ b/gpu_support/cuda_utils/test_cuda.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Initialise our bash functions TOPDIR=$(dirname $(realpath $0)) From 92492d38fa70221bc40b2cc225d836a93fd22cae Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 16:32:37 +0100 Subject: [PATCH 38/49] Fix path to utils --- gpu_support/add_amd_gpu_support.sh | 2 +- gpu_support/add_nvidia_gpu_support.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gpu_support/add_amd_gpu_support.sh b/gpu_support/add_amd_gpu_support.sh index 29c8abdc88..3f5fa13805 100755 --- a/gpu_support/add_amd_gpu_support.sh +++ b/gpu_support/add_amd_gpu_support.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash cat << EOF This is not implemented yet :( diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 98e0e4c365..f906dd5f5a 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -1,11 +1,11 @@ -#!/bin/bash +#!/usr/bin/env bash # Drop into the prefix shell or pipe this script into a Prefix shell with # $EPREFIX/startprefix <<< /path/to/this_script.sh TOPDIR=$(dirname $(realpath $0)) -source $TOPDIR/scripts/utils.sh +source $TOPDIR/../scripts/utils.sh install_cuda="${INSTALL_CUDA:=false}" eessi_version="${EESSI_PILOT_VERSION:=latest}" From 1e145a4cdc2099fc3aadb4c76ec1f5a5a4947cbd Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 16:37:20 +0100 Subject: [PATCH 39/49] Fix path to init scripts. --- gpu_support/add_nvidia_gpu_support.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index f906dd5f5a..b5e3958afc 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -9,6 +9,9 @@ source $TOPDIR/../scripts/utils.sh install_cuda="${INSTALL_CUDA:=false}" eessi_version="${EESSI_PILOT_VERSION:=latest}" +if [ ! "$eessi_version" = "latest" ]; then + eessi_version="versions/$eessi_version" +fi # If you want to install CUDA support on login nodes (typically without GPUs), # set this variable to true. This will skip all GPU-dependent checks From 0c2b6c03f93ddde5baf2d9905276028f878fec99 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 19:57:30 +0100 Subject: [PATCH 40/49] Don't worry too much about args to float_greater_than --- scripts/utils.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index 60342ba3f4..2faeb0788f 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -48,11 +48,11 @@ function float_greater_than() { fi # Make sure the arguments are numbers if [[ ! $1 =~ ^[+-]?[0-9]+\.?[0-9]*$ ]]; then - echo_red "Input is not a float." + echo_yellow "Input to float_greater_than is not a float, ignoring" return $ANY_ERROR_EXITCODE fi if [[ ! $2 =~ ^[+-]?[0-9]+\.?[0-9]*$ ]]; then - echo_red "Input is not a float." + echo_yellow "Input to float_greater_than is not a float, ignoring" return $ANY_ERROR_EXITCODE fi # Now do the actual evaluation From 080f6be76f4a42545f45a79cce561cd9c4b17a65 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 20:13:36 +0100 Subject: [PATCH 41/49] Switch loop to use for --- gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh index 80d527328c..35f141eb18 100755 --- a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh @@ -21,7 +21,7 @@ fi # loop over the compat library versions until we get one that works for us keep_driver_check=1 # Do a maximum of MAXLOOPS attempts -for value in {1..$MAXLOOPS} +for i in $(seq 1 $MAXLOOPS) do latest_cuda_compat_url=$(echo "$cuda_compat_urls" | cut -d " " -f1) # Chomp that value out of the list From 3f4183800ce6b1cc5314b9066d63ad6d317d1b39 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 20:19:16 +0100 Subject: [PATCH 42/49] Test before believing that compat libs work --- gpu_support/cuda_utils/install_cuda_compatlibs.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs.sh b/gpu_support/cuda_utils/install_cuda_compatlibs.sh index 596f3ac61f..e9da39c398 100755 --- a/gpu_support/cuda_utils/install_cuda_compatlibs.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs.sh @@ -28,8 +28,13 @@ if [[ ${driver_cuda_version%.*} =~ ^[0-9]+$ ]]; then echo_yellow "You need to update your CUDA compatibility libraries!" elif [[ ${eessi_cuda_version%.*} =~ ^[0-9]+$ ]]; then if float_greater_than $eessi_cuda_version $cuda_major_minor ; then - echo_green "Existing CUDA compatibility libraries in EESSI should be ok!" - exit 0 + echo_yellow "Existing CUDA compatibility libraries in EESSI should be ok, testing..." + $TOPDIR/test_cuda.sh "${required_cuda_version}" + if [ $? -eq 0 ]; then + exit 0 + else + echo_yellow "Seems not, continuing to install requested version..." + fi fi else echo_yellow "Installing CUDA compatibility libraries" From b10f1e1f33078354e9e003d9b33f4f413a540c7e Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 20:23:02 +0100 Subject: [PATCH 43/49] Tweaks --- gpu_support/add_nvidia_gpu_support.sh | 2 ++ gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh | 2 +- scripts/utils.sh | 4 +--- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index b5e3958afc..189fac71a9 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -6,6 +6,8 @@ TOPDIR=$(dirname $(realpath $0)) source $TOPDIR/../scripts/utils.sh +# Expectation is we are in a Prefix shell (as we need certain commands), let's check + install_cuda="${INSTALL_CUDA:=false}" eessi_version="${EESSI_PILOT_VERSION:=latest}" diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh index 35f141eb18..9de02b6ae1 100755 --- a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh @@ -6,7 +6,7 @@ source "$TOPDIR"/../../scripts/utils.sh install_cuda_version=$1 -MAXLOOPS=10 +MAXLOOPS=12 # Check if the CUDA compat libraries are installed and compatible with the target CUDA version # if not find the latest version of the compatibility libraries and install them diff --git a/scripts/utils.sh b/scripts/utils.sh index 2faeb0788f..8fe8486a04 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -69,9 +69,7 @@ function check_in_prefix_shell() { if [[ -z "${EPREFIX}" ]]; then fatal_error "This script cannot be used without having first defined EPREFIX" fi - if [[ ${SHELL} = ${EPREFIX}/bin/bash ]]; then - echo_green ">> It looks like we're in a Gentoo Prefix environment, good!" - else + if [[ ! ${SHELL} = ${EPREFIX}/bin/bash ]]; then fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" fi } From 31c5549ec4b55b75750ea4037d55667f8c7f5165 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 20:27:48 +0100 Subject: [PATCH 44/49] Add check for prefix shell --- gpu_support/add_nvidia_gpu_support.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 189fac71a9..669691f228 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -7,7 +7,7 @@ TOPDIR=$(dirname $(realpath $0)) source $TOPDIR/../scripts/utils.sh # Expectation is we are in a Prefix shell (as we need certain commands), let's check - +check_in_prefix_shell install_cuda="${INSTALL_CUDA:=false}" eessi_version="${EESSI_PILOT_VERSION:=latest}" From a736122a7e891e266ecf2a72ced286e8fb1b9b9c Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 20:31:29 +0100 Subject: [PATCH 45/49] Add check for prefix shell --- gpu_support/add_nvidia_gpu_support.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 669691f228..69fefc2b12 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -6,8 +6,6 @@ TOPDIR=$(dirname $(realpath $0)) source $TOPDIR/../scripts/utils.sh -# Expectation is we are in a Prefix shell (as we need certain commands), let's check -check_in_prefix_shell install_cuda="${INSTALL_CUDA:=false}" eessi_version="${EESSI_PILOT_VERSION:=latest}" @@ -15,6 +13,11 @@ if [ ! "$eessi_version" = "latest" ]; then eessi_version="versions/$eessi_version" fi +# Initialise EESSI environment +EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/"${eessi_version}"/init/bash +# Expectation is we are in a Prefix shell (as we need certain commands), let's check +check_in_prefix_shell + # If you want to install CUDA support on login nodes (typically without GPUs), # set this variable to true. This will skip all GPU-dependent checks install_wo_gpu=false @@ -43,8 +46,6 @@ else echo_yellow "This means that all GPU-dependent tests/checks will be skipped!" fi -EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/"${eessi_version}"/init/bash - ############################################################################################## # Check that the CUDA driver version is adequate # ( From 1ae6d44ad6092e50bf2600313247d17636e43071 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 20:40:45 +0100 Subject: [PATCH 46/49] Tweak docs --- gpu_support/README.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/gpu_support/README.md b/gpu_support/README.md index 226a7fca6c..f89e905c96 100644 --- a/gpu_support/README.md +++ b/gpu_support/README.md @@ -13,15 +13,6 @@ export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh ``` This will again install the needed compatibility libraries as well as the whole CUDA suite. -If you need a different CUDA version than what is shipped with EESSI, you can also specify that particular version for the script: -``` -export INSTALL_CUDA_VERSION=xx.y.z && export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh -``` -Please note, however, that versions for which the runtime is not shipped with EESSI are not installed in the default modules path. -Thus, you will have to add the following to your modules path to get access to your custom CUDA version: -``` -module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/ -``` ## Prerequisites and tips * You need write permissions to `/cvmfs/pilot.eessi-hpc.org/host_injections` (which by default is a symlink to `/opt/eessi` but can be configured in your CVMFS config file to point somewhere else). If you would like to make a system-wide installation you should change this in your configuration to point somewhere on a shared filesystem. * If you want to install CUDA on a node without GPUs (e.g. on a login node where you want to be able to compile your CUDA-enabled code), you should `export INSTALL_WO_GPU=true` in order to skip checks and tests that can only succeed if you have access to a GPU. This approach is not recommended as there is a chance the CUDA compatibility library installed is not compatible with the existing CUDA driver on GPU nodes (and this will not be detected). From 3445196a3d2168f59d8c9c7de92cd27b1288f07d Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Sat, 4 Mar 2023 14:54:55 +0100 Subject: [PATCH 47/49] Make scripts pass shellcheck --- EESSI-pilot-install-software.sh | 61 +++++++++++-------- gpu_support/README.md | 18 ++++-- gpu_support/add_nvidia_gpu_support.sh | 10 +-- gpu_support/cuda_utils/get_cuda_compatlibs.sh | 8 +-- .../cuda_utils/install_cuda_compatlibs.sh | 50 +++++++-------- .../install_cuda_compatlibs_loop.sh | 16 ++--- gpu_support/cuda_utils/test_cuda.sh | 2 +- scripts/utils.sh | 2 +- 8 files changed, 92 insertions(+), 75 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 530622c7f4..aa22d582e8 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -35,7 +35,7 @@ while [[ $# -gt 0 ]]; do export https_proxy="$2" shift 2 ;; - -*|--*) + -*) echo "Error: Unknown option: $1" >&2 exit 1 ;; @@ -48,12 +48,12 @@ done set -- "${POSITIONAL_ARGS[@]}" -TOPDIR=$(dirname $(realpath $0)) +TOPDIR=$(dirname "$(realpath "$0")") -source $TOPDIR/scripts/utils.sh +source "$TOPDIR"/scripts/utils.sh # honor $TMPDIR if it is already defined, use /tmp otherwise -if [ -z $TMPDIR ]; then +if [ -z "$TMPDIR" ]; then export WORKDIR=/tmp/$USER else export WORKDIR=$TMPDIR/$USER @@ -63,9 +63,9 @@ TMPDIR=$(mktemp -d) echo ">> Setting up environment..." -source $TOPDIR/init/minimal_eessi_env +source "$TOPDIR"/init/minimal_eessi_env -if [ -d $EESSI_CVMFS_REPO ]; then +if [ -d "$EESSI_CVMFS_REPO" ]; then echo_green "$EESSI_CVMFS_REPO available, OK!" else fatal_error "$EESSI_CVMFS_REPO is not available!" @@ -88,8 +88,9 @@ if [[ "$EASYBUILD_OPTARCH" == "GENERIC" ]]; then fi echo ">> Determining software subdirectory to use for current build host..." -if [ -z $EESSI_SOFTWARE_SUBDIR_OVERRIDE ]; then - export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS) +if [ -z "$EESSI_SOFTWARE_SUBDIR_OVERRIDE" ]; then + EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 "$TOPDIR"/eessi_software_subdir.py "$DETECTION_PARAMETERS") + export EESSI_SOFTWARE_SUBDIR_OVERRIDE echo ">> Determined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE via 'eessi_software_subdir.py $DETECTION_PARAMETERS' script" else echo ">> Picking up pre-defined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE: ${EESSI_SOFTWARE_SUBDIR_OVERRIDE}" @@ -98,7 +99,7 @@ fi # Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE) # $EESSI_SILENT - don't print any messages # $EESSI_BASIC_ENV - give a basic set of environment variables -EESSI_SILENT=1 EESSI_BASIC_ENV=1 source $TOPDIR/init/eessi_environment_variables +EESSI_SILENT=1 EESSI_BASIC_ENV=1 source "$TOPDIR"/init/eessi_environment_variables if [[ -z ${EESSI_SOFTWARE_SUBDIR} ]]; then fatal_error "Failed to determine software subdirectory?!" @@ -109,9 +110,10 @@ else fi echo ">> Initializing Lmod..." -source $EPREFIX/usr/share/Lmod/init/bash +source "$EPREFIX"/usr/share/Lmod/init/bash ml_version_out=$TMPDIR/ml.out -ml --version &> $ml_version_out +ml --version &> "$ml_version_out" +# shellcheck disable=SC2181 if [[ $? -eq 0 ]]; then echo_green ">> Found Lmod ${LMOD_VERSION}" else @@ -119,14 +121,14 @@ else fi echo ">> Configuring EasyBuild..." -source $TOPDIR/configure_easybuild +source "$TOPDIR"/configure_easybuild echo ">> Setting up \$MODULEPATH..." # make sure no modules are loaded module --force purge # ignore current $MODULEPATH entirely -module unuse $MODULEPATH -module use $EASYBUILD_INSTALLPATH/modules/all +module unuse "$MODULEPATH" +module use "$EASYBUILD_INSTALLPATH"/modules/all if [[ -z ${MODULEPATH} ]]; then fatal_error "Failed to set up \$MODULEPATH?!" else @@ -137,7 +139,8 @@ REQ_EB_VERSION='4.5.0' echo ">> Checking for EasyBuild module..." ml_av_easybuild_out=$TMPDIR/ml_av_easybuild.out -module avail 2>&1 | grep -i easybuild/${REQ_EB_VERSION} &> ${ml_av_easybuild_out} +module avail 2>&1 | grep -i easybuild/${REQ_EB_VERSION} &> "${ml_av_easybuild_out}" +# shellcheck disable=SC2181 if [[ $? -eq 0 ]]; then echo_green ">> EasyBuild module found!" else @@ -146,7 +149,7 @@ else EB_TMPDIR=${TMPDIR}/ebtmp echo ">> Temporary installation (in ${EB_TMPDIR})..." pip_install_out=${TMPDIR}/pip_install.out - pip3 install --prefix $EB_TMPDIR easybuild &> ${pip_install_out} + pip3 install --prefix "$EB_TMPDIR" easybuild &> "${pip_install_out}" # keep track of original $PATH and $PYTHONPATH values, so we can restore them ORIG_PATH=$PATH @@ -154,11 +157,12 @@ else echo ">> Final installation in ${EASYBUILD_INSTALLPATH}..." export PATH=${EB_TMPDIR}/bin:$PATH - export PYTHONPATH=$(ls -d ${EB_TMPDIR}/lib/python*/site-packages):$PYTHONPATH + PYTHONPATH=$(ls -d "${EB_TMPDIR}"/lib/python*/site-packages):$PYTHONPATH + export PYTHONPATH eb_install_out=${TMPDIR}/eb_install.out ok_msg="Latest EasyBuild release installed, let's go!" fail_msg="Installing latest EasyBuild release failed, that's not good... (output: ${eb_install_out})" - eb --install-latest-eb-release &> ${eb_install_out} + eb --install-latest-eb-release &> "${eb_install_out}" check_exit_code $? "${ok_msg}" "${fail_msg}" # restore origin $PATH and $PYTHONPATH values @@ -169,11 +173,11 @@ else if [[ $? -eq 0 ]]; then ok_msg="EasyBuild v${REQ_EB_VERSION} installed, alright!" fail_msg="Installing EasyBuild v${REQ_EB_VERSION}, yikes! (output: ${eb_install_out})" - eb EasyBuild-${REQ_EB_VERSION}.eb >> ${eb_install_out} 2>&1 + eb EasyBuild-${REQ_EB_VERSION}.eb >> "${eb_install_out}" 2>&1 check_exit_code $? "${ok_msg}" "${fail_msg}" fi - module avail easybuild/${REQ_EB_VERSION} &> ${ml_av_easybuild_out} + module avail easybuild/${REQ_EB_VERSION} &> "${ml_av_easybuild_out}" if [[ $? -eq 0 ]]; then echo_green ">> EasyBuild module installed!" else @@ -184,7 +188,8 @@ fi echo ">> Loading EasyBuild module..." module load EasyBuild/$REQ_EB_VERSION eb_show_system_info_out=${TMPDIR}/eb_show_system_info.out -$EB --show-system-info > ${eb_show_system_info_out} +$EB --show-system-info > "${eb_show_system_info_out}" +# shellcheck disable=SC2181 if [[ $? -eq 0 ]]; then echo_green ">> EasyBuild seems to be working!" $EB --version | grep "${REQ_EB_VERSION}" @@ -196,7 +201,7 @@ if [[ $? -eq 0 ]]; then fi $EB --show-config else - cat ${eb_show_system_info_out} + cat "${eb_show_system_info_out}" fatal_error "EasyBuild not working?!" fi @@ -237,6 +242,7 @@ if [[ $GENERIC -eq 1 ]]; then else openblas_include_easyblocks_from_pr='' fi +# shellcheck disable=SC2086 $EB $openblas_include_easyblocks_from_pr OpenBLAS-0.3.9-GCC-9.3.0.eb --robot check_exit_code $? "${ok_msg}" "${fail_msg}" @@ -410,6 +416,7 @@ $EB CMake-3.20.1-GCCcore-10.3.0.eb --robot --include-easyblocks-from-pr 2248 $EB --from-pr 14584 Rust-1.52.1-GCCcore-10.3.0.eb --robot # use OpenBLAS easyconfig from https://github.com/easybuilders/easybuild-easyconfigs/pull/15885 # which includes a patch to fix installation on POWER +# shellcheck disable=SC2086 $EB $openblas_include_easyblocks_from_pr --from-pr 15885 OpenBLAS-0.3.15-GCC-10.3.0.eb --robot # ignore failing FlexiBLAS tests when building on POWER; # some tests are failing due to a segmentation fault due to "invalid memory reference", @@ -475,14 +482,14 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" echo ">> Creating/updating Lmod cache..." export LMOD_RC="${EASYBUILD_INSTALLPATH}/.lmod/lmodrc.lua" -if [ ! -f $LMOD_RC ]; then - python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH} +if [ ! -f "$LMOD_RC" ]; then + python3 "$TOPDIR"/create_lmodrc.py "${EASYBUILD_INSTALLPATH}" check_exit_code $? "$LMOD_RC created" "Failed to create $LMOD_RC" fi -$TOPDIR/update_lmod_cache.sh ${EPREFIX} ${EASYBUILD_INSTALLPATH} +"$TOPDIR"/update_lmod_cache.sh "${EPREFIX}" "${EASYBUILD_INSTALLPATH}" -$TOPDIR/check_missing_installations.sh +"$TOPDIR"/check_missing_installations.sh echo ">> Cleaning up ${TMPDIR}..." -rm -r ${TMPDIR} +rm -r "${TMPDIR}" diff --git a/gpu_support/README.md b/gpu_support/README.md index f89e905c96..2b075a29dd 100644 --- a/gpu_support/README.md +++ b/gpu_support/README.md @@ -1,18 +1,26 @@ # How to add GPU support The collection of scripts in this directory enables you to add GPU support to your setup. -Note that currently this means that CUDA support can be added for Nvidia GPUs. AMD GPUs are not yet supported (feel free to contribute that though!). +Note that currently this means that CUDA support can be added for Nvidia GPUs. AMD GPUs are not yet supported (feel free +to contribute that though!). To enable the usage of the CUDA runtime in your setup, simply run the following script: ``` ./add_nvidia_gpu_support.sh ``` -This script will install the compatibility libraries (and only those by default!) you need to use the shipped runtime environment of CUDA. +This script will install the compatibility libraries (and only those by default!) you need to use the shipped runtime +environment of CUDA. -If you plan on using the full CUDA suite, i.e. if you want to load the CUDA module, you will have to modify the script execution as follows: +If you plan on using the full CUDA suite, i.e. if you want to load the CUDA module, you will have to modify the script +execution as follows: ``` export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh ``` This will again install the needed compatibility libraries as well as the whole CUDA suite. ## Prerequisites and tips -* You need write permissions to `/cvmfs/pilot.eessi-hpc.org/host_injections` (which by default is a symlink to `/opt/eessi` but can be configured in your CVMFS config file to point somewhere else). If you would like to make a system-wide installation you should change this in your configuration to point somewhere on a shared filesystem. -* If you want to install CUDA on a node without GPUs (e.g. on a login node where you want to be able to compile your CUDA-enabled code), you should `export INSTALL_WO_GPU=true` in order to skip checks and tests that can only succeed if you have access to a GPU. This approach is not recommended as there is a chance the CUDA compatibility library installed is not compatible with the existing CUDA driver on GPU nodes (and this will not be detected). +* You need write permissions to `/cvmfs/pilot.eessi-hpc.org/host_injections` (which by default is a symlink to + `/opt/eessi` but can be configured in your CernVMFS config file to point somewhere else). If you would like to make a + system-wide installation you should change this in your configuration to point somewhere on a shared filesystem. +* If you want to install CUDA on a node without GPUs (e.g. on a login node where you want to be able to compile your + CUDA-enabled code), you should `export INSTALL_WO_GPU=true` in order to skip checks and tests that can only succeed + if you have access to a GPU. This approach is not recommended as there is a chance the CUDA compatibility library + installed is not compatible with the existing CUDA driver on GPU nodes (and this will not be detected). diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh index 69fefc2b12..32fe500a5c 100755 --- a/gpu_support/add_nvidia_gpu_support.sh +++ b/gpu_support/add_nvidia_gpu_support.sh @@ -3,9 +3,9 @@ # Drop into the prefix shell or pipe this script into a Prefix shell with # $EPREFIX/startprefix <<< /path/to/this_script.sh -TOPDIR=$(dirname $(realpath $0)) +TOPDIR=$(dirname "$(realpath "$0")") -source $TOPDIR/../scripts/utils.sh +source "$TOPDIR"/../scripts/utils.sh install_cuda="${INSTALL_CUDA:=false}" eessi_version="${EESSI_PILOT_VERSION:=latest}" @@ -14,12 +14,13 @@ if [ ! "$eessi_version" = "latest" ]; then fi # Initialise EESSI environment +# shellcheck disable=SC1090 EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/"${eessi_version}"/init/bash # Expectation is we are in a Prefix shell (as we need certain commands), let's check check_in_prefix_shell # If you want to install CUDA support on login nodes (typically without GPUs), -# set this variable to true. This will skip all GPU-dependent checks +# set environment variable to true. This will skip all GPU-dependent checks install_wo_gpu=false [ "$INSTALL_WO_GPU" = true ] && install_wo_gpu=true @@ -28,6 +29,7 @@ install_wo_gpu=false if [[ "${install_wo_gpu}" != "true" ]]; then if command -v nvidia-smi > /dev/null 2>&1; then nvidia-smi > /dev/null 2>&1 + # shellcheck disable=SC2181 if [ $? -ne 0 ]; then error="nvidia-smi was found but returned error code, exiting now...\n" error="${error}If you do not have a GPU on this device but wish to force the installation,\n" @@ -75,7 +77,7 @@ latest_cuda_version="${cuda_versions[0]}" # EESSI starts with CUDA 11, no need if [ "${install_cuda}" != false ]; then for cuda_version in "${cuda_versions[@]}" do - "$TOPDIR"/cuda_utils/install_cuda_host_injections.sh "${latest_cuda_version}" + "$TOPDIR"/cuda_utils/install_cuda_host_injections.sh "${cuda_version}" done fi ############################################################################################### diff --git a/gpu_support/cuda_utils/get_cuda_compatlibs.sh b/gpu_support/cuda_utils/get_cuda_compatlibs.sh index c9f29a379a..07194a6c74 100755 --- a/gpu_support/cuda_utils/get_cuda_compatlibs.sh +++ b/gpu_support/cuda_utils/get_cuda_compatlibs.sh @@ -9,11 +9,11 @@ eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}" # build URL for CUDA libraries # take rpm file for compat libs from rhel8 folder, deb and rpm files contain the same libraries -cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/rhel8/"${eessi_cpu_family}"/" -# get all versions in decending order -files=$(curl -s "${cuda_url}" | grep 'cuda-compat' | sed 's/<\/\?[^>]\+>//g' | xargs -n1 | /cvmfs/pilot.eessi-hpc.org/latest/compat/linux/${eessi_cpu_family}/bin/sort -r --version-sort ) +cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${eessi_cpu_family}/" +# get all versions in descending order +files=$(curl -s "${cuda_url}" | grep 'cuda-compat' | sed 's/<\/\?[^>]\+>//g' | xargs -n1 | /cvmfs/pilot.eessi-hpc.org/latest/compat/linux/"${eessi_cpu_family}"/bin/sort -r --version-sort ) if [[ -z "${files// }" ]]; then - echo "Could not find any compat lib files under" ${cuda_url} + echo "Could not find any compat lib files under ${cuda_url}" exit 1 fi for file in $files; do echo "${cuda_url}$file"; done diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs.sh b/gpu_support/cuda_utils/install_cuda_compatlibs.sh index e9da39c398..cb1ddbd7ed 100755 --- a/gpu_support/cuda_utils/install_cuda_compatlibs.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Initialise our bash functions -TOPDIR=$(dirname $(realpath $0)) +TOPDIR=$(dirname "$(realpath "$0")") source "$TOPDIR"/../../scripts/utils.sh # Expect to be in a prefix shell so we know all our required commands exist @@ -24,13 +24,12 @@ eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_L cuda_major_minor=${required_cuda_version%.*} if [[ ${driver_cuda_version%.*} =~ ^[0-9]+$ ]]; then - if float_greater_than $driver_cuda_version $eessi_cuda_version ; then + if float_greater_than "$driver_cuda_version" "$eessi_cuda_version" ; then echo_yellow "You need to update your CUDA compatibility libraries!" elif [[ ${eessi_cuda_version%.*} =~ ^[0-9]+$ ]]; then - if float_greater_than $eessi_cuda_version $cuda_major_minor ; then + if float_greater_than "$eessi_cuda_version" "$cuda_major_minor" ; then echo_yellow "Existing CUDA compatibility libraries in EESSI should be ok, testing..." - $TOPDIR/test_cuda.sh "${required_cuda_version}" - if [ $? -eq 0 ]; then + if "$TOPDIR"/test_cuda.sh "${required_cuda_version}" ; then exit 0 else echo_yellow "Seems not, continuing to install requested version..." @@ -44,10 +43,10 @@ fi # Grab the latest compat library RPM or deb # Download and unpack in temporary directory, easier cleanup after installation tmpdir=$(mktemp -d) -cd $tmpdir +cd "$tmpdir" ||fatal_error "Couldd not 'cd' to ${tmpdir}" compat_file=${libs_url##*/} -wget ${libs_url} -echo $compat_file +wget "${libs_url}" +echo "$compat_file" # Unpack it # rpm files are the default for all OSes @@ -57,15 +56,16 @@ if [[ ${file_extension} == "rpm" ]]; then # Load p7zip to extract files from rpm file module load p7zip # Extract .cpio - 7z x ${compat_file} + 7z x "${compat_file}" # Extract lib* - 7z x ${compat_file/rpm/cpio} + 7z x "${compat_file/rpm/cpio}" # Restore symlinks - cd usr/local/cuda-*/compat - ls *.so *.so.? | xargs -i -I % sh -c '{ echo -n ln -sf" "; cat %; echo " "%; }'| xargs -i sh -c "{}" - cd - + cd usr/local/cuda-*/compat || fatal_error "Could not 'cd' to $(echo "$PWD"/usr/local/cuda-*/compat)" + # shellcheck disable=SC2011 + ls ./*.so ./*.so.? | xargs -i -I % sh -c '{ echo -n ln -sf" "; cat %; echo " "%; }'| xargs -i sh -c "{}" + cd - || fatal_error "Could not 'cd -' from extracted compat dir" elif [[ ${file_extension} == "deb" ]]; then - ar x ${compat_file} + ar x "${compat_file}" tar xf data.tar.* else echo "File extension of cuda compat lib not supported, exiting now..." >&2 @@ -76,17 +76,17 @@ fi if ! create_directory_structure $host_injections_dir ; then fatal_error "Cannot create/write to $host_injections_dir space, exiting now..." fi -cd $host_injections_dir +cd $host_injections_dir || fatal_error "Could not 'cd' to $host_injections_dir" # install the compat libs -cuda_dir=$(basename ${tmpdir}/usr/local/cuda-*) +cuda_dir=$(basename "${tmpdir}"/usr/local/cuda-*) # TODO: This would prevent error messages if folder already exists, but # could be problematic if only some files are missing in destination dir -rm -rf ${cuda_dir} -mv -n ${tmpdir}/usr/local/cuda-* . -rm -r ${tmpdir} +rm -rf "${cuda_dir}" +mv -n "${tmpdir}"/usr/local/cuda-* . +rm -r "${tmpdir}" # Add a symlink that points the latest version to the version we just installed -ln -sfn ${cuda_dir} latest +ln -sfn "${cuda_dir}" latest if [ ! -e latest ] ; then fatal_error "Symlink to latest cuda compat lib version is broken, exiting now..." @@ -95,19 +95,19 @@ fi # Symlink in the path to the latest libraries if [ ! -d "${host_injection_linker_dir}/lib" ]; then # Create the space to host the libraries for the linker - if ! create_directory_structure ${host_injection_linker_dir} ; then + if ! create_directory_structure "${host_injection_linker_dir}" ; then fatal_error "Cannot create/write to ${host_injection_linker_dir} space, exiting now..." fi - ln -s ${host_injections_dir}/latest/compat ${host_injection_linker_dir}/lib + ln -s ${host_injections_dir}/latest/compat "${host_injection_linker_dir}"/lib elif [ ! "${host_injection_linker_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then error_msg="CUDA compat libs symlink exists but points to the wrong location, please fix this...\n" error_msg="${error_msg}${host_injection_linker_dir}/lib should point to ${host_injections_dir}/latest/compat" - fatal_error $error_msg + fatal_error "$error_msg" fi # return to initial dir -cd $current_dir +cd "$current_dir" || fatal_error "Could not 'cd' to $current_dir" echo echo CUDA driver compatability drivers installed for CUDA version: -echo ${cuda_dir/cuda-/} +echo "${cuda_dir/cuda-/}" diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh index 9de02b6ae1..ec210f5781 100755 --- a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Initialise our bash functions -TOPDIR=$(dirname $(realpath $0)) +TOPDIR=$(dirname "$(realpath "$0")") source "$TOPDIR"/../../scripts/utils.sh install_cuda_version=$1 @@ -12,7 +12,7 @@ MAXLOOPS=12 # if not find the latest version of the compatibility libraries and install them # get URL to latest CUDA compat libs, exit if URL is invalid -cuda_compat_urls="$($TOPDIR/get_cuda_compatlibs.sh)" +cuda_compat_urls="$("$TOPDIR"/get_cuda_compatlibs.sh)" ret=$? if [ $ret -ne 0 ]; then fatal_error "Couldn't find current URLs of the CUDA compat libraries, instead got:\n $cuda_compat_urls" @@ -21,11 +21,12 @@ fi # loop over the compat library versions until we get one that works for us keep_driver_check=1 # Do a maximum of MAXLOOPS attempts +# shellcheck disable=SC2034 for i in $(seq 1 $MAXLOOPS) do latest_cuda_compat_url=$(echo "$cuda_compat_urls" | cut -d " " -f1) # Chomp that value out of the list - cuda_compat_urls=$(echo $cuda_compat_urls | cut -d " " -f2-) + cuda_compat_urls=$(echo "$cuda_compat_urls" | cut -d " " -f2-) latest_driver_version="${latest_cuda_compat_url%-*}" latest_driver_version="${latest_driver_version##*-}" # URLs differ for different OSes; check if we already have a number, if not remove string part that is not needed @@ -59,13 +60,12 @@ do fi if [ "${install_compat_libs}" == true ]; then - $TOPDIR/install_cuda_compatlibs.sh ${latest_cuda_compat_url} ${install_cuda_version} + "$TOPDIR"/install_cuda_compatlibs.sh "${latest_cuda_compat_url}" "${install_cuda_version}" fi - if [[ "${install_wo_gpu}" != "true" ]]; then - $TOPDIR/test_cuda.sh "${install_cuda_version}" - if [ $? -eq 0 ] - then + if [[ "${INSTALL_WO_GPU}" != "true" ]]; then + + if "$TOPDIR"/test_cuda.sh "${install_cuda_version}" ; then cuda_version_file="${host_injections_dir}/latest/version.txt" echo "${install_cuda_version}" > ${cuda_version_file} exit 0 diff --git a/gpu_support/cuda_utils/test_cuda.sh b/gpu_support/cuda_utils/test_cuda.sh index d0623bf9a0..a39cdd1cc8 100755 --- a/gpu_support/cuda_utils/test_cuda.sh +++ b/gpu_support/cuda_utils/test_cuda.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Initialise our bash functions -TOPDIR=$(dirname $(realpath $0)) +TOPDIR=$(dirname "$(realpath "$0")") source "$TOPDIR"/../../scripts/utils.sh if [[ $# -eq 0 ]] ; then diff --git a/scripts/utils.sh b/scripts/utils.sh index 8fe8486a04..07760f0dd0 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -57,7 +57,7 @@ function float_greater_than() { fi # Now do the actual evaluation return_code=$ANY_ERROR_EXITCODE - result=$(echo $1 $2 | awk '{if ($1 > $2) print "true"}') + result=$(echo "$1" "$2" | awk '{if ($1 > $2) print "true"}') if [ "$result" = true ] ; then return_code=0 fi From a8ce9678087c26f71862852b2b0e4a25e2bfd440 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Sun, 5 Mar 2023 19:25:30 +0100 Subject: [PATCH 48/49] Only use curl, also be careful about deliberate splitting --- EESSI-pilot-install-software.sh | 3 ++- gpu_support/cuda_utils/install_cuda_compatlibs.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index aa22d582e8..d3cbfe27ed 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -89,7 +89,8 @@ fi echo ">> Determining software subdirectory to use for current build host..." if [ -z "$EESSI_SOFTWARE_SUBDIR_OVERRIDE" ]; then - EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 "$TOPDIR"/eessi_software_subdir.py "$DETECTION_PARAMETERS") + # shellcheck disable=SC2086 + EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 "$TOPDIR"/eessi_software_subdir.py $DETECTION_PARAMETERS) export EESSI_SOFTWARE_SUBDIR_OVERRIDE echo ">> Determined \$EESSI_SOFTWARE_SUBDIR_OVERRIDE via 'eessi_software_subdir.py $DETECTION_PARAMETERS' script" else diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs.sh b/gpu_support/cuda_utils/install_cuda_compatlibs.sh index cb1ddbd7ed..e9372994e0 100755 --- a/gpu_support/cuda_utils/install_cuda_compatlibs.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs.sh @@ -45,7 +45,7 @@ fi tmpdir=$(mktemp -d) cd "$tmpdir" ||fatal_error "Couldd not 'cd' to ${tmpdir}" compat_file=${libs_url##*/} -wget "${libs_url}" +curl -O "${libs_url}" echo "$compat_file" # Unpack it From 9c2d26739f7978a51741ec6af86c8b78232fd7c8 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Sun, 5 Mar 2023 20:48:46 +0100 Subject: [PATCH 49/49] Allow splitting when required --- gpu_support/cuda_utils/install_cuda_compatlibs.sh | 2 +- gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs.sh b/gpu_support/cuda_utils/install_cuda_compatlibs.sh index e9372994e0..f3bad4b695 100755 --- a/gpu_support/cuda_utils/install_cuda_compatlibs.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs.sh @@ -43,7 +43,7 @@ fi # Grab the latest compat library RPM or deb # Download and unpack in temporary directory, easier cleanup after installation tmpdir=$(mktemp -d) -cd "$tmpdir" ||fatal_error "Couldd not 'cd' to ${tmpdir}" +cd "$tmpdir" ||fatal_error "Could not 'cd' to ${tmpdir}" compat_file=${libs_url##*/} curl -O "${libs_url}" echo "$compat_file" diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh index ec210f5781..2d7efc189c 100755 --- a/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh +++ b/gpu_support/cuda_utils/install_cuda_compatlibs_loop.sh @@ -24,9 +24,11 @@ keep_driver_check=1 # shellcheck disable=SC2034 for i in $(seq 1 $MAXLOOPS) do - latest_cuda_compat_url=$(echo "$cuda_compat_urls" | cut -d " " -f1) + # shellcheck disable=SC2086 + latest_cuda_compat_url=$(echo $cuda_compat_urls | cut -d " " -f1) # Chomp that value out of the list - cuda_compat_urls=$(echo "$cuda_compat_urls" | cut -d " " -f2-) + # shellcheck disable=SC2086 + cuda_compat_urls=$(echo $cuda_compat_urls | cut -d " " -f2-) latest_driver_version="${latest_cuda_compat_url%-*}" latest_driver_version="${latest_driver_version##*-}" # URLs differ for different OSes; check if we already have a number, if not remove string part that is not needed