From a86c61462b5d0dab3df979b50aed79ad1b055d0a Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 11:01:25 +0100 Subject: [PATCH 01/31] Add CUDA support to software_layer --- EESSI-pilot-install-software.sh | 28 ++++ eb_hooks.py | 265 ++++++++++++++++++++++---------- eessi-2021.12.yml | 14 ++ 3 files changed, 227 insertions(+), 80 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 2bc6876965..1698066bf4 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -394,6 +394,34 @@ $EB --from-pr 15885 OpenBLAS-0.3.15-GCC-10.3.0.eb --robot $EB SciPy-bundle-2021.05-foss-2021a.eb -r --buildpath /dev/shm/$USER/easybuild_build check_exit_code $? "${ok_msg}" "${fail_msg}" +# CUDA support + +# install p7zip (to be able to unpack RPMs) +p7zip_ec="p7zip-17.04-GCCcore-10.3.0.eb" +echo ">> Installing $p7zip_ec..." +ok_msg="$p7zip_ec installed, off to a good (?) start!" +fail_msg="Failed to install $p7zip_ec, woopsie..." +$EB $p7zip_ec --robot +check_exit_code $? "${ok_msg}" "${fail_msg}" + +# install CUDA (uses eb_hooks.py to only install runtime) +cuda_ec="CUDA-11.3.1.eb" +echo ">> Installing $cuda_ec..." +ok_msg="$cuda_ec installed, off to a good (?) start!" +fail_msg="Failed to install $cuda_ec, woopsie..." +$EB $cuda_ec --robot +check_exit_code $? "${ok_msg}" "${fail_msg}" + +# install CUDA samples (requires EESSI support for CUDA) +# TODO Run EESSI NVIDIA GPU support script here +# (which unbreaks the symlinks from the runtime installation) +cuda_samples_ec="CUDA-Samples-11.3-GCC-10.3.0-CUDA-11.3.1.eb" +echo ">> Installing $cuda_samples_ec..." +ok_msg="$cuda_ec installed, off to a good (?) start!" +fail_msg="Failed to install $cuda_samples_ec, woopsie..." +$EB $cuda_samples_ec --robot --from-pr=16914 +check_exit_code $? "${ok_msg}" "${fail_msg}" + ### add packages here echo ">> Creating/updating Lmod cache..." diff --git a/eb_hooks.py b/eb_hooks.py index df7742f999..c5a0ca9cca 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -8,51 +8,56 @@ from easybuild.tools.systemtools import AARCH64, POWER, X86_64, get_cpu_architecture, get_cpu_features from easybuild.tools.toolchain.compiler import OPTARCH_GENERIC -EESSI_RPATH_OVERRIDE_ATTR = 'orig_rpath_override_dirs' +EESSI_RPATH_OVERRIDE_ATTR = "orig_rpath_override_dirs" + +CUDA_ENABLED_TOOLCHAINS = [ + "fosscuda", + "gcccuda", + "gimpic", + "giolfc", + "gmklc", + "golfc", + "gomklc", + "gompic", + "goolfc", + "iccifortcuda", + "iimklc", + "iimpic", + "intelcuda", + "iomklc", + "iompic", + "nvompic", + "nvpsmpic", +] +PARSE_HOOKS = { + "CGAL": cgal_toolchainopts_precise, + "fontconfig": fontconfig_add_fonts, + "UCX": ucx_eprefix, +} -def get_eessi_envvar(eessi_envvar): - """Get an EESSI environment variable from the environment""" - - eessi_envvar_value = os.getenv(eessi_envvar) - if eessi_envvar_value is None: - raise EasyBuildError("$%s is not defined!", eessi_envvar) - - return eessi_envvar_value - - -def get_rpath_override_dirs(software_name): - # determine path to installations in software layer via $EESSI_SOFTWARE_PATH - eessi_software_path = get_eessi_envvar('EESSI_SOFTWARE_PATH') - eessi_pilot_version = get_eessi_envvar('EESSI_PILOT_VERSION') - - # construct the rpath override directory stub - rpath_injection_stub = os.path.join( - # Make sure we are looking inside the `host_injections` directory - eessi_software_path.replace(eessi_pilot_version, os.path.join('host_injections', eessi_pilot_version), 1), - # Add the subdirectory for the specific software - 'rpath_overrides', - software_name, - # We can't know the version, but this allows the use of a symlink - # to facilitate version upgrades without removing files - 'system', - ) - - # Allow for libraries in lib or lib64 - rpath_injection_dirs = [os.path.join(rpath_injection_stub, x) for x in ('lib', 'lib64')] +PRE_CONFIGURE_HOOKS = { + "libfabric": libfabric_disable_psm3_x86_64_generic, + "MetaBAT": metabat_preconfigure, + "WRF": wrf_preconfigure, +} - return rpath_injection_dirs +POST_PACKAGE_HOOKS = { + "CUDA": cuda_postpackage, +} def parse_hook(ec, *args, **kwargs): """Main parse hook: trigger custom functions based on software name.""" # determine path to Prefix installation in compat layer via $EPREFIX - eprefix = get_eessi_envvar('EPREFIX') + eprefix = get_eessi_envvar("EPREFIX") if ec.name in PARSE_HOOKS: PARSE_HOOKS[ec.name](ec, eprefix) + ec = inject_gpu_property(ec) + def pre_configure_hook(self, *args, **kwargs): """Main pre-configure hook: trigger custom functions based on software name.""" @@ -74,19 +79,21 @@ def pre_prepare_hook(self, *args, **kwargs): # update the relevant option (but keep the original value so we can reset it later) if hasattr(self, EESSI_RPATH_OVERRIDE_ATTR): - raise EasyBuildError("'self' already has attribute %s! Can't use pre_prepare hook.", - EESSI_RPATH_OVERRIDE_ATTR) + raise EasyBuildError( + "'self' already has attribute %s! Can't use pre_prepare hook.", EESSI_RPATH_OVERRIDE_ATTR + ) - setattr(self, EESSI_RPATH_OVERRIDE_ATTR, build_option('rpath_override_dirs')) + setattr(self, EESSI_RPATH_OVERRIDE_ATTR, build_option("rpath_override_dirs")) if getattr(self, EESSI_RPATH_OVERRIDE_ATTR): # self.EESSI_RPATH_OVERRIDE_ATTR is (already) a colon separated string, let's make it a list orig_rpath_override_dirs = [getattr(self, EESSI_RPATH_OVERRIDE_ATTR)] - rpath_override_dirs = ':'.join(orig_rpath_override_dirs + mpi_rpath_override_dirs) + rpath_override_dirs = ":".join(orig_rpath_override_dirs + mpi_rpath_override_dirs) else: - rpath_override_dirs = ':'.join(mpi_rpath_override_dirs) - update_build_option('rpath_override_dirs', rpath_override_dirs) - print_msg("Updated rpath_override_dirs (to allow overriding MPI family %s): %s", - mpi_family, rpath_override_dirs) + rpath_override_dirs = ":".join(mpi_rpath_override_dirs) + update_build_option("rpath_override_dirs", rpath_override_dirs) + print_msg( + "Updated rpath_override_dirs (to allow overriding MPI family %s): %s", mpi_family, rpath_override_dirs + ) def post_prepare_hook(self, *args, **kwargs): @@ -94,30 +101,78 @@ def post_prepare_hook(self, *args, **kwargs): if hasattr(self, EESSI_RPATH_OVERRIDE_ATTR): # Reset the value of 'rpath_override_dirs' now that we are finished with it - update_build_option('rpath_override_dirs', getattr(self, EESSI_RPATH_OVERRIDE_ATTR)) + update_build_option("rpath_override_dirs", getattr(self, EESSI_RPATH_OVERRIDE_ATTR)) print_msg("Resetting rpath_override_dirs to original value: %s", getattr(self, EESSI_RPATH_OVERRIDE_ATTR)) delattr(self, EESSI_RPATH_OVERRIDE_ATTR) +def pre_configure_hook(self, *args, **kwargs): + """Main pre-configure hook: trigger custom functions based on software name.""" + if self.name in PRE_CONFIGURE_HOOKS: + PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs) + + +def post_package_hook(self, *args, **kwargs): + """Main post-package hook: trigger custom functions based on software name.""" + if self.name in POST_PACKAGE_HOOKS: + POST_PACKAGE_HOOKS[self.name](self, *args, **kwargs) + + +# Functions used by hooks + + +def get_eessi_envvar(eessi_envvar): + """Get an EESSI environment variable from the environment""" + + eessi_envvar_value = os.getenv(eessi_envvar) + if eessi_envvar_value is None: + raise EasyBuildError("$%s is not defined!", eessi_envvar) + + return eessi_envvar_value + + +def get_rpath_override_dirs(software_name): + # determine path to installations in software layer via $EESSI_SOFTWARE_PATH + eessi_software_path = get_eessi_envvar("EESSI_SOFTWARE_PATH") + eessi_pilot_version = get_eessi_envvar("EESSI_PILOT_VERSION") + + # construct the rpath override directory stub + rpath_injection_stub = os.path.join( + # Make sure we are looking inside the `host_injections` directory + eessi_software_path.replace(eessi_pilot_version, os.path.join("host_injections", eessi_pilot_version), 1), + # Add the subdirectory for the specific software + "rpath_overrides", + software_name, + # We can't know the version, but this allows the use of a symlink + # to facilitate version upgrades without removing files + "system", + ) + + # Allow for libraries in lib or lib64 + rpath_injection_dirs = [os.path.join(rpath_injection_stub, x) for x in ("lib", "lib64")] + + return rpath_injection_dirs + + def cgal_toolchainopts_precise(ec, eprefix): """Enable 'precise' rather than 'strict' toolchain option for CGAL on POWER.""" - if ec.name == 'CGAL': + if ec.name == "CGAL": if get_cpu_architecture() == POWER: # 'strict' implies '-mieee-fp', which is not supported on POWER # see https://github.com/easybuilders/easybuild-framework/issues/2077 - ec['toolchainopts']['strict'] = False - ec['toolchainopts']['precise'] = True - print_msg("Tweaked toochainopts for %s: %s", ec.name, ec['toolchainopts']) + ec["toolchainopts"]["strict"] = False + ec["toolchainopts"]["precise"] = True + print_msg("Tweaked toochainopts for %s: %s", ec.name, ec["toolchainopts"]) else: raise EasyBuildError("CGAL-specific hook triggered for non-CGAL easyconfig?!") def fontconfig_add_fonts(ec, eprefix): """Inject --with-add-fonts configure option for fontconfig.""" - if ec.name == 'fontconfig': + if ec.name == "fontconfig": # make fontconfig aware of fonts included with compat layer - with_add_fonts = '--with-add-fonts=%s' % os.path.join(eprefix, 'usr', 'share', 'fonts') - ec.update('configopts', with_add_fonts) + with_add_fonts = "--with-add-fonts=%s" % os.path.join(eprefix, "usr", "share", "fonts") + ec.update("configopts", with_add_fonts) print_msg("Added '%s' configure option for %s", with_add_fonts, ec.name) else: raise EasyBuildError("fontconfig-specific hook triggered for non-fontconfig easyconfig?!") @@ -125,29 +180,23 @@ def fontconfig_add_fonts(ec, eprefix): def ucx_eprefix(ec, eprefix): """Make UCX aware of compatibility layer via additional configuration options.""" - if ec.name == 'UCX': - ec.update('configopts', '--with-sysroot=%s' % eprefix) - ec.update('configopts', '--with-rdmacm=%s' % os.path.join(eprefix, 'usr')) - print_msg("Using custom configure options for %s: %s", ec.name, ec['configopts']) + if ec.name == "UCX": + ec.update("configopts", "--with-sysroot=%s" % eprefix) + ec.update("configopts", "--with-rdmacm=%s" % os.path.join(eprefix, "usr")) + print_msg("Using custom configure options for %s: %s", ec.name, ec["configopts"]) else: raise EasyBuildError("UCX-specific hook triggered for non-UCX easyconfig?!") -def pre_configure_hook(self, *args, **kwargs): - """Main pre-configure hook: trigger custom functions based on software name.""" - if self.name in PRE_CONFIGURE_HOOKS: - PRE_CONFIGURE_HOOKS[self.name](self, *args, **kwargs) - - def libfabric_disable_psm3_x86_64_generic(self, *args, **kwargs): """Add --disable-psm3 to libfabric configure options when building with --optarch=GENERIC on x86_64.""" - if self.name == 'libfabric': + if self.name == "libfabric": if get_cpu_architecture() == X86_64: - generic = build_option('optarch') == OPTARCH_GENERIC - no_avx = 'avx' not in get_cpu_features() + generic = build_option("optarch") == OPTARCH_GENERIC + no_avx = "avx" not in get_cpu_features() if generic or no_avx: - self.cfg.update('configopts', '--disable-psm3') - print_msg("Using custom configure options for %s: %s", self.name, self.cfg['configopts']) + self.cfg.update("configopts", "--disable-psm3") + print_msg("Using custom configure options for %s: %s", self.name, self.cfg["configopts"]) else: raise EasyBuildError("libfabric-specific hook triggered for non-libfabric easyconfig?!") @@ -158,10 +207,10 @@ def metabat_preconfigure(self, *args, **kwargs): - take into account that zlib is a filtered dependency, and that there's no libz.a in the EESSI compat layer """ - if self.name == 'MetaBAT': - configopts = self.cfg['configopts'] + if self.name == "MetaBAT": + configopts = self.cfg["configopts"] regex = re.compile(r"\$EBROOTZLIB/lib/libz.a") - self.cfg['configopts'] = regex.sub('$EPREFIX/usr/lib64/libz.so', configopts) + self.cfg["configopts"] = regex.sub("$EPREFIX/usr/lib64/libz.so", configopts) else: raise EasyBuildError("MetaBAT-specific hook triggered for non-MetaBAT easyconfig?!") @@ -171,24 +220,80 @@ def wrf_preconfigure(self, *args, **kwargs): Pre-configure hook for WRF: - patch arch/configure_new.defaults so building WRF with foss toolchain works on aarch64 """ - if self.name == 'WRF': + if self.name == "WRF": if get_cpu_architecture() == AARCH64: pattern = "Linux x86_64 ppc64le, gfortran" repl = "Linux x86_64 aarch64 ppc64le, gfortran" - self.cfg.update('preconfigopts', "sed -i 's/%s/%s/g' arch/configure_new.defaults && " % (pattern, repl)) - print_msg("Using custom preconfigopts for %s: %s", self.name, self.cfg['preconfigopts']) + self.cfg.update("preconfigopts", "sed -i 's/%s/%s/g' arch/configure_new.defaults && " % (pattern, repl)) + print_msg("Using custom preconfigopts for %s: %s", self.name, self.cfg["preconfigopts"]) else: raise EasyBuildError("WRF-specific hook triggered for non-WRF easyconfig?!") -PARSE_HOOKS = { - 'CGAL': cgal_toolchainopts_precise, - 'fontconfig': fontconfig_add_fonts, - 'UCX': ucx_eprefix, -} - -PRE_CONFIGURE_HOOKS = { - 'libfabric': libfabric_disable_psm3_x86_64_generic, - 'MetaBAT': metabat_preconfigure, - 'WRF': wrf_preconfigure, -} +def cuda_post_package(self, *args, **kwargs): + """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections.""" + print_msg("Replacing CUDA stuff we cannot ship with symlinks...") + # read CUDA EULA + eula_path = os.path.join(self.installdir, "EULA.txt") + tmp_buffer = [] + with open(eula_path) as infile: + copy = False + for line in infile: + if line.strip() == "2.6. Attachment A": + copy = True + continue + elif line.strip() == "2.7. Attachment B": + copy = False + continue + elif copy: + tmp_buffer.append(line) + # create whitelist without file extensions, they're not really needed and they only complicate things + whitelist = [] + file_extensions = [".so", ".a", ".h", ".bc"] + for tmp in tmp_buffer: + for word in tmp.split(): + if any(ext in word for ext in file_extensions): + whitelist.append(word.split(".")[0]) + whitelist = list(set(whitelist)) + # iterate over all files in the CUDA path + for root, dirs, files in os.walk(self.installdir): + for filename in files: + # we only really care about real files, i.e. not symlinks + if not os.path.islink(os.path.join(root, filename)): + # check if the current file is part of the whitelist + basename = filename.split(".")[0] + if basename not in whitelist: + # if it is not in the whitelist, delete the file and create a symlink to host_injections + source = os.path.join(root, filename) + target = source.replace("versions", "host_injections") + os.remove(source) + # Using os.symlink requires the existence of the target directory, so we use os.system + os.system("ln %s %s" % (target, source)) + + +def inject_gpu_property(ec): + ec_dict = ec.asdict() + # Check if CUDA is in the dependencies, if so add the GPU Lmod tag + if ( + "CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])] + or ec_dict["toolchain"]["name"] in CUDA_ENABLED_TOOLCHAINS + ): + ec.log.info("[parse hook] Injecting gpu as Lmod arch property and envvar with CUDA version") + key = "modluafooter" + value = 'add_property("arch","gpu")' + cuda_version = 0 + for dep in iter(ec_dict["dependencies"]): + # Make CUDA a build dependency only (rpathing saves us from link errors) + if "CUDA" in dep[0]: + cuda_version = dep[1] + ec_dict["dependencies"].remove(dep) + ec_dict["builddependencies"].append(dep) if dep not in ec_dict["builddependencies"] else ec_dict[ + "builddependencies" + ] + value = "\n".join([value, 'setenv("EESSICUDAVERSION","%s")' % cuda_version]) + if key in ec_dict: + if not value in ec_dict[key]: + ec[key] = "\n".join([ec_dict[key], value]) + else: + ec[key] = value + return ec diff --git a/eessi-2021.12.yml b/eessi-2021.12.yml index 977c0f9804..bbfadb9daa 100644 --- a/eessi-2021.12.yml +++ b/eessi-2021.12.yml @@ -1,4 +1,14 @@ software: + CUDA: + toolchains: + SYSTEM: + versions: '11.3.1' + CUDA-Samples: + toolchains: + GCC-10.3.0: + versions: + '11.3': + versionsuffix: -CUDA-11.3.1 code-server: toolchains: SYSTEM: @@ -29,6 +39,10 @@ software: toolchains: gompi-2020a: versions: ['5.6.3'] + p7zip: + toolchains: + GCCcore-10.3.0: + versions: ['17.04'] QuantumESPRESSO: toolchains: foss-2020a: From 6c41b262d020ce818baa9aedc19c4322d2dee33d Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 13:04:45 +0100 Subject: [PATCH 02/31] singularity install does not seem to install mksquashfs --- .github/workflows/tests_scripts.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml index 18fcd7b255..24ce26ab61 100644 --- a/.github/workflows/tests_scripts.yml +++ b/.github/workflows/tests_scripts.yml @@ -35,6 +35,7 @@ jobs: curl -OL https://dl.fedoraproject.org/pub/epel/8/Everything/x86_64/Packages/s/$singularity_rpm sudo alien -d $singularity_rpm sudo apt install ./singularity*.deb + sudo apt install mksquashfs singularity --version - name: test install_software_layer.sh script From 7d53b030d7acb343baab2866842a913d795fd363 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 13:07:38 +0100 Subject: [PATCH 03/31] Trigger script test --- EESSI-pilot-install-software.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 1698066bf4..16c7c48aa2 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -443,3 +443,4 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" echo ">> Cleaning up ${TMPDIR}..." rm -r ${TMPDIR} +echo nothing From 58357b9b6a405e3cc3d2f6e4bfc173e5f519021b Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 13:11:24 +0100 Subject: [PATCH 04/31] Revert --- EESSI-pilot-install-software.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 16c7c48aa2..1698066bf4 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -443,4 +443,3 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" echo ">> Cleaning up ${TMPDIR}..." rm -r ${TMPDIR} -echo nothing From 4b6654dcb5a9e9957784adc9c824fa8081842af7 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 13:13:29 +0100 Subject: [PATCH 05/31] Use the right package name for squash-fs --- .github/workflows/tests_scripts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml index 24ce26ab61..acacaa54a1 100644 --- a/.github/workflows/tests_scripts.yml +++ b/.github/workflows/tests_scripts.yml @@ -35,7 +35,7 @@ jobs: curl -OL https://dl.fedoraproject.org/pub/epel/8/Everything/x86_64/Packages/s/$singularity_rpm sudo alien -d $singularity_rpm sudo apt install ./singularity*.deb - sudo apt install mksquashfs + sudo apt install squashfs-tools singularity --version - name: test install_software_layer.sh script From 33ce58497b3cc5beff81b5d8f0265d997a0b4b15 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 14:14:09 +0100 Subject: [PATCH 06/31] Tidy up hooks --- eb_hooks.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index c5a0ca9cca..3ab991771f 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -30,22 +30,6 @@ "nvpsmpic", ] -PARSE_HOOKS = { - "CGAL": cgal_toolchainopts_precise, - "fontconfig": fontconfig_add_fonts, - "UCX": ucx_eprefix, -} - -PRE_CONFIGURE_HOOKS = { - "libfabric": libfabric_disable_psm3_x86_64_generic, - "MetaBAT": metabat_preconfigure, - "WRF": wrf_preconfigure, -} - -POST_PACKAGE_HOOKS = { - "CUDA": cuda_postpackage, -} - def parse_hook(ec, *args, **kwargs): """Main parse hook: trigger custom functions based on software name.""" @@ -230,7 +214,7 @@ def wrf_preconfigure(self, *args, **kwargs): raise EasyBuildError("WRF-specific hook triggered for non-WRF easyconfig?!") -def cuda_post_package(self, *args, **kwargs): +def cuda_postpackage(self, *args, **kwargs): """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections.""" print_msg("Replacing CUDA stuff we cannot ship with symlinks...") # read CUDA EULA @@ -297,3 +281,19 @@ def inject_gpu_property(ec): else: ec[key] = value return ec + +PARSE_HOOKS = { + "CGAL": cgal_toolchainopts_precise, + "fontconfig": fontconfig_add_fonts, + "UCX": ucx_eprefix, +} + +PRE_CONFIGURE_HOOKS = { + "libfabric": libfabric_disable_psm3_x86_64_generic, + "MetaBAT": metabat_preconfigure, + "WRF": wrf_preconfigure, +} + +POST_PACKAGE_HOOKS = { + "CUDA": cuda_postpackage, +} From f1cd893823227a934e5537bba50cc6b5e9358502 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 16 Dec 2022 15:25:32 +0100 Subject: [PATCH 07/31] Force creation of links --- eb_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index 3ab991771f..ad55573660 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -252,7 +252,7 @@ def cuda_postpackage(self, *args, **kwargs): target = source.replace("versions", "host_injections") os.remove(source) # Using os.symlink requires the existence of the target directory, so we use os.system - os.system("ln %s %s" % (target, source)) + os.system("ln -s %s %s" % (target, source)) def inject_gpu_property(ec): From 06a9eaf5eb4118528ac130be31c9b1af26ca4306 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Mon, 19 Dec 2022 23:06:44 +0100 Subject: [PATCH 08/31] Install host_injections CUDA --- EESSI-pilot-install-software.sh | 3 ++ eb_hooks.py | 6 ++-- install_cuda_host_injections.sh | 53 +++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) create mode 100755 install_cuda_host_injections.sh diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 1698066bf4..94b71e14c9 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -412,6 +412,9 @@ fail_msg="Failed to install $cuda_ec, woopsie..." $EB $cuda_ec --robot check_exit_code $? "${ok_msg}" "${fail_msg}" +# Add the host_injections CUDA so we can actually build CUDA apps +./install_cuda_host_injections.sh 11.3.1 + # install CUDA samples (requires EESSI support for CUDA) # TODO Run EESSI NVIDIA GPU support script here # (which unbreaks the symlinks from the runtime installation) diff --git a/eb_hooks.py b/eb_hooks.py index ad55573660..4d31a5f4b4 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -232,7 +232,7 @@ def cuda_postpackage(self, *args, **kwargs): elif copy: tmp_buffer.append(line) # create whitelist without file extensions, they're not really needed and they only complicate things - whitelist = [] + whitelist = ['eula'] file_extensions = [".so", ".a", ".h", ".bc"] for tmp in tmp_buffer: for word in tmp.split(): @@ -252,7 +252,9 @@ def cuda_postpackage(self, *args, **kwargs): target = source.replace("versions", "host_injections") os.remove(source) # Using os.symlink requires the existence of the target directory, so we use os.system - os.system("ln -s %s %s" % (target, source)) + system_command="ln -s %s %s" % (target, source) + if os.system(system_command) != 0: + raise EasyBuildError("Failed to create symbolic link: %s" % system_command) def inject_gpu_property(ec): diff --git a/install_cuda_host_injections.sh b/install_cuda_host_injections.sh new file mode 100755 index 0000000000..038ff19ff3 --- /dev/null +++ b/install_cuda_host_injections.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +if [[ $# -eq 0 ]] ; then + echo 'You must provide the CUDA version as an argument, e.g.:' + echo " $0 11.3.1" + exit 1 +fi +install_cuda_version=$1 +if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + echo "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" + exit 1 +else + # As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` + # (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) + cuda_install_dir=${EESSI_SOFTWARE_PATH/versions/host_injections} +fi + +# Only install CUDA if specified version is not found. +# This is only relevant for users, the shipped CUDA installation will +# always be in versions instead of host_injections and have symlinks pointing +# to host_injections for everything we're not allowed to ship +# (existence of easybuild subdir implies a successful install) +if [ -d ${cuda_install_dir}/software/CUDA/${install_cuda_version}/easybuild ]; then + echo "CUDA software found! No need to install CUDA again, proceed with testing." +else + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), need to do a space check before we proceed + avail_space=$(df --output=avail ${cuda_install_dir}/ | tail -n 1 | awk '{print $1}') + if (( ${avail_space} < 16000000 )); then + echo "Need more disk space to install CUDA, exiting now..." + exit 1 + fi + if [[ ! -z "${EBROOTEASYBUILD}" ]]; then + echo "Loading EasyBuild module to do actual install" + module load EasyBuild + fi + # we need the --rebuild option and a random dir for the module if the module file is shipped with EESSI + if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + tmpdir=$(mktemp -d) + extra_args="--rebuild --installpath-modules=${tmpdir}" + fi + # We don't want hooks used in this install, we need a vanilla CUDA installation + touch $tmpdir/none.py + eb ${extra_args} --hooks=$tmpdir/none.py --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb + ret=$? + if [ $ret -ne 0 ]; then + echo "CUDA installation failed, please check EasyBuild logs..." + exit 1 + fi + # clean up tmpdir if it exists + if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + rm -rf ${tmpdir} + fi +fi From b4e80a153390be28dda32fba288f780a5899d430 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Mon, 19 Dec 2022 23:47:06 +0100 Subject: [PATCH 09/31] Move comments to the right place --- EESSI-pilot-install-software.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 94b71e14c9..b7752d3e83 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -413,11 +413,10 @@ $EB $cuda_ec --robot check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps +# (which unbreaks the symlinks from the runtime installation) ./install_cuda_host_injections.sh 11.3.1 # install CUDA samples (requires EESSI support for CUDA) -# TODO Run EESSI NVIDIA GPU support script here -# (which unbreaks the symlinks from the runtime installation) cuda_samples_ec="CUDA-Samples-11.3-GCC-10.3.0-CUDA-11.3.1.eb" echo ">> Installing $cuda_samples_ec..." ok_msg="$cuda_ec installed, off to a good (?) start!" From 2c8697330155ca91a3ea78f2e3de4d46134877cf Mon Sep 17 00:00:00 2001 From: ocaisa Date: Tue, 14 Feb 2023 17:11:18 +0100 Subject: [PATCH 10/31] Reimplement `mkdir -p` reporting where permissions break down This will allow us to log where creating directory structures under `host_injections` is breaking down. --- scripts/utils.sh | 68 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/scripts/utils.sh b/scripts/utils.sh index 5d8455bb68..06fc897912 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -31,3 +31,71 @@ function check_exit_code { fatal_error "${fail_msg}" fi } + +# Reimplement 'mkdir -p' with reporting on where permissions break down +function create_directory_structure() { + # Ensure we are given a single path argument + if [ $# -ne 1 ]; then + echo "Function requires a single (relative or absolute) path argument" >&2 + return 1 + fi + + # set a persistent variable that knows the full structure + # (i.e., retains the value upon recursive calls) + full_structure="${full_structure:="$1"}" + + local directory_structure="$1" + + # Check if directory exists and is writeable + if [ -d "${directory_structure}" ]; then + if [ "${directory_structure}" = "${full_structure}" ]; then + # release our (unneeded) global variable + unset full_structure + fi + if [ -w "${directory_structure}" ]; then + # Nothing to be done + return 0 + else + echo "Directory ${directory_structure} exists but is not writeable" >&2 + return 1 + fi + fi + + local directory_structure_parent=$(dirname "${directory_structure}") + + # If the parent doesn't exist we need to create it + if [ ! -d "${directory_structure_parent}" ]; then + # Create the parent via a recursive call to this function + # (if this doesn't succeed we need to return the error code) + if ! create_directory_structure "${directory_structure_parent}"; then + if [ "${directory_structure}" = "${full_structure}" ]; then + # release our (unneeded) global variable + unset full_structure + fi + return 1 + fi + fi + + # Check the parent is writeable, and create the new subdir + if [ -w "${directory_structure_parent}" ]; then + if [ "${directory_structure}" = "${full_structure}" ]; then + # release our (unneeded) global variable + unset full_structure + fi + if ! mkdir "${directory_structure}"; then + echo "'mkdir ${directory_structure}' failed for an unknown reason!" >&2 + return 1 + else + # Success! + return 0 + fi + else + echo "Attempt to create ${full_structure} failed," \ + "${directory_structure_parent} exists but you don't have write permissions." >&2 + if [ "${directory_structure}" = "${full_structure}" ]; then + # release our global variable + unset full_structure + fi + return 1 + fi +} From 9590047d008f3be05fc809b6b4b3bbd53c1810cb Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 24 Feb 2023 15:47:34 +0100 Subject: [PATCH 11/31] Be more agressive on catching errors --- EESSI-pilot-install-software.sh | 14 ++++++ install_cuda_host_injections.sh | 86 +++++++++++++++++++++++---------- 2 files changed, 74 insertions(+), 26 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 2d1132efdb..d702e96778 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -431,6 +431,16 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # CUDA support +# Need recent version of EasyBuild +echo ">> Installing EasyBuild 4.5.1..." +ok_msg="EasyBuild v4.7.0 installed" +fail_msg="EasyBuild v4.7.0 failed to install" +$EB --from-pr 17065 --include-easyblocks-from-pr 2893 +check_exit_code $? "${ok_msg}" "${fail_msg}" + +LMOD_IGNORE_CACHE=1 module swap EasyBuild/4.7.0 +check_exit_code $? "Swapped to EasyBuild/4.7.0" "Couldn't swap to EasyBuild/4.7.0" + # install p7zip (to be able to unpack RPMs) p7zip_ec="p7zip-17.04-GCCcore-10.3.0.eb" echo ">> Installing $p7zip_ec..." @@ -450,6 +460,10 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps # (which unbreaks the symlinks from the runtime installation) ./install_cuda_host_injections.sh 11.3.1 +echo ">> Installing $cuda_ec under host_injections..." +ok_msg="$cuda_ec (re)installed under host_injections!" +fail_msg="Failed to install $cuda_ec under host_injections, woopsie..." +check_exit_code $? "${ok_msg}" "${fail_msg}" # install CUDA samples (requires EESSI support for CUDA) cuda_samples_ec="CUDA-Samples-11.3-GCC-10.3.0-CUDA-11.3.1.eb" diff --git a/install_cuda_host_injections.sh b/install_cuda_host_injections.sh index 038ff19ff3..59c1d72996 100755 --- a/install_cuda_host_injections.sh +++ b/install_cuda_host_injections.sh @@ -1,18 +1,18 @@ #!/bin/bash +# Initialise our bash functions +source scripts/utils.sh + if [[ $# -eq 0 ]] ; then - echo 'You must provide the CUDA version as an argument, e.g.:' - echo " $0 11.3.1" - exit 1 + fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" fi install_cuda_version=$1 if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then - echo "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" - exit 1 + fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" else # As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` # (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) - cuda_install_dir=${EESSI_SOFTWARE_PATH/versions/host_injections} + cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} fi # Only install CUDA if specified version is not found. @@ -20,34 +20,68 @@ fi # always be in versions instead of host_injections and have symlinks pointing # to host_injections for everything we're not allowed to ship # (existence of easybuild subdir implies a successful install) -if [ -d ${cuda_install_dir}/software/CUDA/${install_cuda_version}/easybuild ]; then - echo "CUDA software found! No need to install CUDA again, proceed with testing." +if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then + echo_green "CUDA software found! No need to install CUDA again, proceed with testing." else - # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), need to do a space check before we proceed - avail_space=$(df --output=avail ${cuda_install_dir}/ | tail -n 1 | awk '{print $1}') - if (( ${avail_space} < 16000000 )); then - echo "Need more disk space to install CUDA, exiting now..." - exit 1 - fi - if [[ ! -z "${EBROOTEASYBUILD}" ]]; then - echo "Loading EasyBuild module to do actual install" + # We need to be able write to the installation space so let's make sure we can + if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then + fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" + fi + + # we need a directory we can use for temporary storage + if [[ -z "${CUDA_TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) + else + tmpdir="${CUDA_TEMP_DIR}"/temp + if ! mkdir "$tmpdir" ; then + fatal_error "Could not create directory ${tmpdir}" + fi + fi + + + required_space_in_tmpdir=50000 + # Let's see if we have sources and build locations defined if not, we use the temporary space + if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) + fi + if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) + fi + + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), + # need to do a space check before we proceed + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < 5000000 )); then + fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." + fi + avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < required_space_in_tmpdir )); then + error1="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" + error2="You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH to reduce this requirement. Exiting now..." + fatal_error "${error1}${error2}" + fi + + if [[ -z "${EBROOTEASYBUILD}" ]]; then + echo_yellow "Loading EasyBuild module to do actual install" module load EasyBuild fi - # we need the --rebuild option and a random dir for the module if the module file is shipped with EESSI + + # we need the --rebuild option and a (random) dir for the module if the module + # file is shipped with EESSI if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then - tmpdir=$(mktemp -d) extra_args="--rebuild --installpath-modules=${tmpdir}" fi # We don't want hooks used in this install, we need a vanilla CUDA installation - touch $tmpdir/none.py - eb ${extra_args} --hooks=$tmpdir/none.py --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb + touch "$tmpdir"/none.py + eb "${extra_args}" --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb ret=$? if [ $ret -ne 0 ]; then - echo "CUDA installation failed, please check EasyBuild logs..." - exit 1 - fi - # clean up tmpdir if it exists - if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then - rm -rf ${tmpdir} + fatal_error "CUDA installation failed, please check EasyBuild logs..." + else + echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" fi + # clean up tmpdir + rm -rf "${tmpdir}" fi From 1357f763b5f27a0be82ed6c443ce3213a7e8361e Mon Sep 17 00:00:00 2001 From: ocaisa Date: Mon, 27 Feb 2023 16:43:08 +0100 Subject: [PATCH 12/31] `${extra_args}` is actually multiple args not a single string --- install_cuda_host_injections.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install_cuda_host_injections.sh b/install_cuda_host_injections.sh index 59c1d72996..b6f83b4e9d 100755 --- a/install_cuda_host_injections.sh +++ b/install_cuda_host_injections.sh @@ -75,7 +75,7 @@ else fi # We don't want hooks used in this install, we need a vanilla CUDA installation touch "$tmpdir"/none.py - eb "${extra_args}" --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb + eb ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb ret=$? if [ $ret -ne 0 ]; then fatal_error "CUDA installation failed, please check EasyBuild logs..." From 8096c5460c583f88d83bb8f2b00f204fb8830d3c Mon Sep 17 00:00:00 2001 From: ocaisa Date: Mon, 27 Feb 2023 16:46:33 +0100 Subject: [PATCH 13/31] Update EESSI-pilot-install-software.sh --- EESSI-pilot-install-software.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index d702e96778..5b604deaa7 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -468,7 +468,7 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # install CUDA samples (requires EESSI support for CUDA) cuda_samples_ec="CUDA-Samples-11.3-GCC-10.3.0-CUDA-11.3.1.eb" echo ">> Installing $cuda_samples_ec..." -ok_msg="$cuda_ec installed, off to a good (?) start!" +ok_msg="$cuda_samples_ec installed, off to a good (?) start!" fail_msg="Failed to install $cuda_samples_ec, woopsie..." $EB $cuda_samples_ec --robot --from-pr=16914 check_exit_code $? "${ok_msg}" "${fail_msg}" From ec31edfbb1954982f197cd44cd296a62e2f47954 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 1 Mar 2023 14:46:56 +0100 Subject: [PATCH 14/31] Catching echo exit code instead of actual code --- EESSI-pilot-install-software.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index 5b604deaa7..a9f6576866 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -459,8 +459,8 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps # (which unbreaks the symlinks from the runtime installation) -./install_cuda_host_injections.sh 11.3.1 echo ">> Installing $cuda_ec under host_injections..." +./install_cuda_host_injections.sh 11.3.1 ok_msg="$cuda_ec (re)installed under host_injections!" fail_msg="Failed to install $cuda_ec under host_injections, woopsie..." check_exit_code $? "${ok_msg}" "${fail_msg}" From 0e99db5127e2735828d22a415aeeb194216f1378 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 1 Mar 2023 14:48:52 +0100 Subject: [PATCH 15/31] Give a full path to the CUDA host injections script --- EESSI-pilot-install-software.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index a9f6576866..c715dd55ad 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -460,7 +460,7 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps # (which unbreaks the symlinks from the runtime installation) echo ">> Installing $cuda_ec under host_injections..." -./install_cuda_host_injections.sh 11.3.1 +$(dirname "$BASH_SOURCE")/install_cuda_host_injections.sh 11.3.1 ok_msg="$cuda_ec (re)installed under host_injections!" fail_msg="Failed to install $cuda_ec under host_injections, woopsie..." check_exit_code $? "${ok_msg}" "${fail_msg}" From cd11792609b4a637f324748089da61f1456827f5 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 1 Mar 2023 15:29:02 +0100 Subject: [PATCH 16/31] Add checks for some whitelist entries for CUDA --- eb_hooks.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/eb_hooks.py b/eb_hooks.py index 4d31a5f4b4..c7358d5f13 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -232,13 +232,18 @@ def cuda_postpackage(self, *args, **kwargs): elif copy: tmp_buffer.append(line) # create whitelist without file extensions, they're not really needed and they only complicate things - whitelist = ['eula'] + whitelist = ['EULA'] file_extensions = [".so", ".a", ".h", ".bc"] for tmp in tmp_buffer: for word in tmp.split(): if any(ext in word for ext in file_extensions): whitelist.append(word.split(".")[0]) whitelist = list(set(whitelist)) + # Do some quick checks for things we should or shouldn't have in the list + if "nvcc" in whitelist: + raise EasyBuildError("Found 'nvcc' in whitelist: %s" % whitelist) + if "libcudart" not in whitelist: + raise EasyBuildError("Did not find 'libcudart' in whitelist: %s" % whitelist) # iterate over all files in the CUDA path for root, dirs, files in os.walk(self.installdir): for filename in files: From f514f8155fd6f02e894fc48e4a7a83d046b08222 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 1 Mar 2023 15:56:24 +0100 Subject: [PATCH 17/31] Fix failing eb installation --- EESSI-pilot-install-software.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index c715dd55ad..f97894df74 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -432,10 +432,10 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # CUDA support # Need recent version of EasyBuild -echo ">> Installing EasyBuild 4.5.1..." +echo ">> Installing EasyBuild 4.7.0..." ok_msg="EasyBuild v4.7.0 installed" fail_msg="EasyBuild v4.7.0 failed to install" -$EB --from-pr 17065 --include-easyblocks-from-pr 2893 +$EB --from-pr 17065 --include-easyblocks-from-pr 2893 --try-amend=use_pip=1 check_exit_code $? "${ok_msg}" "${fail_msg}" LMOD_IGNORE_CACHE=1 module swap EasyBuild/4.7.0 From be326a1e5842460035ad6dffc580c73395a4c8b2 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Wed, 1 Mar 2023 21:02:01 +0100 Subject: [PATCH 18/31] Make sure we check space in the right places --- install_cuda_host_injections.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/install_cuda_host_injections.sh b/install_cuda_host_injections.sh index b6f83b4e9d..0684c7ca80 100755 --- a/install_cuda_host_injections.sh +++ b/install_cuda_host_injections.sh @@ -38,7 +38,6 @@ else fi fi - required_space_in_tmpdir=50000 # Let's see if we have sources and build locations defined if not, we use the temporary space if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then @@ -52,11 +51,11 @@ else # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), # need to do a space check before we proceed - avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') if (( avail_space < 5000000 )); then fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." fi - avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') if (( avail_space < required_space_in_tmpdir )); then error1="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" error2="You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH to reduce this requirement. Exiting now..." From 103f5fa34ea9c4cc740088d25a6d4ce6b6ca78bb Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 09:38:12 +0100 Subject: [PATCH 19/31] Simply wrap `mkdir -p` for better error reporting --- EESSI-pilot-install-software.sh | 4 +- install_cuda_host_injections.sh | 87 --------------------------------- scripts/utils.sh | 82 ++++++------------------------- 3 files changed, 19 insertions(+), 154 deletions(-) delete mode 100755 install_cuda_host_injections.sh diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index f97894df74..d648162c31 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -431,6 +431,8 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # CUDA support +cuda_version="11.3.1" + # Need recent version of EasyBuild echo ">> Installing EasyBuild 4.7.0..." ok_msg="EasyBuild v4.7.0 installed" @@ -460,7 +462,7 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps # (which unbreaks the symlinks from the runtime installation) echo ">> Installing $cuda_ec under host_injections..." -$(dirname "$BASH_SOURCE")/install_cuda_host_injections.sh 11.3.1 +$(dirname "$BASH_SOURCE")/gpu_support/cuda_utils/install_cuda_host_injections.sh 11.3.1 ok_msg="$cuda_ec (re)installed under host_injections!" fail_msg="Failed to install $cuda_ec under host_injections, woopsie..." check_exit_code $? "${ok_msg}" "${fail_msg}" diff --git a/install_cuda_host_injections.sh b/install_cuda_host_injections.sh deleted file mode 100755 index b6f83b4e9d..0000000000 --- a/install_cuda_host_injections.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/bash - -# Initialise our bash functions -source scripts/utils.sh - -if [[ $# -eq 0 ]] ; then - fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" -fi -install_cuda_version=$1 -if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then - fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" -else - # As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` - # (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) - cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} -fi - -# Only install CUDA if specified version is not found. -# This is only relevant for users, the shipped CUDA installation will -# always be in versions instead of host_injections and have symlinks pointing -# to host_injections for everything we're not allowed to ship -# (existence of easybuild subdir implies a successful install) -if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then - echo_green "CUDA software found! No need to install CUDA again, proceed with testing." -else - # We need to be able write to the installation space so let's make sure we can - if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then - fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" - fi - - # we need a directory we can use for temporary storage - if [[ -z "${CUDA_TEMP_DIR}" ]]; then - tmpdir=$(mktemp -d) - else - tmpdir="${CUDA_TEMP_DIR}"/temp - if ! mkdir "$tmpdir" ; then - fatal_error "Could not create directory ${tmpdir}" - fi - fi - - - required_space_in_tmpdir=50000 - # Let's see if we have sources and build locations defined if not, we use the temporary space - if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then - export EASYBUILD_BUILDPATH=${tmpdir}/build - required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) - fi - if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then - export EASYBUILD_SOURCEPATH=${tmpdir}/sources - required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) - fi - - # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), - # need to do a space check before we proceed - avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < 5000000 )); then - fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." - fi - avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') - if (( avail_space < required_space_in_tmpdir )); then - error1="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" - error2="You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH to reduce this requirement. Exiting now..." - fatal_error "${error1}${error2}" - fi - - if [[ -z "${EBROOTEASYBUILD}" ]]; then - echo_yellow "Loading EasyBuild module to do actual install" - module load EasyBuild - fi - - # we need the --rebuild option and a (random) dir for the module if the module - # file is shipped with EESSI - if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then - extra_args="--rebuild --installpath-modules=${tmpdir}" - fi - # We don't want hooks used in this install, we need a vanilla CUDA installation - touch "$tmpdir"/none.py - eb ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb - ret=$? - if [ $ret -ne 0 ]; then - fatal_error "CUDA installation failed, please check EasyBuild logs..." - else - echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" - fi - # clean up tmpdir - rm -rf "${tmpdir}" -fi diff --git a/scripts/utils.sh b/scripts/utils.sh index 035a914445..c1342c21fc 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -14,7 +14,7 @@ ANY_ERROR_EXITCODE=1 function fatal_error() { echo_red "ERROR: $1" >&2 if [[ $# -gt 1 ]]; then - exit $2 + exit "$2" else exit "${ANY_ERROR_EXITCODE}" fi @@ -32,79 +32,29 @@ function check_exit_code { fi } -# Reimplement 'mkdir -p' with reporting on where permissions break down function create_directory_structure() { # Ensure we are given a single path argument if [ $# -ne 1 ]; then - echo "Function requires a single (relative or absolute) path argument" >&2 - return 1 + echo_red "Function requires a single (relative or absolute) path argument" >&2 + return $ANY_ERROR_EXITCODE fi - - # set a persistent variable that knows the full structure - # (i.e., retains the value upon recursive calls) - full_structure="${full_structure:="$1"}" - - local directory_structure="$1" - - # Check if directory exists and is writeable - if [ -d "${directory_structure}" ]; then - if [ "${directory_structure}" = "${full_structure}" ]; then - # release our (unneeded) global variable - unset full_structure - fi - if [ -w "${directory_structure}" ]; then - # Nothing to be done - return 0 - else - echo "Directory ${directory_structure} exists but is not writeable" >&2 - return 1 - fi - fi - - local directory_structure_parent=$(dirname "${directory_structure}") - - # If the parent doesn't exist we need to create it - if [ ! -d "${directory_structure_parent}" ]; then - # Create the parent via a recursive call to this function - # (if this doesn't succeed we need to return the error code) - if ! create_directory_structure "${directory_structure_parent}"; then - if [ "${directory_structure}" = "${full_structure}" ]; then - # release our (unneeded) global variable - unset full_structure - fi - return 1 - fi - fi - - # Check the parent is writeable, and create the new subdir - if [ -w "${directory_structure_parent}" ]; then - if [ "${directory_structure}" = "${full_structure}" ]; then - # release our (unneeded) global variable - unset full_structure - fi - if ! mkdir "${directory_structure}"; then - echo "'mkdir ${directory_structure}' failed for an unknown reason!" >&2 - return 1 - else - # Success! - return 0 - fi - else - echo "Attempt to create ${full_structure} failed," \ - "${directory_structure_parent} exists but you don't have write permissions." >&2 - if [ "${directory_structure}" = "${full_structure}" ]; then - # release our global variable - unset full_structure - fi - return 1 + dir_structure="$1" + + # Attempt to create the directory structure + error_message=$(mkdir -p 2>&1) + return_code=$? + # If it fails be explicit about the error + if [ ${return_code} -ne 0 ]; then + echo_red "Creating ${dir_structure} failed with\n${error_message}" >&2 fi + return $return_code } function get_path_for_tool { tool_name=$1 tool_envvar_name=$2 - which_out=$(which ${tool_name} 2>&1) + which_out=$(which "${tool_name}" 2>&1) exit_code=$? if [[ ${exit_code} -eq 0 ]]; then echo "INFO: found tool ${tool_name} in PATH (${which_out})" >&2 @@ -136,7 +86,7 @@ function get_host_from_url { url=$1 re="(http|https)://([^/:]+)" if [[ $url =~ $re ]]; then - echo ${BASH_REMATCH[2]} + echo "${BASH_REMATCH[2]}" return 0 else echo "" @@ -148,7 +98,7 @@ function get_port_from_url { url=$1 re="(http|https)://[^:]+:([0-9]+)" if [[ $url =~ $re ]]; then - echo ${BASH_REMATCH[2]} + echo "${BASH_REMATCH[2]}" return 0 else echo "" @@ -158,7 +108,7 @@ function get_port_from_url { function get_ipv4_address { hname=$1 - hipv4=$(grep ${hname} /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) + hipv4=$(grep "${hname}" /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) # TODO try other methods if the one above does not work --> tool that verifies # what method can be used? echo "${hipv4}" From 793ba29d5bb1359a6ef5d631371cdf445b1147a5 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 09:42:57 +0100 Subject: [PATCH 20/31] Simply wrap `mkdir -p` for better error reporting --- .../install_cuda_host_injections.sh | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100755 gpu_support/cuda_utils/install_cuda_host_injections.sh diff --git a/gpu_support/cuda_utils/install_cuda_host_injections.sh b/gpu_support/cuda_utils/install_cuda_host_injections.sh new file mode 100755 index 0000000000..0684c7ca80 --- /dev/null +++ b/gpu_support/cuda_utils/install_cuda_host_injections.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Initialise our bash functions +source scripts/utils.sh + +if [[ $# -eq 0 ]] ; then + fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" +fi +install_cuda_version=$1 +if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" +else + # As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` + # (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) + cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} +fi + +# Only install CUDA if specified version is not found. +# This is only relevant for users, the shipped CUDA installation will +# always be in versions instead of host_injections and have symlinks pointing +# to host_injections for everything we're not allowed to ship +# (existence of easybuild subdir implies a successful install) +if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then + echo_green "CUDA software found! No need to install CUDA again, proceed with testing." +else + # We need to be able write to the installation space so let's make sure we can + if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then + fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" + fi + + # we need a directory we can use for temporary storage + if [[ -z "${CUDA_TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) + else + tmpdir="${CUDA_TEMP_DIR}"/temp + if ! mkdir "$tmpdir" ; then + fatal_error "Could not create directory ${tmpdir}" + fi + fi + + required_space_in_tmpdir=50000 + # Let's see if we have sources and build locations defined if not, we use the temporary space + if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) + fi + if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) + fi + + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), + # need to do a space check before we proceed + avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < 5000000 )); then + fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." + fi + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < required_space_in_tmpdir )); then + error1="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" + error2="You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH to reduce this requirement. Exiting now..." + fatal_error "${error1}${error2}" + fi + + if [[ -z "${EBROOTEASYBUILD}" ]]; then + echo_yellow "Loading EasyBuild module to do actual install" + module load EasyBuild + fi + + # we need the --rebuild option and a (random) dir for the module if the module + # file is shipped with EESSI + if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + extra_args="--rebuild --installpath-modules=${tmpdir}" + fi + # We don't want hooks used in this install, we need a vanilla CUDA installation + touch "$tmpdir"/none.py + eb ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb + ret=$? + if [ $ret -ne 0 ]; then + fatal_error "CUDA installation failed, please check EasyBuild logs..." + else + echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" + fi + # clean up tmpdir + rm -rf "${tmpdir}" +fi From c0a12470de4ba2c4523f084edd8b516d6fb62235 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 09:46:57 +0100 Subject: [PATCH 21/31] Make CUDA version a variable --- EESSI-pilot-install-software.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index d648162c31..f535634631 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -452,7 +452,7 @@ $EB $p7zip_ec --robot check_exit_code $? "${ok_msg}" "${fail_msg}" # install CUDA (uses eb_hooks.py to only install runtime) -cuda_ec="CUDA-11.3.1.eb" +cuda_ec="CUDA-${cuda_version}.eb" echo ">> Installing $cuda_ec..." ok_msg="$cuda_ec installed, off to a good (?) start!" fail_msg="Failed to install $cuda_ec, woopsie..." @@ -462,9 +462,9 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps # (which unbreaks the symlinks from the runtime installation) echo ">> Installing $cuda_ec under host_injections..." -$(dirname "$BASH_SOURCE")/gpu_support/cuda_utils/install_cuda_host_injections.sh 11.3.1 -ok_msg="$cuda_ec (re)installed under host_injections!" -fail_msg="Failed to install $cuda_ec under host_injections, woopsie..." +$(dirname "$BASH_SOURCE")/gpu_support/cuda_utils/install_cuda_host_injections.sh ${cuda_version} +ok_msg="CUDA $cuda_version (re)installed under host_injections!" +fail_msg="Failed to install CUDA $cuda_version under host_injections, woopsie..." check_exit_code $? "${ok_msg}" "${fail_msg}" # install CUDA samples (requires EESSI support for CUDA) From 5e82923f01ff3089074e0760a1f64cfa5d3a5b2d Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 09:51:37 +0100 Subject: [PATCH 22/31] Use TOPDIR, be more descriptive --- EESSI-pilot-install-software.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh index f535634631..035f851d61 100755 --- a/EESSI-pilot-install-software.sh +++ b/EESSI-pilot-install-software.sh @@ -461,8 +461,8 @@ check_exit_code $? "${ok_msg}" "${fail_msg}" # Add the host_injections CUDA so we can actually build CUDA apps # (which unbreaks the symlinks from the runtime installation) -echo ">> Installing $cuda_ec under host_injections..." -$(dirname "$BASH_SOURCE")/gpu_support/cuda_utils/install_cuda_host_injections.sh ${cuda_version} +echo ">> Re-installing CUDA $cuda_version under host_injections (to un-break symlinks in EESSI installation)..." +"${TOPDIR}"/gpu_support/cuda_utils/install_cuda_host_injections.sh ${cuda_version} ok_msg="CUDA $cuda_version (re)installed under host_injections!" fail_msg="Failed to install CUDA $cuda_version under host_injections, woopsie..." check_exit_code $? "${ok_msg}" "${fail_msg}" From 8384b25a9658944fa57de03da7dcac84ae5b0a8e Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 10:06:45 +0100 Subject: [PATCH 23/31] Add missing argument --- scripts/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index c1342c21fc..ef9e4095c2 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -41,7 +41,7 @@ function create_directory_structure() { dir_structure="$1" # Attempt to create the directory structure - error_message=$(mkdir -p 2>&1) + error_message=$(mkdir -p "$dir_structure" 2>&1) return_code=$? # If it fails be explicit about the error if [ ${return_code} -ne 0 ]; then From 98fe2a747fef928e123288fba157433322473dac Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 11:49:48 +0100 Subject: [PATCH 24/31] Improve error messages in new bash function --- scripts/utils.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index ef9e4095c2..b9aad997e1 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -45,8 +45,18 @@ function create_directory_structure() { return_code=$? # If it fails be explicit about the error if [ ${return_code} -ne 0 ]; then - echo_red "Creating ${dir_structure} failed with\n${error_message}" >&2 + real_dir=$(readlink -m "$dir_structure") + echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 + else + # If we're creating it, our use case is that we want to be able to write there + # (this is a check in case the directory already existed) + if [ ! -w "${dir_structure}" ]; then + real_dir=$(readlink -m "$dir_structure") + echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" + return $ANY_ERROR_EXITCODE + fi fi + return $return_code } From bbe7df210da6594ba94a27cf6ce072c7042cb24b Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 11:53:09 +0100 Subject: [PATCH 25/31] Stick with return_code --- scripts/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index b9aad997e1..b052e0a1ec 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -53,7 +53,7 @@ function create_directory_structure() { if [ ! -w "${dir_structure}" ]; then real_dir=$(readlink -m "$dir_structure") echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" - return $ANY_ERROR_EXITCODE + return_code=$ANY_ERROR_EXITCODE fi fi From 95dc24509434d38e353f21a54413d771ebfdc401 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 11:56:02 +0100 Subject: [PATCH 26/31] Use realpath to be consistent with other scripts --- scripts/utils.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index b052e0a1ec..099c1712ef 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -45,13 +45,13 @@ function create_directory_structure() { return_code=$? # If it fails be explicit about the error if [ ${return_code} -ne 0 ]; then - real_dir=$(readlink -m "$dir_structure") + real_dir=$(realpath -s "$dir_structure") echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 else # If we're creating it, our use case is that we want to be able to write there # (this is a check in case the directory already existed) if [ ! -w "${dir_structure}" ]; then - real_dir=$(readlink -m "$dir_structure") + real_dir=$(realpath -s "$dir_structure") echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" return_code=$ANY_ERROR_EXITCODE fi From a1270f2359d64f9ee19f7ac10645e90d71ee2437 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 11:58:22 +0100 Subject: [PATCH 27/31] Wrong realpath flag --- scripts/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index 099c1712ef..b5aa430985 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -51,7 +51,7 @@ function create_directory_structure() { # If we're creating it, our use case is that we want to be able to write there # (this is a check in case the directory already existed) if [ ! -w "${dir_structure}" ]; then - real_dir=$(realpath -s "$dir_structure") + real_dir=$(realpath -m "$dir_structure") echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" return_code=$ANY_ERROR_EXITCODE fi From aba486de42f250b7ada1d7459188ba579337540b Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 11:59:06 +0100 Subject: [PATCH 28/31] Wrong realpath flag --- scripts/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index b5aa430985..f043ba0ca6 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -45,7 +45,7 @@ function create_directory_structure() { return_code=$? # If it fails be explicit about the error if [ ${return_code} -ne 0 ]; then - real_dir=$(realpath -s "$dir_structure") + real_dir=$(realpath -m "$dir_structure") echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 else # If we're creating it, our use case is that we want to be able to write there From d2d1fc3b4a6b4e5372f1218b3da86f621f09f674 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 12:06:58 +0100 Subject: [PATCH 29/31] Fix typo --- .../cuda_utils/install_cuda_host_injections.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/gpu_support/cuda_utils/install_cuda_host_injections.sh b/gpu_support/cuda_utils/install_cuda_host_injections.sh index 0684c7ca80..4264f06b83 100755 --- a/gpu_support/cuda_utils/install_cuda_host_injections.sh +++ b/gpu_support/cuda_utils/install_cuda_host_injections.sh @@ -42,11 +42,11 @@ else # Let's see if we have sources and build locations defined if not, we use the temporary space if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then export EASYBUILD_BUILDPATH=${tmpdir}/build - required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) fi if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then export EASYBUILD_SOURCEPATH=${tmpdir}/sources - required_space_in_tmpdir=$((required_space_in_tempdir + 5000000)) + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) fi # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), @@ -57,9 +57,10 @@ else fi avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') if (( avail_space < required_space_in_tmpdir )); then - error1="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" - error2="You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH to reduce this requirement. Exiting now..." - fatal_error "${error1}${error2}" + error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" + error="${error}You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " + error="${error}to reduce this requirement. Exiting now..." + fatal_error "${error}" fi if [[ -z "${EBROOTEASYBUILD}" ]]; then @@ -69,11 +70,12 @@ else # we need the --rebuild option and a (random) dir for the module if the module # file is shipped with EESSI - if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then + if [ -f "${EESSI_SOFTWARE_PATH}"/modules/all/CUDA/"${install_cuda_version}".lua ]; then extra_args="--rebuild --installpath-modules=${tmpdir}" fi # We don't want hooks used in this install, we need a vanilla CUDA installation touch "$tmpdir"/none.py + # shellcheck disable=SC2086 # Intended splitting of extra_args eb ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb ret=$? if [ $ret -ne 0 ]; then From 562e94b7beba8e9e4fc71c84a6ad5b154ad9283f Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 12:11:49 +0100 Subject: [PATCH 30/31] Always add the rebuild option if we get to the point where we actually install CUDA --- gpu_support/cuda_utils/install_cuda_host_injections.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gpu_support/cuda_utils/install_cuda_host_injections.sh b/gpu_support/cuda_utils/install_cuda_host_injections.sh index 4264f06b83..0d68759c1e 100755 --- a/gpu_support/cuda_utils/install_cuda_host_injections.sh +++ b/gpu_support/cuda_utils/install_cuda_host_injections.sh @@ -68,11 +68,10 @@ else module load EasyBuild fi - # we need the --rebuild option and a (random) dir for the module if the module - # file is shipped with EESSI - if [ -f "${EESSI_SOFTWARE_PATH}"/modules/all/CUDA/"${install_cuda_version}".lua ]; then - extra_args="--rebuild --installpath-modules=${tmpdir}" - fi + # we need the --rebuild option and a (random) dir for the module since we are + # fixing the broken links of the EESSI-shipped installation + extra_args="--rebuild --installpath-modules=${tmpdir}" + # We don't want hooks used in this install, we need a vanilla CUDA installation touch "$tmpdir"/none.py # shellcheck disable=SC2086 # Intended splitting of extra_args From b4ae5f027f77fd38cf215b0fdf04b8136760d414 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 3 Mar 2023 12:42:31 +0100 Subject: [PATCH 31/31] Expose CUDA_TEMP_DIR --- gpu_support/cuda_utils/install_cuda_host_injections.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gpu_support/cuda_utils/install_cuda_host_injections.sh b/gpu_support/cuda_utils/install_cuda_host_injections.sh index 0d68759c1e..1ddccf4e82 100755 --- a/gpu_support/cuda_utils/install_cuda_host_injections.sh +++ b/gpu_support/cuda_utils/install_cuda_host_injections.sh @@ -58,7 +58,8 @@ else avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') if (( avail_space < required_space_in_tmpdir )); then error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" - error="${error}You can set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " + error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check." + error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " error="${error}to reduce this requirement. Exiting now..." fatal_error "${error}" fi @@ -75,7 +76,7 @@ else # We don't want hooks used in this install, we need a vanilla CUDA installation touch "$tmpdir"/none.py # shellcheck disable=SC2086 # Intended splitting of extra_args - eb ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb + eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb ret=$? if [ $ret -ne 0 ]; then fatal_error "CUDA installation failed, please check EasyBuild logs..."