From 5e479ae5074c4a70ecb7fcdf5687e1345e5e1654 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 23 Apr 2025 10:53:54 +0200 Subject: [PATCH 1/3] {2023.06}[2023a,a64fx] add remaining apps originally built with EB 4.9.0 --- .../a64fx/eessi-2023.06-eb-4.9.4-2023a.yml | 110 ++++++++++++++++-- 1 file changed, 103 insertions(+), 7 deletions(-) diff --git a/easystacks/software.eessi.io/2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml b/easystacks/software.eessi.io/2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml index 2859d4dc55..6c9d2a0c52 100644 --- a/easystacks/software.eessi.io/2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml +++ b/easystacks/software.eessi.io/2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml @@ -14,14 +14,15 @@ easyconfigs: # originally built with EB 4.8.2; PR 19270 included since EB 4.9.0 # - pybind11-2.11.1-GCCcore-12.3.0.eb: # # avoid indirect dependency on old CMake version built with GCCcore/10.2.0 via Catch2 build dependency; -# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19270 # options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19270 # from-pr: 19270 - pybind11-2.11.1-GCCcore-12.3.0.eb # the package SciPy-bundle itself has to be rebuilt; here we use the commit to add the dependency # Cython; PR 21693 is included since EB 5.0.0 # - SciPy-bundle-2023.07-gfbf-2023a.eb: # options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/21693 # from-pr: 21693 - SciPy-bundle-2023.07-gfbf-2023a.eb: options: @@ -48,14 +49,15 @@ easyconfigs: # originally built with EB 4.8.2; PR 19268 included since EB 4.9.0 # - TensorFlow-2.13.0-foss-2023a.eb: # # patch setup.py for grpcio extension in TensorFlow 2.13.0 easyconfigs to take into account alternate sysroot; -# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19268 # options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19268 # from-pr: 19268 - TensorFlow-2.13.0-foss-2023a.eb - X11-20230603-GCCcore-12.3.0.eb # originally built with EB 4.8.2; PR 19339 included since EB 4.9.0 # - HarfBuzz-5.3.1-GCCcore-12.3.0.eb: # options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19339 # from-pr: 19339 - HarfBuzz-5.3.1-GCCcore-12.3.0.eb - Qt5-5.15.10-GCCcore-12.3.0.eb @@ -63,16 +65,19 @@ easyconfigs: # originally built with EB 4.8.2; PR 19363 included since EB 4.9.0 # - LHAPDF-6.5.4-GCC-12.3.0.eb: # options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19363 # from-pr: 19363 - LHAPDF-6.5.4-GCC-12.3.0.eb # originally built with EB 4.8.2; PR 19397 included since EB 4.9.0 # - LoopTools-2.15-GCC-12.3.0.eb: # options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19397 # from-pr: 19397 - LoopTools-2.15-GCC-12.3.0.eb # originally built with EB 4.8.2; PR 19185 included since EB 4.9.0 # - R-4.3.2-gfbf-2023a.eb: # options: +# # https://github.com/easybuilders/easybuild-easyconfigs/pull/19185 # from-pr: 19185 - R-4.3.2-gfbf-2023a.eb # originally built with EB 4.8.2; source URL has changed recently @@ -87,16 +92,19 @@ easyconfigs: # originally built with EB 4.8.2; PR 19455 included since EB 4.9.0 # - ALL-0.9.2-foss-2023a.eb: # options: +# # https://github.com/easybuilders/easybuild-easyconfigs/pull/19455 # from-pr: 19455 - ALL-0.9.2-foss-2023a.eb # originally built with EB 4.8.2; PR 19735 included since EB 4.9.1 # - CDO-2.2.2-gompi-2023a.eb: # options: +# # https://github.com/easybuilders/easybuild-easyconfigs/pull/19735 # from-pr: 19735 - CDO-2.2.2-gompi-2023a.eb # originally built with EB 4.8.2; PR 19820 included since EB 4.9.1 # - BWA-0.7.17-20220923-GCCcore-12.3.0.eb: # options: +# # https://github.com/easybuilders/easybuild-easyconfigs/pull/19820 # from-pr: 19820 - BWA-0.7.17-20220923-GCCcore-12.3.0.eb # from here on apps were originally built with EB 4.9.0 @@ -113,14 +121,14 @@ easyconfigs: from-commit: e610fe1ac5393d1de668a466fdaaea74c580ee03 # PR 19592 was included since EB 4.9.1 # - ESPResSo-4.2.1-foss-2023a.eb: -# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19592 # options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19592 # from-pr: 19592 - ESPResSo-4.2.1-foss-2023a.eb # PR 19679 was included since EB 4.9.1 # - Rivet-3.1.9-gompi-2023a-HepMC3-3.2.6.eb: -# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19679 # options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19679 # from-pr: 19679 - Rivet-3.1.9-gompi-2023a-HepMC3-3.2.6.eb - Pillow-10.0.0-GCCcore-12.3.0.eb @@ -150,15 +158,103 @@ easyconfigs: from-commit: 0437ff1ad34283398f55d4a6e01e6540b1ae9688 # PR 19646 was included since EB 4.9.1 # - snakemake-8.4.2-foss-2023a.eb: -# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19646 # options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19646 # from-pr: 19646 - snakemake-8.4.2-foss-2023a.eb # PRs 19471 and 3036 were included since EB 4.9.1 # - LAMMPS-2Aug2023_update2-foss-2023a-kokkos.eb: -# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19471 -# # see https://github.com/easybuilders/easybuild-easyblocks/pull/3036 # options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19471 # from-pr: 19471 +# # see https://github.com/easybuilders/easybuild-easyblocks/pull/3036 # include-easyblocks-from-pr: 3036 - LAMMPS-2Aug2023_update2-foss-2023a-kokkos.eb +# PR 19573 was included since EB 4.9.1 +# - PyTorch-2.1.2-foss-2023a.eb: +# options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19573 +# from-pr: 19573 + - PyTorch-2.1.2-foss-2023a.eb + - matplotlib-3.7.2-gfbf-2023a.eb +# PR 19554 was included since EB 4.9.1 +# - PyQt5-5.15.10-GCCcore-12.3.0.eb: +# options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19554 +# from-pr: 19554 + - PyQt5-5.15.10-GCCcore-12.3.0.eb +# PR 19996 was included since EB 4.9.1 +# - Pillow-SIMD-9.5.0-GCCcore-12.3.0.eb: +# options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19996 +# from-pr: 19996 + - Pillow-SIMD-9.5.0-GCCcore-12.3.0.eb + - dask-2023.9.2-foss-2023a.eb + - JupyterNotebook-7.0.2-GCCcore-12.3.0.eb +# PR 20086 was included since EB 4.9.1 +# - ImageMagick-7.1.1-15-GCCcore-12.3.0.eb: +# options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/20086 +# from-pr: 20086 + - ImageMagick-7.1.1-15-GCCcore-12.3.0.eb +# PR 20050 was included since EB 4.9.1 +# - Z3-4.12.2-GCCcore-12.3.0.eb: +# options: +# # The Z3 dependency of PyTorch had it's versionsuffix removed +# # and we need to workaround the problem this creates, +# # see https://github.com/EESSI/software-layer/pull/501 for details +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/20050 +# from-pr: 20050 + - Z3-4.12.2-GCCcore-12.3.0.eb +# PR 20007 was included since EB 4.9.1 +# - PyOpenGL-3.1.7-GCCcore-12.3.0.eb: +# options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/20007 +# from-pr: 20007 + - PyOpenGL-3.1.7-GCCcore-12.3.0.eb + - OpenJPEG-2.5.0-GCCcore-12.3.0.eb + - Highway-1.0.4-GCCcore-12.3.0.eb + - ELPA-2023.05.001-foss-2023a.eb +# libxc was handled as follows when building NVIDIA Grace stack +# - libxc-6.2.2-GCC-12.3.0.eb + - libxc-6.2.2-GCC-12.3.0.eb: + options: + # possible change in gitlabs tarball packaging, affected by .gitattributes + # https://github.com/easybuilders/easybuild-easyconfigs/pull/22580 + from-commit: 456d64bbeacf465e8f7e7ff378864e26352d045d +# ParMETIS was handled as follows when building NVIDIA Grace stack +# (it wasn't listed separately when building stack for Sapphire Rapids) + - ParMETIS-4.0.3-gompi-2023a.eb: + options: + # source URLs for ParMETIS-4.0.3 have changed, corresponding PR is + # https://github.com/easybuilders/easybuild-easyconfigs/pull/22579 + # ParMETIS-4.0.3 is a dependency of SuperLU_DIST-8.1.2 + from-commit: 977e5208a720f23ace41b83b84da8b717d0aeada +# PR 20162 was included since EB 4.9.1 +# - SuperLU_DIST-8.1.2-foss-2023a.eb: +# options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/20162 +# from-pr: 20162 + - SuperLU_DIST-8.1.2-foss-2023a.eb +# PRs 19686 and 3086 were included since EB 4.9.1 +# - PETSc-3.20.3-foss-2023a.eb: +# options: +# # see https://github.com/easybuilders/easybuild-easyblocks/pull/3086 +# include-easyblocks-from-pr: 3086 +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/20162 +# from-pr: 19686 + - PETSc-3.20.3-foss-2023a.eb +# PR 20142 was included since EB 4.9.1 +# - MODFLOW-6.4.4-foss-2023a.eb: +# options: +# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/20142 +# from-pr: 20142 + - MODFLOW-6.4.4-foss-2023a.eb + # add all dependencies of R-bundle-CRAN-2023.12-foss-2023a.eb; due to a rebuild we will install the bundle itself with EB 4.9.4 + - NLopt-2.7.1-GCCcore-12.3.0.eb + - nettle-3.9.1-GCCcore-12.3.0.eb + - Xvfb-21.1.8-GCCcore-12.3.0.eb + - libsndfile-1.2.2-GCCcore-12.3.0.eb + - PostgreSQL-16.1-GCCcore-12.3.0.eb + - ImageMagick-7.1.1-15-GCCcore-12.3.0.eb + - GDAL-3.7.1-foss-2023a.eb From 077eb43600cd7f3842ac459a392aae2a87ac639a Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 23 Apr 2025 19:27:58 +0200 Subject: [PATCH 2/3] skip building PyTorch in this PR --- .../2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/easystacks/software.eessi.io/2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml b/easystacks/software.eessi.io/2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml index 6c9d2a0c52..82548a5856 100644 --- a/easystacks/software.eessi.io/2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml +++ b/easystacks/software.eessi.io/2023.06/a64fx/eessi-2023.06-eb-4.9.4-2023a.yml @@ -170,12 +170,12 @@ easyconfigs: # # see https://github.com/easybuilders/easybuild-easyblocks/pull/3036 # include-easyblocks-from-pr: 3036 - LAMMPS-2Aug2023_update2-foss-2023a-kokkos.eb -# PR 19573 was included since EB 4.9.1 -# - PyTorch-2.1.2-foss-2023a.eb: -# options: -# # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19573 -# from-pr: 19573 - - PyTorch-2.1.2-foss-2023a.eb +## PR 19573 was included since EB 4.9.1 +## - PyTorch-2.1.2-foss-2023a.eb: +## options: +## # see https://github.com/easybuilders/easybuild-easyconfigs/pull/19573 +## from-pr: 19573 +# - PyTorch-2.1.2-foss-2023a.eb - matplotlib-3.7.2-gfbf-2023a.eb # PR 19554 was included since EB 4.9.1 # - PyQt5-5.15.10-GCCcore-12.3.0.eb: From 16408d5966d7229dd31fffb29a6ced9c493d6509 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 24 Apr 2025 13:35:46 +0200 Subject: [PATCH 3/3] refactor code to limit parallelism --- eb_hooks.py | 82 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 18 deletions(-) diff --git a/eb_hooks.py b/eb_hooks.py index 5f5405c173..b45cce0a30 100644 --- a/eb_hooks.py +++ b/eb_hooks.py @@ -121,27 +121,38 @@ def parse_hook(ec, *args, **kwargs): def post_ready_hook(self, *args, **kwargs): """ - Post-ready hook: limit parallellism for selected builds, because they require a lot of memory per used core. + Post-ready hook: limit parallellism for selected builds based on software name and CPU target. + parallelism needs to be limited because some builds require a lot of memory per used core. """ # 'parallel' easyconfig parameter is set via EasyBlock.set_parallel in ready step based on available cores. - # here we reduce parallellism to only use half of that for selected software, - # to avoid failing builds/tests due to out-of-memory problems; - memory_hungry_build = self.name in ['libxc', 'MBX', 'TensorFlow'] - # on A64FX systems, (HBM) memory is typically scarce, so we need to use fewer cores for some builds + # get current parallelism setting + parallel = self.cfg['parallel'] + if parallel == 1: + return # no need to limit if already using 1 core + + # get CPU target cpu_target = get_eessi_envvar('EESSI_SOFTWARE_SUBDIR') - memory_hungry_build_a64fx = cpu_target == CPU_TARGET_A64FX and self.name in ['Qt5', 'ROOT'] - if memory_hungry_build or memory_hungry_build_a64fx: - parallel = self.cfg['parallel'] - if cpu_target == CPU_TARGET_A64FX and self.name in ['TensorFlow']: - # limit parallelism to 8, builds with 12 and 16 failed on Deucalion - if parallel > 8: - self.cfg['parallel'] = 8 - msg = "limiting parallelism to %s (was %s) for %s on %s to avoid out-of-memory failures during building/testing" - print_msg(msg % (self.cfg['parallel'], parallel, self.name, cpu_target), log=self.log) - elif parallel > 1: - self.cfg['parallel'] = parallel // 2 - msg = "limiting parallelism to %s (was %s) for %s to avoid out-of-memory failures during building/testing" - print_msg(msg % (self.cfg['parallel'], parallel, self.name), log=self.log) + + # check if we have limits defined for this software + if self.name in PARALLELISM_LIMITS: + limits = PARALLELISM_LIMITS[self.name] + + # first check for CPU-specific limit + if cpu_target in limits: + operation_func, operation_args = limits[cpu_target] + new_parallel = operation_func(parallel, operation_args) + # then check for generic limit (applies to all CPU targets) + elif '*' in limits: + operation_func, operation_args = limits['*'] + new_parallel = operation_func(parallel, operation_args) + else: + return # no applicable limits found + + # apply the limit if it's different from current + if new_parallel != parallel: + self.cfg['parallel'] = new_parallel + msg = "limiting parallelism to %s (was %s) for %s on %s to avoid out-of-memory failures during building/testing" + print_msg(msg % (new_parallel, parallel, self.name, cpu_target), log=self.log) def pre_prepare_hook(self, *args, **kwargs): @@ -1249,3 +1260,38 @@ def post_module_hook(self, *args, **kwargs): } POST_MODULE_HOOKS = {} + +# Define parallelism limit operations +def divide_by_factor(parallel, factor): + """Divide parallelism by given factor""" + return max(1, parallel // factor) + +def set_maximum(parallel, max_value): + """Set parallelism to maximum value""" + return min(parallel, max_value) + +# Data structure defining parallelism limits for different software and CPU targets +# Format: {software_name: {cpu_target: (operation_function, operation_args)}} +# '*' for a CPU target means the operation applies to all CPU targets +# Information is processed in the post_ready_hook function. First it checks if the +# specific CPU target is defined in the data structure below. If not, it checks for +# the generic '*' entry. +PARALLELISM_LIMITS = { + 'libxc': { + '*': (divide_by_factor, 2), + CPU_TARGET_A64FX: (set_maximum, 12), + }, + 'MBX': { + '*': (divide_by_factor, 2), + }, + 'TensorFlow': { + '*': (divide_by_factor, 2), + CPU_TARGET_A64FX: (set_maximum, 8), + }, + 'Qt5': { + CPU_TARGET_A64FX: (divide_by_factor, 2), + }, + 'ROOT': { + CPU_TARGET_A64FX: (divide_by_factor, 2), + }, +}