From 4d85cb513327065b9e8a6ab3f8c490c08915bb9b Mon Sep 17 00:00:00 2001 From: Javier Otero Date: Thu, 27 May 2021 15:25:47 +0200 Subject: [PATCH 01/20] Port strided bandwidth benchmark to the lib --- .../cpu/strided_bandwidth/strides.py | 210 ++++++++---------- .../cpu/strided_bandwidth/__init__.py | 74 ++++++ .../cpu/strided_bandwidth/src/strides.cpp | 0 3 files changed, 164 insertions(+), 120 deletions(-) create mode 100644 hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py rename {cscs-checks => hpctestlib}/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp (100%) diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py index 22991adccd..5be27582b0 100644 --- a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py +++ b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py @@ -6,28 +6,34 @@ import reframe as rfm import reframe.utility.sanity as sn +from hpctestlib.microbenchmarks.cpu.strided_bandwidth import StridedBandwidth -class StridedBase(rfm.RegressionTest): - def __init__(self): - self.sourcepath = 'strides.cpp' - self.build_system = 'SingleSource' - self.valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc', - 'eiger:mc', 'pilatus:mc'] - self.valid_prog_environs = ['PrgEnv-gnu'] - self.num_tasks = 1 - self.num_tasks_per_node = 1 - self.sanity_patterns = sn.assert_eq( - sn.count(sn.findall(r'bandwidth', self.stdout)), - self.num_tasks_assigned) - - self.perf_patterns = { - 'bandwidth': sn.extractsingle( - r'bandwidth: (?P\S+) GB/s', - self.stdout, 'bw', float) - } - - self.system_num_cpus = { +@rfm.simple_test +class strided_bandwidth_check(StridedBandwidth): + '''Strided bandwidth check. + + This test is parameterized with the ``stride`` parameter, covering the + following scenarios: 8-byte stride using the full cache line, 64-byte + stride using 1/8 of the cacheline, and 128-byte using 1/8 of every 2nd + cacheline. + + This test requires the ``num_cpus`` variable, which is set in a post-setup + hook. The data for each supported system is stored in ``system_num_cpus`` + + Since the performance references change with the ``stride`` parameter, the + references for each test instace are stored in the ``reference_per_stride`` + variable. The actual references are then set in a pre-performance hook. + ''' + + # Define the stride parameter + stride = parameter([1, 8, 16]) + + valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc', + 'eiger:mc', 'pilatus:mc'] + valid_prog_environs = ['PrgEnv-gnu'] + system_num_cpus = variable( + dict, value={ 'daint:mc': 72, 'daint:gpu': 24, 'dom:mc': 72, @@ -35,110 +41,74 @@ def __init__(self): 'eiger:mc': 128, 'pilatus:mc': 128 } - - self.maintainers = ['SK'] - self.tags = {'benchmark', 'diagnostic'} - - @property - @sn.sanity_function - def num_tasks_assigned(self): - return self.job.num_tasks - - -@rfm.simple_test -class StridedBandwidthTest(StridedBase): - def __init__(self): - super().__init__() - - self.reference = { - 'dom:gpu': { - 'bandwidth': (50, -0.1, 0.1, 'GB/s') - }, - 'dom:mc': { - 'bandwidth': (100, -0.1, 0.1, 'GB/s') - }, - 'daint:gpu': { - 'bandwidth': (50, -0.1, 0.1, 'GB/s') - }, - 'daint:mc': { - 'bandwidth': (100, -0.1, 0.1, 'GB/s') - }, - 'eiger:mc': { - 'bandwidth': (270, -0.1, 0.1, 'GB/s') - }, - 'pilatus:mc': { - 'bandwidth': (270, -0.1, 0.1, 'GB/s') - } - } - - @rfm.run_before('run') - def set_exec_opts(self): - self.num_cpus = self.system_num_cpus[self.current_partition.fullname] - - # 8-byte stride, using the full cacheline - self.executable_opts = ['100000000', '1', f'{self.num_cpus}'] - - -@rfm.simple_test -class StridedBandwidthTest64(StridedBase): - def __init__(self): - super().__init__() - - self.reference = { - 'dom:gpu': { - 'bandwidth': (6, -0.1, 0.2, 'GB/s') - }, - 'dom:mc': { - 'bandwidth': (12.5, -0.1, 0.2, 'GB/s') + ) + reference_per_stride = variable( + dict, value={ + 1: { + 'dom:gpu': { + 'bandwidth': (50, -0.1, 0.1, 'GB/s') + }, + 'dom:mc': { + 'bandwidth': (100, -0.1, 0.1, 'GB/s') + }, + 'daint:gpu': { + 'bandwidth': (50, -0.1, 0.1, 'GB/s') + }, + 'daint:mc': { + 'bandwidth': (100, -0.1, 0.1, 'GB/s') + }, + 'eiger:mc': { + 'bandwidth': (270, -0.1, 0.1, 'GB/s') + }, + 'pilatus:mc': { + 'bandwidth': (270, -0.1, 0.1, 'GB/s') + } }, - 'daint:gpu': { - 'bandwidth': (6, -0.05, 0.2, 'GB/s') + 8: { + 'dom:gpu': { + 'bandwidth': (6, -0.1, 0.2, 'GB/s') + }, + 'dom:mc': { + 'bandwidth': (12.5, -0.1, 0.2, 'GB/s') + }, + 'daint:gpu': { + 'bandwidth': (6, -0.05, 0.2, 'GB/s') + }, + 'daint:mc': { + 'bandwidth': (12.5, -0.1, 0.2, 'GB/s') + }, + 'eiger:mc': { + 'bandwidth': (33, -0.1, 0.2, 'GB/s') + }, + 'pilatus:mc': { + 'bandwidth': (33, -0.1, 0.2, 'GB/s') + } }, - 'daint:mc': { - 'bandwidth': (12.5, -0.1, 0.2, 'GB/s') - }, - 'eiger:mc': { - 'bandwidth': (33, -0.1, 0.2, 'GB/s') - }, - 'pilatus:mc': { - 'bandwidth': (33, -0.1, 0.2, 'GB/s') + 16: { + 'dom:gpu': { + 'bandwidth': (4.5, -0.1, 0.2, 'GB/s') + }, + 'dom:mc': { + 'bandwidth': (9.1, -0.1, 0.2, 'GB/s') + }, + 'daint:gpu': { + 'bandwidth': (4.5, -0.1, 0.2, 'GB/s') + }, + 'daint:mc': { + 'bandwidth': (9.1, -0.1, 0.2, 'GB/s') + }, + 'eiger:mc': { + 'bandwidth': (33, -0.1, 0.2, 'GB/s') + }, } } + ) + tags = {'benchmark', 'diagnostic'} - @rfm.run_before('run') - def set_exec_opts(self): - self.num_cpus = self.system_num_cpus[self.current_partition.fullname] - - # 64-byte stride, using 1/8 of the cacheline - self.executable_opts = ['100000000', '8', '%s' % self.num_cpus] - - -@rfm.simple_test -class StridedBandwidthTest128(StridedBase): - def __init__(self): - super().__init__() - - self.reference = { - 'dom:gpu': { - 'bandwidth': (4.5, -0.1, 0.2, 'GB/s') - }, - 'dom:mc': { - 'bandwidth': (9.1, -0.1, 0.2, 'GB/s') - }, - 'daint:gpu': { - 'bandwidth': (4.5, -0.1, 0.2, 'GB/s') - }, - 'daint:mc': { - 'bandwidth': (9.1, -0.1, 0.2, 'GB/s') - }, - 'eiger:mc': { - 'bandwidth': (33, -0.1, 0.2, 'GB/s') - }, - } - - @rfm.run_before('run') - def set_exec_opts(self): + @rfm.run_after('setup') + def set_num_cpus(self): self.num_cpus = self.system_num_cpus[self.current_partition.fullname] - # 128-byte stride, using 1/8 of every 2nd cacheline - self.executable_opts = ['100000000', '16', '%s' % self.num_cpus] + @rfm.run_before('performance') + def set_references(self): + self.reference = self.reference_per_stride[self.stride] diff --git a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py new file mode 100644 index 0000000000..64ffa378c0 --- /dev/null +++ b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py @@ -0,0 +1,74 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn + + +__all__ = ['StridedBandwidth'] + + +class StridedBandwidth(rfm.RegressionTest, pin_prefix=True): + '''Strided bandwith benchmark. + + The executable takes three required arguments. These are the buffer size + (in bytes), the stride (in multiples of 8 bytes) and the number of threads + to run this application with. + + The performance stage measures the bandiwdth in GB/s. + ''' + + #: Parameter that controls the stride access pattern. + #: This parameter must be opverridden by the derived class. + #: + #: :default: ``()`` + stride = parameter() + + #: Set the number of cpus per node. + #: + #: :default: ``required`` + num_cpus = variable(int) + + sourcepath = 'strides.cpp' + build_system = 'SingleSource' + num_tasks = 0 + num_tasks_per_node = 1 + reference = { + '*': { + 'bandwidth': (None, None, None, 'GB/s') + } + } + maintainers = ['SK'] + + @rfm.run_before('run') + def set_exec_opts(self): + '''Set the exec options. + + In order, these are the buffer size, stride and number of threads. See + the main docstring above for more info. + ''' + self.executable_opts = ['100000000', f'{self.stride}', f'{self.num_cpus}'] + + @rfm.run_before('sanity') + def set_sanity_patterns(self): + ''' Assert that the bandwidth is reported for all the tasks.''' + + self.sanity_patterns = sn.assert_eq( + sn.count(sn.findall(r'bandwidth:', self.stdout)), + self.job.num_tasks + ) + + @rfm.run_before('performance') + def set_perf_patterns(self): + '''Extract the min bandwidth as a performance metric.''' + + self.perf_patterns = { + 'bandwidth': sn.min( + sn.extractall( + r'bandwidth: (?P\S+) GB/s', + self.stdout, 'bw', float + ) + ) + } diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp similarity index 100% rename from cscs-checks/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp rename to hpctestlib/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp From db483281a2fe531dca17034ace3ba9f90f9a222a Mon Sep 17 00:00:00 2001 From: "Javier J. Otero Perez" Date: Thu, 27 May 2021 17:08:12 +0200 Subject: [PATCH 02/20] Port strided bandidth check to a64fx --- config/cscs.py | 15 +++++++++- .../cpu/strided_bandwidth/strides.py | 29 +++++++++++++++---- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/config/cscs.py b/config/cscs.py index 6bb5778a33..c53e062ba0 100644 --- a/config/cscs.py +++ b/config/cscs.py @@ -39,7 +39,8 @@ ], 'environs': [ 'builtin', - 'PrgEnv-gnu' + 'PrgEnv-gnu', + 'PrgEnv-fujitsu' ], 'descr': 'Fujitsu A64FX CPUs', 'max_jobs': 100, @@ -815,6 +816,18 @@ 'cxx': 'mpicxx', 'ftn': 'mpif90' }, + { + 'name': 'PrgEnv-fujitsu', + 'target_systems': [ + 'ault' + ], + 'modules': [ + 'a64fxsdk' + ], + 'cc': 'mpifccpx', + 'cxx': 'mpiFCCpx', + 'ftn': 'mpifrtpx' + }, { 'name': 'builtin', 'target_systems': [ diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py index 5be27582b0..3428cce169 100644 --- a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py +++ b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: BSD-3-Clause import reframe as rfm -import reframe.utility.sanity as sn from hpctestlib.microbenchmarks.cpu.strided_bandwidth import StridedBandwidth @@ -30,8 +29,15 @@ class strided_bandwidth_check(StridedBandwidth): stride = parameter([1, 8, 16]) valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc', - 'eiger:mc', 'pilatus:mc'] + 'eiger:mc', 'pilatus:mc', 'ault:a64fx'] valid_prog_environs = ['PrgEnv-gnu'] + + @rfm.run_after('init') + def set_valid_systems(self): + cp = self.current_system.name + if cp == 'ault': + self.valid_prog_environs = ['PrgEnv-fujitsu'] + system_num_cpus = variable( dict, value={ 'daint:mc': 72, @@ -39,7 +45,8 @@ class strided_bandwidth_check(StridedBandwidth): 'dom:mc': 72, 'dom:gpu': 24, 'eiger:mc': 128, - 'pilatus:mc': 128 + 'pilatus:mc': 128, + 'ault:a64fx': 48, } ) reference_per_stride = variable( @@ -62,7 +69,10 @@ class strided_bandwidth_check(StridedBandwidth): }, 'pilatus:mc': { 'bandwidth': (270, -0.1, 0.1, 'GB/s') - } + }, + 'ault:a64fx': { + 'bandwidth': (50, -0.1, 0.1, 'GB/s') + }, }, 8: { 'dom:gpu': { @@ -82,7 +92,10 @@ class strided_bandwidth_check(StridedBandwidth): }, 'pilatus:mc': { 'bandwidth': (33, -0.1, 0.2, 'GB/s') - } + }, + 'ault:a64fx': { + 'bandwidth': (45, -0.1, 0.1, 'GB/s') + }, }, 16: { 'dom:gpu': { @@ -100,6 +113,12 @@ class strided_bandwidth_check(StridedBandwidth): 'eiger:mc': { 'bandwidth': (33, -0.1, 0.2, 'GB/s') }, + 'pilatus:mc': { + 'bandwidth': (33, -0.1, 0.2, 'GB/s') + }, + 'ault:a64fx': { + 'bandwidth': (25, -0.1, 0.1, 'GB/s') + }, } } ) From a1873475b97c0db1c7fa4229ac3ece4a4f42bbee Mon Sep 17 00:00:00 2001 From: "Javier J. Otero Perez" Date: Mon, 31 May 2021 13:42:53 +0200 Subject: [PATCH 03/20] Port stream benchmark to the hpctestlib --- .../microbenchmarks/cpu/stream/stream.py | 100 +++++++----------- .../microbenchmarks/cpu/stream/__init__.py | 61 +++++++++++ .../microbenchmarks/cpu/stream/src/stream.c | 0 3 files changed, 102 insertions(+), 59 deletions(-) create mode 100644 hpctestlib/microbenchmarks/cpu/stream/__init__.py rename {cscs-checks => hpctestlib}/microbenchmarks/cpu/stream/src/stream.c (100%) diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream.py b/cscs-checks/microbenchmarks/cpu/stream/stream.py index eb2b609582..8180fe8738 100644 --- a/cscs-checks/microbenchmarks/cpu/stream/stream.py +++ b/cscs-checks/microbenchmarks/cpu/stream/stream.py @@ -6,70 +6,40 @@ import reframe as rfm import reframe.utility.sanity as sn +from hpctestlib.microbenchmark.cpu.stream import Stream -@rfm.simple_test -class StreamTest(rfm.RegressionTest): - '''This test checks the stream test: - Function Best Rate MB/s Avg time Min time Max time - Triad: 13991.7 0.017174 0.017153 0.017192 - ''' - - def __init__(self): - self.descr = 'STREAM Benchmark' - self.exclusive_access = True - self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', - 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn'] - self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-gnu', - 'PrgEnv-intel', 'PrgEnv-pgi', - 'PrgEnv-cray_classic'] - - self.use_multithreading = False - self.prgenv_flags = { - 'PrgEnv-cray_classic': ['-homp', '-O3'], +@rfm.simple_test +class stream_check(Stream): + valid_systems = [ + 'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', + 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn' + ] + valid_prog_environs = [ + 'PrgEnv-cray', 'PrgEnv-gnu', 'PrgEnv-intel', 'PrgEnv-pgi' + ] + prgenv_flags = variable( + dict, value={ 'PrgEnv-cray': ['-fopenmp', '-O3'], 'PrgEnv-gnu': ['-fopenmp', '-O3'], 'PrgEnv-intel': ['-qopenmp', '-O3'], 'PrgEnv-pgi': ['-mp', '-O3'] } - - if self.current_system.name in ['arolla', 'tsa']: - self.exclusive_access = True - self.valid_prog_environs = ['PrgEnv-gnu'] - - self.sourcepath = 'stream.c' - self.build_system = 'SingleSource' - self.num_tasks = 1 - self.num_tasks_per_node = 1 - self.stream_cpus_per_task = { + ) + stream_cpus_per_task = variable( + dict, value={ 'arolla:cn': 16, 'arolla:pn': 16, 'daint:gpu': 12, 'daint:mc': 36, 'dom:gpu': 12, 'dom:mc': 36, - 'leone:normal': 16, - 'monch:compute': 20, 'tsa:cn': 16, 'tsa:pn': 16, } - self.variables = { - 'OMP_PLACES': 'threads', - 'OMP_PROC_BIND': 'spread' - } - self.sanity_patterns = sn.assert_found( - r'Solution Validates: avg error less than', self.stdout) - self.perf_patterns = { - 'triad': sn.extractsingle(r'Triad:\s+(?P\S+)\s+\S+', - self.stdout, 'triad', float) - } - self.stream_bw_reference = { - 'PrgEnv-cray_classic': { - 'daint:gpu': {'triad': (57000, -0.05, None, 'MB/s')}, - 'daint:mc': {'triad': (117000, -0.05, None, 'MB/s')}, - 'dom:gpu': {'triad': (57000, -0.05, None, 'MB/s')}, - 'dom:mc': {'triad': (117000, -0.05, None, 'MB/s')}, - }, + ) + stream_bw_reference = variable( + dict, value={ 'PrgEnv-cray': { 'daint:gpu': {'triad': (44000, -0.05, None, 'MB/s')}, 'daint:mc': {'triad': (89000, -0.05, None, 'MB/s')}, @@ -95,21 +65,33 @@ def __init__(self): 'dom:mc': {'triad': (88500, -0.05, None, 'MB/s')}, } } - self.tags = {'production', 'craype'} - self.maintainers = ['RS', 'SK'] + ) + num_tasks = 1 + tags = {'production', 'craype'} + + @rfm.run_after('init') + def filter_valid_prog_environs(self): + if self.current_system.name in ['arolla', 'tsa']: + self.valid_prog_environs = ['PrgEnv-gnu'] - @rfm.run_after('setup') - def prepare_test(self): + @rfm.run_after('init') + def set_num_cpus_per_task(self): self.num_cpus_per_task = self.stream_cpus_per_task.get( - self.current_partition.fullname, 1) - self.variables['OMP_NUM_THREADS'] = str(self.num_cpus_per_task) - envname = self.current_environ.name + self.current_partition.fullname, required + ) + @rfm.run_before('compile') + def set_compiler_flags(self): + envname = self.current_environ.name self.build_system.cflags = self.prgenv_flags.get(envname, ['-O3']) - if envname == 'PrgEnv-pgi': + + @rfm.run_before('run') + def set_env_vars(self): + if self.current_environ.name == 'PrgEnv-pgi': self.variables['OMP_PROC_BIND'] = 'true' - try: + @rfm.run_before('performance') + def set_perf_references(self): + envname = self.current_environ.name + if envname in self.stream_bw_reference: self.reference = self.stream_bw_reference[envname] - except KeyError: - self.reference = self.stream_bw_reference['PrgEnv-gnu'] diff --git a/hpctestlib/microbenchmarks/cpu/stream/__init__.py b/hpctestlib/microbenchmarks/cpu/stream/__init__.py new file mode 100644 index 0000000000..a112e720df --- /dev/null +++ b/hpctestlib/microbenchmarks/cpu/stream/__init__.py @@ -0,0 +1,61 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn + + +__all__ = ['Stream'] + + +class Stream(rfm.RegressionTest, pin_prefix=True): + '''This test checks the stream test: + Function Best Rate MB/s Avg time Min time Max time + Triad: 13991.7 0.017174 0.017153 0.017192 + ''' + + descr = 'STREAM Benchmark' + exclusive_access = True + use_multithreading = False + sourcepath = 'stream.c' + build_system = 'SingleSource' + num_tasks_per_node = 1 + variables = { + 'OMP_PLACES': 'threads', + 'OMP_PROC_BIND': 'spread' + } + + num_tasks = required + num_cpus_per_task = required + + reference = { + '*': { + 'triad': (None, None, None, 'MB/s') + } + } + maintainers = ['RS', 'SK'] + + @rfm.run_before('run') + def set_omp_num_threads(self): + '''Set the number of OMP threads to ``num_cpus_per_task``.''' + self.variables['OMP_NUM_THREADS'] = f'{self.num_cpus_per_task}' + + @rfm.run_before('sanity') + def set_sanity_patterns(self): + '''Set sanity patterns to check the error threshold.''' + + self.sanity_patterns = sn.assert_found( + r'Solution Validates: avg error less than', self.stdout + ) + + @rfm.run_before('performance') + def set_performance_patterns(self): + '''Set performance to track the triad bandwidth.''' + + self.perf_patterns = { + 'triad': sn.min(sn.extractall( + r'Triad:\s+(?P\S+)\s+\S+', self.stdout, 'triad', float + )) + } diff --git a/cscs-checks/microbenchmarks/cpu/stream/src/stream.c b/hpctestlib/microbenchmarks/cpu/stream/src/stream.c similarity index 100% rename from cscs-checks/microbenchmarks/cpu/stream/src/stream.c rename to hpctestlib/microbenchmarks/cpu/stream/src/stream.c From 363d42ff90fb4befc4bcdc9354dceed97f09ac56 Mon Sep 17 00:00:00 2001 From: Javier Otero Date: Mon, 31 May 2021 15:53:15 +0200 Subject: [PATCH 04/20] Bugfix stream tests --- .../microbenchmarks/cpu/stream/stream.py | 31 +- .../microbenchmarks/cpu/stream/__init__.py | 37 +- .../microbenchmarks/cpu/stream/src/stream.c | 585 ------------------ .../cpu/strided_bandwidth/__init__.py | 7 +- 4 files changed, 59 insertions(+), 601 deletions(-) delete mode 100644 hpctestlib/microbenchmarks/cpu/stream/src/stream.c diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream.py b/cscs-checks/microbenchmarks/cpu/stream/stream.py index 8180fe8738..be156ece8b 100644 --- a/cscs-checks/microbenchmarks/cpu/stream/stream.py +++ b/cscs-checks/microbenchmarks/cpu/stream/stream.py @@ -6,11 +6,13 @@ import reframe as rfm import reframe.utility.sanity as sn -from hpctestlib.microbenchmark.cpu.stream import Stream +from hpctestlib.microbenchmarks.cpu.stream import Stream @rfm.simple_test class stream_check(Stream): + '''Stream benchmark test.''' + valid_systems = [ 'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn' @@ -38,7 +40,7 @@ class stream_check(Stream): 'tsa:pn': 16, } ) - stream_bw_reference = variable( + triad_reference = variable( dict, value={ 'PrgEnv-cray': { 'daint:gpu': {'triad': (44000, -0.05, None, 'MB/s')}, @@ -71,27 +73,44 @@ class stream_check(Stream): @rfm.run_after('init') def filter_valid_prog_environs(self): + '''Special conditions for arolla and tsa.''' if self.current_system.name in ['arolla', 'tsa']: self.valid_prog_environs = ['PrgEnv-gnu'] - @rfm.run_after('init') + @rfm.run_after('setup') def set_num_cpus_per_task(self): + '''If partition not in ``stream_cpus_per_task``, leave as required.''' self.num_cpus_per_task = self.stream_cpus_per_task.get( - self.current_partition.fullname, required + self.current_partition.fullname, self.required ) @rfm.run_before('compile') def set_compiler_flags(self): + '''Set build flags for the different environments.''' envname = self.current_environ.name self.build_system.cflags = self.prgenv_flags.get(envname, ['-O3']) @rfm.run_before('run') def set_env_vars(self): + '''Special environment treatment for the PrgEnv-pgi.''' if self.current_environ.name == 'PrgEnv-pgi': self.variables['OMP_PROC_BIND'] = 'true' @rfm.run_before('performance') def set_perf_references(self): + '''Set performance refs as defined in ``triad_reference``. + + All other perf vars are left as default. + ''' + envname = self.current_environ.name - if envname in self.stream_bw_reference: - self.reference = self.stream_bw_reference[envname] + if envname in self.triad_reference: + extra_refs = { + '*': { + 'scale': (None, None, None, 'MB/s'), + 'add': (None, None, None, 'MB/s'), + 'copy': (None, None, None, 'MB/s'), + } + } + self.reference = self.triad_reference[envname] + self.reference.update(extra_refs) diff --git a/hpctestlib/microbenchmarks/cpu/stream/__init__.py b/hpctestlib/microbenchmarks/cpu/stream/__init__.py index a112e720df..b4ce530dce 100644 --- a/hpctestlib/microbenchmarks/cpu/stream/__init__.py +++ b/hpctestlib/microbenchmarks/cpu/stream/__init__.py @@ -11,14 +11,25 @@ class Stream(rfm.RegressionTest, pin_prefix=True): - '''This test checks the stream test: - Function Best Rate MB/s Avg time Min time Max time - Triad: 13991.7 0.017174 0.017153 0.017192 + '''Stream benchmark. + + For info on the executable, see the executable sources. + + Derived tests must set the variables ``num_tasks`` and + ``num_cpus_per_task``. + ''' + # Required variables + num_tasks = required + num_cpus_per_task = required + descr = 'STREAM Benchmark' exclusive_access = True use_multithreading = False + prebuild_cmds = [ + 'wget http://www.cs.virginia.edu/stream/FTP/Code/stream.c', + ] sourcepath = 'stream.c' build_system = 'SingleSource' num_tasks_per_node = 1 @@ -26,13 +37,12 @@ class Stream(rfm.RegressionTest, pin_prefix=True): 'OMP_PLACES': 'threads', 'OMP_PROC_BIND': 'spread' } - - num_tasks = required - num_cpus_per_task = required - reference = { '*': { - 'triad': (None, None, None, 'MB/s') + 'triad': (None, None, None, 'MB/s'), + 'add': (None, None, None, 'MB/s'), + 'copy': (None, None, None, 'MB/s'), + 'scale': (None, None, None, 'MB/s') } } maintainers = ['RS', 'SK'] @@ -57,5 +67,14 @@ def set_performance_patterns(self): self.perf_patterns = { 'triad': sn.min(sn.extractall( r'Triad:\s+(?P\S+)\s+\S+', self.stdout, 'triad', float - )) + )), + 'add': sn.min(sn.extractall( + r'Add:\s+(?P\S+)\s+\S+', self.stdout, 'add', float + )), + 'copy': sn.min(sn.extractall( + r'Copy:\s+(?P\S+)\s+\S+', self.stdout, 'copy', float + )), + 'scale': sn.min(sn.extractall( + r'Scale:\s+(?P\S+)\s+\S+', self.stdout, 'scale', float + )), } diff --git a/hpctestlib/microbenchmarks/cpu/stream/src/stream.c b/hpctestlib/microbenchmarks/cpu/stream/src/stream.c deleted file mode 100644 index b9a2cee3b2..0000000000 --- a/hpctestlib/microbenchmarks/cpu/stream/src/stream.c +++ /dev/null @@ -1,585 +0,0 @@ -/*-----------------------------------------------------------------------*/ -/* Program: STREAM */ -/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */ -/* Original code developed by John D. McCalpin */ -/* Programmers: John D. McCalpin */ -/* Joe R. Zagar */ -/* */ -/* This program measures memory transfer rates in MB/s for simple */ -/* computational kernels coded in C. */ -/*-----------------------------------------------------------------------*/ -/* Copyright 1991-2013: John D. McCalpin */ -/*-----------------------------------------------------------------------*/ -/* License: */ -/* 1. You are free to use this program and/or to redistribute */ -/* this program. */ -/* 2. You are free to modify this program for your own use, */ -/* including commercial use, subject to the publication */ -/* restrictions in item 3. */ -/* 3. You are free to publish results obtained from running this */ -/* program, or from works that you derive from this program, */ -/* with the following limitations: */ -/* 3a. In order to be referred to as "STREAM benchmark results", */ -/* published results must be in conformance to the STREAM */ -/* Run Rules, (briefly reviewed below) published at */ -/* http://www.cs.virginia.edu/stream/ref.html */ -/* and incorporated herein by reference. */ -/* As the copyright holder, John McCalpin retains the */ -/* right to determine conformity with the Run Rules. */ -/* 3b. Results based on modified source code or on runs not in */ -/* accordance with the STREAM Run Rules must be clearly */ -/* labelled whenever they are published. Examples of */ -/* proper labelling include: */ -/* "tuned STREAM benchmark results" */ -/* "based on a variant of the STREAM benchmark code" */ -/* Other comparable, clear, and reasonable labelling is */ -/* acceptable. */ -/* 3c. Submission of results to the STREAM benchmark web site */ -/* is encouraged, but not required. */ -/* 4. Use of this program or creation of derived works based on this */ -/* program constitutes acceptance of these licensing restrictions. */ -/* 5. Absolutely no warranty is expressed or implied. */ -/*-----------------------------------------------------------------------*/ -# include -# include -# include -# include -# include -# include - -/*----------------------------------------------------------------------- - * INSTRUCTIONS: - * - * 1) STREAM requires different amounts of memory to run on different - * systems, depending on both the system cache size(s) and the - * granularity of the system timer. - * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) - * to meet *both* of the following criteria: - * (a) Each array must be at least 4 times the size of the - * available cache memory. I don't worry about the difference - * between 10^6 and 2^20, so in practice the minimum array size - * is about 3.8 times the cache size. - * Example 1: One Xeon E3 with 8 MB L3 cache - * STREAM_ARRAY_SIZE should be >= 4 million, giving - * an array size of 30.5 MB and a total memory requirement - * of 91.5 MB. - * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) - * STREAM_ARRAY_SIZE should be >= 20 million, giving - * an array size of 153 MB and a total memory requirement - * of 458 MB. - * (b) The size should be large enough so that the 'timing calibration' - * output by the program is at least 20 clock-ticks. - * Example: most versions of Windows have a 10 millisecond timer - * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. - * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. - * This means the each array must be at least 1 GB, or 128M elements. - * - * Version 5.10 increases the default array size from 2 million - * elements to 10 million elements in response to the increasing - * size of L3 caches. The new default size is large enough for caches - * up to 20 MB. - * Version 5.10 changes the loop index variables from "register int" - * to "ssize_t", which allows array indices >2^32 (4 billion) - * on properly configured 64-bit systems. Additional compiler options - * (such as "-mcmodel=medium") may be required for large memory runs. - * - * Array size can be set at compile time without modifying the source - * code for the (many) compilers that support preprocessor definitions - * on the compile line. E.g., - * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M - * will override the default size of 10M with a new size of 100M elements - * per array. - */ -#ifndef STREAM_ARRAY_SIZE -# define STREAM_ARRAY_SIZE 10000000 -#endif - -/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result - * for any iteration after the first, therefore the minimum value - * for NTIMES is 2. - * There are no rules on maximum allowable values for NTIMES, but - * values larger than the default are unlikely to noticeably - * increase the reported performance. - * NTIMES can also be set on the compile line without changing the source - * code using, for example, "-DNTIMES=7". - */ -#ifdef NTIMES -#if NTIMES<=1 -# define NTIMES 10 -#endif -#endif -#ifndef NTIMES -# define NTIMES 10 -#endif - -/* Users are allowed to modify the "OFFSET" variable, which *may* change the - * relative alignment of the arrays (though compilers may change the - * effective offset by making the arrays non-contiguous on some systems). - * Use of non-zero values for OFFSET can be especially helpful if the - * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. - * OFFSET can also be set on the compile line without changing the source - * code using, for example, "-DOFFSET=56". - */ -#ifndef OFFSET -# define OFFSET 0 -#endif - -/* - * 3) Compile the code with optimization. Many compilers generate - * unreasonably bad code before the optimizer tightens things up. - * If the results are unreasonably good, on the other hand, the - * optimizer might be too smart for me! - * - * For a simple single-core version, try compiling with: - * cc -O stream.c -o stream - * This is known to work on many, many systems.... - * - * To use multiple cores, you need to tell the compiler to obey the OpenMP - * directives in the code. This varies by compiler, but a common example is - * gcc -O -fopenmp stream.c -o stream_omp - * The environment variable OMP_NUM_THREADS allows runtime control of the - * number of threads/cores used when the resulting "stream_omp" program - * is executed. - * - * To run with single-precision variables and arithmetic, simply add - * -DSTREAM_TYPE=float - * to the compile line. - * Note that this changes the minimum array sizes required --- see (1) above. - * - * The preprocessor directive "TUNED" does not do much -- it simply causes the - * code to call separate functions to execute each kernel. Trivial versions - * of these functions are provided, but they are *not* tuned -- they just - * provide predefined interfaces to be replaced with tuned code. - * - * - * 4) Optional: Mail the results to mccalpin@cs.virginia.edu - * Be sure to include info that will help me understand: - * a) the computer hardware configuration (e.g., processor model, memory type) - * b) the compiler name/version and compilation flags - * c) any run-time information (such as OMP_NUM_THREADS) - * d) all of the output from the test case. - * - * Thanks! - * - *-----------------------------------------------------------------------*/ - -# define HLINE "-------------------------------------------------------------\n" - -# ifndef MIN -# define MIN(x,y) ((x)<(y)?(x):(y)) -# endif -# ifndef MAX -# define MAX(x,y) ((x)>(y)?(x):(y)) -# endif - -#ifndef STREAM_TYPE -#define STREAM_TYPE double -#endif - -static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], - b[STREAM_ARRAY_SIZE+OFFSET], - c[STREAM_ARRAY_SIZE+OFFSET]; - -static double avgtime[4] = {0}, maxtime[4] = {0}, - mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; - -static char *label[4] = {"Copy: ", "Scale: ", - "Add: ", "Triad: "}; - -static double bytes[4] = { - 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, - 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, - 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, - 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE - }; - -extern double mysecond(); -extern void checkSTREAMresults(); -#ifdef TUNED -extern void tuned_STREAM_Copy(); -extern void tuned_STREAM_Scale(STREAM_TYPE scalar); -extern void tuned_STREAM_Add(); -extern void tuned_STREAM_Triad(STREAM_TYPE scalar); -#endif -#ifdef _OPENMP -extern int omp_get_num_threads(); -#endif -int -main() - { - int quantum, checktick(); - int BytesPerWord; - int k; - ssize_t j; - STREAM_TYPE scalar; - double t, times[4][NTIMES]; - - /* --- SETUP --- determine precision and check timing --- */ - - printf(HLINE); - printf("STREAM version $Revision: 5.10 $\n"); - printf(HLINE); - BytesPerWord = sizeof(STREAM_TYPE); - printf("This system uses %d bytes per array element.\n", - BytesPerWord); - - printf(HLINE); -#ifdef N - printf("***** WARNING: ******\n"); - printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); - printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); - printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); - printf("***** WARNING: ******\n"); -#endif - - printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); - printf("Memory per array = %.1f MiB (= %.1f GiB).\n", - BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), - BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); - printf("Total memory required = %.1f MiB (= %.1f GiB).\n", - (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), - (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); - printf("Each kernel will be executed %d times.\n", NTIMES); - printf(" The *best* time for each kernel (excluding the first iteration)\n"); - printf(" will be used to compute the reported bandwidth.\n"); - -#ifdef _OPENMP - printf(HLINE); -#pragma omp parallel - { -#pragma omp master - { - k = omp_get_num_threads(); - printf ("Number of Threads requested = %i\n",k); - } - } -#endif - -#ifdef _OPENMP - k = 0; -#pragma omp parallel -#pragma omp atomic - k++; - printf ("Number of Threads counted = %i\n",k); -#endif - - /* Get initial value for system clock. */ -#pragma omp parallel for - for (j=0; j= 1) - printf("Your clock granularity/precision appears to be " - "%d microseconds.\n", quantum); - else { - printf("Your clock granularity appears to be " - "less than one microsecond.\n"); - quantum = 1; - } - - t = mysecond(); -#pragma omp parallel for - for (j = 0; j < STREAM_ARRAY_SIZE; j++) - a[j] = 2.0E0 * a[j]; - t = 1.0E6 * (mysecond() - t); - - printf("Each test below will take on the order" - " of %d microseconds.\n", (int) t ); - printf(" (= %d clock ticks)\n", (int) (t/quantum) ); - printf("Increase the size of the arrays if this shows that\n"); - printf("you are not getting at least 20 clock ticks per test.\n"); - - printf(HLINE); - - printf("WARNING -- The above is only a rough guideline.\n"); - printf("For best results, please be sure you know the\n"); - printf("precision of your system timer.\n"); - printf(HLINE); - - /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ - - scalar = 3.0; - for (k=0; k - -double mysecond() -{ - struct timeval tp; - struct timezone tzp; - int i; - - i = gettimeofday(&tp,&tzp); - return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); -} - -#ifndef abs -#define abs(a) ((a) >= 0 ? (a) : -(a)) -#endif -void checkSTREAMresults () -{ - STREAM_TYPE aj,bj,cj,scalar; - STREAM_TYPE aSumErr,bSumErr,cSumErr; - STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; - double epsilon; - ssize_t j; - int k,ierr,err; - - /* reproduce initialization */ - aj = 1.0; - bj = 2.0; - cj = 0.0; - /* a[] is modified during timing check */ - aj = 2.0E0 * aj; - /* now execute timing loop */ - scalar = 3.0; - for (k=0; k epsilon) { - err++; - printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); - printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); - ierr = 0; - for (j=0; j epsilon) { - ierr++; -#ifdef VERBOSE - if (ierr < 10) { - printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", - j,aj,a[j],abs((aj-a[j])/aAvgErr)); - } -#endif - } - } - printf(" For array a[], %d errors were found.\n",ierr); - } - if (abs(bAvgErr/bj) > epsilon) { - err++; - printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); - printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); - printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); - ierr = 0; - for (j=0; j epsilon) { - ierr++; -#ifdef VERBOSE - if (ierr < 10) { - printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", - j,bj,b[j],abs((bj-b[j])/bAvgErr)); - } -#endif - } - } - printf(" For array b[], %d errors were found.\n",ierr); - } - if (abs(cAvgErr/cj) > epsilon) { - err++; - printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); - printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); - printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); - ierr = 0; - for (j=0; j epsilon) { - ierr++; -#ifdef VERBOSE - if (ierr < 10) { - printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", - j,cj,c[j],abs((cj-c[j])/cAvgErr)); - } -#endif - } - } - printf(" For array c[], %d errors were found.\n",ierr); - } - if (err == 0) { - printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); - } -#ifdef VERBOSE - printf ("Results Validation Verbose Results: \n"); - printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); - printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); - printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); -#endif -} - -#ifdef TUNED -/* stubs for "tuned" versions of the kernels */ -void tuned_STREAM_Copy() -{ - ssize_t j; -#pragma omp parallel for - for (j=0; j Date: Mon, 31 May 2021 15:55:39 +0200 Subject: [PATCH 05/20] Remove pin_prefix from stream test --- hpctestlib/microbenchmarks/cpu/stream/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hpctestlib/microbenchmarks/cpu/stream/__init__.py b/hpctestlib/microbenchmarks/cpu/stream/__init__.py index b4ce530dce..38da66e447 100644 --- a/hpctestlib/microbenchmarks/cpu/stream/__init__.py +++ b/hpctestlib/microbenchmarks/cpu/stream/__init__.py @@ -10,14 +10,13 @@ __all__ = ['Stream'] -class Stream(rfm.RegressionTest, pin_prefix=True): +class Stream(rfm.RegressionTest): '''Stream benchmark. For info on the executable, see the executable sources. Derived tests must set the variables ``num_tasks`` and ``num_cpus_per_task``. - ''' # Required variables From 8be9af24be89320cec7dcfff4d731d9b483d2da6 Mon Sep 17 00:00:00 2001 From: Javier Otero Date: Mon, 31 May 2021 19:10:08 +0200 Subject: [PATCH 06/20] Port cpu latency to hpctestlib --- .../microbenchmarks/cpu/latency/latency.py | 108 +++++++----------- .../microbenchmarks/cpu/latency/__init__.py | 89 +++++++++++++++ .../cpu/latency/src/latency.cpp | 0 3 files changed, 132 insertions(+), 65 deletions(-) create mode 100644 hpctestlib/microbenchmarks/cpu/latency/__init__.py rename {cscs-checks => hpctestlib}/microbenchmarks/cpu/latency/src/latency.cpp (100%) diff --git a/cscs-checks/microbenchmarks/cpu/latency/latency.py b/cscs-checks/microbenchmarks/cpu/latency/latency.py index fa02138a9b..63221cb1f6 100644 --- a/cscs-checks/microbenchmarks/cpu/latency/latency.py +++ b/cscs-checks/microbenchmarks/cpu/latency/latency.py @@ -6,92 +6,70 @@ import reframe as rfm import reframe.utility.sanity as sn +from hpctestlib.microbenchmarks.cpu.latency import CpuLatency -@rfm.simple_test -class CPULatencyTest(rfm.RegressionTest): - def __init__(self): - self.sourcepath = 'latency.cpp' - self.build_system = 'SingleSource' - self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', - 'ault:intel', 'ault:amdvega', 'tave:compute'] - self.valid_prog_environs = ['PrgEnv-gnu'] - self.num_tasks = 0 - self.num_tasks_per_node = 1 - - self.build_system.cxxflags = ['-O3'] - self.executable_opts = ['16000', '128000', '8000000', '500000000'] +@rfm.simple_test +class cpu_latency_check(CpuLatency): + buffer_sizes = ['16000', '128000', '8000000', '500000000'] + num_tasks = 0 + valid_systems = [ + 'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', + 'ault:intel', 'ault:amdvega', 'tave:compute' + ] + valid_prog_environs = ['PrgEnv-gnu'] + tags = {'benchmark', 'diagnostic'} + @rfm.run_after('setup') + def set_modules(self): if self.current_system.name in {'daint', 'dom'}: self.modules = ['craype-hugepages1G'] if self.current_system.name in {'tave'}: self.modules = ['craype-hugepages512M'] - self.sanity_patterns = sn.assert_eq( - sn.count(sn.findall(r'latency', self.stdout)), - self.num_tasks_assigned * len(self.executable_opts)) - - def lat_pattern(index): - return sn.extractsingle( - r'latency \(ns\) for input size %s: (?P\S+) clocks' % - self.executable_opts[index], self.stdout, 'bw', float) - - self.perf_patterns = { - 'latencyL1': lat_pattern(0), - 'latencyL2': lat_pattern(1), - 'latencyL3': lat_pattern(2), - 'latencyMem': lat_pattern(3), - } - + @rfm.run_before('performance') + def set_references(self): self.reference = { 'dom:mc': { - 'latencyL1': (1.21, -0.01, 0.26, 'ns'), - 'latencyL2': (3.65, -0.01, 0.26, 'ns'), - 'latencyL3': (18.83, -0.01, 0.05, 'ns'), - 'latencyMem': (76.6, -0.01, 0.05, 'ns') + 'latencyL1': (1.21, -0.01, 0.26, 'ns'), + 'latencyL2': (3.65, -0.01, 0.26, 'ns'), + 'latencyL3': (18.83, -0.01, 0.05, 'ns'), + 'latencyL4': (76.6, -0.01, 0.05, 'ns') }, 'dom:gpu': { - 'latencyL1': (1.14, -0.01, 0.26, 'ns'), - 'latencyL2': (3.44, -0.01, 0.26, 'ns'), - 'latencyL3': (15.65, -0.01, 0.05, 'ns'), - 'latencyMem': (71.7, -0.01, 0.05, 'ns') + 'latencyL1': (1.14, -0.01, 0.26, 'ns'), + 'latencyL2': (3.44, -0.01, 0.26, 'ns'), + 'latencyL3': (15.65, -0.01, 0.05, 'ns'), + 'latencyL4': (71.7, -0.01, 0.05, 'ns') }, 'daint:mc': { - 'latencyL1': (1.21, -0.01, 0.26, 'ns'), - 'latencyL2': (3.65, -0.01, 0.26, 'ns'), - 'latencyL3': (18.83, -0.01, 0.05, 'ns'), - 'latencyMem': (76.6, -0.01, 0.05, 'ns') + 'latencyL1': (1.21, -0.01, 0.26, 'ns'), + 'latencyL2': (3.65, -0.01, 0.26, 'ns'), + 'latencyL3': (18.83, -0.01, 0.05, 'ns'), + 'latencyL4': (76.6, -0.01, 0.05, 'ns') }, 'daint:gpu': { - 'latencyL1': (1.14, -0.01, 0.26, 'ns'), - 'latencyL2': (3.44, -0.01, 0.26, 'ns'), - 'latencyL3': (15.65, -0.01, 0.05, 'ns'), - 'latencyMem': (71.7, -0.01, 0.05, 'ns') + 'latencyL1': (1.14, -0.01, 0.26, 'ns'), + 'latencyL2': (3.44, -0.01, 0.26, 'ns'), + 'latencyL3': (15.65, -0.01, 0.05, 'ns'), + 'latencyL4': (71.7, -0.01, 0.05, 'ns') }, 'ault:intel': { - 'latencyL1': (1.08, -0.01, 0.26, 'ns'), - 'latencyL2': (3.8, -0.01, 0.26, 'ns'), - 'latencyL3': (21.5, -0.01, 0.05, 'ns'), - 'latencyMem': (86.5, -0.01, 0.05, 'ns') + 'latencyL1': (1.08, -0.01, 0.26, 'ns'), + 'latencyL2': (3.8, -0.01, 0.26, 'ns'), + 'latencyL3': (21.5, -0.01, 0.05, 'ns'), + 'latencyL4': (86.5, -0.01, 0.05, 'ns') }, 'ault:amdvega': { - 'latencyL1': (1.32, -0.01, 0.26, 'ns'), - 'latencyL2': (4.02, -0.01, 0.26, 'ns'), - 'latencyL3': (14.4, -0.01, 0.26, 'ns'), - 'latencyMem': (90.0, -0.01, 0.05, 'ns') + 'latencyL1': (1.32, -0.01, 0.26, 'ns'), + 'latencyL2': (4.02, -0.01, 0.26, 'ns'), + 'latencyL3': (14.4, -0.01, 0.26, 'ns'), + 'latencyL4': (90.0, -0.01, 0.05, 'ns') }, 'tave:compute': { - 'latencyL1': (2.86, -0.01, 0.05, 'ns'), - 'latencyL2': (12.15, -0.01, 0.05, 'ns'), - 'latencyL3': (137, -0.01, 0.05, 'ns'), - 'latencyMem': (150, -0.05, 0.05, 'ns') + 'latencyL1': (2.86, -0.01, 0.05, 'ns'), + 'latencyL2': (12.15, -0.01, 0.05, 'ns'), + 'latencyL3': (137, -0.01, 0.05, 'ns'), + 'latencyL4': (150, -0.05, 0.05, 'ns') }, } - - self.maintainers = ['SK'] - self.tags = {'benchmark', 'diagnostic'} - - @property - @sn.sanity_function - def num_tasks_assigned(self): - return self.job.num_tasks diff --git a/hpctestlib/microbenchmarks/cpu/latency/__init__.py b/hpctestlib/microbenchmarks/cpu/latency/__init__.py new file mode 100644 index 0000000000..2ec00d9842 --- /dev/null +++ b/hpctestlib/microbenchmarks/cpu/latency/__init__.py @@ -0,0 +1,89 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn +import reframe.utility.typecheck as typ + +__all__ = ['CpuLatency'] + + +class CpuLatency(rfm.RegressionTest, pin_prefix=True): + ''' CPU latency test. + + Derived tests must set the variables ``buffer_size`` and ``num_tasks``. + The variable ``buffer_sizes`` is a list of the different buffer sizes to + be used on this latency test. The executable will return the latency in + ``ns`` for each of the buffer sizes specified in this list. + + This test assumes that the list of buffer sizes is provided in increasing + order, and this test will automatically extract a performance variable for + the latency of each buffer. These performance variables are named + ``latencyL1``, ``latencyL2`` and so on in increasing order. + ''' + + # Required variables + buffer_sizes = variable(typ.List[str]) + num_tasks = required + + sourcepath = 'latency.cpp' + build_system = 'SingleSource' + num_tasks_per_node = 1 + maintainers = ['SK', 'JO'] + + @rfm.run_before('compile') + def set_compiler_flags(self): + self.build_system.cxxflags = ['-O3'] + + @rfm.run_before('run') + def set_exc_opts(self): + self.executable_opts = self.buffer_sizes + + @rfm.run_before('sanity') + def set_sanity_patterns(self): + self.sanity_patterns = sn.assert_eq( + sn.count(sn.findall(r'latency \(ns\)', self.stdout)), + self.num_tasks*sn.count(self.executable_opts) + ) + + @sn.sanity_function + def get_latency(self, buffer_size): + '''Extract the worst latency for a given buffer size.''' + + return sn.max(sn.extractall( + r'latency \(ns\) for input size %s: (?P\S+) clocks' % + buffer_size, self.stdout, 'bw', float + )) + + @rfm.run_before('performance') + def set_references(self): + '''Set dummy references to get the perf values in the perf report. + + This will create as many levels as passed in ``buffer_sizes``. Derived + test must override this hook if they wish to use their own reference + values. + ''' + + refs = {'*': {}} + dummy_ref = (None, None, None, 'ns') + for i, buff in enumerate(self.buffer_sizes): + level = i+1 + refs['*'].update({f'latencyL{level}': dummy_ref}) + + self.reference = refs + + @rfm.run_before('performance') + def set_perf_patterns(self): + '''Set the performance patters to extract all latency levels. + + The levels are named from ``L1`` to ``L(n+1)``, where ``n`` is the + length of ``buffer_sizes``. + ''' + + self.perf_patterns = {} + for i, buff in enumerate(self.buffer_sizes): + level = i+1 + level_name = f'latencyL{level}' + self.perf_patterns.update({level_name: self.get_latency(buff)}) diff --git a/cscs-checks/microbenchmarks/cpu/latency/src/latency.cpp b/hpctestlib/microbenchmarks/cpu/latency/src/latency.cpp similarity index 100% rename from cscs-checks/microbenchmarks/cpu/latency/src/latency.cpp rename to hpctestlib/microbenchmarks/cpu/latency/src/latency.cpp From c9400450da3db1a56717e3f5c664eb4915ad1cc6 Mon Sep 17 00:00:00 2001 From: Javier Otero Date: Tue, 1 Jun 2021 11:10:49 +0200 Subject: [PATCH 07/20] Cleanup performance stage --- .../microbenchmarks/cpu/latency/__init__.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/hpctestlib/microbenchmarks/cpu/latency/__init__.py b/hpctestlib/microbenchmarks/cpu/latency/__init__.py index 2ec00d9842..3557af0c04 100644 --- a/hpctestlib/microbenchmarks/cpu/latency/__init__.py +++ b/hpctestlib/microbenchmarks/cpu/latency/__init__.py @@ -39,10 +39,13 @@ def set_compiler_flags(self): @rfm.run_before('run') def set_exc_opts(self): + '''Set the ``buffer_sizes`` as the executable options.''' self.executable_opts = self.buffer_sizes @rfm.run_before('sanity') def set_sanity_patterns(self): + '''Verify the number of reported latency measurements.''' + self.sanity_patterns = sn.assert_eq( sn.count(sn.findall(r'latency \(ns\)', self.stdout)), self.num_tasks*sn.count(self.executable_opts) @@ -61,29 +64,27 @@ def get_latency(self, buffer_size): def set_references(self): '''Set dummy references to get the perf values in the perf report. - This will create as many levels as passed in ``buffer_sizes``. Derived + This will create as many levels as items in ``buffer_sizes``. Derived test must override this hook if they wish to use their own reference values. ''' - refs = {'*': {}} - dummy_ref = (None, None, None, 'ns') - for i, buff in enumerate(self.buffer_sizes): - level = i+1 - refs['*'].update({f'latencyL{level}': dummy_ref}) - - self.reference = refs + self.reference = { + '*': { + f'latencyL{i+1}': (None, None, None, 'ns') + for i, buff in enumerate(self.buffer_sizes) + } + } @rfm.run_before('performance') def set_perf_patterns(self): '''Set the performance patters to extract all latency levels. - The levels are named from ``L1`` to ``L(n+1)``, where ``n`` is the + The levels are named from ``L1`` to ``L{n}``, where ``n`` is the length of ``buffer_sizes``. ''' - self.perf_patterns = {} - for i, buff in enumerate(self.buffer_sizes): - level = i+1 - level_name = f'latencyL{level}' - self.perf_patterns.update({level_name: self.get_latency(buff)}) + self.perf_patterns = { + f'latencyL{i+1}': self.get_latency(buff) + for i, buff in enumerate(self.buffer_sizes) + } From eef477ae2f63dedd685db7407202377729e55fbe Mon Sep 17 00:00:00 2001 From: Javier Otero Date: Tue, 1 Jun 2021 14:08:10 +0200 Subject: [PATCH 08/20] Port CPU DGEMM to hpctestlib --- .../microbenchmarks/cpu/dgemm/dgemm.py | 133 +++++++++--------- .../microbenchmarks/cpu/dgemm/__init__.py | 97 +++++++++++++ .../microbenchmarks/cpu/dgemm/src/dgemm.c | 0 3 files changed, 163 insertions(+), 67 deletions(-) create mode 100644 hpctestlib/microbenchmarks/cpu/dgemm/__init__.py rename {cscs-checks => hpctestlib}/microbenchmarks/cpu/dgemm/src/dgemm.c (100%) diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py index 3b9fc82b42..92891282d6 100644 --- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py +++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py @@ -5,44 +5,60 @@ import reframe as rfm import reframe.utility.sanity as sn +import reframe.utility.osext as osext +from hpctestlib.microbenchmarks.cpu.dgemm import Dgemm @rfm.simple_test -class DGEMMTest(rfm.RegressionTest): - def __init__(self): - self.descr = 'DGEMM performance test' - self.sourcepath = 'dgemm.c' - self.sanity_patterns = self.eval_sanity() +class dgemm_check(Dgemm): + '''CSCS DGEMM check. - # the perf patterns are automaticaly generated inside sanity - self.perf_patterns = {} - self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', - 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn', - 'eiger:mc', 'pilatus:mc'] + The matrix dimensions are set in the base class. + Every node reports its performance in Gflops/s. To do so, this class + overrides the performance patterns and references from the base test. + This is done in the ``set_perf_patterns`` pre-performance hook. + ''' + + valid_systems = [ + 'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', + 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn', + 'eiger:mc', 'pilatus:mc' + ] + num_tasks = 0 + sys_reference = variable( + dict, value={ + 'daint:gpu': (300.0, -0.15, None, 'Gflop/s'), + 'daint:mc': (1040.0, -0.15, None, 'Gflop/s'), + 'dom:gpu': (300.0, -0.15, None, 'Gflop/s'), + 'dom:mc': (1040.0, -0.15, None, 'Gflop/s'), + 'eiger:mc': (3200.0, -0.15, None, 'Gflop/s'), + 'pilatus:mc': (3200.0, -0.15, None, 'Gflop/s'), + '*': (None, None, None, 'Gflop/s'), + }, + ) + tags = {'benchmark', 'diagnostic', 'craype'} + + @rfm.run_after('init') + def set_valid_prog_environs(self): if self.current_system.name in ['daint', 'dom']: self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-intel'] elif self.current_system.name in ['arolla', 'tsa']: self.valid_prog_environs = ['PrgEnv-gnu-nompi'] elif self.current_system.name in ['eiger', 'pilatus']: self.valid_prog_environs = ['PrgEnv-gnu'] - else: - self.valid_prog_environs = [] - self.num_tasks = 0 - self.use_multithreading = False - self.executable_opts = ['6144', '12288', '3072'] - self.build_system = 'SingleSource' - self.build_system.cflags = ['-O3'] - self.sys_reference = { - 'daint:gpu': (300.0, -0.15, None, 'Gflop/s'), - 'daint:mc': (1040.0, -0.15, None, 'Gflop/s'), - 'dom:gpu': (300.0, -0.15, None, 'Gflop/s'), - 'dom:mc': (1040.0, -0.15, None, 'Gflop/s'), - 'eiger:mc': (3200.0, -0.15, None, 'Gflop/s'), - 'pilatus:mc': (3200.0, -0.15, None, 'Gflop/s'), - } - self.maintainers = ['AJ', 'VH'] - self.tags = {'benchmark', 'diagnostic', 'craype'} + @rfm.run_after('setup') + def set_num_cpus_per_task(self): + if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']: + self.num_cpus_per_task = 12 + elif self.current_partition.fullname in ['daint:mc', 'dom:mc']: + self.num_cpus_per_task = 36 + elif self.current_partition.fullname in ['arolla:cn', 'tsa:cn']: + self.num_cpus_per_task = 16 + elif self.current_partition.fullname in ['arolla:pn', 'tsa:pn']: + self.num_cpus_per_task = 40 + elif self.current_partition.fullname in ['eiger:mc', 'pilatus:mc']: + self.num_cpus_per_task = 128 @rfm.run_before('compile') def setflags(self): @@ -52,7 +68,7 @@ def setflags(self): self.build_system.cppflags = [ '-DMKL_ILP64', '-I${MKLROOT}/include' ] - self.build_system.cflags = ['-qopenmp'] + self.build_system.cflags += ['-qopenmp'] self.build_system.ldflags = [ '-mkl', '-static-intel', '-liomp5', '-lpthread', '-lm', '-ldl' ] @@ -63,46 +79,29 @@ def setflags(self): self.build_system.ldflags = ['-L$EBROOTOPENBLAS/lib', '-lopenblas', '-lpthread', '-lgfortran'] - @rfm.run_before('run') - def set_tasks(self): - if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']: - self.num_cpus_per_task = 12 - elif self.current_partition.fullname in ['daint:mc', 'dom:mc']: - self.num_cpus_per_task = 36 - elif self.current_partition.fullname in ['arolla:cn', 'tsa:cn']: - self.num_cpus_per_task = 16 - elif self.current_partition.fullname in ['arolla:pn', 'tsa:pn']: - self.num_cpus_per_task = 40 - elif self.current_partition.fullname in ['eiger:mc', 'pilatus:mc']: - self.num_cpus_per_task = 128 + @rfm.run_before('performance') + def set_perf_patterns(self): + '''Override base performance patterns. - if self.num_cpus_per_task: - self.variables = { - 'OMP_NUM_THREADS': str(self.num_cpus_per_task), - 'OMP_BIND': 'cores', - 'OMP_PROC_BIND': 'spread', - 'OMP_SCHEDULE': 'static' - } + Set each node as a performance variable reporting the Gflop/s. + The ``reference`` values for each node are extracted from the + ``sys_reference`` dict. + ''' - @sn.sanity_function - def eval_sanity(self): - all_tested_nodes = sn.evaluate(sn.extractall( - r'(?P\S+):\s+Time for \d+ DGEMM operations', - self.stdout, 'hostname')) - num_tested_nodes = len(all_tested_nodes) - failure_msg = ('Requested %s node(s), but found %s node(s)' % - (self.job.num_tasks, num_tested_nodes)) - sn.evaluate(sn.assert_eq(num_tested_nodes, self.job.num_tasks, - msg=failure_msg)) + part_name = self.current_partition.fullname + with osext.change_dir(self.stagedir): + node_names = sn.evaluate(self.get_nodenames()) - for hostname in all_tested_nodes: - partition_name = self.current_partition.fullname - ref_name = '%s:%s' % (partition_name, hostname) - self.reference[ref_name] = self.sys_reference.get( - partition_name, (0.0, None, None, 'Gflop/s') - ) - self.perf_patterns[hostname] = sn.extractsingle( - r'%s:\s+Avg\. performance\s+:\s+(?P\S+)' - r'\sGflop/s' % hostname, self.stdout, 'gflops', float) + # If part_name not in sys_reference, default back to '*' + if part_name not in self.sys_reference: + part_name = '*' - return True + # Set references and perf patterns. + self.reference = { + part_name: { + nid: self.sys_reference[part_name] for nid in node_names + } + } + self.perf_patterns = { + nid: self.get_node_performance(nid) for nid in node_names + } diff --git a/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py new file mode 100644 index 0000000000..d53888118b --- /dev/null +++ b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py @@ -0,0 +1,97 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn + + +__all__ = ['Dgemm'] + + +class Dgemm(rfm.RegressionTest, pin_prefix=True): + '''Dgemm benchmark. + + Derived test must specify the variables ``num_tasks`` and + ``num_cpus_per_task``. + + The matrix sizes can be controlled through executable options. By default, + this test sets these as ``m=6144``, ``n=12288`` and ``k=3072``. Derived + tests are free to change these parameters at their convenience. The + performance of this tests is measured by the lowest performing node in + ``Gflops/s``. + ''' + + num_tasks = required + num_cpus_per_task = required + + descr = 'DGEMM performance test' + sourcepath = 'dgemm.c' + use_multithreading = False + executable_opts = ['6144', '12288', '3072'] + build_system = 'SingleSource' + reference = { + '*': { + 'min_perf': (None, None, None, 'Gflops/s') + } + } + maintainers = ['AJ', 'VH'] + + @rfm.run_before('compile') + def set_c_flags(self): + self.build_system.cflags += ['-O3'] + + @rfm.run_before('run') + def set_env_vars(self): + '''Set the environment variables.''' + + self.variables = { + 'OMP_NUM_THREADS': str(self.num_cpus_per_task), + 'OMP_BIND': 'cores', + 'OMP_PROC_BIND': 'spread', + 'OMP_SCHEDULE': 'static' + } + + @sn.sanity_function + def get_nodenames(self): + '''Return a set with the participating node IDs.''' + + return set(sn.extractall( + r'(?P\S+):\s+Time for \d+ DGEMM operations', + self.stdout, 'hostname' + )) + + @rfm.run_before('sanity') + def set_sanity_patterns(self): + '''Assert that all requested nodes have completed.''' + + self.sanity_patterns = sn.assert_eq( + self.job.num_tasks, sn.count(self.get_nodenames()), + msg='some nodes did not complete' + ) + + @sn.sanity_function + def get_node_performance(self, nodeid): + '''Get the performance data from a specific ``nodeid``.''' + + return sn.extractsingle( + r'%s:\s+Avg\. performance\s+:\s+(?P\S+)\sGflop/s' % nodeid, + self.stdout, 'gflops', float + ) + + @sn.sanity_function + def get_min_performance(self): + '''Get the lowest performance from all nodes.''' + + return sn.min([ + self.get_node_performance(nid) for nid in self.get_nodenames() + ]) + + @rfm.run_before('performance') + def set_perf_patterns(self): + '''Set the perf patterns to check the min performance reported.''' + + self.perf_patterns = { + 'min_perf': self.get_min_performance(), + } diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/src/dgemm.c b/hpctestlib/microbenchmarks/cpu/dgemm/src/dgemm.c similarity index 100% rename from cscs-checks/microbenchmarks/cpu/dgemm/src/dgemm.c rename to hpctestlib/microbenchmarks/cpu/dgemm/src/dgemm.c From db3b552cfbb4ae75715843243916de727b616d0f Mon Sep 17 00:00:00 2001 From: Javier Otero Date: Tue, 1 Jun 2021 15:41:47 +0200 Subject: [PATCH 09/20] PEP8 fixes --- cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py index 92891282d6..a57fa2b590 100644 --- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py +++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py @@ -9,6 +9,7 @@ from hpctestlib.microbenchmarks.cpu.dgemm import Dgemm + @rfm.simple_test class dgemm_check(Dgemm): '''CSCS DGEMM check. @@ -21,8 +22,8 @@ class dgemm_check(Dgemm): valid_systems = [ 'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', - 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn', - 'eiger:mc', 'pilatus:mc' + 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn', + 'eiger:mc', 'pilatus:mc' ] num_tasks = 0 sys_reference = variable( From 64d34297cf7a5a183d8efceb1ca725599aa885e8 Mon Sep 17 00:00:00 2001 From: "Javier J. Otero Perez" Date: Tue, 1 Jun 2021 17:40:22 +0200 Subject: [PATCH 10/20] Port dgemm to a64fx --- .../microbenchmarks/cpu/dgemm/dgemm.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py index a57fa2b590..faddda9e28 100644 --- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py +++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py @@ -23,7 +23,7 @@ class dgemm_check(Dgemm): valid_systems = [ 'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn', - 'eiger:mc', 'pilatus:mc' + 'eiger:mc', 'pilatus:mc', 'ault:a64fx' ] num_tasks = 0 sys_reference = variable( @@ -34,6 +34,7 @@ class dgemm_check(Dgemm): 'dom:mc': (1040.0, -0.15, None, 'Gflop/s'), 'eiger:mc': (3200.0, -0.15, None, 'Gflop/s'), 'pilatus:mc': (3200.0, -0.15, None, 'Gflop/s'), + 'ault:a64fx': (1930.0, -0.15, None, 'Gflop/s'), '*': (None, None, None, 'Gflop/s'), }, ) @@ -47,6 +48,16 @@ def set_valid_prog_environs(self): self.valid_prog_environs = ['PrgEnv-gnu-nompi'] elif self.current_system.name in ['eiger', 'pilatus']: self.valid_prog_environs = ['PrgEnv-gnu'] + elif self.current_system.name in ['ault']: + self.valid_prog_environs = ['PrgEnv-fujitsu'] + + @rfm.run_after('setup') + def skip_incompatible_combinations(self): + '''Fujitsu env only available in ault's a64fx partition.''' + if self.current_environ.name.startswith('PrgEnv-fujitsu'): + self.skip_if( + self.current_partition.fullname not in {'ault:a64fx'} + ) @rfm.run_after('setup') def set_num_cpus_per_task(self): @@ -60,9 +71,11 @@ def set_num_cpus_per_task(self): self.num_cpus_per_task = 40 elif self.current_partition.fullname in ['eiger:mc', 'pilatus:mc']: self.num_cpus_per_task = 128 + elif self.current_partition.fullname in ['ault:a64fx']: + self.num_cpus_per_task = 48 @rfm.run_before('compile') - def setflags(self): + def set_flags(self): if self.current_environ.name.startswith('PrgEnv-gnu'): self.build_system.cflags += ['-fopenmp'] elif self.current_environ.name.startswith('PrgEnv-intel'): @@ -73,6 +86,9 @@ def setflags(self): self.build_system.ldflags = [ '-mkl', '-static-intel', '-liomp5', '-lpthread', '-lm', '-ldl' ] + elif self.current_environ.name.startswith('PrgEnv-fujitsu'): + self.build_system.cflags += ['-fopenmp', '-Nlibomp', '-mt'] + self.build_system.ldflags += ['-SSL2BLAMP', '-mt'] if self.current_partition.fullname in ['arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn']: From 53cd1313c5c7284e8759849a11fa2c15178d4bd3 Mon Sep 17 00:00:00 2001 From: "Javier J. Otero Perez" Date: Tue, 1 Jun 2021 18:02:58 +0200 Subject: [PATCH 11/20] Port stream benchmark to a64fx --- .../microbenchmarks/cpu/stream/stream.py | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream.py b/cscs-checks/microbenchmarks/cpu/stream/stream.py index be156ece8b..5b08dc51db 100644 --- a/cscs-checks/microbenchmarks/cpu/stream/stream.py +++ b/cscs-checks/microbenchmarks/cpu/stream/stream.py @@ -15,19 +15,11 @@ class stream_check(Stream): valid_systems = [ 'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', - 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn' + 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn', 'ault:a64fx' ] valid_prog_environs = [ 'PrgEnv-cray', 'PrgEnv-gnu', 'PrgEnv-intel', 'PrgEnv-pgi' ] - prgenv_flags = variable( - dict, value={ - 'PrgEnv-cray': ['-fopenmp', '-O3'], - 'PrgEnv-gnu': ['-fopenmp', '-O3'], - 'PrgEnv-intel': ['-qopenmp', '-O3'], - 'PrgEnv-pgi': ['-mp', '-O3'] - } - ) stream_cpus_per_task = variable( dict, value={ 'arolla:cn': 16, @@ -38,6 +30,7 @@ class stream_check(Stream): 'dom:mc': 36, 'tsa:cn': 16, 'tsa:pn': 16, + 'ault:a64fx': 48, } ) triad_reference = variable( @@ -65,7 +58,10 @@ class stream_check(Stream): 'daint:mc': {'triad': (88500, -0.05, None, 'MB/s')}, 'dom:gpu': {'triad': (44500, -0.05, None, 'MB/s')}, 'dom:mc': {'triad': (88500, -0.05, None, 'MB/s')}, - } + }, + 'PrgEnv-fujitsu': { + 'ault:a64fx': {'triad': (85500, -0.05, None, 'MB/s')}, + }, } ) num_tasks = 1 @@ -76,6 +72,8 @@ def filter_valid_prog_environs(self): '''Special conditions for arolla and tsa.''' if self.current_system.name in ['arolla', 'tsa']: self.valid_prog_environs = ['PrgEnv-gnu'] + elif self.current_system.name in ['ault']: + self.valid_prog_environs = ['PrgEnv-fujitsu'] @rfm.run_after('setup') def set_num_cpus_per_task(self): @@ -87,8 +85,17 @@ def set_num_cpus_per_task(self): @rfm.run_before('compile') def set_compiler_flags(self): '''Set build flags for the different environments.''' + envname = self.current_environ.name - self.build_system.cflags = self.prgenv_flags.get(envname, ['-O3']) + if envname in {'PrgEnv-cray', 'PrgEnv-gnu'}: + self.build_system.cflags += ['-fopenmp', '-O3'] + elif envname in {'PrgEnv-intel'}: + self.build_system.cflags += ['-qopenmp', '-O3'] + elif envname in {'PrgEnv-intel'}: + self.build_system.cflags += ['-mp', '-O3'] + elif envname in {'PrgEnv-fujitsu'}: + self.build_system.cflags += ['-fopenmp', '-mt', '-O3'] + self.build_system.ldflags += ['-mt'] @rfm.run_before('run') def set_env_vars(self): From d7543e211c5d2b3cfafa9b131f9e9155dc8e893b Mon Sep 17 00:00:00 2001 From: "Javier J. Otero Perez" Date: Tue, 1 Jun 2021 18:13:03 +0200 Subject: [PATCH 12/20] Port latency benchmark to a64fx --- cscs-checks/microbenchmarks/cpu/latency/latency.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cscs-checks/microbenchmarks/cpu/latency/latency.py b/cscs-checks/microbenchmarks/cpu/latency/latency.py index 63221cb1f6..798a8dd386 100644 --- a/cscs-checks/microbenchmarks/cpu/latency/latency.py +++ b/cscs-checks/microbenchmarks/cpu/latency/latency.py @@ -15,11 +15,16 @@ class cpu_latency_check(CpuLatency): num_tasks = 0 valid_systems = [ 'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', - 'ault:intel', 'ault:amdvega', 'tave:compute' + 'ault:intel', 'ault:amdvega', 'tave:compute', 'ault:a64fx' ] valid_prog_environs = ['PrgEnv-gnu'] tags = {'benchmark', 'diagnostic'} + @rfm.run_after('init') + def set_valid_environs(self): + if self.current_system.name in {'ault'}: + self.valid_prog_environs = ['PrgEnv-fujitsu'] + @rfm.run_after('setup') def set_modules(self): if self.current_system.name in {'daint', 'dom'}: @@ -72,4 +77,10 @@ def set_references(self): 'latencyL3': (137, -0.01, 0.05, 'ns'), 'latencyL4': (150, -0.05, 0.05, 'ns') }, + 'ault:a64fx': { + 'latencyL1': (2.78, None, 0.05, 'ns'), + 'latencyL2': (14.3, None, 0.05, 'ns'), + 'latencyL3': (32.1, None, 0.05, 'ns'), + 'latencyL4': (146, None, 0.05, 'ns') + }, } From e7cf2e19ecb5f279d381680de7cac888df610ef7 Mon Sep 17 00:00:00 2001 From: "Javier J. Otero Perez" Date: Tue, 1 Jun 2021 18:29:35 +0200 Subject: [PATCH 13/20] PEP8 fixes --- hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py index f66c9b2950..b7bdbce310 100644 --- a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py +++ b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py @@ -54,7 +54,9 @@ def set_exec_opts(self): In order, these are the buffer size, stride and number of threads. See the main docstring above for more info. ''' - self.executable_opts = ['100000000', f'{self.stride}', f'{self.num_cpus}'] + self.executable_opts = [ + '100000000', f'{self.stride}', f'{self.num_cpus}' + ] @rfm.run_before('sanity') def set_sanity_patterns(self): From 235fd0fd18ca809b0f59a7d579ff5c579da21ef6 Mon Sep 17 00:00:00 2001 From: "Javier J. Otero Perez" Date: Tue, 1 Jun 2021 18:34:11 +0200 Subject: [PATCH 14/20] Remove unused imports --- cscs-checks/microbenchmarks/cpu/latency/latency.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cscs-checks/microbenchmarks/cpu/latency/latency.py b/cscs-checks/microbenchmarks/cpu/latency/latency.py index 798a8dd386..aea1ae2dcd 100644 --- a/cscs-checks/microbenchmarks/cpu/latency/latency.py +++ b/cscs-checks/microbenchmarks/cpu/latency/latency.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: BSD-3-Clause import reframe as rfm -import reframe.utility.sanity as sn from hpctestlib.microbenchmarks.cpu.latency import CpuLatency From 918c14deb84354b071de2af58e393ff48bad7241 Mon Sep 17 00:00:00 2001 From: "Javier J. Otero Perez" Date: Tue, 1 Jun 2021 18:44:51 +0200 Subject: [PATCH 15/20] Remove unused imports --- cscs-checks/microbenchmarks/cpu/stream/stream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream.py b/cscs-checks/microbenchmarks/cpu/stream/stream.py index 5b08dc51db..109fae2ef7 100644 --- a/cscs-checks/microbenchmarks/cpu/stream/stream.py +++ b/cscs-checks/microbenchmarks/cpu/stream/stream.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: BSD-3-Clause import reframe as rfm -import reframe.utility.sanity as sn from hpctestlib.microbenchmarks.cpu.stream import Stream From 30060d9ca1fedfdce52942997a390b5a3880129f Mon Sep 17 00:00:00 2001 From: Javier Otero Date: Tue, 8 Jun 2021 16:29:11 +0200 Subject: [PATCH 16/20] Add num_tasks for strided bw check --- cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py index 3428cce169..ae35f2c286 100644 --- a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py +++ b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py @@ -31,6 +31,7 @@ class strided_bandwidth_check(StridedBandwidth): valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc', 'eiger:mc', 'pilatus:mc', 'ault:a64fx'] valid_prog_environs = ['PrgEnv-gnu'] + num_tasks = 0 @rfm.run_after('init') def set_valid_systems(self): From 653712bcddf02d4f142cbc271a34de1636a324a0 Mon Sep 17 00:00:00 2001 From: Javier Otero Date: Tue, 8 Jun 2021 16:51:25 +0200 Subject: [PATCH 17/20] Bump syntax to remove depr warns --- .../microbenchmarks/cpu/dgemm/dgemm.py | 10 +++++----- .../microbenchmarks/cpu/dgemm/dgemm_test.py | 20 +++++++++++++++++++ .../microbenchmarks/cpu/latency/latency.py | 6 +++--- .../microbenchmarks/cpu/stream/stream.py | 10 +++++----- .../microbenchmarks/cpu/stream/stream_test.py | 17 ++++++++++++++++ .../cpu/strided_bandwidth/strides.py | 6 +++--- .../microbenchmarks/cpu/dgemm/__init__.py | 8 ++++---- .../microbenchmarks/cpu/latency/__init__.py | 10 +++++----- .../microbenchmarks/cpu/stream/__init__.py | 6 +++--- .../cpu/strided_bandwidth/__init__.py | 6 +++--- 10 files changed, 68 insertions(+), 31 deletions(-) create mode 100644 cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py create mode 100644 cscs-checks/microbenchmarks/cpu/stream/stream_test.py diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py index faddda9e28..4f74a56c15 100644 --- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py +++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py @@ -40,7 +40,7 @@ class dgemm_check(Dgemm): ) tags = {'benchmark', 'diagnostic', 'craype'} - @rfm.run_after('init') + @run_after('init') def set_valid_prog_environs(self): if self.current_system.name in ['daint', 'dom']: self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-intel'] @@ -51,7 +51,7 @@ def set_valid_prog_environs(self): elif self.current_system.name in ['ault']: self.valid_prog_environs = ['PrgEnv-fujitsu'] - @rfm.run_after('setup') + @run_after('setup') def skip_incompatible_combinations(self): '''Fujitsu env only available in ault's a64fx partition.''' if self.current_environ.name.startswith('PrgEnv-fujitsu'): @@ -59,7 +59,7 @@ def skip_incompatible_combinations(self): self.current_partition.fullname not in {'ault:a64fx'} ) - @rfm.run_after('setup') + @run_after('setup') def set_num_cpus_per_task(self): if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']: self.num_cpus_per_task = 12 @@ -74,7 +74,7 @@ def set_num_cpus_per_task(self): elif self.current_partition.fullname in ['ault:a64fx']: self.num_cpus_per_task = 48 - @rfm.run_before('compile') + @run_before('compile') def set_flags(self): if self.current_environ.name.startswith('PrgEnv-gnu'): self.build_system.cflags += ['-fopenmp'] @@ -96,7 +96,7 @@ def set_flags(self): self.build_system.ldflags = ['-L$EBROOTOPENBLAS/lib', '-lopenblas', '-lpthread', '-lgfortran'] - @rfm.run_before('performance') + @run_before('performance') def set_perf_patterns(self): '''Override base performance patterns. diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py new file mode 100644 index 0000000000..36fcbf914e --- /dev/null +++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py @@ -0,0 +1,20 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn + +from hpctestlib.microbenchmarks.cpu.dgemm import Dgemm + +@rfm.simple_test +class dgemm_check(Dgemm): + valid_systems = ['dom:mc'] + valid_prog_environs = ['PrgEnv-gnu'] + num_tasks = 0 + num_cpus_per_task = 36 + + @run_before('compile') + def setflags(self): + self.build_system.cflags += ['-fopenmp'] diff --git a/cscs-checks/microbenchmarks/cpu/latency/latency.py b/cscs-checks/microbenchmarks/cpu/latency/latency.py index aea1ae2dcd..5d85b2bdb7 100644 --- a/cscs-checks/microbenchmarks/cpu/latency/latency.py +++ b/cscs-checks/microbenchmarks/cpu/latency/latency.py @@ -19,19 +19,19 @@ class cpu_latency_check(CpuLatency): valid_prog_environs = ['PrgEnv-gnu'] tags = {'benchmark', 'diagnostic'} - @rfm.run_after('init') + @run_after('init') def set_valid_environs(self): if self.current_system.name in {'ault'}: self.valid_prog_environs = ['PrgEnv-fujitsu'] - @rfm.run_after('setup') + @run_after('setup') def set_modules(self): if self.current_system.name in {'daint', 'dom'}: self.modules = ['craype-hugepages1G'] if self.current_system.name in {'tave'}: self.modules = ['craype-hugepages512M'] - @rfm.run_before('performance') + @run_before('performance') def set_references(self): self.reference = { 'dom:mc': { diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream.py b/cscs-checks/microbenchmarks/cpu/stream/stream.py index 109fae2ef7..c9f4d14a03 100644 --- a/cscs-checks/microbenchmarks/cpu/stream/stream.py +++ b/cscs-checks/microbenchmarks/cpu/stream/stream.py @@ -66,7 +66,7 @@ class stream_check(Stream): num_tasks = 1 tags = {'production', 'craype'} - @rfm.run_after('init') + @run_after('init') def filter_valid_prog_environs(self): '''Special conditions for arolla and tsa.''' if self.current_system.name in ['arolla', 'tsa']: @@ -74,14 +74,14 @@ def filter_valid_prog_environs(self): elif self.current_system.name in ['ault']: self.valid_prog_environs = ['PrgEnv-fujitsu'] - @rfm.run_after('setup') + @run_after('setup') def set_num_cpus_per_task(self): '''If partition not in ``stream_cpus_per_task``, leave as required.''' self.num_cpus_per_task = self.stream_cpus_per_task.get( self.current_partition.fullname, self.required ) - @rfm.run_before('compile') + @run_before('compile') def set_compiler_flags(self): '''Set build flags for the different environments.''' @@ -96,13 +96,13 @@ def set_compiler_flags(self): self.build_system.cflags += ['-fopenmp', '-mt', '-O3'] self.build_system.ldflags += ['-mt'] - @rfm.run_before('run') + @run_before('run') def set_env_vars(self): '''Special environment treatment for the PrgEnv-pgi.''' if self.current_environ.name == 'PrgEnv-pgi': self.variables['OMP_PROC_BIND'] = 'true' - @rfm.run_before('performance') + @run_before('performance') def set_perf_references(self): '''Set performance refs as defined in ``triad_reference``. diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream_test.py b/cscs-checks/microbenchmarks/cpu/stream/stream_test.py new file mode 100644 index 0000000000..87a8509e51 --- /dev/null +++ b/cscs-checks/microbenchmarks/cpu/stream/stream_test.py @@ -0,0 +1,17 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn + +from hpctestlib.microbenchmarks.cpu.stream import Stream + + +@rfm.simple_test +class stream_check(Stream): + valid_systems = ['dom:mc'] + valid_prog_environs = ['PrgEnv-cray'] + num_tasks = 2 + num_cpus_per_task = 36 diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py index ae35f2c286..58745fc9f3 100644 --- a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py +++ b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py @@ -33,7 +33,7 @@ class strided_bandwidth_check(StridedBandwidth): valid_prog_environs = ['PrgEnv-gnu'] num_tasks = 0 - @rfm.run_after('init') + @run_after('init') def set_valid_systems(self): cp = self.current_system.name if cp == 'ault': @@ -125,10 +125,10 @@ def set_valid_systems(self): ) tags = {'benchmark', 'diagnostic'} - @rfm.run_after('setup') + @run_after('setup') def set_num_cpus(self): self.num_cpus = self.system_num_cpus[self.current_partition.fullname] - @rfm.run_before('performance') + @run_before('performance') def set_references(self): self.reference = self.reference_per_stride[self.stride] diff --git a/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py index d53888118b..0b3cd7c7c9 100644 --- a/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py +++ b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py @@ -38,11 +38,11 @@ class Dgemm(rfm.RegressionTest, pin_prefix=True): } maintainers = ['AJ', 'VH'] - @rfm.run_before('compile') + @run_before('compile') def set_c_flags(self): self.build_system.cflags += ['-O3'] - @rfm.run_before('run') + @run_before('run') def set_env_vars(self): '''Set the environment variables.''' @@ -62,7 +62,7 @@ def get_nodenames(self): self.stdout, 'hostname' )) - @rfm.run_before('sanity') + @run_before('sanity') def set_sanity_patterns(self): '''Assert that all requested nodes have completed.''' @@ -88,7 +88,7 @@ def get_min_performance(self): self.get_node_performance(nid) for nid in self.get_nodenames() ]) - @rfm.run_before('performance') + @run_before('performance') def set_perf_patterns(self): '''Set the perf patterns to check the min performance reported.''' diff --git a/hpctestlib/microbenchmarks/cpu/latency/__init__.py b/hpctestlib/microbenchmarks/cpu/latency/__init__.py index 3557af0c04..26235c28a3 100644 --- a/hpctestlib/microbenchmarks/cpu/latency/__init__.py +++ b/hpctestlib/microbenchmarks/cpu/latency/__init__.py @@ -33,16 +33,16 @@ class CpuLatency(rfm.RegressionTest, pin_prefix=True): num_tasks_per_node = 1 maintainers = ['SK', 'JO'] - @rfm.run_before('compile') + @run_before('compile') def set_compiler_flags(self): self.build_system.cxxflags = ['-O3'] - @rfm.run_before('run') + @run_before('run') def set_exc_opts(self): '''Set the ``buffer_sizes`` as the executable options.''' self.executable_opts = self.buffer_sizes - @rfm.run_before('sanity') + @run_before('sanity') def set_sanity_patterns(self): '''Verify the number of reported latency measurements.''' @@ -60,7 +60,7 @@ def get_latency(self, buffer_size): buffer_size, self.stdout, 'bw', float )) - @rfm.run_before('performance') + @run_before('performance') def set_references(self): '''Set dummy references to get the perf values in the perf report. @@ -76,7 +76,7 @@ def set_references(self): } } - @rfm.run_before('performance') + @run_before('performance') def set_perf_patterns(self): '''Set the performance patters to extract all latency levels. diff --git a/hpctestlib/microbenchmarks/cpu/stream/__init__.py b/hpctestlib/microbenchmarks/cpu/stream/__init__.py index 38da66e447..341b67818c 100644 --- a/hpctestlib/microbenchmarks/cpu/stream/__init__.py +++ b/hpctestlib/microbenchmarks/cpu/stream/__init__.py @@ -46,12 +46,12 @@ class Stream(rfm.RegressionTest): } maintainers = ['RS', 'SK'] - @rfm.run_before('run') + @run_before('run') def set_omp_num_threads(self): '''Set the number of OMP threads to ``num_cpus_per_task``.''' self.variables['OMP_NUM_THREADS'] = f'{self.num_cpus_per_task}' - @rfm.run_before('sanity') + @run_before('sanity') def set_sanity_patterns(self): '''Set sanity patterns to check the error threshold.''' @@ -59,7 +59,7 @@ def set_sanity_patterns(self): r'Solution Validates: avg error less than', self.stdout ) - @rfm.run_before('performance') + @run_before('performance') def set_performance_patterns(self): '''Set performance to track the triad bandwidth.''' diff --git a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py index b7bdbce310..3708964c05 100644 --- a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py +++ b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py @@ -47,7 +47,7 @@ class StridedBandwidth(rfm.RegressionTest, pin_prefix=True): } maintainers = ['SK'] - @rfm.run_before('run') + @run_before('run') def set_exec_opts(self): '''Set the exec options. @@ -58,7 +58,7 @@ def set_exec_opts(self): '100000000', f'{self.stride}', f'{self.num_cpus}' ] - @rfm.run_before('sanity') + @run_before('sanity') def set_sanity_patterns(self): ''' Assert that the bandwidth is reported for all the tasks.''' @@ -67,7 +67,7 @@ def set_sanity_patterns(self): self.job.num_tasks ) - @rfm.run_before('performance') + @run_before('performance') def set_perf_patterns(self): '''Extract the min bandwidth as a performance metric.''' From aedc16006b0cfb958cf86a406bcb45a54278bc0d Mon Sep 17 00:00:00 2001 From: Javier Otero Date: Tue, 8 Jun 2021 16:56:13 +0200 Subject: [PATCH 18/20] Remove leaked files --- .../microbenchmarks/cpu/dgemm/dgemm_test.py | 20 ------------------- .../microbenchmarks/cpu/stream/stream_test.py | 17 ---------------- 2 files changed, 37 deletions(-) delete mode 100644 cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py delete mode 100644 cscs-checks/microbenchmarks/cpu/stream/stream_test.py diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py deleted file mode 100644 index 36fcbf914e..0000000000 --- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) -# ReFrame Project Developers. See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: BSD-3-Clause - -import reframe as rfm -import reframe.utility.sanity as sn - -from hpctestlib.microbenchmarks.cpu.dgemm import Dgemm - -@rfm.simple_test -class dgemm_check(Dgemm): - valid_systems = ['dom:mc'] - valid_prog_environs = ['PrgEnv-gnu'] - num_tasks = 0 - num_cpus_per_task = 36 - - @run_before('compile') - def setflags(self): - self.build_system.cflags += ['-fopenmp'] diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream_test.py b/cscs-checks/microbenchmarks/cpu/stream/stream_test.py deleted file mode 100644 index 87a8509e51..0000000000 --- a/cscs-checks/microbenchmarks/cpu/stream/stream_test.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) -# ReFrame Project Developers. See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: BSD-3-Clause - -import reframe as rfm -import reframe.utility.sanity as sn - -from hpctestlib.microbenchmarks.cpu.stream import Stream - - -@rfm.simple_test -class stream_check(Stream): - valid_systems = ['dom:mc'] - valid_prog_environs = ['PrgEnv-cray'] - num_tasks = 2 - num_cpus_per_task = 36 From 5826b0f0badad9c2f1d1aedd47878e78b39630a2 Mon Sep 17 00:00:00 2001 From: Javier Otero Date: Thu, 28 Oct 2021 18:03:15 +0200 Subject: [PATCH 19/20] Update dgemm --- .../microbenchmarks/cpu/dgemm/dgemm.py | 47 +++---------------- .../microbenchmarks/cpu/dgemm/__init__.py | 27 +++-------- 2 files changed, 14 insertions(+), 60 deletions(-) diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py index 4f74a56c15..f06e4e8ccb 100644 --- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py +++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py @@ -56,23 +56,17 @@ def skip_incompatible_combinations(self): '''Fujitsu env only available in ault's a64fx partition.''' if self.current_environ.name.startswith('PrgEnv-fujitsu'): self.skip_if( - self.current_partition.fullname not in {'ault:a64fx'} + self.current_partition.fullname not in ('ault:a64fx') ) @run_after('setup') def set_num_cpus_per_task(self): - if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']: - self.num_cpus_per_task = 12 - elif self.current_partition.fullname in ['daint:mc', 'dom:mc']: - self.num_cpus_per_task = 36 - elif self.current_partition.fullname in ['arolla:cn', 'tsa:cn']: - self.num_cpus_per_task = 16 - elif self.current_partition.fullname in ['arolla:pn', 'tsa:pn']: - self.num_cpus_per_task = 40 - elif self.current_partition.fullname in ['eiger:mc', 'pilatus:mc']: - self.num_cpus_per_task = 128 - elif self.current_partition.fullname in ['ault:a64fx']: - self.num_cpus_per_task = 48 + proc = self.current_partition.processor + pname = self.current_partition.fullname + if not proc.info: + self.skip(f'no topology information found for partition {pname!r}') + + self.num_cpus_per_task = proc.num_cpus // proc.num_cpus_per_core @run_before('compile') def set_flags(self): @@ -95,30 +89,3 @@ def set_flags(self): self.build_system.cflags += ['-I$EBROOTOPENBLAS/include'] self.build_system.ldflags = ['-L$EBROOTOPENBLAS/lib', '-lopenblas', '-lpthread', '-lgfortran'] - - @run_before('performance') - def set_perf_patterns(self): - '''Override base performance patterns. - - Set each node as a performance variable reporting the Gflop/s. - The ``reference`` values for each node are extracted from the - ``sys_reference`` dict. - ''' - - part_name = self.current_partition.fullname - with osext.change_dir(self.stagedir): - node_names = sn.evaluate(self.get_nodenames()) - - # If part_name not in sys_reference, default back to '*' - if part_name not in self.sys_reference: - part_name = '*' - - # Set references and perf patterns. - self.reference = { - part_name: { - nid: self.sys_reference[part_name] for nid in node_names - } - } - self.perf_patterns = { - nid: self.get_node_performance(nid) for nid in node_names - } diff --git a/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py index 0b3cd7c7c9..fb95bdeec6 100644 --- a/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py +++ b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py @@ -31,11 +31,6 @@ class Dgemm(rfm.RegressionTest, pin_prefix=True): use_multithreading = False executable_opts = ['6144', '12288', '3072'] build_system = 'SingleSource' - reference = { - '*': { - 'min_perf': (None, None, None, 'Gflops/s') - } - } maintainers = ['AJ', 'VH'] @run_before('compile') @@ -53,7 +48,7 @@ def set_env_vars(self): 'OMP_SCHEDULE': 'static' } - @sn.sanity_function + @deferrable def get_nodenames(self): '''Return a set with the participating node IDs.''' @@ -62,16 +57,16 @@ def get_nodenames(self): self.stdout, 'hostname' )) - @run_before('sanity') - def set_sanity_patterns(self): + @sanity_function + def assert_all_nodes_completed(self): '''Assert that all requested nodes have completed.''' - self.sanity_patterns = sn.assert_eq( + return sn.assert_eq( self.job.num_tasks, sn.count(self.get_nodenames()), msg='some nodes did not complete' ) - @sn.sanity_function + @deferrable def get_node_performance(self, nodeid): '''Get the performance data from a specific ``nodeid``.''' @@ -80,18 +75,10 @@ def get_node_performance(self, nodeid): self.stdout, 'gflops', float ) - @sn.sanity_function - def get_min_performance(self): + @performance_function('Gflops/s') + def min_perf(self): '''Get the lowest performance from all nodes.''' return sn.min([ self.get_node_performance(nid) for nid in self.get_nodenames() ]) - - @run_before('performance') - def set_perf_patterns(self): - '''Set the perf patterns to check the min performance reported.''' - - self.perf_patterns = { - 'min_perf': self.get_min_performance(), - } From 0064156a163003de8eb00aa18a05f5d37977edad Mon Sep 17 00:00:00 2001 From: Javier Otero Date: Thu, 28 Oct 2021 18:08:22 +0200 Subject: [PATCH 20/20] Remove unused imports --- cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py index f06e4e8ccb..8ea51f62d6 100644 --- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py +++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py @@ -4,8 +4,6 @@ # SPDX-License-Identifier: BSD-3-Clause import reframe as rfm -import reframe.utility.sanity as sn -import reframe.utility.osext as osext from hpctestlib.microbenchmarks.cpu.dgemm import Dgemm