From 4d85cb513327065b9e8a6ab3f8c490c08915bb9b Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Thu, 27 May 2021 15:25:47 +0200
Subject: [PATCH 01/20] Port strided bandwidth benchmark to the lib

---
 .../cpu/strided_bandwidth/strides.py          | 210 ++++++++----------
 .../cpu/strided_bandwidth/__init__.py         |  74 ++++++
 .../cpu/strided_bandwidth/src/strides.cpp     |   0
 3 files changed, 164 insertions(+), 120 deletions(-)
 create mode 100644 hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
 rename {cscs-checks => hpctestlib}/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp (100%)
diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
index 22991adccd..5be27582b0 100644
--- a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
+++ b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
@@ -6,28 +6,34 @@
 import reframe as rfm
 import reframe.utility.sanity as sn
 
+from hpctestlib.microbenchmarks.cpu.strided_bandwidth import StridedBandwidth
 
-class StridedBase(rfm.RegressionTest):
-    def __init__(self):
-        self.sourcepath = 'strides.cpp'
-        self.build_system = 'SingleSource'
-        self.valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc',
-                              'eiger:mc', 'pilatus:mc']
-        self.valid_prog_environs = ['PrgEnv-gnu']
-        self.num_tasks = 1
-        self.num_tasks_per_node = 1
 
-        self.sanity_patterns = sn.assert_eq(
-            sn.count(sn.findall(r'bandwidth', self.stdout)),
-            self.num_tasks_assigned)
-
-        self.perf_patterns = {
-            'bandwidth': sn.extractsingle(
-                r'bandwidth: (?P<bw>\S+) GB/s',
-                self.stdout, 'bw', float)
-        }
-
-        self.system_num_cpus = {
+@rfm.simple_test
+class strided_bandwidth_check(StridedBandwidth):
+    '''Strided bandwidth check.
+
+    This test is parameterized with the ``stride`` parameter, covering the
+    following scenarios: 8-byte stride using the full cache line, 64-byte
+    stride using 1/8 of the cacheline, and 128-byte using 1/8 of every 2nd
+    cacheline.
+
+    This test requires the ``num_cpus`` variable, which is set in a post-setup
+    hook. The data for each supported system is stored in ``system_num_cpus``
+
+    Since the performance references change with the ``stride`` parameter, the
+    references for each test instace are stored in the ``reference_per_stride``
+    variable. The actual references are then set in a pre-performance hook.
+    '''
+
+    # Define the stride parameter
+    stride = parameter([1, 8, 16])
+
+    valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc',
+                     'eiger:mc', 'pilatus:mc']
+    valid_prog_environs = ['PrgEnv-gnu']
+    system_num_cpus = variable(
+        dict, value={
             'daint:mc':  72,
             'daint:gpu': 24,
             'dom:mc':  72,
@@ -35,110 +41,74 @@ def __init__(self):
             'eiger:mc': 128,
             'pilatus:mc': 128
         }
-
-        self.maintainers = ['SK']
-        self.tags = {'benchmark', 'diagnostic'}
-
-    @property
-    @sn.sanity_function
-    def num_tasks_assigned(self):
-        return self.job.num_tasks
-
-
-@rfm.simple_test
-class StridedBandwidthTest(StridedBase):
-    def __init__(self):
-        super().__init__()
-
-        self.reference = {
-            'dom:gpu': {
-                'bandwidth': (50, -0.1, 0.1, 'GB/s')
-            },
-            'dom:mc': {
-                'bandwidth': (100, -0.1, 0.1, 'GB/s')
-            },
-            'daint:gpu': {
-                'bandwidth': (50, -0.1, 0.1, 'GB/s')
-            },
-            'daint:mc': {
-                'bandwidth': (100, -0.1, 0.1, 'GB/s')
-            },
-            'eiger:mc': {
-                'bandwidth': (270, -0.1, 0.1, 'GB/s')
-            },
-            'pilatus:mc': {
-                'bandwidth': (270, -0.1, 0.1, 'GB/s')
-            }
-        }
-
-    @rfm.run_before('run')
-    def set_exec_opts(self):
-        self.num_cpus = self.system_num_cpus[self.current_partition.fullname]
-
-        # 8-byte stride, using the full cacheline
-        self.executable_opts = ['100000000', '1', f'{self.num_cpus}']
-
-
-@rfm.simple_test
-class StridedBandwidthTest64(StridedBase):
-    def __init__(self):
-        super().__init__()
-
-        self.reference = {
-            'dom:gpu': {
-                'bandwidth': (6, -0.1, 0.2, 'GB/s')
-            },
-            'dom:mc': {
-                'bandwidth': (12.5, -0.1, 0.2, 'GB/s')
+    )
+    reference_per_stride = variable(
+        dict, value={
+            1: {
+                'dom:gpu': {
+                    'bandwidth': (50, -0.1, 0.1, 'GB/s')
+                },
+                'dom:mc': {
+                    'bandwidth': (100, -0.1, 0.1, 'GB/s')
+                },
+                'daint:gpu': {
+                    'bandwidth': (50, -0.1, 0.1, 'GB/s')
+                },
+                'daint:mc': {
+                    'bandwidth': (100, -0.1, 0.1, 'GB/s')
+                },
+                'eiger:mc': {
+                    'bandwidth': (270, -0.1, 0.1, 'GB/s')
+                },
+                'pilatus:mc': {
+                    'bandwidth': (270, -0.1, 0.1, 'GB/s')
+                }
             },
-            'daint:gpu': {
-                'bandwidth': (6, -0.05, 0.2, 'GB/s')
+            8: {
+                'dom:gpu': {
+                    'bandwidth': (6, -0.1, 0.2, 'GB/s')
+                },
+                'dom:mc': {
+                    'bandwidth': (12.5, -0.1, 0.2, 'GB/s')
+                },
+                'daint:gpu': {
+                    'bandwidth': (6, -0.05, 0.2, 'GB/s')
+                },
+                'daint:mc': {
+                    'bandwidth': (12.5, -0.1, 0.2, 'GB/s')
+                },
+                'eiger:mc': {
+                    'bandwidth': (33, -0.1, 0.2, 'GB/s')
+                },
+                'pilatus:mc': {
+                    'bandwidth': (33, -0.1, 0.2, 'GB/s')
+                }
             },
-            'daint:mc': {
-                'bandwidth': (12.5, -0.1, 0.2, 'GB/s')
-            },
-            'eiger:mc': {
-                'bandwidth': (33, -0.1, 0.2, 'GB/s')
-            },
-            'pilatus:mc': {
-                'bandwidth': (33, -0.1, 0.2, 'GB/s')
+            16: {
+                'dom:gpu': {
+                    'bandwidth': (4.5, -0.1, 0.2, 'GB/s')
+                },
+                'dom:mc': {
+                    'bandwidth': (9.1, -0.1, 0.2, 'GB/s')
+                },
+                'daint:gpu': {
+                    'bandwidth': (4.5, -0.1, 0.2, 'GB/s')
+                },
+                'daint:mc': {
+                    'bandwidth': (9.1, -0.1, 0.2, 'GB/s')
+                },
+                'eiger:mc': {
+                    'bandwidth': (33, -0.1, 0.2, 'GB/s')
+                },
             }
         }
+    )
+    tags = {'benchmark', 'diagnostic'}
 
-    @rfm.run_before('run')
-    def set_exec_opts(self):
-        self.num_cpus = self.system_num_cpus[self.current_partition.fullname]
-
-        # 64-byte stride, using 1/8 of the cacheline
-        self.executable_opts = ['100000000', '8', '%s' % self.num_cpus]
-
-
-@rfm.simple_test
-class StridedBandwidthTest128(StridedBase):
-    def __init__(self):
-        super().__init__()
-
-        self.reference = {
-            'dom:gpu': {
-                'bandwidth': (4.5, -0.1, 0.2, 'GB/s')
-            },
-            'dom:mc': {
-                'bandwidth': (9.1, -0.1, 0.2, 'GB/s')
-            },
-            'daint:gpu': {
-                'bandwidth': (4.5, -0.1, 0.2, 'GB/s')
-            },
-            'daint:mc': {
-                'bandwidth': (9.1, -0.1, 0.2, 'GB/s')
-            },
-            'eiger:mc': {
-                'bandwidth': (33, -0.1, 0.2, 'GB/s')
-            },
-        }
-
-    @rfm.run_before('run')
-    def set_exec_opts(self):
+    @rfm.run_after('setup')
+    def set_num_cpus(self):
         self.num_cpus = self.system_num_cpus[self.current_partition.fullname]
 
-        # 128-byte stride, using 1/8 of every 2nd cacheline
-        self.executable_opts = ['100000000', '16', '%s' % self.num_cpus]
+    @rfm.run_before('performance')
+    def set_references(self):
+        self.reference = self.reference_per_stride[self.stride]
diff --git a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
new file mode 100644
index 0000000000..64ffa378c0
--- /dev/null
+++ b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
@@ -0,0 +1,74 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+__all__ = ['StridedBandwidth']
+
+
+class StridedBandwidth(rfm.RegressionTest, pin_prefix=True):
+    '''Strided bandwith benchmark.
+
+    The executable takes three required arguments. These are the buffer size
+    (in bytes), the stride (in multiples of 8 bytes) and the number of threads
+    to run this application with.
+
+    The performance stage measures the bandiwdth in GB/s.
+    '''
+
+    #: Parameter that controls the stride access pattern.
+    #: This parameter must be opverridden by the derived class.
+    #:
+    #: :default: ``()``
+    stride = parameter()
+
+    #: Set the number of cpus per node.
+    #:
+    #: :default: ``required``
+    num_cpus = variable(int)
+
+    sourcepath = 'strides.cpp'
+    build_system = 'SingleSource'
+    num_tasks = 0
+    num_tasks_per_node = 1
+    reference = {
+        '*': {
+            'bandwidth': (None, None, None, 'GB/s')
+        }
+    }
+    maintainers = ['SK']
+
+    @rfm.run_before('run')
+    def set_exec_opts(self):
+        '''Set the exec options.
+
+        In order, these are the buffer size, stride and number of threads. See
+        the main docstring above for more info.
+        '''
+        self.executable_opts = ['100000000', f'{self.stride}', f'{self.num_cpus}']
+
+    @rfm.run_before('sanity')
+    def set_sanity_patterns(self):
+        ''' Assert that the bandwidth is reported for all the tasks.'''
+
+        self.sanity_patterns = sn.assert_eq(
+            sn.count(sn.findall(r'bandwidth:', self.stdout)),
+            self.job.num_tasks
+        )
+
+    @rfm.run_before('performance')
+    def set_perf_patterns(self):
+        '''Extract the min bandwidth as a performance metric.'''
+
+        self.perf_patterns = {
+            'bandwidth': sn.min(
+                sn.extractall(
+                    r'bandwidth: (?P<bw>\S+) GB/s',
+                    self.stdout, 'bw', float
+                )
+            )
+        }
diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp
similarity index 100%
rename from cscs-checks/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp
rename to hpctestlib/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp

From db483281a2fe531dca17034ace3ba9f90f9a222a Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Thu, 27 May 2021 17:08:12 +0200
Subject: [PATCH 02/20] Port strided bandidth check to a64fx

---
 config/cscs.py                                | 15 +++++++++-
 .../cpu/strided_bandwidth/strides.py          | 29 +++++++++++++++----
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/config/cscs.py b/config/cscs.py
index 6bb5778a33..c53e062ba0 100644
--- a/config/cscs.py
+++ b/config/cscs.py
@@ -39,7 +39,8 @@
                     ],
                     'environs': [
                         'builtin',
-                        'PrgEnv-gnu'
+                        'PrgEnv-gnu',
+                        'PrgEnv-fujitsu'
                     ],
                     'descr': 'Fujitsu A64FX CPUs',
                     'max_jobs': 100,
@@ -815,6 +816,18 @@
             'cxx': 'mpicxx',
             'ftn': 'mpif90'
         },
+        {
+            'name': 'PrgEnv-fujitsu',
+            'target_systems': [
+                'ault'
+            ],
+            'modules': [
+                'a64fxsdk'
+            ],
+            'cc': 'mpifccpx',
+            'cxx': 'mpiFCCpx',
+            'ftn': 'mpifrtpx'
+        },
         {
             'name': 'builtin',
             'target_systems': [
diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
index 5be27582b0..3428cce169 100644
--- a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
+++ b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
@@ -4,7 +4,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import reframe as rfm
-import reframe.utility.sanity as sn
 
 from hpctestlib.microbenchmarks.cpu.strided_bandwidth import StridedBandwidth
 
@@ -30,8 +29,15 @@ class strided_bandwidth_check(StridedBandwidth):
     stride = parameter([1, 8, 16])
 
     valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc',
-                     'eiger:mc', 'pilatus:mc']
+                     'eiger:mc', 'pilatus:mc', 'ault:a64fx']
     valid_prog_environs = ['PrgEnv-gnu']
+
+    @rfm.run_after('init')
+    def set_valid_systems(self):
+        cp = self.current_system.name
+        if cp == 'ault':
+            self.valid_prog_environs = ['PrgEnv-fujitsu']
+
     system_num_cpus = variable(
         dict, value={
             'daint:mc':  72,
@@ -39,7 +45,8 @@ class strided_bandwidth_check(StridedBandwidth):
             'dom:mc':  72,
             'dom:gpu': 24,
             'eiger:mc': 128,
-            'pilatus:mc': 128
+            'pilatus:mc': 128,
+            'ault:a64fx': 48,
         }
     )
     reference_per_stride = variable(
@@ -62,7 +69,10 @@ class strided_bandwidth_check(StridedBandwidth):
                 },
                 'pilatus:mc': {
                     'bandwidth': (270, -0.1, 0.1, 'GB/s')
-                }
+                },
+                'ault:a64fx': {
+                    'bandwidth': (50, -0.1, 0.1, 'GB/s')
+                },
             },
             8: {
                 'dom:gpu': {
@@ -82,7 +92,10 @@ class strided_bandwidth_check(StridedBandwidth):
                 },
                 'pilatus:mc': {
                     'bandwidth': (33, -0.1, 0.2, 'GB/s')
-                }
+                },
+                'ault:a64fx': {
+                    'bandwidth': (45, -0.1, 0.1, 'GB/s')
+                },
             },
             16: {
                 'dom:gpu': {
@@ -100,6 +113,12 @@ class strided_bandwidth_check(StridedBandwidth):
                 'eiger:mc': {
                     'bandwidth': (33, -0.1, 0.2, 'GB/s')
                 },
+                'pilatus:mc': {
+                    'bandwidth': (33, -0.1, 0.2, 'GB/s')
+                },
+                'ault:a64fx': {
+                    'bandwidth': (25, -0.1, 0.1, 'GB/s')
+                },
             }
         }
     )

From a1873475b97c0db1c7fa4229ac3ece4a4f42bbee Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Mon, 31 May 2021 13:42:53 +0200
Subject: [PATCH 03/20] Port stream benchmark to the hpctestlib

---
 .../microbenchmarks/cpu/stream/stream.py      | 100 +++++++-----------
 .../microbenchmarks/cpu/stream/__init__.py    |  61 +++++++++++
 .../microbenchmarks/cpu/stream/src/stream.c   |   0
 3 files changed, 102 insertions(+), 59 deletions(-)
 create mode 100644 hpctestlib/microbenchmarks/cpu/stream/__init__.py
 rename {cscs-checks => hpctestlib}/microbenchmarks/cpu/stream/src/stream.c (100%)

diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream.py b/cscs-checks/microbenchmarks/cpu/stream/stream.py
index eb2b609582..8180fe8738 100644
--- a/cscs-checks/microbenchmarks/cpu/stream/stream.py
+++ b/cscs-checks/microbenchmarks/cpu/stream/stream.py
@@ -6,70 +6,40 @@
 import reframe as rfm
 import reframe.utility.sanity as sn
 
+from hpctestlib.microbenchmark.cpu.stream import Stream
 
-@rfm.simple_test
-class StreamTest(rfm.RegressionTest):
-    '''This test checks the stream test:
-       Function    Best Rate MB/s  Avg time     Min time     Max time
-       Triad:          13991.7     0.017174     0.017153     0.017192
-    '''
-
-    def __init__(self):
-        self.descr = 'STREAM Benchmark'
-        self.exclusive_access = True
-        self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-                              'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn']
-        self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-gnu',
-                                    'PrgEnv-intel', 'PrgEnv-pgi',
-                                    'PrgEnv-cray_classic']
-
-        self.use_multithreading = False
 
-        self.prgenv_flags = {
-            'PrgEnv-cray_classic': ['-homp', '-O3'],
+@rfm.simple_test
+class stream_check(Stream):
+    valid_systems = [
+        'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
+        'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn'
+    ]
+    valid_prog_environs = [
+        'PrgEnv-cray', 'PrgEnv-gnu', 'PrgEnv-intel', 'PrgEnv-pgi'
+    ]
+    prgenv_flags = variable(
+        dict, value={
             'PrgEnv-cray': ['-fopenmp', '-O3'],
             'PrgEnv-gnu': ['-fopenmp', '-O3'],
             'PrgEnv-intel': ['-qopenmp', '-O3'],
             'PrgEnv-pgi': ['-mp', '-O3']
         }
-
-        if self.current_system.name in ['arolla', 'tsa']:
-            self.exclusive_access = True
-            self.valid_prog_environs = ['PrgEnv-gnu']
-
-        self.sourcepath = 'stream.c'
-        self.build_system = 'SingleSource'
-        self.num_tasks = 1
-        self.num_tasks_per_node = 1
-        self.stream_cpus_per_task = {
+    )
+    stream_cpus_per_task = variable(
+        dict, value={
             'arolla:cn': 16,
             'arolla:pn': 16,
             'daint:gpu': 12,
             'daint:mc': 36,
             'dom:gpu': 12,
             'dom:mc': 36,
-            'leone:normal': 16,
-            'monch:compute': 20,
             'tsa:cn': 16,
             'tsa:pn': 16,
         }
-        self.variables = {
-            'OMP_PLACES': 'threads',
-            'OMP_PROC_BIND': 'spread'
-        }
-        self.sanity_patterns = sn.assert_found(
-            r'Solution Validates: avg error less than', self.stdout)
-        self.perf_patterns = {
-            'triad': sn.extractsingle(r'Triad:\s+(?P<triad>\S+)\s+\S+',
-                                      self.stdout, 'triad', float)
-        }
-        self.stream_bw_reference = {
-            'PrgEnv-cray_classic': {
-                'daint:gpu': {'triad': (57000, -0.05, None, 'MB/s')},
-                'daint:mc': {'triad': (117000, -0.05, None, 'MB/s')},
-                'dom:gpu': {'triad': (57000, -0.05, None, 'MB/s')},
-                'dom:mc': {'triad': (117000, -0.05, None, 'MB/s')},
-            },
+    )
+    stream_bw_reference = variable(
+        dict, value={
             'PrgEnv-cray': {
                 'daint:gpu': {'triad': (44000, -0.05, None, 'MB/s')},
                 'daint:mc': {'triad': (89000, -0.05, None, 'MB/s')},
@@ -95,21 +65,33 @@ def __init__(self):
                 'dom:mc': {'triad': (88500, -0.05, None, 'MB/s')},
             }
         }
-        self.tags = {'production', 'craype'}
-        self.maintainers = ['RS', 'SK']
+    )
+    num_tasks = 1
+    tags = {'production', 'craype'}
+
+    @rfm.run_after('init')
+    def filter_valid_prog_environs(self):
+        if self.current_system.name in ['arolla', 'tsa']:
+            self.valid_prog_environs = ['PrgEnv-gnu']
 
-    @rfm.run_after('setup')
-    def prepare_test(self):
+    @rfm.run_after('init')
+    def set_num_cpus_per_task(self):
         self.num_cpus_per_task = self.stream_cpus_per_task.get(
-            self.current_partition.fullname, 1)
-        self.variables['OMP_NUM_THREADS'] = str(self.num_cpus_per_task)
-        envname = self.current_environ.name
+            self.current_partition.fullname, required
+        )
 
+    @rfm.run_before('compile')
+    def set_compiler_flags(self):
+        envname = self.current_environ.name
         self.build_system.cflags = self.prgenv_flags.get(envname, ['-O3'])
-        if envname == 'PrgEnv-pgi':
+
+    @rfm.run_before('run')
+    def set_env_vars(self):
+        if self.current_environ.name == 'PrgEnv-pgi':
             self.variables['OMP_PROC_BIND'] = 'true'
 
-        try:
+    @rfm.run_before('performance')
+    def set_perf_references(self):
+        envname = self.current_environ.name
+        if envname in self.stream_bw_reference:
             self.reference = self.stream_bw_reference[envname]
-        except KeyError:
-            self.reference = self.stream_bw_reference['PrgEnv-gnu']
diff --git a/hpctestlib/microbenchmarks/cpu/stream/__init__.py b/hpctestlib/microbenchmarks/cpu/stream/__init__.py
new file mode 100644
index 0000000000..a112e720df
--- /dev/null
+++ b/hpctestlib/microbenchmarks/cpu/stream/__init__.py
@@ -0,0 +1,61 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+__all__ = ['Stream']
+
+
+class Stream(rfm.RegressionTest, pin_prefix=True):
+    '''This test checks the stream test:
+       Function    Best Rate MB/s  Avg time     Min time     Max time
+       Triad:          13991.7     0.017174     0.017153     0.017192
+    '''
+
+    descr = 'STREAM Benchmark'
+    exclusive_access = True
+    use_multithreading = False
+    sourcepath = 'stream.c'
+    build_system = 'SingleSource'
+    num_tasks_per_node = 1
+    variables = {
+        'OMP_PLACES': 'threads',
+        'OMP_PROC_BIND': 'spread'
+    }
+
+    num_tasks = required
+    num_cpus_per_task = required
+
+    reference = {
+        '*': {
+            'triad': (None, None, None, 'MB/s')
+        }
+    }
+    maintainers = ['RS', 'SK']
+
+    @rfm.run_before('run')
+    def set_omp_num_threads(self):
+        '''Set the number of OMP threads to ``num_cpus_per_task``.'''
+        self.variables['OMP_NUM_THREADS'] = f'{self.num_cpus_per_task}'
+
+    @rfm.run_before('sanity')
+    def set_sanity_patterns(self):
+        '''Set sanity patterns to check the error threshold.'''
+
+        self.sanity_patterns = sn.assert_found(
+            r'Solution Validates: avg error less than', self.stdout
+        )
+
+    @rfm.run_before('performance')
+    def set_performance_patterns(self):
+        '''Set performance to track the triad bandwidth.'''
+
+        self.perf_patterns = {
+            'triad': sn.min(sn.extractall(
+                r'Triad:\s+(?P<triad>\S+)\s+\S+', self.stdout, 'triad', float
+            ))
+        }
diff --git a/cscs-checks/microbenchmarks/cpu/stream/src/stream.c b/hpctestlib/microbenchmarks/cpu/stream/src/stream.c
similarity index 100%
rename from cscs-checks/microbenchmarks/cpu/stream/src/stream.c
rename to hpctestlib/microbenchmarks/cpu/stream/src/stream.c

From 363d42ff90fb4befc4bcdc9354dceed97f09ac56 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Mon, 31 May 2021 15:53:15 +0200
Subject: [PATCH 04/20] Bugfix stream tests

---
 .../microbenchmarks/cpu/stream/stream.py      |  31 +-
 .../microbenchmarks/cpu/stream/__init__.py    |  37 +-
 .../microbenchmarks/cpu/stream/src/stream.c   | 585 ------------------
 .../cpu/strided_bandwidth/__init__.py         |   7 +-
 4 files changed, 59 insertions(+), 601 deletions(-)
 delete mode 100644 hpctestlib/microbenchmarks/cpu/stream/src/stream.c

diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream.py b/cscs-checks/microbenchmarks/cpu/stream/stream.py
index 8180fe8738..be156ece8b 100644
--- a/cscs-checks/microbenchmarks/cpu/stream/stream.py
+++ b/cscs-checks/microbenchmarks/cpu/stream/stream.py
@@ -6,11 +6,13 @@
 import reframe as rfm
 import reframe.utility.sanity as sn
 
-from hpctestlib.microbenchmark.cpu.stream import Stream
+from hpctestlib.microbenchmarks.cpu.stream import Stream
 
 
 @rfm.simple_test
 class stream_check(Stream):
+    '''Stream benchmark test.'''
+
     valid_systems = [
         'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
         'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn'
@@ -38,7 +40,7 @@ class stream_check(Stream):
             'tsa:pn': 16,
         }
     )
-    stream_bw_reference = variable(
+    triad_reference = variable(
         dict, value={
             'PrgEnv-cray': {
                 'daint:gpu': {'triad': (44000, -0.05, None, 'MB/s')},
@@ -71,27 +73,44 @@ class stream_check(Stream):
 
     @rfm.run_after('init')
     def filter_valid_prog_environs(self):
+        '''Special conditions for arolla and tsa.'''
         if self.current_system.name in ['arolla', 'tsa']:
             self.valid_prog_environs = ['PrgEnv-gnu']
 
-    @rfm.run_after('init')
+    @rfm.run_after('setup')
     def set_num_cpus_per_task(self):
+        '''If partition not in ``stream_cpus_per_task``, leave as required.'''
         self.num_cpus_per_task = self.stream_cpus_per_task.get(
-            self.current_partition.fullname, required
+            self.current_partition.fullname, self.required
         )
 
     @rfm.run_before('compile')
     def set_compiler_flags(self):
+        '''Set build flags for the different environments.'''
         envname = self.current_environ.name
         self.build_system.cflags = self.prgenv_flags.get(envname, ['-O3'])
 
     @rfm.run_before('run')
     def set_env_vars(self):
+        '''Special environment treatment for the PrgEnv-pgi.'''
         if self.current_environ.name == 'PrgEnv-pgi':
             self.variables['OMP_PROC_BIND'] = 'true'
 
     @rfm.run_before('performance')
     def set_perf_references(self):
+        '''Set performance refs as defined in ``triad_reference``.
+
+        All other perf vars are left as default.
+        '''
+
         envname = self.current_environ.name
-        if envname in self.stream_bw_reference:
-            self.reference = self.stream_bw_reference[envname]
+        if envname in self.triad_reference:
+            extra_refs = {
+                '*': {
+                    'scale': (None, None, None, 'MB/s'),
+                    'add': (None, None, None, 'MB/s'),
+                    'copy': (None, None, None, 'MB/s'),
+                }
+            }
+            self.reference = self.triad_reference[envname]
+            self.reference.update(extra_refs)
diff --git a/hpctestlib/microbenchmarks/cpu/stream/__init__.py b/hpctestlib/microbenchmarks/cpu/stream/__init__.py
index a112e720df..b4ce530dce 100644
--- a/hpctestlib/microbenchmarks/cpu/stream/__init__.py
+++ b/hpctestlib/microbenchmarks/cpu/stream/__init__.py
@@ -11,14 +11,25 @@
 
 
 class Stream(rfm.RegressionTest, pin_prefix=True):
-    '''This test checks the stream test:
-       Function    Best Rate MB/s  Avg time     Min time     Max time
-       Triad:          13991.7     0.017174     0.017153     0.017192
+    '''Stream benchmark.
+
+    For info on the executable, see the executable sources.
+
+    Derived tests must set the variables ``num_tasks`` and
+    ``num_cpus_per_task``.
+
     '''
 
+    # Required variables
+    num_tasks = required
+    num_cpus_per_task = required
+
     descr = 'STREAM Benchmark'
     exclusive_access = True
     use_multithreading = False
+    prebuild_cmds = [
+        'wget http://www.cs.virginia.edu/stream/FTP/Code/stream.c',
+    ]
     sourcepath = 'stream.c'
     build_system = 'SingleSource'
     num_tasks_per_node = 1
@@ -26,13 +37,12 @@ class Stream(rfm.RegressionTest, pin_prefix=True):
         'OMP_PLACES': 'threads',
         'OMP_PROC_BIND': 'spread'
     }
-
-    num_tasks = required
-    num_cpus_per_task = required
-
     reference = {
         '*': {
-            'triad': (None, None, None, 'MB/s')
+            'triad': (None, None, None, 'MB/s'),
+            'add': (None, None, None, 'MB/s'),
+            'copy': (None, None, None, 'MB/s'),
+            'scale': (None, None, None, 'MB/s')
         }
     }
     maintainers = ['RS', 'SK']
@@ -57,5 +67,14 @@ def set_performance_patterns(self):
         self.perf_patterns = {
             'triad': sn.min(sn.extractall(
                 r'Triad:\s+(?P<triad>\S+)\s+\S+', self.stdout, 'triad', float
-            ))
+            )),
+            'add': sn.min(sn.extractall(
+                r'Add:\s+(?P<add>\S+)\s+\S+', self.stdout, 'add', float
+            )),
+            'copy': sn.min(sn.extractall(
+                r'Copy:\s+(?P<copy>\S+)\s+\S+', self.stdout, 'copy', float
+            )),
+            'scale': sn.min(sn.extractall(
+                r'Scale:\s+(?P<scale>\S+)\s+\S+', self.stdout, 'scale', float
+            )),
         }
diff --git a/hpctestlib/microbenchmarks/cpu/stream/src/stream.c b/hpctestlib/microbenchmarks/cpu/stream/src/stream.c
deleted file mode 100644
index b9a2cee3b2..0000000000
--- a/hpctestlib/microbenchmarks/cpu/stream/src/stream.c
+++ /dev/null
@@ -1,585 +0,0 @@
-/*-----------------------------------------------------------------------*/
-/* Program: STREAM                                                       */
-/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
-/* Original code developed by John D. McCalpin                           */
-/* Programmers: John D. McCalpin                                         */
-/*              Joe R. Zagar                                             */
-/*                                                                       */
-/* This program measures memory transfer rates in MB/s for simple        */
-/* computational kernels coded in C.                                     */
-/*-----------------------------------------------------------------------*/
-/* Copyright 1991-2013: John D. McCalpin                                 */
-/*-----------------------------------------------------------------------*/
-/* License:                                                              */
-/*  1. You are free to use this program and/or to redistribute           */
-/*     this program.                                                     */
-/*  2. You are free to modify this program for your own use,             */
-/*     including commercial use, subject to the publication              */
-/*     restrictions in item 3.                                           */
-/*  3. You are free to publish results obtained from running this        */
-/*     program, or from works that you derive from this program,         */
-/*     with the following limitations:                                   */
-/*     3a. In order to be referred to as "STREAM benchmark results",     */
-/*         published results must be in conformance to the STREAM        */
-/*         Run Rules, (briefly reviewed below) published at              */
-/*         http://www.cs.virginia.edu/stream/ref.html                    */
-/*         and incorporated herein by reference.                         */
-/*         As the copyright holder, John McCalpin retains the            */
-/*         right to determine conformity with the Run Rules.             */
-/*     3b. Results based on modified source code or on runs not in       */
-/*         accordance with the STREAM Run Rules must be clearly          */
-/*         labelled whenever they are published.  Examples of            */
-/*         proper labelling include:                                     */
-/*           "tuned STREAM benchmark results"                            */
-/*           "based on a variant of the STREAM benchmark code"           */
-/*         Other comparable, clear, and reasonable labelling is          */
-/*         acceptable.                                                   */
-/*     3c. Submission of results to the STREAM benchmark web site        */
-/*         is encouraged, but not required.                              */
-/*  4. Use of this program or creation of derived works based on this    */
-/*     program constitutes acceptance of these licensing restrictions.   */
-/*  5. Absolutely no warranty is expressed or implied.                   */
-/*-----------------------------------------------------------------------*/
-# include <stdio.h>
-# include <unistd.h>
-# include <math.h>
-# include <float.h>
-# include <limits.h>
-# include <sys/time.h>
-
-/*-----------------------------------------------------------------------
- * INSTRUCTIONS:
- *
- *	1) STREAM requires different amounts of memory to run on different
- *           systems, depending on both the system cache size(s) and the
- *           granularity of the system timer.
- *     You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
- *           to meet *both* of the following criteria:
- *       (a) Each array must be at least 4 times the size of the
- *           available cache memory. I don't worry about the difference
- *           between 10^6 and 2^20, so in practice the minimum array size
- *           is about 3.8 times the cache size.
- *           Example 1: One Xeon E3 with 8 MB L3 cache
- *               STREAM_ARRAY_SIZE should be >= 4 million, giving
- *               an array size of 30.5 MB and a total memory requirement
- *               of 91.5 MB.  
- *           Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
- *               STREAM_ARRAY_SIZE should be >= 20 million, giving
- *               an array size of 153 MB and a total memory requirement
- *               of 458 MB.  
- *       (b) The size should be large enough so that the 'timing calibration'
- *           output by the program is at least 20 clock-ticks.  
- *           Example: most versions of Windows have a 10 millisecond timer
- *               granularity.  20 "ticks" at 10 ms/tic is 200 milliseconds.
- *               If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
- *               This means the each array must be at least 1 GB, or 128M elements.
- *
- *      Version 5.10 increases the default array size from 2 million
- *          elements to 10 million elements in response to the increasing
- *          size of L3 caches.  The new default size is large enough for caches
- *          up to 20 MB. 
- *      Version 5.10 changes the loop index variables from "register int"
- *          to "ssize_t", which allows array indices >2^32 (4 billion)
- *          on properly configured 64-bit systems.  Additional compiler options
- *          (such as "-mcmodel=medium") may be required for large memory runs.
- *
- *      Array size can be set at compile time without modifying the source
- *          code for the (many) compilers that support preprocessor definitions
- *          on the compile line.  E.g.,
- *                gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
- *          will override the default size of 10M with a new size of 100M elements
- *          per array.
- */
-#ifndef STREAM_ARRAY_SIZE
-#   define STREAM_ARRAY_SIZE	10000000
-#endif
-
-/*  2) STREAM runs each kernel "NTIMES" times and reports the *best* result
- *         for any iteration after the first, therefore the minimum value
- *         for NTIMES is 2.
- *      There are no rules on maximum allowable values for NTIMES, but
- *         values larger than the default are unlikely to noticeably
- *         increase the reported performance.
- *      NTIMES can also be set on the compile line without changing the source
- *         code using, for example, "-DNTIMES=7".
- */
-#ifdef NTIMES
-#if NTIMES<=1
-#   define NTIMES	10
-#endif
-#endif
-#ifndef NTIMES
-#   define NTIMES	10
-#endif
-
-/*  Users are allowed to modify the "OFFSET" variable, which *may* change the
- *         relative alignment of the arrays (though compilers may change the 
- *         effective offset by making the arrays non-contiguous on some systems). 
- *      Use of non-zero values for OFFSET can be especially helpful if the
- *         STREAM_ARRAY_SIZE is set to a value close to a large power of 2.
- *      OFFSET can also be set on the compile line without changing the source
- *         code using, for example, "-DOFFSET=56".
- */
-#ifndef OFFSET
-#   define OFFSET	0
-#endif
-
-/*
- *	3) Compile the code with optimization.  Many compilers generate
- *       unreasonably bad code before the optimizer tightens things up.  
- *     If the results are unreasonably good, on the other hand, the
- *       optimizer might be too smart for me!
- *
- *     For a simple single-core version, try compiling with:
- *            cc -O stream.c -o stream
- *     This is known to work on many, many systems....
- *
- *     To use multiple cores, you need to tell the compiler to obey the OpenMP
- *       directives in the code.  This varies by compiler, but a common example is
- *            gcc -O -fopenmp stream.c -o stream_omp
- *       The environment variable OMP_NUM_THREADS allows runtime control of the 
- *         number of threads/cores used when the resulting "stream_omp" program
- *         is executed.
- *
- *     To run with single-precision variables and arithmetic, simply add
- *         -DSTREAM_TYPE=float
- *     to the compile line.
- *     Note that this changes the minimum array sizes required --- see (1) above.
- *
- *     The preprocessor directive "TUNED" does not do much -- it simply causes the 
- *       code to call separate functions to execute each kernel.  Trivial versions
- *       of these functions are provided, but they are *not* tuned -- they just 
- *       provide predefined interfaces to be replaced with tuned code.
- *
- *
- *	4) Optional: Mail the results to mccalpin@cs.virginia.edu
- *	   Be sure to include info that will help me understand:
- *		a) the computer hardware configuration (e.g., processor model, memory type)
- *		b) the compiler name/version and compilation flags
- *      c) any run-time information (such as OMP_NUM_THREADS)
- *		d) all of the output from the test case.
- *
- * Thanks!
- *
- *-----------------------------------------------------------------------*/
-
-# define HLINE "-------------------------------------------------------------\n"
-
-# ifndef MIN
-# define MIN(x,y) ((x)<(y)?(x):(y))
-# endif
-# ifndef MAX
-# define MAX(x,y) ((x)>(y)?(x):(y))
-# endif
-
-#ifndef STREAM_TYPE
-#define STREAM_TYPE double
-#endif
-
-static STREAM_TYPE	a[STREAM_ARRAY_SIZE+OFFSET],
-			b[STREAM_ARRAY_SIZE+OFFSET],
-			c[STREAM_ARRAY_SIZE+OFFSET];
-
-static double	avgtime[4] = {0}, maxtime[4] = {0},
-		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
-
-static char	*label[4] = {"Copy:      ", "Scale:     ",
-    "Add:       ", "Triad:     "};
-
-static double	bytes[4] = {
-    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
-    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
-    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
-    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE
-    };
-
-extern double mysecond();
-extern void checkSTREAMresults();
-#ifdef TUNED
-extern void tuned_STREAM_Copy();
-extern void tuned_STREAM_Scale(STREAM_TYPE scalar);
-extern void tuned_STREAM_Add();
-extern void tuned_STREAM_Triad(STREAM_TYPE scalar);
-#endif
-#ifdef _OPENMP
-extern int omp_get_num_threads();
-#endif
-int
-main()
-    {
-    int			quantum, checktick();
-    int			BytesPerWord;
-    int			k;
-    ssize_t		j;
-    STREAM_TYPE		scalar;
-    double		t, times[4][NTIMES];
-
-    /* --- SETUP --- determine precision and check timing --- */
-
-    printf(HLINE);
-    printf("STREAM version $Revision: 5.10 $\n");
-    printf(HLINE);
-    BytesPerWord = sizeof(STREAM_TYPE);
-    printf("This system uses %d bytes per array element.\n",
-	BytesPerWord);
-
-    printf(HLINE);
-#ifdef N
-    printf("*****  WARNING: ******\n");
-    printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
-    printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
-    printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
-    printf("*****  WARNING: ******\n");
-#endif
-
-    printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
-    printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
-	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
-	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
-    printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
-	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
-	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
-    printf("Each kernel will be executed %d times.\n", NTIMES);
-    printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
-    printf(" will be used to compute the reported bandwidth.\n");
-
-#ifdef _OPENMP
-    printf(HLINE);
-#pragma omp parallel 
-    {
-#pragma omp master
-	{
-	    k = omp_get_num_threads();
-	    printf ("Number of Threads requested = %i\n",k);
-        }
-    }
-#endif
-
-#ifdef _OPENMP
-	k = 0;
-#pragma omp parallel
-#pragma omp atomic 
-		k++;
-    printf ("Number of Threads counted = %i\n",k);
-#endif
-
-    /* Get initial value for system clock. */
-#pragma omp parallel for
-    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
-	    a[j] = 1.0;
-	    b[j] = 2.0;
-	    c[j] = 0.0;
-	}
-
-    printf(HLINE);
-
-    if  ( (quantum = checktick()) >= 1) 
-	printf("Your clock granularity/precision appears to be "
-	    "%d microseconds.\n", quantum);
-    else {
-	printf("Your clock granularity appears to be "
-	    "less than one microsecond.\n");
-	quantum = 1;
-    }
-
-    t = mysecond();
-#pragma omp parallel for
-    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
-		a[j] = 2.0E0 * a[j];
-    t = 1.0E6 * (mysecond() - t);
-
-    printf("Each test below will take on the order"
-	" of %d microseconds.\n", (int) t  );
-    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
-    printf("Increase the size of the arrays if this shows that\n");
-    printf("you are not getting at least 20 clock ticks per test.\n");
-
-    printf(HLINE);
-
-    printf("WARNING -- The above is only a rough guideline.\n");
-    printf("For best results, please be sure you know the\n");
-    printf("precision of your system timer.\n");
-    printf(HLINE);
-    
-    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
-
-    scalar = 3.0;
-    for (k=0; k<NTIMES; k++)
-	{
-	times[0][k] = mysecond();
-#ifdef TUNED
-        tuned_STREAM_Copy();
-#else
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    c[j] = a[j];
-#endif
-	times[0][k] = mysecond() - times[0][k];
-	
-	times[1][k] = mysecond();
-#ifdef TUNED
-        tuned_STREAM_Scale(scalar);
-#else
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    b[j] = scalar*c[j];
-#endif
-	times[1][k] = mysecond() - times[1][k];
-	
-	times[2][k] = mysecond();
-#ifdef TUNED
-        tuned_STREAM_Add();
-#else
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    c[j] = a[j]+b[j];
-#endif
-	times[2][k] = mysecond() - times[2][k];
-	
-	times[3][k] = mysecond();
-#ifdef TUNED
-        tuned_STREAM_Triad(scalar);
-#else
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    a[j] = b[j]+scalar*c[j];
-#endif
-	times[3][k] = mysecond() - times[3][k];
-	}
-
-    /*	--- SUMMARY --- */
-
-    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
-	{
-	for (j=0; j<4; j++)
-	    {
-	    avgtime[j] = avgtime[j] + times[j][k];
-	    mintime[j] = MIN(mintime[j], times[j][k]);
-	    maxtime[j] = MAX(maxtime[j], times[j][k]);
-	    }
-	}
-    
-    printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
-    for (j=0; j<4; j++) {
-		avgtime[j] = avgtime[j]/(double)(NTIMES-1);
-
-		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
-	       1.0E-06 * bytes[j]/mintime[j],
-	       avgtime[j],
-	       mintime[j],
-	       maxtime[j]);
-    }
-    printf(HLINE);
-
-    /* --- Check Results --- */
-    checkSTREAMresults();
-    printf(HLINE);
-
-    return 0;
-}
-
-# define	M	20
-
-int
-checktick()
-    {
-    int		i, minDelta, Delta;
-    double	t1, t2, timesfound[M];
-
-/*  Collect a sequence of M unique time values from the system. */
-
-    for (i = 0; i < M; i++) {
-	t1 = mysecond();
-	while( ((t2=mysecond()) - t1) < 1.0E-6 )
-	    ;
-	timesfound[i] = t1 = t2;
-	}
-
-/*
- * Determine the minimum difference between these M values.
- * This result will be our estimate (in microseconds) for the
- * clock granularity.
- */
-
-    minDelta = 1000000;
-    for (i = 1; i < M; i++) {
-	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
-	minDelta = MIN(minDelta, MAX(Delta,0));
-	}
-
-   return(minDelta);
-    }
-
-
-
-/* A gettimeofday routine to give access to the wall
-   clock timer on most UNIX-like systems.  */
-
-#include <sys/time.h>
-
-double mysecond()
-{
-        struct timeval tp;
-        struct timezone tzp;
-        int i;
-
-        i = gettimeofday(&tp,&tzp);
-        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
-}
-
-#ifndef abs
-#define abs(a) ((a) >= 0 ? (a) : -(a))
-#endif
-void checkSTREAMresults ()
-{
-	STREAM_TYPE aj,bj,cj,scalar;
-	STREAM_TYPE aSumErr,bSumErr,cSumErr;
-	STREAM_TYPE aAvgErr,bAvgErr,cAvgErr;
-	double epsilon;
-	ssize_t	j;
-	int	k,ierr,err;
-
-    /* reproduce initialization */
-	aj = 1.0;
-	bj = 2.0;
-	cj = 0.0;
-    /* a[] is modified during timing check */
-	aj = 2.0E0 * aj;
-    /* now execute timing loop */
-	scalar = 3.0;
-	for (k=0; k<NTIMES; k++)
-        {
-            cj = aj;
-            bj = scalar*cj;
-            cj = aj+bj;
-            aj = bj+scalar*cj;
-        }
-
-    /* accumulate deltas between observed and expected results */
-	aSumErr = 0.0;
-	bSumErr = 0.0;
-	cSumErr = 0.0;
-	for (j=0; j<STREAM_ARRAY_SIZE; j++) {
-		aSumErr += abs(a[j] - aj);
-		bSumErr += abs(b[j] - bj);
-		cSumErr += abs(c[j] - cj);
-		// if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj);	// MCCALPIN
-	}
-	aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
-	bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
-	cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
-
-	if (sizeof(STREAM_TYPE) == 4) {
-		epsilon = 1.e-6;
-	}
-	else if (sizeof(STREAM_TYPE) == 8) {
-		epsilon = 1.e-13;
-	}
-	else {
-		printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
-		epsilon = 1.e-6;
-	}
-
-	err = 0;
-	if (abs(aAvgErr/aj) > epsilon) {
-		err++;
-		printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
-		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
-		ierr = 0;
-		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
-			if (abs(a[j]/aj-1.0) > epsilon) {
-				ierr++;
-#ifdef VERBOSE
-				if (ierr < 10) {
-					printf("         array a: index: %ld, expected: %e, observed: %e, relative error: %e\n",
-						j,aj,a[j],abs((aj-a[j])/aAvgErr));
-				}
-#endif
-			}
-		}
-		printf("     For array a[], %d errors were found.\n",ierr);
-	}
-	if (abs(bAvgErr/bj) > epsilon) {
-		err++;
-		printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
-		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
-		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
-		ierr = 0;
-		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
-			if (abs(b[j]/bj-1.0) > epsilon) {
-				ierr++;
-#ifdef VERBOSE
-				if (ierr < 10) {
-					printf("         array b: index: %ld, expected: %e, observed: %e, relative error: %e\n",
-						j,bj,b[j],abs((bj-b[j])/bAvgErr));
-				}
-#endif
-			}
-		}
-		printf("     For array b[], %d errors were found.\n",ierr);
-	}
-	if (abs(cAvgErr/cj) > epsilon) {
-		err++;
-		printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
-		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
-		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
-		ierr = 0;
-		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
-			if (abs(c[j]/cj-1.0) > epsilon) {
-				ierr++;
-#ifdef VERBOSE
-				if (ierr < 10) {
-					printf("         array c: index: %ld, expected: %e, observed: %e, relative error: %e\n",
-						j,cj,c[j],abs((cj-c[j])/cAvgErr));
-				}
-#endif
-			}
-		}
-		printf("     For array c[], %d errors were found.\n",ierr);
-	}
-	if (err == 0) {
-		printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
-	}
-#ifdef VERBOSE
-	printf ("Results Validation Verbose Results: \n");
-	printf ("    Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj);
-	printf ("    Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]);
-	printf ("    Rel Errors on a, b, c:     %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj));
-#endif
-}
-
-#ifdef TUNED
-/* stubs for "tuned" versions of the kernels */
-void tuned_STREAM_Copy()
-{
-	ssize_t j;
-#pragma omp parallel for
-        for (j=0; j<STREAM_ARRAY_SIZE; j++)
-            c[j] = a[j];
-}
-
-void tuned_STREAM_Scale(STREAM_TYPE scalar)
-{
-	ssize_t j;
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    b[j] = scalar*c[j];
-}
-
-void tuned_STREAM_Add()
-{
-	ssize_t j;
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    c[j] = a[j]+b[j];
-}
-
-void tuned_STREAM_Triad(STREAM_TYPE scalar)
-{
-	ssize_t j;
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    a[j] = b[j]+scalar*c[j];
-}
-/* end of stubs for the "tuned" versions of the kernels */
-#endif
diff --git a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
index 64ffa378c0..f66c9b2950 100644
--- a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
+++ b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
@@ -17,6 +17,9 @@ class StridedBandwidth(rfm.RegressionTest, pin_prefix=True):
     (in bytes), the stride (in multiples of 8 bytes) and the number of threads
     to run this application with.
 
+    Derived tests must set the parameter ``stride``, and the variables
+    ``num_cpus`` and ``num_tasks``.
+
     The performance stage measures the bandiwdth in GB/s.
     '''
 
@@ -31,9 +34,11 @@ class StridedBandwidth(rfm.RegressionTest, pin_prefix=True):
     #: :default: ``required``
     num_cpus = variable(int)
 
+    # Required variables
+    num_tasks = required
+
     sourcepath = 'strides.cpp'
     build_system = 'SingleSource'
-    num_tasks = 0
     num_tasks_per_node = 1
     reference = {
         '*': {

From d69863652f61b3a5d747687800611d0676681d25 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Mon, 31 May 2021 15:55:39 +0200
Subject: [PATCH 05/20] Remove pin_prefix from stream test

---
 hpctestlib/microbenchmarks/cpu/stream/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hpctestlib/microbenchmarks/cpu/stream/__init__.py b/hpctestlib/microbenchmarks/cpu/stream/__init__.py
index b4ce530dce..38da66e447 100644
--- a/hpctestlib/microbenchmarks/cpu/stream/__init__.py
+++ b/hpctestlib/microbenchmarks/cpu/stream/__init__.py
@@ -10,14 +10,13 @@
 __all__ = ['Stream']
 
 
-class Stream(rfm.RegressionTest, pin_prefix=True):
+class Stream(rfm.RegressionTest):
     '''Stream benchmark.
 
     For info on the executable, see the executable sources.
 
     Derived tests must set the variables ``num_tasks`` and
     ``num_cpus_per_task``.
-
     '''
 
     # Required variables

From 8be9af24be89320cec7dcfff4d731d9b483d2da6 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Mon, 31 May 2021 19:10:08 +0200
Subject: [PATCH 06/20] Port cpu latency to hpctestlib

---
 .../microbenchmarks/cpu/latency/latency.py    | 108 +++++++-----------
 .../microbenchmarks/cpu/latency/__init__.py   |  89 +++++++++++++++
 .../cpu/latency/src/latency.cpp               |   0
 3 files changed, 132 insertions(+), 65 deletions(-)
 create mode 100644 hpctestlib/microbenchmarks/cpu/latency/__init__.py
 rename {cscs-checks => hpctestlib}/microbenchmarks/cpu/latency/src/latency.cpp (100%)

diff --git a/cscs-checks/microbenchmarks/cpu/latency/latency.py b/cscs-checks/microbenchmarks/cpu/latency/latency.py
index fa02138a9b..63221cb1f6 100644
--- a/cscs-checks/microbenchmarks/cpu/latency/latency.py
+++ b/cscs-checks/microbenchmarks/cpu/latency/latency.py
@@ -6,92 +6,70 @@
 import reframe as rfm
 import reframe.utility.sanity as sn
 
+from hpctestlib.microbenchmarks.cpu.latency import CpuLatency
 
-@rfm.simple_test
-class CPULatencyTest(rfm.RegressionTest):
-    def __init__(self):
-        self.sourcepath = 'latency.cpp'
-        self.build_system = 'SingleSource'
-        self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-                              'ault:intel', 'ault:amdvega', 'tave:compute']
-        self.valid_prog_environs = ['PrgEnv-gnu']
-        self.num_tasks = 0
-        self.num_tasks_per_node = 1
-
-        self.build_system.cxxflags = ['-O3']
 
-        self.executable_opts = ['16000', '128000', '8000000', '500000000']
+@rfm.simple_test
+class cpu_latency_check(CpuLatency):
+    buffer_sizes = ['16000', '128000', '8000000', '500000000']
+    num_tasks = 0
+    valid_systems = [
+        'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
+        'ault:intel', 'ault:amdvega', 'tave:compute'
+    ]
+    valid_prog_environs = ['PrgEnv-gnu']
+    tags = {'benchmark', 'diagnostic'}
 
+    @rfm.run_after('setup')
+    def set_modules(self):
         if self.current_system.name in {'daint', 'dom'}:
             self.modules = ['craype-hugepages1G']
         if self.current_system.name in {'tave'}:
             self.modules = ['craype-hugepages512M']
 
-        self.sanity_patterns = sn.assert_eq(
-            sn.count(sn.findall(r'latency', self.stdout)),
-            self.num_tasks_assigned * len(self.executable_opts))
-
-        def lat_pattern(index):
-            return sn.extractsingle(
-                r'latency \(ns\) for input size %s: (?P<bw>\S+) clocks' %
-                self.executable_opts[index], self.stdout, 'bw', float)
-
-        self.perf_patterns = {
-            'latencyL1': lat_pattern(0),
-            'latencyL2': lat_pattern(1),
-            'latencyL3': lat_pattern(2),
-            'latencyMem': lat_pattern(3),
-        }
-
+    @rfm.run_before('performance')
+    def set_references(self):
         self.reference = {
             'dom:mc': {
-                'latencyL1':  (1.21, -0.01, 0.26, 'ns'),
-                'latencyL2':  (3.65, -0.01, 0.26, 'ns'),
-                'latencyL3':  (18.83, -0.01, 0.05, 'ns'),
-                'latencyMem': (76.6, -0.01, 0.05, 'ns')
+                'latencyL1': (1.21, -0.01, 0.26, 'ns'),
+                'latencyL2': (3.65, -0.01, 0.26, 'ns'),
+                'latencyL3': (18.83, -0.01, 0.05, 'ns'),
+                'latencyL4': (76.6, -0.01, 0.05, 'ns')
             },
             'dom:gpu': {
-                'latencyL1':  (1.14, -0.01, 0.26, 'ns'),
-                'latencyL2':  (3.44, -0.01, 0.26, 'ns'),
-                'latencyL3':  (15.65, -0.01, 0.05, 'ns'),
-                'latencyMem': (71.7, -0.01, 0.05, 'ns')
+                'latencyL1': (1.14, -0.01, 0.26, 'ns'),
+                'latencyL2': (3.44, -0.01, 0.26, 'ns'),
+                'latencyL3': (15.65, -0.01, 0.05, 'ns'),
+                'latencyL4': (71.7, -0.01, 0.05, 'ns')
             },
             'daint:mc': {
-                'latencyL1':  (1.21, -0.01, 0.26, 'ns'),
-                'latencyL2':  (3.65, -0.01, 0.26, 'ns'),
-                'latencyL3':  (18.83, -0.01, 0.05, 'ns'),
-                'latencyMem': (76.6, -0.01, 0.05, 'ns')
+                'latencyL1': (1.21, -0.01, 0.26, 'ns'),
+                'latencyL2': (3.65, -0.01, 0.26, 'ns'),
+                'latencyL3': (18.83, -0.01, 0.05, 'ns'),
+                'latencyL4': (76.6, -0.01, 0.05, 'ns')
             },
             'daint:gpu': {
-                'latencyL1':  (1.14, -0.01, 0.26, 'ns'),
-                'latencyL2':  (3.44, -0.01, 0.26, 'ns'),
-                'latencyL3':  (15.65, -0.01, 0.05, 'ns'),
-                'latencyMem': (71.7, -0.01, 0.05, 'ns')
+                'latencyL1': (1.14, -0.01, 0.26, 'ns'),
+                'latencyL2': (3.44, -0.01, 0.26, 'ns'),
+                'latencyL3': (15.65, -0.01, 0.05, 'ns'),
+                'latencyL4': (71.7, -0.01, 0.05, 'ns')
             },
             'ault:intel': {
-                'latencyL1':  (1.08, -0.01, 0.26, 'ns'),
-                'latencyL2':  (3.8, -0.01, 0.26, 'ns'),
-                'latencyL3':  (21.5, -0.01, 0.05, 'ns'),
-                'latencyMem': (86.5, -0.01, 0.05, 'ns')
+                'latencyL1': (1.08, -0.01, 0.26, 'ns'),
+                'latencyL2': (3.8, -0.01, 0.26, 'ns'),
+                'latencyL3': (21.5, -0.01, 0.05, 'ns'),
+                'latencyL4': (86.5, -0.01, 0.05, 'ns')
             },
             'ault:amdvega': {
-                'latencyL1':  (1.32, -0.01, 0.26, 'ns'),
-                'latencyL2':  (4.02, -0.01, 0.26, 'ns'),
-                'latencyL3':  (14.4, -0.01, 0.26, 'ns'),
-                'latencyMem': (90.0, -0.01, 0.05, 'ns')
+                'latencyL1': (1.32, -0.01, 0.26, 'ns'),
+                'latencyL2': (4.02, -0.01, 0.26, 'ns'),
+                'latencyL3': (14.4, -0.01, 0.26, 'ns'),
+                'latencyL4': (90.0, -0.01, 0.05, 'ns')
             },
             'tave:compute': {
-                'latencyL1':  (2.86, -0.01, 0.05, 'ns'),
-                'latencyL2':  (12.15, -0.01, 0.05, 'ns'),
-                'latencyL3':  (137, -0.01, 0.05, 'ns'),
-                'latencyMem': (150, -0.05, 0.05, 'ns')
+                'latencyL1': (2.86, -0.01, 0.05, 'ns'),
+                'latencyL2': (12.15, -0.01, 0.05, 'ns'),
+                'latencyL3': (137, -0.01, 0.05, 'ns'),
+                'latencyL4': (150, -0.05, 0.05, 'ns')
             },
         }
-
-        self.maintainers = ['SK']
-        self.tags = {'benchmark', 'diagnostic'}
-
-    @property
-    @sn.sanity_function
-    def num_tasks_assigned(self):
-        return self.job.num_tasks
diff --git a/hpctestlib/microbenchmarks/cpu/latency/__init__.py b/hpctestlib/microbenchmarks/cpu/latency/__init__.py
new file mode 100644
index 0000000000..2ec00d9842
--- /dev/null
+++ b/hpctestlib/microbenchmarks/cpu/latency/__init__.py
@@ -0,0 +1,89 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+import reframe.utility.typecheck as typ
+
+__all__ = ['CpuLatency']
+
+
+class CpuLatency(rfm.RegressionTest, pin_prefix=True):
+    ''' CPU latency test.
+
+    Derived tests must set the variables ``buffer_size`` and ``num_tasks``.
+    The variable ``buffer_sizes`` is a list of the different buffer sizes to
+    be used on this latency test. The executable will return the latency in
+    ``ns`` for each of the buffer sizes specified in this list.
+
+    This test assumes that the list of buffer sizes is provided in increasing
+    order, and this test will automatically extract a performance variable for
+    the latency of each buffer. These performance variables are named
+    ``latencyL1``, ``latencyL2`` and so on in increasing order.
+    '''
+
+    # Required variables
+    buffer_sizes = variable(typ.List[str])
+    num_tasks = required
+
+    sourcepath = 'latency.cpp'
+    build_system = 'SingleSource'
+    num_tasks_per_node = 1
+    maintainers = ['SK', 'JO']
+
+    @rfm.run_before('compile')
+    def set_compiler_flags(self):
+        self.build_system.cxxflags = ['-O3']
+
+    @rfm.run_before('run')
+    def set_exc_opts(self):
+        self.executable_opts = self.buffer_sizes
+
+    @rfm.run_before('sanity')
+    def set_sanity_patterns(self):
+        self.sanity_patterns = sn.assert_eq(
+            sn.count(sn.findall(r'latency \(ns\)', self.stdout)),
+            self.num_tasks*sn.count(self.executable_opts)
+        )
+
+    @sn.sanity_function
+    def get_latency(self, buffer_size):
+        '''Extract the worst latency for a given buffer size.'''
+
+        return sn.max(sn.extractall(
+            r'latency \(ns\) for input size %s: (?P<bw>\S+) clocks' %
+            buffer_size, self.stdout, 'bw', float
+        ))
+
+    @rfm.run_before('performance')
+    def set_references(self):
+        '''Set dummy references to get the perf values in the perf report.
+
+        This will create as many levels as passed in ``buffer_sizes``. Derived
+        test must override this hook if they wish to use their own reference
+        values.
+        '''
+
+        refs = {'*': {}}
+        dummy_ref = (None, None, None, 'ns')
+        for i, buff in enumerate(self.buffer_sizes):
+            level = i+1
+            refs['*'].update({f'latencyL{level}': dummy_ref})
+
+        self.reference = refs
+
+    @rfm.run_before('performance')
+    def set_perf_patterns(self):
+        '''Set the performance patters to extract all latency levels.
+
+        The levels are named from ``L1`` to ``L(n+1)``, where ``n`` is the
+        length of ``buffer_sizes``.
+        '''
+
+        self.perf_patterns = {}
+        for i, buff in enumerate(self.buffer_sizes):
+            level = i+1
+            level_name = f'latencyL{level}'
+            self.perf_patterns.update({level_name: self.get_latency(buff)})
diff --git a/cscs-checks/microbenchmarks/cpu/latency/src/latency.cpp b/hpctestlib/microbenchmarks/cpu/latency/src/latency.cpp
similarity index 100%
rename from cscs-checks/microbenchmarks/cpu/latency/src/latency.cpp
rename to hpctestlib/microbenchmarks/cpu/latency/src/latency.cpp

From c9400450da3db1a56717e3f5c664eb4915ad1cc6 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Tue, 1 Jun 2021 11:10:49 +0200
Subject: [PATCH 07/20] Cleanup performance stage

---
 .../microbenchmarks/cpu/latency/__init__.py   | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/hpctestlib/microbenchmarks/cpu/latency/__init__.py b/hpctestlib/microbenchmarks/cpu/latency/__init__.py
index 2ec00d9842..3557af0c04 100644
--- a/hpctestlib/microbenchmarks/cpu/latency/__init__.py
+++ b/hpctestlib/microbenchmarks/cpu/latency/__init__.py
@@ -39,10 +39,13 @@ def set_compiler_flags(self):
 
     @rfm.run_before('run')
     def set_exc_opts(self):
+        '''Set the ``buffer_sizes`` as the executable options.'''
         self.executable_opts = self.buffer_sizes
 
     @rfm.run_before('sanity')
     def set_sanity_patterns(self):
+        '''Verify the number of reported latency measurements.'''
+
         self.sanity_patterns = sn.assert_eq(
             sn.count(sn.findall(r'latency \(ns\)', self.stdout)),
             self.num_tasks*sn.count(self.executable_opts)
@@ -61,29 +64,27 @@ def get_latency(self, buffer_size):
     def set_references(self):
         '''Set dummy references to get the perf values in the perf report.
 
-        This will create as many levels as passed in ``buffer_sizes``. Derived
+        This will create as many levels as items in ``buffer_sizes``. Derived
         test must override this hook if they wish to use their own reference
         values.
         '''
 
-        refs = {'*': {}}
-        dummy_ref = (None, None, None, 'ns')
-        for i, buff in enumerate(self.buffer_sizes):
-            level = i+1
-            refs['*'].update({f'latencyL{level}': dummy_ref})
-
-        self.reference = refs
+        self.reference = {
+            '*': {
+                f'latencyL{i+1}': (None, None, None, 'ns')
+                for i, buff in enumerate(self.buffer_sizes)
+            }
+        }
 
     @rfm.run_before('performance')
     def set_perf_patterns(self):
         '''Set the performance patters to extract all latency levels.
 
-        The levels are named from ``L1`` to ``L(n+1)``, where ``n`` is the
+        The levels are named from ``L1`` to ``L{n}``, where ``n`` is the
         length of ``buffer_sizes``.
         '''
 
-        self.perf_patterns = {}
-        for i, buff in enumerate(self.buffer_sizes):
-            level = i+1
-            level_name = f'latencyL{level}'
-            self.perf_patterns.update({level_name: self.get_latency(buff)})
+        self.perf_patterns = {
+            f'latencyL{i+1}': self.get_latency(buff)
+            for i, buff in enumerate(self.buffer_sizes)
+        }

From eef477ae2f63dedd685db7407202377729e55fbe Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Tue, 1 Jun 2021 14:08:10 +0200
Subject: [PATCH 08/20] Port CPU DGEMM to hpctestlib

---
 .../microbenchmarks/cpu/dgemm/dgemm.py        | 133 +++++++++---------
 .../microbenchmarks/cpu/dgemm/__init__.py     |  97 +++++++++++++
 .../microbenchmarks/cpu/dgemm/src/dgemm.c     |   0
 3 files changed, 163 insertions(+), 67 deletions(-)
 create mode 100644 hpctestlib/microbenchmarks/cpu/dgemm/__init__.py
 rename {cscs-checks => hpctestlib}/microbenchmarks/cpu/dgemm/src/dgemm.c (100%)

diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
index 3b9fc82b42..92891282d6 100644
--- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
+++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
@@ -5,44 +5,60 @@
 
 import reframe as rfm
 import reframe.utility.sanity as sn
+import reframe.utility.osext as osext
 
+from hpctestlib.microbenchmarks.cpu.dgemm import Dgemm
 
 @rfm.simple_test
-class DGEMMTest(rfm.RegressionTest):
-    def __init__(self):
-        self.descr = 'DGEMM performance test'
-        self.sourcepath = 'dgemm.c'
-        self.sanity_patterns = self.eval_sanity()
+class dgemm_check(Dgemm):
+    '''CSCS DGEMM check.
 
-        # the perf patterns are automaticaly generated inside sanity
-        self.perf_patterns = {}
-        self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-                              'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn',
-                              'eiger:mc', 'pilatus:mc']
+    The matrix dimensions are set in the base class.
+    Every node reports its performance in Gflops/s. To do so, this class
+    overrides the performance patterns and references from the base test.
+    This is done in the ``set_perf_patterns`` pre-performance hook.
+    '''
+
+    valid_systems = [
+        'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
+         'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn',
+         'eiger:mc', 'pilatus:mc'
+    ]
+    num_tasks = 0
+    sys_reference = variable(
+        dict, value={
+            'daint:gpu':  (300.0, -0.15, None, 'Gflop/s'),
+            'daint:mc':   (1040.0, -0.15, None, 'Gflop/s'),
+            'dom:gpu':    (300.0, -0.15, None, 'Gflop/s'),
+            'dom:mc':     (1040.0, -0.15, None, 'Gflop/s'),
+            'eiger:mc':   (3200.0, -0.15, None, 'Gflop/s'),
+            'pilatus:mc': (3200.0, -0.15, None, 'Gflop/s'),
+            '*':          (None, None, None, 'Gflop/s'),
+        },
+    )
+    tags = {'benchmark', 'diagnostic', 'craype'}
+
+    @rfm.run_after('init')
+    def set_valid_prog_environs(self):
         if self.current_system.name in ['daint', 'dom']:
             self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-intel']
         elif self.current_system.name in ['arolla', 'tsa']:
             self.valid_prog_environs = ['PrgEnv-gnu-nompi']
         elif self.current_system.name in ['eiger', 'pilatus']:
             self.valid_prog_environs = ['PrgEnv-gnu']
-        else:
-            self.valid_prog_environs = []
 
-        self.num_tasks = 0
-        self.use_multithreading = False
-        self.executable_opts = ['6144', '12288', '3072']
-        self.build_system = 'SingleSource'
-        self.build_system.cflags = ['-O3']
-        self.sys_reference = {
-            'daint:gpu': (300.0, -0.15, None, 'Gflop/s'),
-            'daint:mc': (1040.0, -0.15, None, 'Gflop/s'),
-            'dom:gpu': (300.0, -0.15, None, 'Gflop/s'),
-            'dom:mc': (1040.0, -0.15, None, 'Gflop/s'),
-            'eiger:mc': (3200.0, -0.15, None, 'Gflop/s'),
-            'pilatus:mc': (3200.0, -0.15, None, 'Gflop/s'),
-        }
-        self.maintainers = ['AJ', 'VH']
-        self.tags = {'benchmark', 'diagnostic', 'craype'}
+    @rfm.run_after('setup')
+    def set_num_cpus_per_task(self):
+        if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']:
+            self.num_cpus_per_task = 12
+        elif self.current_partition.fullname in ['daint:mc', 'dom:mc']:
+            self.num_cpus_per_task = 36
+        elif self.current_partition.fullname in ['arolla:cn', 'tsa:cn']:
+            self.num_cpus_per_task = 16
+        elif self.current_partition.fullname in ['arolla:pn', 'tsa:pn']:
+            self.num_cpus_per_task = 40
+        elif self.current_partition.fullname in ['eiger:mc', 'pilatus:mc']:
+            self.num_cpus_per_task = 128
 
     @rfm.run_before('compile')
     def setflags(self):
@@ -52,7 +68,7 @@ def setflags(self):
             self.build_system.cppflags = [
                 '-DMKL_ILP64', '-I${MKLROOT}/include'
             ]
-            self.build_system.cflags = ['-qopenmp']
+            self.build_system.cflags += ['-qopenmp']
             self.build_system.ldflags = [
                 '-mkl', '-static-intel', '-liomp5', '-lpthread', '-lm', '-ldl'
             ]
@@ -63,46 +79,29 @@ def setflags(self):
             self.build_system.ldflags = ['-L$EBROOTOPENBLAS/lib', '-lopenblas',
                                          '-lpthread', '-lgfortran']
 
-    @rfm.run_before('run')
-    def set_tasks(self):
-        if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']:
-            self.num_cpus_per_task = 12
-        elif self.current_partition.fullname in ['daint:mc', 'dom:mc']:
-            self.num_cpus_per_task = 36
-        elif self.current_partition.fullname in ['arolla:cn', 'tsa:cn']:
-            self.num_cpus_per_task = 16
-        elif self.current_partition.fullname in ['arolla:pn', 'tsa:pn']:
-            self.num_cpus_per_task = 40
-        elif self.current_partition.fullname in ['eiger:mc', 'pilatus:mc']:
-            self.num_cpus_per_task = 128
+    @rfm.run_before('performance')
+    def set_perf_patterns(self):
+        '''Override base performance patterns.
 
-        if self.num_cpus_per_task:
-            self.variables = {
-                'OMP_NUM_THREADS': str(self.num_cpus_per_task),
-                'OMP_BIND': 'cores',
-                'OMP_PROC_BIND': 'spread',
-                'OMP_SCHEDULE': 'static'
-            }
+        Set each node as a performance variable reporting the Gflop/s.
+        The ``reference`` values for each node are extracted from the
+        ``sys_reference`` dict.
+        '''
 
-    @sn.sanity_function
-    def eval_sanity(self):
-        all_tested_nodes = sn.evaluate(sn.extractall(
-            r'(?P<hostname>\S+):\s+Time for \d+ DGEMM operations',
-            self.stdout, 'hostname'))
-        num_tested_nodes = len(all_tested_nodes)
-        failure_msg = ('Requested %s node(s), but found %s node(s)' %
-                       (self.job.num_tasks, num_tested_nodes))
-        sn.evaluate(sn.assert_eq(num_tested_nodes, self.job.num_tasks,
-                                 msg=failure_msg))
+        part_name = self.current_partition.fullname
+        with osext.change_dir(self.stagedir):
+            node_names = sn.evaluate(self.get_nodenames())
 
-        for hostname in all_tested_nodes:
-            partition_name = self.current_partition.fullname
-            ref_name = '%s:%s' % (partition_name, hostname)
-            self.reference[ref_name] = self.sys_reference.get(
-                partition_name, (0.0, None, None, 'Gflop/s')
-            )
-            self.perf_patterns[hostname] = sn.extractsingle(
-                r'%s:\s+Avg\. performance\s+:\s+(?P<gflops>\S+)'
-                r'\sGflop/s' % hostname, self.stdout, 'gflops', float)
+        # If part_name not in sys_reference, default back to '*'
+        if part_name not in self.sys_reference:
+            part_name = '*'
 
-        return True
+        # Set references and perf patterns.
+        self.reference = {
+            part_name: {
+                nid: self.sys_reference[part_name] for nid in node_names
+            }
+        }
+        self.perf_patterns = {
+            nid: self.get_node_performance(nid) for nid in node_names
+        }
diff --git a/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py
new file mode 100644
index 0000000000..d53888118b
--- /dev/null
+++ b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py
@@ -0,0 +1,97 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+__all__ = ['Dgemm']
+
+
+class Dgemm(rfm.RegressionTest, pin_prefix=True):
+    '''Dgemm benchmark.
+
+    Derived test must specify the variables ``num_tasks`` and
+    ``num_cpus_per_task``.
+
+    The matrix sizes can be controlled through executable options. By default,
+    this test sets these as ``m=6144``, ``n=12288`` and ``k=3072``. Derived
+    tests are free to change these parameters at their convenience. The
+    performance of this tests is measured by the lowest performing node in
+    ``Gflops/s``.
+    '''
+
+    num_tasks = required
+    num_cpus_per_task = required
+
+    descr = 'DGEMM performance test'
+    sourcepath = 'dgemm.c'
+    use_multithreading = False
+    executable_opts = ['6144', '12288', '3072']
+    build_system = 'SingleSource'
+    reference = {
+        '*': {
+            'min_perf': (None, None, None, 'Gflops/s')
+        }
+    }
+    maintainers = ['AJ', 'VH']
+
+    @rfm.run_before('compile')
+    def set_c_flags(self):
+        self.build_system.cflags += ['-O3']
+
+    @rfm.run_before('run')
+    def set_env_vars(self):
+        '''Set the environment variables.'''
+
+        self.variables = {
+            'OMP_NUM_THREADS': str(self.num_cpus_per_task),
+            'OMP_BIND': 'cores',
+            'OMP_PROC_BIND': 'spread',
+            'OMP_SCHEDULE': 'static'
+        }
+
+    @sn.sanity_function
+    def get_nodenames(self):
+        '''Return a set with the participating node IDs.'''
+
+        return set(sn.extractall(
+            r'(?P<hostname>\S+):\s+Time for \d+ DGEMM operations',
+            self.stdout, 'hostname'
+        ))
+
+    @rfm.run_before('sanity')
+    def set_sanity_patterns(self):
+        '''Assert that all requested nodes have completed.'''
+
+        self.sanity_patterns = sn.assert_eq(
+            self.job.num_tasks, sn.count(self.get_nodenames()),
+            msg='some nodes did not complete'
+        )
+
+    @sn.sanity_function
+    def get_node_performance(self, nodeid):
+        '''Get the performance data from a specific ``nodeid``.'''
+
+        return sn.extractsingle(
+            r'%s:\s+Avg\. performance\s+:\s+(?P<gflops>\S+)\sGflop/s' % nodeid,
+            self.stdout, 'gflops', float
+        )
+
+    @sn.sanity_function
+    def get_min_performance(self):
+        '''Get the lowest performance from all nodes.'''
+
+        return sn.min([
+            self.get_node_performance(nid) for nid in self.get_nodenames()
+        ])
+
+    @rfm.run_before('performance')
+    def set_perf_patterns(self):
+        '''Set the perf patterns to check the min performance reported.'''
+
+        self.perf_patterns = {
+            'min_perf': self.get_min_performance(),
+        }
diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/src/dgemm.c b/hpctestlib/microbenchmarks/cpu/dgemm/src/dgemm.c
similarity index 100%
rename from cscs-checks/microbenchmarks/cpu/dgemm/src/dgemm.c
rename to hpctestlib/microbenchmarks/cpu/dgemm/src/dgemm.c

From db3b552cfbb4ae75715843243916de727b616d0f Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Tue, 1 Jun 2021 15:41:47 +0200
Subject: [PATCH 09/20] PEP8 fixes

---
 cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
index 92891282d6..a57fa2b590 100644
--- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
+++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
@@ -9,6 +9,7 @@
 
 from hpctestlib.microbenchmarks.cpu.dgemm import Dgemm
 
+
 @rfm.simple_test
 class dgemm_check(Dgemm):
     '''CSCS DGEMM check.
@@ -21,8 +22,8 @@ class dgemm_check(Dgemm):
 
     valid_systems = [
         'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-         'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn',
-         'eiger:mc', 'pilatus:mc'
+        'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn',
+        'eiger:mc', 'pilatus:mc'
     ]
     num_tasks = 0
     sys_reference = variable(

From 64d34297cf7a5a183d8efceb1ca725599aa885e8 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 1 Jun 2021 17:40:22 +0200
Subject: [PATCH 10/20] Port dgemm to a64fx

---
 .../microbenchmarks/cpu/dgemm/dgemm.py        | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
index a57fa2b590..faddda9e28 100644
--- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
+++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
@@ -23,7 +23,7 @@ class dgemm_check(Dgemm):
     valid_systems = [
         'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
         'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn',
-        'eiger:mc', 'pilatus:mc'
+        'eiger:mc', 'pilatus:mc', 'ault:a64fx'
     ]
     num_tasks = 0
     sys_reference = variable(
@@ -34,6 +34,7 @@ class dgemm_check(Dgemm):
             'dom:mc':     (1040.0, -0.15, None, 'Gflop/s'),
             'eiger:mc':   (3200.0, -0.15, None, 'Gflop/s'),
             'pilatus:mc': (3200.0, -0.15, None, 'Gflop/s'),
+            'ault:a64fx': (1930.0, -0.15, None, 'Gflop/s'),
             '*':          (None, None, None, 'Gflop/s'),
         },
     )
@@ -47,6 +48,16 @@ def set_valid_prog_environs(self):
             self.valid_prog_environs = ['PrgEnv-gnu-nompi']
         elif self.current_system.name in ['eiger', 'pilatus']:
             self.valid_prog_environs = ['PrgEnv-gnu']
+        elif self.current_system.name in ['ault']:
+            self.valid_prog_environs = ['PrgEnv-fujitsu']
+
+    @rfm.run_after('setup')
+    def skip_incompatible_combinations(self):
+        '''Fujitsu env only available in ault's a64fx partition.'''
+        if self.current_environ.name.startswith('PrgEnv-fujitsu'):
+            self.skip_if(
+                self.current_partition.fullname not in {'ault:a64fx'}
+            )
 
     @rfm.run_after('setup')
     def set_num_cpus_per_task(self):
@@ -60,9 +71,11 @@ def set_num_cpus_per_task(self):
             self.num_cpus_per_task = 40
         elif self.current_partition.fullname in ['eiger:mc', 'pilatus:mc']:
             self.num_cpus_per_task = 128
+        elif self.current_partition.fullname in ['ault:a64fx']:
+            self.num_cpus_per_task = 48
 
     @rfm.run_before('compile')
-    def setflags(self):
+    def set_flags(self):
         if self.current_environ.name.startswith('PrgEnv-gnu'):
             self.build_system.cflags += ['-fopenmp']
         elif self.current_environ.name.startswith('PrgEnv-intel'):
@@ -73,6 +86,9 @@ def setflags(self):
             self.build_system.ldflags = [
                 '-mkl', '-static-intel', '-liomp5', '-lpthread', '-lm', '-ldl'
             ]
+        elif self.current_environ.name.startswith('PrgEnv-fujitsu'):
+            self.build_system.cflags += ['-fopenmp', '-Nlibomp', '-mt']
+            self.build_system.ldflags += ['-SSL2BLAMP', '-mt']
 
         if self.current_partition.fullname in ['arolla:cn', 'arolla:pn',
                                                'tsa:cn', 'tsa:pn']:

From 53cd1313c5c7284e8759849a11fa2c15178d4bd3 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 1 Jun 2021 18:02:58 +0200
Subject: [PATCH 11/20] Port stream benchmark to a64fx

---
 .../microbenchmarks/cpu/stream/stream.py      | 29 ++++++++++++-------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream.py b/cscs-checks/microbenchmarks/cpu/stream/stream.py
index be156ece8b..5b08dc51db 100644
--- a/cscs-checks/microbenchmarks/cpu/stream/stream.py
+++ b/cscs-checks/microbenchmarks/cpu/stream/stream.py
@@ -15,19 +15,11 @@ class stream_check(Stream):
 
     valid_systems = [
         'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-        'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn'
+        'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn', 'ault:a64fx'
     ]
     valid_prog_environs = [
         'PrgEnv-cray', 'PrgEnv-gnu', 'PrgEnv-intel', 'PrgEnv-pgi'
     ]
-    prgenv_flags = variable(
-        dict, value={
-            'PrgEnv-cray': ['-fopenmp', '-O3'],
-            'PrgEnv-gnu': ['-fopenmp', '-O3'],
-            'PrgEnv-intel': ['-qopenmp', '-O3'],
-            'PrgEnv-pgi': ['-mp', '-O3']
-        }
-    )
     stream_cpus_per_task = variable(
         dict, value={
             'arolla:cn': 16,
@@ -38,6 +30,7 @@ class stream_check(Stream):
             'dom:mc': 36,
             'tsa:cn': 16,
             'tsa:pn': 16,
+            'ault:a64fx': 48,
         }
     )
     triad_reference = variable(
@@ -65,7 +58,10 @@ class stream_check(Stream):
                 'daint:mc': {'triad': (88500, -0.05, None, 'MB/s')},
                 'dom:gpu': {'triad': (44500, -0.05, None, 'MB/s')},
                 'dom:mc': {'triad': (88500, -0.05, None, 'MB/s')},
-            }
+            },
+            'PrgEnv-fujitsu': {
+                'ault:a64fx': {'triad': (85500, -0.05, None, 'MB/s')},
+            },
         }
     )
     num_tasks = 1
@@ -76,6 +72,8 @@ def filter_valid_prog_environs(self):
         '''Special conditions for arolla and tsa.'''
         if self.current_system.name in ['arolla', 'tsa']:
             self.valid_prog_environs = ['PrgEnv-gnu']
+        elif self.current_system.name in ['ault']:
+            self.valid_prog_environs = ['PrgEnv-fujitsu']
 
     @rfm.run_after('setup')
     def set_num_cpus_per_task(self):
@@ -87,8 +85,17 @@ def set_num_cpus_per_task(self):
     @rfm.run_before('compile')
     def set_compiler_flags(self):
         '''Set build flags for the different environments.'''
+
         envname = self.current_environ.name
-        self.build_system.cflags = self.prgenv_flags.get(envname, ['-O3'])
+        if envname in {'PrgEnv-cray', 'PrgEnv-gnu'}:
+            self.build_system.cflags += ['-fopenmp', '-O3']
+        elif envname in {'PrgEnv-intel'}:
+            self.build_system.cflags += ['-qopenmp', '-O3']
+        elif envname in {'PrgEnv-intel'}:
+            self.build_system.cflags += ['-mp', '-O3']
+        elif envname in {'PrgEnv-fujitsu'}:
+            self.build_system.cflags += ['-fopenmp', '-mt', '-O3']
+            self.build_system.ldflags += ['-mt']
 
     @rfm.run_before('run')
     def set_env_vars(self):

From d7543e211c5d2b3cfafa9b131f9e9155dc8e893b Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 1 Jun 2021 18:13:03 +0200
Subject: [PATCH 12/20] Port latency benchmark to a64fx

---
 cscs-checks/microbenchmarks/cpu/latency/latency.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/cscs-checks/microbenchmarks/cpu/latency/latency.py b/cscs-checks/microbenchmarks/cpu/latency/latency.py
index 63221cb1f6..798a8dd386 100644
--- a/cscs-checks/microbenchmarks/cpu/latency/latency.py
+++ b/cscs-checks/microbenchmarks/cpu/latency/latency.py
@@ -15,11 +15,16 @@ class cpu_latency_check(CpuLatency):
     num_tasks = 0
     valid_systems = [
         'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-        'ault:intel', 'ault:amdvega', 'tave:compute'
+        'ault:intel', 'ault:amdvega', 'tave:compute', 'ault:a64fx'
     ]
     valid_prog_environs = ['PrgEnv-gnu']
     tags = {'benchmark', 'diagnostic'}
 
+    @rfm.run_after('init')
+    def set_valid_environs(self):
+        if self.current_system.name in {'ault'}:
+            self.valid_prog_environs = ['PrgEnv-fujitsu']
+
     @rfm.run_after('setup')
     def set_modules(self):
         if self.current_system.name in {'daint', 'dom'}:
@@ -72,4 +77,10 @@ def set_references(self):
                 'latencyL3': (137, -0.01, 0.05, 'ns'),
                 'latencyL4': (150, -0.05, 0.05, 'ns')
             },
+            'ault:a64fx': {
+                'latencyL1': (2.78, None, 0.05, 'ns'),
+                'latencyL2': (14.3, None, 0.05, 'ns'),
+                'latencyL3': (32.1, None, 0.05, 'ns'),
+                'latencyL4': (146,  None, 0.05, 'ns')
+            },
         }

From e7cf2e19ecb5f279d381680de7cac888df610ef7 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 1 Jun 2021 18:29:35 +0200
Subject: [PATCH 13/20] PEP8 fixes

---
 hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
index f66c9b2950..b7bdbce310 100644
--- a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
+++ b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
@@ -54,7 +54,9 @@ def set_exec_opts(self):
         In order, these are the buffer size, stride and number of threads. See
         the main docstring above for more info.
         '''
-        self.executable_opts = ['100000000', f'{self.stride}', f'{self.num_cpus}']
+        self.executable_opts = [
+            '100000000', f'{self.stride}', f'{self.num_cpus}'
+        ]
 
     @rfm.run_before('sanity')
     def set_sanity_patterns(self):

From 235fd0fd18ca809b0f59a7d579ff5c579da21ef6 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 1 Jun 2021 18:34:11 +0200
Subject: [PATCH 14/20] Remove unused imports

---
 cscs-checks/microbenchmarks/cpu/latency/latency.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cscs-checks/microbenchmarks/cpu/latency/latency.py b/cscs-checks/microbenchmarks/cpu/latency/latency.py
index 798a8dd386..aea1ae2dcd 100644
--- a/cscs-checks/microbenchmarks/cpu/latency/latency.py
+++ b/cscs-checks/microbenchmarks/cpu/latency/latency.py
@@ -4,7 +4,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import reframe as rfm
-import reframe.utility.sanity as sn
 
 from hpctestlib.microbenchmarks.cpu.latency import CpuLatency
 

From 918c14deb84354b071de2af58e393ff48bad7241 Mon Sep 17 00:00:00 2001
From: "Javier J. Otero Perez" <jotero@cscs.ch>
Date: Tue, 1 Jun 2021 18:44:51 +0200
Subject: [PATCH 15/20] Remove unused imports

---
 cscs-checks/microbenchmarks/cpu/stream/stream.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream.py b/cscs-checks/microbenchmarks/cpu/stream/stream.py
index 5b08dc51db..109fae2ef7 100644
--- a/cscs-checks/microbenchmarks/cpu/stream/stream.py
+++ b/cscs-checks/microbenchmarks/cpu/stream/stream.py
@@ -4,7 +4,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import reframe as rfm
-import reframe.utility.sanity as sn
 
 from hpctestlib.microbenchmarks.cpu.stream import Stream
 

From 30060d9ca1fedfdce52942997a390b5a3880129f Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Tue, 8 Jun 2021 16:29:11 +0200
Subject: [PATCH 16/20] Add num_tasks for strided bw check

---
 cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
index 3428cce169..ae35f2c286 100644
--- a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
+++ b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
@@ -31,6 +31,7 @@ class strided_bandwidth_check(StridedBandwidth):
     valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc',
                      'eiger:mc', 'pilatus:mc', 'ault:a64fx']
     valid_prog_environs = ['PrgEnv-gnu']
+    num_tasks = 0
 
     @rfm.run_after('init')
     def set_valid_systems(self):

From 653712bcddf02d4f142cbc271a34de1636a324a0 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Tue, 8 Jun 2021 16:51:25 +0200
Subject: [PATCH 17/20] Bump syntax to remove depr warns

---
 .../microbenchmarks/cpu/dgemm/dgemm.py        | 10 +++++-----
 .../microbenchmarks/cpu/dgemm/dgemm_test.py   | 20 +++++++++++++++++++
 .../microbenchmarks/cpu/latency/latency.py    |  6 +++---
 .../microbenchmarks/cpu/stream/stream.py      | 10 +++++-----
 .../microbenchmarks/cpu/stream/stream_test.py | 17 ++++++++++++++++
 .../cpu/strided_bandwidth/strides.py          |  6 +++---
 .../microbenchmarks/cpu/dgemm/__init__.py     |  8 ++++----
 .../microbenchmarks/cpu/latency/__init__.py   | 10 +++++-----
 .../microbenchmarks/cpu/stream/__init__.py    |  6 +++---
 .../cpu/strided_bandwidth/__init__.py         |  6 +++---
 10 files changed, 68 insertions(+), 31 deletions(-)
 create mode 100644 cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py
 create mode 100644 cscs-checks/microbenchmarks/cpu/stream/stream_test.py

diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
index faddda9e28..4f74a56c15 100644
--- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
+++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
@@ -40,7 +40,7 @@ class dgemm_check(Dgemm):
     )
     tags = {'benchmark', 'diagnostic', 'craype'}
 
-    @rfm.run_after('init')
+    @run_after('init')
     def set_valid_prog_environs(self):
         if self.current_system.name in ['daint', 'dom']:
             self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-intel']
@@ -51,7 +51,7 @@ def set_valid_prog_environs(self):
         elif self.current_system.name in ['ault']:
             self.valid_prog_environs = ['PrgEnv-fujitsu']
 
-    @rfm.run_after('setup')
+    @run_after('setup')
     def skip_incompatible_combinations(self):
         '''Fujitsu env only available in ault's a64fx partition.'''
         if self.current_environ.name.startswith('PrgEnv-fujitsu'):
@@ -59,7 +59,7 @@ def skip_incompatible_combinations(self):
                 self.current_partition.fullname not in {'ault:a64fx'}
             )
 
-    @rfm.run_after('setup')
+    @run_after('setup')
     def set_num_cpus_per_task(self):
         if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']:
             self.num_cpus_per_task = 12
@@ -74,7 +74,7 @@ def set_num_cpus_per_task(self):
         elif self.current_partition.fullname in ['ault:a64fx']:
             self.num_cpus_per_task = 48
 
-    @rfm.run_before('compile')
+    @run_before('compile')
     def set_flags(self):
         if self.current_environ.name.startswith('PrgEnv-gnu'):
             self.build_system.cflags += ['-fopenmp']
@@ -96,7 +96,7 @@ def set_flags(self):
             self.build_system.ldflags = ['-L$EBROOTOPENBLAS/lib', '-lopenblas',
                                          '-lpthread', '-lgfortran']
 
-    @rfm.run_before('performance')
+    @run_before('performance')
     def set_perf_patterns(self):
         '''Override base performance patterns.
 
diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py
new file mode 100644
index 0000000000..36fcbf914e
--- /dev/null
+++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py
@@ -0,0 +1,20 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+from hpctestlib.microbenchmarks.cpu.dgemm import Dgemm
+
+@rfm.simple_test
+class dgemm_check(Dgemm):
+    valid_systems = ['dom:mc']
+    valid_prog_environs = ['PrgEnv-gnu']
+    num_tasks = 0
+    num_cpus_per_task = 36
+
+    @run_before('compile')
+    def setflags(self):
+        self.build_system.cflags += ['-fopenmp']
diff --git a/cscs-checks/microbenchmarks/cpu/latency/latency.py b/cscs-checks/microbenchmarks/cpu/latency/latency.py
index aea1ae2dcd..5d85b2bdb7 100644
--- a/cscs-checks/microbenchmarks/cpu/latency/latency.py
+++ b/cscs-checks/microbenchmarks/cpu/latency/latency.py
@@ -19,19 +19,19 @@ class cpu_latency_check(CpuLatency):
     valid_prog_environs = ['PrgEnv-gnu']
     tags = {'benchmark', 'diagnostic'}
 
-    @rfm.run_after('init')
+    @run_after('init')
     def set_valid_environs(self):
         if self.current_system.name in {'ault'}:
             self.valid_prog_environs = ['PrgEnv-fujitsu']
 
-    @rfm.run_after('setup')
+    @run_after('setup')
     def set_modules(self):
         if self.current_system.name in {'daint', 'dom'}:
             self.modules = ['craype-hugepages1G']
         if self.current_system.name in {'tave'}:
             self.modules = ['craype-hugepages512M']
 
-    @rfm.run_before('performance')
+    @run_before('performance')
     def set_references(self):
         self.reference = {
             'dom:mc': {
diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream.py b/cscs-checks/microbenchmarks/cpu/stream/stream.py
index 109fae2ef7..c9f4d14a03 100644
--- a/cscs-checks/microbenchmarks/cpu/stream/stream.py
+++ b/cscs-checks/microbenchmarks/cpu/stream/stream.py
@@ -66,7 +66,7 @@ class stream_check(Stream):
     num_tasks = 1
     tags = {'production', 'craype'}
 
-    @rfm.run_after('init')
+    @run_after('init')
     def filter_valid_prog_environs(self):
         '''Special conditions for arolla and tsa.'''
         if self.current_system.name in ['arolla', 'tsa']:
@@ -74,14 +74,14 @@ def filter_valid_prog_environs(self):
         elif self.current_system.name in ['ault']:
             self.valid_prog_environs = ['PrgEnv-fujitsu']
 
-    @rfm.run_after('setup')
+    @run_after('setup')
     def set_num_cpus_per_task(self):
         '''If partition not in ``stream_cpus_per_task``, leave as required.'''
         self.num_cpus_per_task = self.stream_cpus_per_task.get(
             self.current_partition.fullname, self.required
         )
 
-    @rfm.run_before('compile')
+    @run_before('compile')
     def set_compiler_flags(self):
         '''Set build flags for the different environments.'''
 
@@ -96,13 +96,13 @@ def set_compiler_flags(self):
             self.build_system.cflags += ['-fopenmp', '-mt', '-O3']
             self.build_system.ldflags += ['-mt']
 
-    @rfm.run_before('run')
+    @run_before('run')
     def set_env_vars(self):
         '''Special environment treatment for the PrgEnv-pgi.'''
         if self.current_environ.name == 'PrgEnv-pgi':
             self.variables['OMP_PROC_BIND'] = 'true'
 
-    @rfm.run_before('performance')
+    @run_before('performance')
     def set_perf_references(self):
         '''Set performance refs as defined in ``triad_reference``.
 
diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream_test.py b/cscs-checks/microbenchmarks/cpu/stream/stream_test.py
new file mode 100644
index 0000000000..87a8509e51
--- /dev/null
+++ b/cscs-checks/microbenchmarks/cpu/stream/stream_test.py
@@ -0,0 +1,17 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+from hpctestlib.microbenchmarks.cpu.stream import Stream
+
+
+@rfm.simple_test
+class stream_check(Stream):
+    valid_systems = ['dom:mc']
+    valid_prog_environs = ['PrgEnv-cray']
+    num_tasks = 2
+    num_cpus_per_task = 36
diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
index ae35f2c286..58745fc9f3 100644
--- a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
+++ b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
@@ -33,7 +33,7 @@ class strided_bandwidth_check(StridedBandwidth):
     valid_prog_environs = ['PrgEnv-gnu']
     num_tasks = 0
 
-    @rfm.run_after('init')
+    @run_after('init')
     def set_valid_systems(self):
         cp = self.current_system.name
         if cp == 'ault':
@@ -125,10 +125,10 @@ def set_valid_systems(self):
     )
     tags = {'benchmark', 'diagnostic'}
 
-    @rfm.run_after('setup')
+    @run_after('setup')
     def set_num_cpus(self):
         self.num_cpus = self.system_num_cpus[self.current_partition.fullname]
 
-    @rfm.run_before('performance')
+    @run_before('performance')
     def set_references(self):
         self.reference = self.reference_per_stride[self.stride]
diff --git a/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py
index d53888118b..0b3cd7c7c9 100644
--- a/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py
+++ b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py
@@ -38,11 +38,11 @@ class Dgemm(rfm.RegressionTest, pin_prefix=True):
     }
     maintainers = ['AJ', 'VH']
 
-    @rfm.run_before('compile')
+    @run_before('compile')
     def set_c_flags(self):
         self.build_system.cflags += ['-O3']
 
-    @rfm.run_before('run')
+    @run_before('run')
     def set_env_vars(self):
         '''Set the environment variables.'''
 
@@ -62,7 +62,7 @@ def get_nodenames(self):
             self.stdout, 'hostname'
         ))
 
-    @rfm.run_before('sanity')
+    @run_before('sanity')
     def set_sanity_patterns(self):
         '''Assert that all requested nodes have completed.'''
 
@@ -88,7 +88,7 @@ def get_min_performance(self):
             self.get_node_performance(nid) for nid in self.get_nodenames()
         ])
 
-    @rfm.run_before('performance')
+    @run_before('performance')
     def set_perf_patterns(self):
         '''Set the perf patterns to check the min performance reported.'''
 
diff --git a/hpctestlib/microbenchmarks/cpu/latency/__init__.py b/hpctestlib/microbenchmarks/cpu/latency/__init__.py
index 3557af0c04..26235c28a3 100644
--- a/hpctestlib/microbenchmarks/cpu/latency/__init__.py
+++ b/hpctestlib/microbenchmarks/cpu/latency/__init__.py
@@ -33,16 +33,16 @@ class CpuLatency(rfm.RegressionTest, pin_prefix=True):
     num_tasks_per_node = 1
     maintainers = ['SK', 'JO']
 
-    @rfm.run_before('compile')
+    @run_before('compile')
     def set_compiler_flags(self):
         self.build_system.cxxflags = ['-O3']
 
-    @rfm.run_before('run')
+    @run_before('run')
     def set_exc_opts(self):
         '''Set the ``buffer_sizes`` as the executable options.'''
         self.executable_opts = self.buffer_sizes
 
-    @rfm.run_before('sanity')
+    @run_before('sanity')
     def set_sanity_patterns(self):
         '''Verify the number of reported latency measurements.'''
 
@@ -60,7 +60,7 @@ def get_latency(self, buffer_size):
             buffer_size, self.stdout, 'bw', float
         ))
 
-    @rfm.run_before('performance')
+    @run_before('performance')
     def set_references(self):
         '''Set dummy references to get the perf values in the perf report.
 
@@ -76,7 +76,7 @@ def set_references(self):
             }
         }
 
-    @rfm.run_before('performance')
+    @run_before('performance')
     def set_perf_patterns(self):
         '''Set the performance patters to extract all latency levels.
 
diff --git a/hpctestlib/microbenchmarks/cpu/stream/__init__.py b/hpctestlib/microbenchmarks/cpu/stream/__init__.py
index 38da66e447..341b67818c 100644
--- a/hpctestlib/microbenchmarks/cpu/stream/__init__.py
+++ b/hpctestlib/microbenchmarks/cpu/stream/__init__.py
@@ -46,12 +46,12 @@ class Stream(rfm.RegressionTest):
     }
     maintainers = ['RS', 'SK']
 
-    @rfm.run_before('run')
+    @run_before('run')
     def set_omp_num_threads(self):
         '''Set the number of OMP threads to ``num_cpus_per_task``.'''
         self.variables['OMP_NUM_THREADS'] = f'{self.num_cpus_per_task}'
 
-    @rfm.run_before('sanity')
+    @run_before('sanity')
     def set_sanity_patterns(self):
         '''Set sanity patterns to check the error threshold.'''
 
@@ -59,7 +59,7 @@ def set_sanity_patterns(self):
             r'Solution Validates: avg error less than', self.stdout
         )
 
-    @rfm.run_before('performance')
+    @run_before('performance')
     def set_performance_patterns(self):
         '''Set performance to track the triad bandwidth.'''
 
diff --git a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
index b7bdbce310..3708964c05 100644
--- a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
+++ b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
@@ -47,7 +47,7 @@ class StridedBandwidth(rfm.RegressionTest, pin_prefix=True):
     }
     maintainers = ['SK']
 
-    @rfm.run_before('run')
+    @run_before('run')
     def set_exec_opts(self):
         '''Set the exec options.
 
@@ -58,7 +58,7 @@ def set_exec_opts(self):
             '100000000', f'{self.stride}', f'{self.num_cpus}'
         ]
 
-    @rfm.run_before('sanity')
+    @run_before('sanity')
     def set_sanity_patterns(self):
         ''' Assert that the bandwidth is reported for all the tasks.'''
 
@@ -67,7 +67,7 @@ def set_sanity_patterns(self):
             self.job.num_tasks
         )
 
-    @rfm.run_before('performance')
+    @run_before('performance')
     def set_perf_patterns(self):
         '''Extract the min bandwidth as a performance metric.'''
 

From aedc16006b0cfb958cf86a406bcb45a54278bc0d Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Tue, 8 Jun 2021 16:56:13 +0200
Subject: [PATCH 18/20] Remove leaked files

---
 .../microbenchmarks/cpu/dgemm/dgemm_test.py   | 20 -------------------
 .../microbenchmarks/cpu/stream/stream_test.py | 17 ----------------
 2 files changed, 37 deletions(-)
 delete mode 100644 cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py
 delete mode 100644 cscs-checks/microbenchmarks/cpu/stream/stream_test.py

diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py
deleted file mode 100644
index 36fcbf914e..0000000000
--- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm_test.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
-# ReFrame Project Developers. See the top-level LICENSE file for details.
-#
-# SPDX-License-Identifier: BSD-3-Clause
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-from hpctestlib.microbenchmarks.cpu.dgemm import Dgemm
-
-@rfm.simple_test
-class dgemm_check(Dgemm):
-    valid_systems = ['dom:mc']
-    valid_prog_environs = ['PrgEnv-gnu']
-    num_tasks = 0
-    num_cpus_per_task = 36
-
-    @run_before('compile')
-    def setflags(self):
-        self.build_system.cflags += ['-fopenmp']
diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream_test.py b/cscs-checks/microbenchmarks/cpu/stream/stream_test.py
deleted file mode 100644
index 87a8509e51..0000000000
--- a/cscs-checks/microbenchmarks/cpu/stream/stream_test.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
-# ReFrame Project Developers. See the top-level LICENSE file for details.
-#
-# SPDX-License-Identifier: BSD-3-Clause
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-from hpctestlib.microbenchmarks.cpu.stream import Stream
-
-
-@rfm.simple_test
-class stream_check(Stream):
-    valid_systems = ['dom:mc']
-    valid_prog_environs = ['PrgEnv-cray']
-    num_tasks = 2
-    num_cpus_per_task = 36

From 5826b0f0badad9c2f1d1aedd47878e78b39630a2 Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Thu, 28 Oct 2021 18:03:15 +0200
Subject: [PATCH 19/20] Update dgemm

---
 .../microbenchmarks/cpu/dgemm/dgemm.py        | 47 +++----------------
 .../microbenchmarks/cpu/dgemm/__init__.py     | 27 +++--------
 2 files changed, 14 insertions(+), 60 deletions(-)

diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
index 4f74a56c15..f06e4e8ccb 100644
--- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
+++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
@@ -56,23 +56,17 @@ def skip_incompatible_combinations(self):
         '''Fujitsu env only available in ault's a64fx partition.'''
         if self.current_environ.name.startswith('PrgEnv-fujitsu'):
             self.skip_if(
-                self.current_partition.fullname not in {'ault:a64fx'}
+                self.current_partition.fullname not in ('ault:a64fx')
             )
 
     @run_after('setup')
     def set_num_cpus_per_task(self):
-        if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']:
-            self.num_cpus_per_task = 12
-        elif self.current_partition.fullname in ['daint:mc', 'dom:mc']:
-            self.num_cpus_per_task = 36
-        elif self.current_partition.fullname in ['arolla:cn', 'tsa:cn']:
-            self.num_cpus_per_task = 16
-        elif self.current_partition.fullname in ['arolla:pn', 'tsa:pn']:
-            self.num_cpus_per_task = 40
-        elif self.current_partition.fullname in ['eiger:mc', 'pilatus:mc']:
-            self.num_cpus_per_task = 128
-        elif self.current_partition.fullname in ['ault:a64fx']:
-            self.num_cpus_per_task = 48
+        proc = self.current_partition.processor
+        pname = self.current_partition.fullname
+        if not proc.info:
+            self.skip(f'no topology information found for partition {pname!r}')
+
+        self.num_cpus_per_task = proc.num_cpus // proc.num_cpus_per_core
 
     @run_before('compile')
     def set_flags(self):
@@ -95,30 +89,3 @@ def set_flags(self):
             self.build_system.cflags += ['-I$EBROOTOPENBLAS/include']
             self.build_system.ldflags = ['-L$EBROOTOPENBLAS/lib', '-lopenblas',
                                          '-lpthread', '-lgfortran']
-
-    @run_before('performance')
-    def set_perf_patterns(self):
-        '''Override base performance patterns.
-
-        Set each node as a performance variable reporting the Gflop/s.
-        The ``reference`` values for each node are extracted from the
-        ``sys_reference`` dict.
-        '''
-
-        part_name = self.current_partition.fullname
-        with osext.change_dir(self.stagedir):
-            node_names = sn.evaluate(self.get_nodenames())
-
-        # If part_name not in sys_reference, default back to '*'
-        if part_name not in self.sys_reference:
-            part_name = '*'
-
-        # Set references and perf patterns.
-        self.reference = {
-            part_name: {
-                nid: self.sys_reference[part_name] for nid in node_names
-            }
-        }
-        self.perf_patterns = {
-            nid: self.get_node_performance(nid) for nid in node_names
-        }
diff --git a/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py
index 0b3cd7c7c9..fb95bdeec6 100644
--- a/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py
+++ b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py
@@ -31,11 +31,6 @@ class Dgemm(rfm.RegressionTest, pin_prefix=True):
     use_multithreading = False
     executable_opts = ['6144', '12288', '3072']
     build_system = 'SingleSource'
-    reference = {
-        '*': {
-            'min_perf': (None, None, None, 'Gflops/s')
-        }
-    }
     maintainers = ['AJ', 'VH']
 
     @run_before('compile')
@@ -53,7 +48,7 @@ def set_env_vars(self):
             'OMP_SCHEDULE': 'static'
         }
 
-    @sn.sanity_function
+    @deferrable
     def get_nodenames(self):
         '''Return a set with the participating node IDs.'''
 
@@ -62,16 +57,16 @@ def get_nodenames(self):
             self.stdout, 'hostname'
         ))
 
-    @run_before('sanity')
-    def set_sanity_patterns(self):
+    @sanity_function
+    def assert_all_nodes_completed(self):
         '''Assert that all requested nodes have completed.'''
 
-        self.sanity_patterns = sn.assert_eq(
+        return sn.assert_eq(
             self.job.num_tasks, sn.count(self.get_nodenames()),
             msg='some nodes did not complete'
         )
 
-    @sn.sanity_function
+    @deferrable
     def get_node_performance(self, nodeid):
         '''Get the performance data from a specific ``nodeid``.'''
 
@@ -80,18 +75,10 @@ def get_node_performance(self, nodeid):
             self.stdout, 'gflops', float
         )
 
-    @sn.sanity_function
-    def get_min_performance(self):
+    @performance_function('Gflops/s')
+    def min_perf(self):
         '''Get the lowest performance from all nodes.'''
 
         return sn.min([
             self.get_node_performance(nid) for nid in self.get_nodenames()
         ])
-
-    @run_before('performance')
-    def set_perf_patterns(self):
-        '''Set the perf patterns to check the min performance reported.'''
-
-        self.perf_patterns = {
-            'min_perf': self.get_min_performance(),
-        }

From 0064156a163003de8eb00aa18a05f5d37977edad Mon Sep 17 00:00:00 2001
From: Javier Otero <jotero@cscs.ch>
Date: Thu, 28 Oct 2021 18:08:22 +0200
Subject: [PATCH 20/20] Remove unused imports

---
 cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
index f06e4e8ccb..8ea51f62d6 100644
--- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
+++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
@@ -4,8 +4,6 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import reframe as rfm
-import reframe.utility.sanity as sn
-import reframe.utility.osext as osext
 
 from hpctestlib.microbenchmarks.cpu.dgemm import Dgemm