diff --git a/config/cscs.py b/config/cscs.py
index 70ed35501d..0013afc391 100644
--- a/config/cscs.py
+++ b/config/cscs.py
@@ -39,7 +39,8 @@
                     ],
                     'environs': [
                         'builtin',
-                        'PrgEnv-gnu'
+                        'PrgEnv-gnu',
+                        'PrgEnv-fujitsu'
                     ],
                     'descr': 'Fujitsu A64FX CPUs',
                     'max_jobs': 100,
@@ -857,6 +858,18 @@
             'cxx': 'mpicxx',
             'ftn': 'mpif90'
         },
+        {
+            'name': 'PrgEnv-fujitsu',
+            'target_systems': [
+                'ault'
+            ],
+            'modules': [
+                'a64fxsdk'
+            ],
+            'cc': 'mpifccpx',
+            'cxx': 'mpiFCCpx',
+            'ftn': 'mpifrtpx'
+        },
         {
             'name': 'builtin',
             'target_systems': [
diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
index 632c527f4a..8ea51f62d6 100644
--- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
+++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py
@@ -4,105 +4,86 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import reframe as rfm
-import reframe.utility.sanity as sn
+
+from hpctestlib.microbenchmarks.cpu.dgemm import Dgemm
 
 
 @rfm.simple_test
-class DGEMMTest(rfm.RegressionTest):
-    def __init__(self):
-        self.descr = 'DGEMM performance test'
-        self.sourcepath = 'dgemm.c'
-        self.sanity_patterns = self.eval_sanity()
+class dgemm_check(Dgemm):
+    '''CSCS DGEMM check.
+
+    The matrix dimensions are set in the base class.
+    Every node reports its performance in Gflops/s. To do so, this class
+    overrides the performance patterns and references from the base test.
+    This is done in the ``set_perf_patterns`` pre-performance hook.
+    '''
+
+    valid_systems = [
+        'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
+        'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn',
+        'eiger:mc', 'pilatus:mc', 'ault:a64fx'
+    ]
+    num_tasks = 0
+    sys_reference = variable(
+        dict, value={
+            'daint:gpu':  (300.0, -0.15, None, 'Gflop/s'),
+            'daint:mc':   (1040.0, -0.15, None, 'Gflop/s'),
+            'dom:gpu':    (300.0, -0.15, None, 'Gflop/s'),
+            'dom:mc':     (1040.0, -0.15, None, 'Gflop/s'),
+            'eiger:mc':   (3200.0, -0.15, None, 'Gflop/s'),
+            'pilatus:mc': (3200.0, -0.15, None, 'Gflop/s'),
+            'ault:a64fx': (1930.0, -0.15, None, 'Gflop/s'),
+            '*':          (None, None, None, 'Gflop/s'),
+        },
+    )
+    tags = {'benchmark', 'diagnostic', 'craype'}
 
-        # the perf patterns are automaticaly generated inside sanity
-        self.perf_patterns = {}
-        self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-                              'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn',
-                              'eiger:mc', 'pilatus:mc']
+    @run_after('init')
+    def set_valid_prog_environs(self):
         if self.current_system.name in ['daint', 'dom']:
             self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-intel']
         elif self.current_system.name in ['arolla', 'tsa']:
             self.valid_prog_environs = ['PrgEnv-gnu-nompi']
         elif self.current_system.name in ['eiger', 'pilatus']:
             self.valid_prog_environs = ['PrgEnv-gnu']
-        else:
-            self.valid_prog_environs = []
+        elif self.current_system.name in ['ault']:
+            self.valid_prog_environs = ['PrgEnv-fujitsu']
 
-        self.num_tasks = 0
-        self.use_multithreading = False
-        self.executable_opts = ['6144', '12288', '3072']
-        self.build_system = 'SingleSource'
-        self.build_system.cflags = ['-O3']
-        self.sys_reference = {
-            'daint:gpu': (300.0, -0.15, None, 'Gflop/s'),
-            'daint:mc': (1040.0, -0.15, None, 'Gflop/s'),
-            'dom:gpu': (300.0, -0.15, None, 'Gflop/s'),
-            'dom:mc': (1040.0, -0.15, None, 'Gflop/s'),
-            'eiger:mc': (3200.0, -0.15, None, 'Gflop/s'),
-            'pilatus:mc': (3200.0, -0.15, None, 'Gflop/s'),
-        }
-        self.maintainers = ['AJ', 'VH']
-        self.tags = {'benchmark', 'diagnostic', 'craype'}
+    @run_after('setup')
+    def skip_incompatible_combinations(self):
+        '''Fujitsu env only available in ault's a64fx partition.'''
+        if self.current_environ.name.startswith('PrgEnv-fujitsu'):
+            self.skip_if(
+                self.current_partition.fullname not in ('ault:a64fx')
+            )
+
+    @run_after('setup')
+    def set_num_cpus_per_task(self):
+        proc = self.current_partition.processor
+        pname = self.current_partition.fullname
+        if not proc.info:
+            self.skip(f'no topology information found for partition {pname!r}')
+
+        self.num_cpus_per_task = proc.num_cpus // proc.num_cpus_per_core
 
     @run_before('compile')
-    def setflags(self):
+    def set_flags(self):
         if self.current_environ.name.startswith('PrgEnv-gnu'):
             self.build_system.cflags += ['-fopenmp']
         elif self.current_environ.name.startswith('PrgEnv-intel'):
             self.build_system.cppflags = [
                 '-DMKL_ILP64', '-I${MKLROOT}/include'
             ]
-            self.build_system.cflags = ['-qopenmp']
+            self.build_system.cflags += ['-qopenmp']
             self.build_system.ldflags = [
                 '-mkl', '-static-intel', '-liomp5', '-lpthread', '-lm', '-ldl'
             ]
+        elif self.current_environ.name.startswith('PrgEnv-fujitsu'):
+            self.build_system.cflags += ['-fopenmp', '-Nlibomp', '-mt']
+            self.build_system.ldflags += ['-SSL2BLAMP', '-mt']
 
         if self.current_partition.fullname in ['arolla:cn', 'arolla:pn',
                                                'tsa:cn', 'tsa:pn']:
             self.build_system.cflags += ['-I$EBROOTOPENBLAS/include']
             self.build_system.ldflags = ['-L$EBROOTOPENBLAS/lib', '-lopenblas',
                                          '-lpthread', '-lgfortran']
-
-    @run_before('run')
-    def set_tasks(self):
-        if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']:
-            self.num_cpus_per_task = 12
-        elif self.current_partition.fullname in ['daint:mc', 'dom:mc']:
-            self.num_cpus_per_task = 36
-        elif self.current_partition.fullname in ['arolla:cn', 'tsa:cn']:
-            self.num_cpus_per_task = 16
-        elif self.current_partition.fullname in ['arolla:pn', 'tsa:pn']:
-            self.num_cpus_per_task = 40
-        elif self.current_partition.fullname in ['eiger:mc', 'pilatus:mc']:
-            self.num_cpus_per_task = 128
-
-        if self.num_cpus_per_task:
-            self.variables = {
-                'OMP_NUM_THREADS': str(self.num_cpus_per_task),
-                'OMP_BIND': 'cores',
-                'OMP_PROC_BIND': 'spread',
-                'OMP_SCHEDULE': 'static'
-            }
-
-    @sn.sanity_function
-    def eval_sanity(self):
-        all_tested_nodes = sn.evaluate(sn.extractall(
-            r'(?P<hostname>\S+):\s+Time for \d+ DGEMM operations',
-            self.stdout, 'hostname'))
-        num_tested_nodes = len(all_tested_nodes)
-        failure_msg = ('Requested %s node(s), but found %s node(s)' %
-                       (self.job.num_tasks, num_tested_nodes))
-        sn.evaluate(sn.assert_eq(num_tested_nodes, self.job.num_tasks,
-                                 msg=failure_msg))
-
-        for hostname in all_tested_nodes:
-            partition_name = self.current_partition.fullname
-            ref_name = '%s:%s' % (partition_name, hostname)
-            self.reference[ref_name] = self.sys_reference.get(
-                partition_name, (0.0, None, None, 'Gflop/s')
-            )
-            self.perf_patterns[hostname] = sn.extractsingle(
-                r'%s:\s+Avg\. performance\s+:\s+(?P<gflops>\S+)'
-                r'\sGflop/s' % hostname, self.stdout, 'gflops', float)
-
-        return True
diff --git a/cscs-checks/microbenchmarks/cpu/latency/latency.py b/cscs-checks/microbenchmarks/cpu/latency/latency.py
index a766ee4554..583516dad6 100644
--- a/cscs-checks/microbenchmarks/cpu/latency/latency.py
+++ b/cscs-checks/microbenchmarks/cpu/latency/latency.py
@@ -4,61 +4,73 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import reframe as rfm
-import reframe.utility.sanity as sn
+
+from hpctestlib.microbenchmarks.cpu.latency import CpuLatency
 
 
 @rfm.simple_test
-class CPULatencyTest(rfm.RegressionTest):
-    sourcepath = 'latency.cpp'
-    build_system = 'SingleSource'
-    valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-                     'ault:intel', 'ault:amdvega', 'tave:compute']
-    valid_prog_environs = ['PrgEnv-gnu']
+class cpu_latency_check(CpuLatency):
+    '''CPU latency check.
+
+    This check performs a latency test for each provided buffer size.
+    '''
+    # Set required variables
+    buffer_sizes = ['16000', '128000', '8000000', '500000000']
     num_tasks = 0
-    num_tasks_per_node = 1
-    executable_opts = ['16000', '128000', '8000000', '500000000']
+
+    valid_systems = [
+        'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
+        'ault:intel', 'ault:amdvega', 'tave:compute', 'ault:a64fx'
+    ]
+    valid_prog_environs = ['PrgEnv-gnu']
     reference = {
         'dom:mc': {
-            'latencyL1':  (1.21, -0.01, 0.26, 'ns'),
-            'latencyL2':  (3.65, -0.01, 0.26, 'ns'),
-            'latencyL3':  (18.83, -0.01, 0.05, 'ns'),
-            'latencyMem': (76.6, -0.01, 0.05, 'ns')
+            'latencyL1': (1.21, -0.01, 0.26, 'ns'),
+            'latencyL2': (3.65, -0.01, 0.26, 'ns'),
+            'latencyL3': (18.83, -0.01, 0.05, 'ns'),
+            'latencyL4': (76.6, -0.01, 0.05, 'ns')
         },
         'dom:gpu': {
-            'latencyL1':  (1.14, -0.01, 0.26, 'ns'),
-            'latencyL2':  (3.44, -0.01, 0.26, 'ns'),
-            'latencyL3':  (15.65, -0.01, 0.05, 'ns'),
-            'latencyMem': (71.7, -0.01, 0.05, 'ns')
+            'latencyL1': (1.14, -0.01, 0.26, 'ns'),
+            'latencyL2': (3.44, -0.01, 0.26, 'ns'),
+            'latencyL3': (15.65, -0.01, 0.05, 'ns'),
+            'latencyL4': (71.7, -0.01, 0.05, 'ns')
         },
         'daint:mc': {
-            'latencyL1':  (1.21, -0.01, 0.26, 'ns'),
-            'latencyL2':  (3.65, -0.01, 0.26, 'ns'),
-            'latencyL3':  (18.83, -0.01, 0.05, 'ns'),
-            'latencyMem': (76.6, -0.01, 0.05, 'ns')
+            'latencyL1': (1.21, -0.01, 0.26, 'ns'),
+            'latencyL2': (3.65, -0.01, 0.26, 'ns'),
+            'latencyL3': (18.83, -0.01, 0.05, 'ns'),
+            'latencyL4': (76.6, -0.01, 0.05, 'ns')
         },
         'daint:gpu': {
-            'latencyL1':  (1.14, -0.01, 0.26, 'ns'),
-            'latencyL2':  (3.44, -0.01, 0.26, 'ns'),
-            'latencyL3':  (15.65, -0.01, 0.05, 'ns'),
-            'latencyMem': (71.7, -0.01, 0.05, 'ns')
+            'latencyL1': (1.14, -0.01, 0.26, 'ns'),
+            'latencyL2': (3.44, -0.01, 0.26, 'ns'),
+            'latencyL3': (15.65, -0.01, 0.05, 'ns'),
+            'latencyL4': (71.7, -0.01, 0.05, 'ns')
         },
         'ault:intel': {
-            'latencyL1':  (1.08, -0.01, 0.26, 'ns'),
-            'latencyL2':  (3.8, -0.01, 0.26, 'ns'),
-            'latencyL3':  (21.5, -0.01, 0.05, 'ns'),
-            'latencyMem': (86.5, -0.01, 0.05, 'ns')
+            'latencyL1': (1.08, -0.01, 0.26, 'ns'),
+            'latencyL2': (3.8, -0.01, 0.26, 'ns'),
+            'latencyL3': (21.5, -0.01, 0.05, 'ns'),
+            'latencyL4': (86.5, -0.01, 0.05, 'ns')
         },
         'ault:amdvega': {
-            'latencyL1':  (1.32, -0.01, 0.26, 'ns'),
-            'latencyL2':  (4.02, -0.01, 0.26, 'ns'),
-            'latencyL3':  (14.4, -0.01, 0.26, 'ns'),
-            'latencyMem': (90.0, -0.01, 0.05, 'ns')
+            'latencyL1': (1.32, -0.01, 0.26, 'ns'),
+            'latencyL2': (4.02, -0.01, 0.26, 'ns'),
+            'latencyL3': (14.4, -0.01, 0.26, 'ns'),
+            'latencyL4': (90.0, -0.01, 0.05, 'ns')
         },
         'tave:compute': {
-            'latencyL1':  (2.86, -0.01, 0.05, 'ns'),
-            'latencyL2':  (12.15, -0.01, 0.05, 'ns'),
-            'latencyL3':  (137, -0.01, 0.05, 'ns'),
-            'latencyMem': (150, -0.05, 0.05, 'ns')
+            'latencyL1': (2.86, -0.01, 0.05, 'ns'),
+            'latencyL2': (12.15, -0.01, 0.05, 'ns'),
+            'latencyL3': (137, -0.01, 0.05, 'ns'),
+            'latencyL4': (150, -0.05, 0.05, 'ns')
+        },
+        'ault:a64fx': {
+            'latencyL1': (2.78, None, 0.05, 'ns'),
+            'latencyL2': (14.3, None, 0.05, 'ns'),
+            'latencyL3': (32.1, None, 0.05, 'ns'),
+            'latencyL4': (146,  None, 0.05, 'ns')
         },
     }
     maintainers = ['SK']
@@ -71,34 +83,7 @@ def set_modules(self):
         if self.current_system.name in {'tave'}:
             self.modules = ['craype-hugepages512M']
 
-    @run_before('compile')
-    def set_flags(self):
-        self.build_system.cxxflags = ['-O3']
-
-    @sanity_function
-    def assert_success(self):
-        return sn.assert_eq(
-            sn.count(sn.findall(r'latency', self.stdout)),
-            self.num_tasks * len(self.executable_opts)
-        )
-
-    def lat_pattern(self, index):
-        return sn.extractsingle(
-            r'latency \(ns\) for input size %s: (?P<bw>\S+) clocks' %
-            self.executable_opts[index], self.stdout, 'bw', float)
-
-    @performance_function('ns')
-    def latencyL1(self):
-        return self.lat_pattern(0)
-
-    @performance_function('ns')
-    def latencyL2(self):
-        return self.lat_pattern(1)
-
-    @performance_function('ns')
-    def latencyL3(self):
-        return self.lat_pattern(2)
-
-    @performance_function('ns')
-    def latencyMem(self):
-        return self.lat_pattern(3)
+    @run_after('init')
+    def set_valid_environs(self):
+        if self.current_system.name in {'ault'}:
+            self.valid_prog_environs = ['PrgEnv-fujitsu']
diff --git a/cscs-checks/microbenchmarks/cpu/stream/src/stream.c b/cscs-checks/microbenchmarks/cpu/stream/src/stream.c
deleted file mode 100644
index b9a2cee3b2..0000000000
--- a/cscs-checks/microbenchmarks/cpu/stream/src/stream.c
+++ /dev/null
@@ -1,585 +0,0 @@
-/*-----------------------------------------------------------------------*/
-/* Program: STREAM                                                       */
-/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
-/* Original code developed by John D. McCalpin                           */
-/* Programmers: John D. McCalpin                                         */
-/*              Joe R. Zagar                                             */
-/*                                                                       */
-/* This program measures memory transfer rates in MB/s for simple        */
-/* computational kernels coded in C.                                     */
-/*-----------------------------------------------------------------------*/
-/* Copyright 1991-2013: John D. McCalpin                                 */
-/*-----------------------------------------------------------------------*/
-/* License:                                                              */
-/*  1. You are free to use this program and/or to redistribute           */
-/*     this program.                                                     */
-/*  2. You are free to modify this program for your own use,             */
-/*     including commercial use, subject to the publication              */
-/*     restrictions in item 3.                                           */
-/*  3. You are free to publish results obtained from running this        */
-/*     program, or from works that you derive from this program,         */
-/*     with the following limitations:                                   */
-/*     3a. In order to be referred to as "STREAM benchmark results",     */
-/*         published results must be in conformance to the STREAM        */
-/*         Run Rules, (briefly reviewed below) published at              */
-/*         http://www.cs.virginia.edu/stream/ref.html                    */
-/*         and incorporated herein by reference.                         */
-/*         As the copyright holder, John McCalpin retains the            */
-/*         right to determine conformity with the Run Rules.             */
-/*     3b. Results based on modified source code or on runs not in       */
-/*         accordance with the STREAM Run Rules must be clearly          */
-/*         labelled whenever they are published.  Examples of            */
-/*         proper labelling include:                                     */
-/*           "tuned STREAM benchmark results"                            */
-/*           "based on a variant of the STREAM benchmark code"           */
-/*         Other comparable, clear, and reasonable labelling is          */
-/*         acceptable.                                                   */
-/*     3c. Submission of results to the STREAM benchmark web site        */
-/*         is encouraged, but not required.                              */
-/*  4. Use of this program or creation of derived works based on this    */
-/*     program constitutes acceptance of these licensing restrictions.   */
-/*  5. Absolutely no warranty is expressed or implied.                   */
-/*-----------------------------------------------------------------------*/
-# include <stdio.h>
-# include <unistd.h>
-# include <math.h>
-# include <float.h>
-# include <limits.h>
-# include <sys/time.h>
-
-/*-----------------------------------------------------------------------
- * INSTRUCTIONS:
- *
- *	1) STREAM requires different amounts of memory to run on different
- *           systems, depending on both the system cache size(s) and the
- *           granularity of the system timer.
- *     You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
- *           to meet *both* of the following criteria:
- *       (a) Each array must be at least 4 times the size of the
- *           available cache memory. I don't worry about the difference
- *           between 10^6 and 2^20, so in practice the minimum array size
- *           is about 3.8 times the cache size.
- *           Example 1: One Xeon E3 with 8 MB L3 cache
- *               STREAM_ARRAY_SIZE should be >= 4 million, giving
- *               an array size of 30.5 MB and a total memory requirement
- *               of 91.5 MB.  
- *           Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
- *               STREAM_ARRAY_SIZE should be >= 20 million, giving
- *               an array size of 153 MB and a total memory requirement
- *               of 458 MB.  
- *       (b) The size should be large enough so that the 'timing calibration'
- *           output by the program is at least 20 clock-ticks.  
- *           Example: most versions of Windows have a 10 millisecond timer
- *               granularity.  20 "ticks" at 10 ms/tic is 200 milliseconds.
- *               If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
- *               This means the each array must be at least 1 GB, or 128M elements.
- *
- *      Version 5.10 increases the default array size from 2 million
- *          elements to 10 million elements in response to the increasing
- *          size of L3 caches.  The new default size is large enough for caches
- *          up to 20 MB. 
- *      Version 5.10 changes the loop index variables from "register int"
- *          to "ssize_t", which allows array indices >2^32 (4 billion)
- *          on properly configured 64-bit systems.  Additional compiler options
- *          (such as "-mcmodel=medium") may be required for large memory runs.
- *
- *      Array size can be set at compile time without modifying the source
- *          code for the (many) compilers that support preprocessor definitions
- *          on the compile line.  E.g.,
- *                gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
- *          will override the default size of 10M with a new size of 100M elements
- *          per array.
- */
-#ifndef STREAM_ARRAY_SIZE
-#   define STREAM_ARRAY_SIZE	10000000
-#endif
-
-/*  2) STREAM runs each kernel "NTIMES" times and reports the *best* result
- *         for any iteration after the first, therefore the minimum value
- *         for NTIMES is 2.
- *      There are no rules on maximum allowable values for NTIMES, but
- *         values larger than the default are unlikely to noticeably
- *         increase the reported performance.
- *      NTIMES can also be set on the compile line without changing the source
- *         code using, for example, "-DNTIMES=7".
- */
-#ifdef NTIMES
-#if NTIMES<=1
-#   define NTIMES	10
-#endif
-#endif
-#ifndef NTIMES
-#   define NTIMES	10
-#endif
-
-/*  Users are allowed to modify the "OFFSET" variable, which *may* change the
- *         relative alignment of the arrays (though compilers may change the 
- *         effective offset by making the arrays non-contiguous on some systems). 
- *      Use of non-zero values for OFFSET can be especially helpful if the
- *         STREAM_ARRAY_SIZE is set to a value close to a large power of 2.
- *      OFFSET can also be set on the compile line without changing the source
- *         code using, for example, "-DOFFSET=56".
- */
-#ifndef OFFSET
-#   define OFFSET	0
-#endif
-
-/*
- *	3) Compile the code with optimization.  Many compilers generate
- *       unreasonably bad code before the optimizer tightens things up.  
- *     If the results are unreasonably good, on the other hand, the
- *       optimizer might be too smart for me!
- *
- *     For a simple single-core version, try compiling with:
- *            cc -O stream.c -o stream
- *     This is known to work on many, many systems....
- *
- *     To use multiple cores, you need to tell the compiler to obey the OpenMP
- *       directives in the code.  This varies by compiler, but a common example is
- *            gcc -O -fopenmp stream.c -o stream_omp
- *       The environment variable OMP_NUM_THREADS allows runtime control of the 
- *         number of threads/cores used when the resulting "stream_omp" program
- *         is executed.
- *
- *     To run with single-precision variables and arithmetic, simply add
- *         -DSTREAM_TYPE=float
- *     to the compile line.
- *     Note that this changes the minimum array sizes required --- see (1) above.
- *
- *     The preprocessor directive "TUNED" does not do much -- it simply causes the 
- *       code to call separate functions to execute each kernel.  Trivial versions
- *       of these functions are provided, but they are *not* tuned -- they just 
- *       provide predefined interfaces to be replaced with tuned code.
- *
- *
- *	4) Optional: Mail the results to mccalpin@cs.virginia.edu
- *	   Be sure to include info that will help me understand:
- *		a) the computer hardware configuration (e.g., processor model, memory type)
- *		b) the compiler name/version and compilation flags
- *      c) any run-time information (such as OMP_NUM_THREADS)
- *		d) all of the output from the test case.
- *
- * Thanks!
- *
- *-----------------------------------------------------------------------*/
-
-# define HLINE "-------------------------------------------------------------\n"
-
-# ifndef MIN
-# define MIN(x,y) ((x)<(y)?(x):(y))
-# endif
-# ifndef MAX
-# define MAX(x,y) ((x)>(y)?(x):(y))
-# endif
-
-#ifndef STREAM_TYPE
-#define STREAM_TYPE double
-#endif
-
-static STREAM_TYPE	a[STREAM_ARRAY_SIZE+OFFSET],
-			b[STREAM_ARRAY_SIZE+OFFSET],
-			c[STREAM_ARRAY_SIZE+OFFSET];
-
-static double	avgtime[4] = {0}, maxtime[4] = {0},
-		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
-
-static char	*label[4] = {"Copy:      ", "Scale:     ",
-    "Add:       ", "Triad:     "};
-
-static double	bytes[4] = {
-    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
-    2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
-    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
-    3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE
-    };
-
-extern double mysecond();
-extern void checkSTREAMresults();
-#ifdef TUNED
-extern void tuned_STREAM_Copy();
-extern void tuned_STREAM_Scale(STREAM_TYPE scalar);
-extern void tuned_STREAM_Add();
-extern void tuned_STREAM_Triad(STREAM_TYPE scalar);
-#endif
-#ifdef _OPENMP
-extern int omp_get_num_threads();
-#endif
-int
-main()
-    {
-    int			quantum, checktick();
-    int			BytesPerWord;
-    int			k;
-    ssize_t		j;
-    STREAM_TYPE		scalar;
-    double		t, times[4][NTIMES];
-
-    /* --- SETUP --- determine precision and check timing --- */
-
-    printf(HLINE);
-    printf("STREAM version $Revision: 5.10 $\n");
-    printf(HLINE);
-    BytesPerWord = sizeof(STREAM_TYPE);
-    printf("This system uses %d bytes per array element.\n",
-	BytesPerWord);
-
-    printf(HLINE);
-#ifdef N
-    printf("*****  WARNING: ******\n");
-    printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
-    printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
-    printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
-    printf("*****  WARNING: ******\n");
-#endif
-
-    printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
-    printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
-	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
-	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
-    printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
-	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
-	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
-    printf("Each kernel will be executed %d times.\n", NTIMES);
-    printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
-    printf(" will be used to compute the reported bandwidth.\n");
-
-#ifdef _OPENMP
-    printf(HLINE);
-#pragma omp parallel 
-    {
-#pragma omp master
-	{
-	    k = omp_get_num_threads();
-	    printf ("Number of Threads requested = %i\n",k);
-        }
-    }
-#endif
-
-#ifdef _OPENMP
-	k = 0;
-#pragma omp parallel
-#pragma omp atomic 
-		k++;
-    printf ("Number of Threads counted = %i\n",k);
-#endif
-
-    /* Get initial value for system clock. */
-#pragma omp parallel for
-    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
-	    a[j] = 1.0;
-	    b[j] = 2.0;
-	    c[j] = 0.0;
-	}
-
-    printf(HLINE);
-
-    if  ( (quantum = checktick()) >= 1) 
-	printf("Your clock granularity/precision appears to be "
-	    "%d microseconds.\n", quantum);
-    else {
-	printf("Your clock granularity appears to be "
-	    "less than one microsecond.\n");
-	quantum = 1;
-    }
-
-    t = mysecond();
-#pragma omp parallel for
-    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
-		a[j] = 2.0E0 * a[j];
-    t = 1.0E6 * (mysecond() - t);
-
-    printf("Each test below will take on the order"
-	" of %d microseconds.\n", (int) t  );
-    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
-    printf("Increase the size of the arrays if this shows that\n");
-    printf("you are not getting at least 20 clock ticks per test.\n");
-
-    printf(HLINE);
-
-    printf("WARNING -- The above is only a rough guideline.\n");
-    printf("For best results, please be sure you know the\n");
-    printf("precision of your system timer.\n");
-    printf(HLINE);
-    
-    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
-
-    scalar = 3.0;
-    for (k=0; k<NTIMES; k++)
-	{
-	times[0][k] = mysecond();
-#ifdef TUNED
-        tuned_STREAM_Copy();
-#else
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    c[j] = a[j];
-#endif
-	times[0][k] = mysecond() - times[0][k];
-	
-	times[1][k] = mysecond();
-#ifdef TUNED
-        tuned_STREAM_Scale(scalar);
-#else
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    b[j] = scalar*c[j];
-#endif
-	times[1][k] = mysecond() - times[1][k];
-	
-	times[2][k] = mysecond();
-#ifdef TUNED
-        tuned_STREAM_Add();
-#else
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    c[j] = a[j]+b[j];
-#endif
-	times[2][k] = mysecond() - times[2][k];
-	
-	times[3][k] = mysecond();
-#ifdef TUNED
-        tuned_STREAM_Triad(scalar);
-#else
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    a[j] = b[j]+scalar*c[j];
-#endif
-	times[3][k] = mysecond() - times[3][k];
-	}
-
-    /*	--- SUMMARY --- */
-
-    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
-	{
-	for (j=0; j<4; j++)
-	    {
-	    avgtime[j] = avgtime[j] + times[j][k];
-	    mintime[j] = MIN(mintime[j], times[j][k]);
-	    maxtime[j] = MAX(maxtime[j], times[j][k]);
-	    }
-	}
-    
-    printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
-    for (j=0; j<4; j++) {
-		avgtime[j] = avgtime[j]/(double)(NTIMES-1);
-
-		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
-	       1.0E-06 * bytes[j]/mintime[j],
-	       avgtime[j],
-	       mintime[j],
-	       maxtime[j]);
-    }
-    printf(HLINE);
-
-    /* --- Check Results --- */
-    checkSTREAMresults();
-    printf(HLINE);
-
-    return 0;
-}
-
-# define	M	20
-
-int
-checktick()
-    {
-    int		i, minDelta, Delta;
-    double	t1, t2, timesfound[M];
-
-/*  Collect a sequence of M unique time values from the system. */
-
-    for (i = 0; i < M; i++) {
-	t1 = mysecond();
-	while( ((t2=mysecond()) - t1) < 1.0E-6 )
-	    ;
-	timesfound[i] = t1 = t2;
-	}
-
-/*
- * Determine the minimum difference between these M values.
- * This result will be our estimate (in microseconds) for the
- * clock granularity.
- */
-
-    minDelta = 1000000;
-    for (i = 1; i < M; i++) {
-	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
-	minDelta = MIN(minDelta, MAX(Delta,0));
-	}
-
-   return(minDelta);
-    }
-
-
-
-/* A gettimeofday routine to give access to the wall
-   clock timer on most UNIX-like systems.  */
-
-#include <sys/time.h>
-
-double mysecond()
-{
-        struct timeval tp;
-        struct timezone tzp;
-        int i;
-
-        i = gettimeofday(&tp,&tzp);
-        return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
-}
-
-#ifndef abs
-#define abs(a) ((a) >= 0 ? (a) : -(a))
-#endif
-void checkSTREAMresults ()
-{
-	STREAM_TYPE aj,bj,cj,scalar;
-	STREAM_TYPE aSumErr,bSumErr,cSumErr;
-	STREAM_TYPE aAvgErr,bAvgErr,cAvgErr;
-	double epsilon;
-	ssize_t	j;
-	int	k,ierr,err;
-
-    /* reproduce initialization */
-	aj = 1.0;
-	bj = 2.0;
-	cj = 0.0;
-    /* a[] is modified during timing check */
-	aj = 2.0E0 * aj;
-    /* now execute timing loop */
-	scalar = 3.0;
-	for (k=0; k<NTIMES; k++)
-        {
-            cj = aj;
-            bj = scalar*cj;
-            cj = aj+bj;
-            aj = bj+scalar*cj;
-        }
-
-    /* accumulate deltas between observed and expected results */
-	aSumErr = 0.0;
-	bSumErr = 0.0;
-	cSumErr = 0.0;
-	for (j=0; j<STREAM_ARRAY_SIZE; j++) {
-		aSumErr += abs(a[j] - aj);
-		bSumErr += abs(b[j] - bj);
-		cSumErr += abs(c[j] - cj);
-		// if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj);	// MCCALPIN
-	}
-	aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
-	bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
-	cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
-
-	if (sizeof(STREAM_TYPE) == 4) {
-		epsilon = 1.e-6;
-	}
-	else if (sizeof(STREAM_TYPE) == 8) {
-		epsilon = 1.e-13;
-	}
-	else {
-		printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
-		epsilon = 1.e-6;
-	}
-
-	err = 0;
-	if (abs(aAvgErr/aj) > epsilon) {
-		err++;
-		printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
-		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
-		ierr = 0;
-		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
-			if (abs(a[j]/aj-1.0) > epsilon) {
-				ierr++;
-#ifdef VERBOSE
-				if (ierr < 10) {
-					printf("         array a: index: %ld, expected: %e, observed: %e, relative error: %e\n",
-						j,aj,a[j],abs((aj-a[j])/aAvgErr));
-				}
-#endif
-			}
-		}
-		printf("     For array a[], %d errors were found.\n",ierr);
-	}
-	if (abs(bAvgErr/bj) > epsilon) {
-		err++;
-		printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
-		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
-		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
-		ierr = 0;
-		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
-			if (abs(b[j]/bj-1.0) > epsilon) {
-				ierr++;
-#ifdef VERBOSE
-				if (ierr < 10) {
-					printf("         array b: index: %ld, expected: %e, observed: %e, relative error: %e\n",
-						j,bj,b[j],abs((bj-b[j])/bAvgErr));
-				}
-#endif
-			}
-		}
-		printf("     For array b[], %d errors were found.\n",ierr);
-	}
-	if (abs(cAvgErr/cj) > epsilon) {
-		err++;
-		printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
-		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
-		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
-		ierr = 0;
-		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
-			if (abs(c[j]/cj-1.0) > epsilon) {
-				ierr++;
-#ifdef VERBOSE
-				if (ierr < 10) {
-					printf("         array c: index: %ld, expected: %e, observed: %e, relative error: %e\n",
-						j,cj,c[j],abs((cj-c[j])/cAvgErr));
-				}
-#endif
-			}
-		}
-		printf("     For array c[], %d errors were found.\n",ierr);
-	}
-	if (err == 0) {
-		printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
-	}
-#ifdef VERBOSE
-	printf ("Results Validation Verbose Results: \n");
-	printf ("    Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj);
-	printf ("    Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]);
-	printf ("    Rel Errors on a, b, c:     %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj));
-#endif
-}
-
-#ifdef TUNED
-/* stubs for "tuned" versions of the kernels */
-void tuned_STREAM_Copy()
-{
-	ssize_t j;
-#pragma omp parallel for
-        for (j=0; j<STREAM_ARRAY_SIZE; j++)
-            c[j] = a[j];
-}
-
-void tuned_STREAM_Scale(STREAM_TYPE scalar)
-{
-	ssize_t j;
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    b[j] = scalar*c[j];
-}
-
-void tuned_STREAM_Add()
-{
-	ssize_t j;
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    c[j] = a[j]+b[j];
-}
-
-void tuned_STREAM_Triad(STREAM_TYPE scalar)
-{
-	ssize_t j;
-#pragma omp parallel for
-	for (j=0; j<STREAM_ARRAY_SIZE; j++)
-	    a[j] = b[j]+scalar*c[j];
-}
-/* end of stubs for the "tuned" versions of the kernels */
-#endif
diff --git a/cscs-checks/microbenchmarks/cpu/stream/stream.py b/cscs-checks/microbenchmarks/cpu/stream/stream.py
index 0ee6ed8a9f..217d73f9eb 100644
--- a/cscs-checks/microbenchmarks/cpu/stream/stream.py
+++ b/cscs-checks/microbenchmarks/cpu/stream/stream.py
@@ -4,66 +4,24 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import reframe as rfm
-import reframe.utility.sanity as sn
 
+from hpctestlib.microbenchmarks.cpu.stream import Stream
 
-@rfm.simple_test
-class StreamTest(rfm.RegressionTest):
-    '''This test checks the stream test:
-       Function    Best Rate MB/s  Avg time     Min time     Max time
-       Triad:          13991.7     0.017174     0.017153     0.017192
-    '''
-
-    def __init__(self):
-        self.descr = 'STREAM Benchmark'
-        self.exclusive_access = True
-        self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
-                              'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn']
-        self.valid_prog_environs = ['PrgEnv-cray', 'PrgEnv-gnu',
-                                    'PrgEnv-intel', 'PrgEnv-pgi',
-                                    'PrgEnv-nvidia']
-
-        self.use_multithreading = False
-
-        self.prgenv_flags = {
-            'PrgEnv-cray': ['-fopenmp', '-O3'],
-            'PrgEnv-gnu': ['-fopenmp', '-O3'],
-            'PrgEnv-intel': ['-qopenmp', '-O3'],
-            'PrgEnv-pgi': ['-mp', '-O3'],
-            'PrgEnv-nvidia': ['-mp', '-O3']
-        }
 
-        if self.current_system.name in ['arolla', 'tsa']:
-            self.exclusive_access = True
-            self.valid_prog_environs = ['PrgEnv-gnu']
+@rfm.simple_test
+class stream_check(Stream):
+    '''Stream benchmark test.'''
 
-        self.sourcepath = 'stream.c'
-        self.build_system = 'SingleSource'
-        self.num_tasks = 1
-        self.num_tasks_per_node = 1
-        self.stream_cpus_per_task = {
-            'arolla:cn': 16,
-            'arolla:pn': 16,
-            'daint:gpu': 12,
-            'daint:mc': 36,
-            'dom:gpu': 12,
-            'dom:mc': 36,
-            'leone:normal': 16,
-            'monch:compute': 20,
-            'tsa:cn': 16,
-            'tsa:pn': 16,
-        }
-        self.variables = {
-            'OMP_PLACES': 'threads',
-            'OMP_PROC_BIND': 'spread'
-        }
-        self.sanity_patterns = sn.assert_found(
-            r'Solution Validates: avg error less than', self.stdout)
-        self.perf_patterns = {
-            'triad': sn.extractsingle(r'Triad:\s+(?P<triad>\S+)\s+\S+',
-                                      self.stdout, 'triad', float)
-        }
-        self.stream_bw_reference = {
+    valid_systems = [
+        'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc',
+        'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn', 'ault:a64fx'
+    ]
+    valid_prog_environs = [
+        'PrgEnv-cray', 'PrgEnv-gnu', 'PrgEnv-intel', 'PrgEnv-pgi',
+        'PrgEnv-nvidia'
+    ]
+    triad_reference = variable(
+        dict, value={
             'PrgEnv-cray': {
                 'daint:gpu': {'triad': (44000, -0.05, None, 'MB/s')},
                 'daint:mc': {'triad': (89000, -0.05, None, 'MB/s')},
@@ -87,23 +45,57 @@ def __init__(self):
                 'daint:mc': {'triad': (88500, -0.05, None, 'MB/s')},
                 'dom:gpu': {'triad': (44500, -0.05, None, 'MB/s')},
                 'dom:mc': {'triad': (88500, -0.05, None, 'MB/s')},
-            }
+            },
+            'PrgEnv-fujitsu': {
+                'ault:a64fx': {'triad': (85500, -0.05, None, 'MB/s')},
+            },
         }
-        self.tags = {'production', 'craype'}
-        self.maintainers = ['RS', 'SK']
+    )
+    num_tasks = 1
+    tags = {'production', 'craype'}
+
+    @run_after('init')
+    def filter_valid_prog_environs(self):
+        '''Special conditions for arolla and tsa.'''
+        if self.current_system.name in ['arolla', 'tsa']:
+            self.valid_prog_environs = ['PrgEnv-gnu']
+        elif self.current_system.name in ['ault']:
+            self.valid_prog_environs = ['PrgEnv-fujitsu']
 
     @run_after('setup')
-    def prepare_test(self):
-        self.num_cpus_per_task = self.stream_cpus_per_task.get(
-            self.current_partition.fullname, 1)
-        self.variables['OMP_NUM_THREADS'] = str(self.num_cpus_per_task)
+    def set_num_cpus_per_task(self):
+        '''Set the num cpus based on the autodetected topology.'''
+        proc = self.current_partition.processor
+        pname = self.current_partition.fullname
+        if not proc.info:
+            self.skip(f'no topology information found for partition {pname!r}')
+
+        self.num_cpus_per_task = proc.num_cpus // proc.num_cpus_per_core
+
+    @run_before('compile')
+    def set_compiler_flags(self):
+        '''Set build flags for the different environments.'''
+
         envname = self.current_environ.name
+        if envname in ('PrgEnv-cray', 'PrgEnv-gnu'):
+            self.build_system.cflags += ['-fopenmp', '-O3']
+        elif envname in ('PrgEnv-intel'):
+            self.build_system.cflags += ['-qopenmp', '-O3']
+        elif envname in ('PrgEnv-pgi', 'PrgEnv-nvidia'):
+            self.build_system.cflags += ['-mp', '-O3']
+        elif envname in ('PrgEnv-fujitsu'):
+            self.build_system.cflags += ['-fopenmp', '-mt', '-O3']
+            self.build_system.ldflags += ['-mt']
 
-        self.build_system.cflags = self.prgenv_flags.get(envname, ['-O3'])
-        if envname == 'PrgEnv-pgi':
+    @run_before('run')
+    def set_env_vars(self):
+        '''Special environment treatment for the PrgEnv-pgi.'''
+        if self.current_environ.name == 'PrgEnv-pgi':
             self.variables['OMP_PROC_BIND'] = 'true'
 
-        try:
-            self.reference = self.stream_bw_reference[envname]
-        except KeyError:
-            self.reference = self.stream_bw_reference['PrgEnv-gnu']
+    @run_before('performance')
+    def set_triad_references(self):
+        '''Set performance refs as defined in ``triad_reference``.'''
+        envname = self.current_environ.name
+        if envname in self.triad_reference:
+            self.reference = self.triad_reference[envname]
diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
index cc6377b165..276ebfbff0 100644
--- a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
+++ b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py
@@ -4,39 +4,59 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import reframe as rfm
-import reframe.utility.sanity as sn
 
+from hpctestlib.microbenchmarks.cpu.strided_bandwidth import StridedBandwidth
 
 @rfm.simple_test
-class StridedBandwidthTest(rfm.RegressionTest):
-    sourcepath = 'strides.cpp'
-    build_system = 'SingleSource'
+class strided_bandwidth_check(StridedBandwidth):
+    '''Strided bandwidth check.
+
+    This test is parameterized with the ``stride_bytes`` parameter, covering
+    the following scenarios: 8-byte, 64-byte and 128-byte strides.
+
+    This test requires the ``num_cpus`` variable, which is set in a post-setup
+    hook.
+    '''
+
     valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc',
-                     'eiger:mc', 'pilatus:mc']
+                     'eiger:mc', 'pilatus:mc', 'ault:a64fx']
     valid_prog_environs = ['PrgEnv-gnu']
+
+    # Define the stride parameter
+    stride_bytes = parameter([8, 64, 128])
+
+    # Set required variables
     num_tasks = 1
-    num_tasks_per_node = 1
-    maintainers = ['SK']
     tags = {'benchmark', 'diagnostic'}
-    stride_bytes = parameter([8, 64, 128])
+
+    # Bandwidth references
     reference_bw = {
         8: {
             'haswell': (50, -0.1, 0.1, 'GB/s'),
             'broadwell': (100, -0.1, 0.1, 'GB/s'),
-            'zen2': (270, -0.1, 0.1, 'GB/s')
+            'zen2': (270, -0.1, 0.1, 'GB/s'),
+            'a64fx': (50, -0.1, 0.1, 'GB/s')
         },
         64: {
             'haswell': (6, -0.1, 0.2, 'GB/s'),
             'broadwell': (12.5, -0.1, 0.2, 'GB/s'),
-            'zen2': (33, -0.1, 0.2, 'GB/s')
+            'zen2': (33, -0.1, 0.2, 'GB/s'),
+            'a64fx': (45, -0.1, 0.1, 'GB/s')
         },
         128: {
             'haswell': (4.5, -0.1, 0.2, 'GB/s'),
             'broadwell': (9.1, -0.1, 0.2, 'GB/s'),
-            'zen2': (33, -0.1, 0.2, 'GB/s')
+            'zen2': (33, -0.1, 0.2, 'GB/s'),
+            'a64fx': (25, -0.1, 0.1, 'GB/s')
         },
     }
 
+    @run_after('init')
+    def set_valid_systems(self):
+        cp = self.current_system.name
+        if cp == 'ault':
+            self.valid_prog_environs = ['PrgEnv-fujitsu']
+
     @run_after('setup')
     def skip_if_no_topo(self):
         proc = self.current_partition.processor
@@ -44,16 +64,6 @@ def skip_if_no_topo(self):
         if not proc.info:
             self.skip(f'no topology information found for partition {pname!r}')
 
-    @sanity_function
-    def assert_num_tasks(self):
-        return sn.assert_eq(sn.count(sn.findall(r'bandwidth', self.stdout)),
-                            self.num_tasks)
-
-    @performance_function('GB/s')
-    def bandwidth(self):
-        return sn.extractsingle(r'bandwidth: (?P<bw>\S+) GB/s',
-                                self.stdout, 'bw', float)
-
     @run_before('run')
     def set_exec_opts(self):
         proc = self.current_partition.processor
diff --git a/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py
new file mode 100644
index 0000000000..fb95bdeec6
--- /dev/null
+++ b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py
@@ -0,0 +1,84 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+__all__ = ['Dgemm']
+
+
+class Dgemm(rfm.RegressionTest, pin_prefix=True):
+    '''Dgemm benchmark.
+
+    Derived test must specify the variables ``num_tasks`` and
+    ``num_cpus_per_task``.
+
+    The matrix sizes can be controlled through executable options. By default,
+    this test sets these as ``m=6144``, ``n=12288`` and ``k=3072``. Derived
+    tests are free to change these parameters at their convenience. The
+    performance of this tests is measured by the lowest performing node in
+    ``Gflops/s``.
+    '''
+
+    num_tasks = required
+    num_cpus_per_task = required
+
+    descr = 'DGEMM performance test'
+    sourcepath = 'dgemm.c'
+    use_multithreading = False
+    executable_opts = ['6144', '12288', '3072']
+    build_system = 'SingleSource'
+    maintainers = ['AJ', 'VH']
+
+    @run_before('compile')
+    def set_c_flags(self):
+        self.build_system.cflags += ['-O3']
+
+    @run_before('run')
+    def set_env_vars(self):
+        '''Set the environment variables.'''
+
+        self.variables = {
+            'OMP_NUM_THREADS': str(self.num_cpus_per_task),
+            'OMP_BIND': 'cores',
+            'OMP_PROC_BIND': 'spread',
+            'OMP_SCHEDULE': 'static'
+        }
+
+    @deferrable
+    def get_nodenames(self):
+        '''Return a set with the participating node IDs.'''
+
+        return set(sn.extractall(
+            r'(?P<hostname>\S+):\s+Time for \d+ DGEMM operations',
+            self.stdout, 'hostname'
+        ))
+
+    @sanity_function
+    def assert_all_nodes_completed(self):
+        '''Assert that all requested nodes have completed.'''
+
+        return sn.assert_eq(
+            self.job.num_tasks, sn.count(self.get_nodenames()),
+            msg='some nodes did not complete'
+        )
+
+    @deferrable
+    def get_node_performance(self, nodeid):
+        '''Get the performance data from a specific ``nodeid``.'''
+
+        return sn.extractsingle(
+            r'%s:\s+Avg\. performance\s+:\s+(?P<gflops>\S+)\sGflop/s' % nodeid,
+            self.stdout, 'gflops', float
+        )
+
+    @performance_function('Gflops/s')
+    def min_perf(self):
+        '''Get the lowest performance from all nodes.'''
+
+        return sn.min([
+            self.get_node_performance(nid) for nid in self.get_nodenames()
+        ])
diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/src/dgemm.c b/hpctestlib/microbenchmarks/cpu/dgemm/src/dgemm.c
similarity index 100%
rename from cscs-checks/microbenchmarks/cpu/dgemm/src/dgemm.c
rename to hpctestlib/microbenchmarks/cpu/dgemm/src/dgemm.c
diff --git a/hpctestlib/microbenchmarks/cpu/latency/__init__.py b/hpctestlib/microbenchmarks/cpu/latency/__init__.py
new file mode 100644
index 0000000000..4ba107d803
--- /dev/null
+++ b/hpctestlib/microbenchmarks/cpu/latency/__init__.py
@@ -0,0 +1,75 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+import reframe.utility.typecheck as typ
+
+__all__ = ['CpuLatency']
+
+
+class CpuLatency(rfm.RegressionTest, pin_prefix=True):
+    ''' CPU latency test.
+
+    Derived tests must set the variables ``buffer_size`` and ``num_tasks``.
+    The variable ``buffer_sizes`` is a list of the different buffer sizes to
+    be used on this latency test. The executable will return the latency in
+    ``ns`` for each of the buffer sizes specified in this list.
+
+    This test assumes that the list of buffer sizes is provided in increasing
+    order, and this test will automatically extract a performance variable for
+    the latency of each buffer. These performance variables are named
+    ``latencyL1``, ``latencyL2`` and so on in increasing order.
+    '''
+
+    # Required variables
+    buffer_sizes = variable(typ.List[str])
+    num_tasks = required
+
+    sourcepath = 'latency.cpp'
+    build_system = 'SingleSource'
+    num_tasks_per_node = 1
+    maintainers = ['SK', 'JO']
+
+    @run_before('compile')
+    def set_compiler_flags(self):
+        self.build_system.cxxflags = ['-O3']
+
+    @run_before('run')
+    def set_exc_opts(self):
+        '''Set the ``buffer_sizes`` as the executable options.'''
+        self.executable_opts = self.buffer_sizes
+
+    @sanity_function
+    def count_reported_latencies(self):
+        '''Verify the number of reported latency measurements.'''
+
+        return sn.assert_eq(
+            sn.count(sn.findall(r'latency \(ns\)', self.stdout)),
+            self.num_tasks*sn.count(self.executable_opts)
+        )
+
+    @deferrable
+    def get_latency(self, buffer_size):
+        '''Extract the worst latency for a given buffer size.'''
+
+        return sn.max(sn.extractall(
+            r'latency \(ns\) for input size %s: (?P<bw>\S+) clocks' %
+            buffer_size, self.stdout, 'bw', float
+        ))
+
+    @run_before('performance')
+    def set_perf_patterns(self):
+        '''Set the performance functions to extract all latency levels.
+
+        The levels are named from ``L1`` to ``L{n}``, where ``n`` is the
+        length of ``buffer_sizes``.
+        '''
+
+        self.perf_variables = {
+            f'latencyL{i+1}': sn.make_performance_function(
+                self.get_latency(buff), 'ns'
+            ) for i, buff in enumerate(self.buffer_sizes)
+        }
diff --git a/cscs-checks/microbenchmarks/cpu/latency/src/latency.cpp b/hpctestlib/microbenchmarks/cpu/latency/src/latency.cpp
similarity index 100%
rename from cscs-checks/microbenchmarks/cpu/latency/src/latency.cpp
rename to hpctestlib/microbenchmarks/cpu/latency/src/latency.cpp
diff --git a/hpctestlib/microbenchmarks/cpu/stream/__init__.py b/hpctestlib/microbenchmarks/cpu/stream/__init__.py
new file mode 100644
index 0000000000..d4c5f893a3
--- /dev/null
+++ b/hpctestlib/microbenchmarks/cpu/stream/__init__.py
@@ -0,0 +1,74 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+__all__ = ['Stream']
+
+
+class Stream(rfm.RegressionTest):
+    '''Stream benchmark.
+
+    For info on the executable, see the executable sources.
+
+    Derived tests must set the variables ``num_tasks`` and
+    ``num_cpus_per_task``.
+    '''
+
+    # Required variables
+    num_tasks = required
+    num_cpus_per_task = required
+
+    descr = 'STREAM Benchmark'
+    exclusive_access = True
+    use_multithreading = False
+    prebuild_cmds = [
+        'wget http://www.cs.virginia.edu/stream/FTP/Code/stream.c',
+    ]
+    sourcepath = 'stream.c'
+    build_system = 'SingleSource'
+    num_tasks_per_node = 1
+    variables = {
+        'OMP_PLACES': 'threads',
+        'OMP_PROC_BIND': 'spread'
+    }
+    maintainers = ['RS', 'SK']
+
+    @run_before('run')
+    def set_omp_num_threads(self):
+        '''Set the number of OMP threads to ``num_cpus_per_task``.'''
+        self.variables['OMP_NUM_THREADS'] = f'{self.num_cpus_per_task}'
+
+    @sanity_function
+    def assert_solution_is_validated(self):
+        return sn.assert_found(
+            r'Solution Validates: avg error less than', self.stdout
+        )
+
+    @performance_function('MB/s', perf_key='triad')
+    def extract_min_triad(self):
+        return sn.min(sn.extractall(
+            r'Triad:\s+(?P<triad>\S+)\s+\S+', self.stdout, 'triad', float
+        ))
+
+    @performance_function('MB/s', perf_key='add')
+    def extract_min_add(self):
+        return sn.min(sn.extractall(
+            r'Add:\s+(?P<add>\S+)\s+\S+', self.stdout, 'add', float
+        ))
+
+    @performance_function('MB/s', perf_key='copy')
+    def extract_min_copy(self):
+        return sn.min(sn.extractall(
+            r'Copy:\s+(?P<copy>\S+)\s+\S+', self.stdout, 'copy', float
+        ))
+
+    @performance_function('MB/s', perf_key='scale')
+    def extract_min_scale(self):
+        return sn.min(sn.extractall(
+            r'Scale:\s+(?P<scale>\S+)\s+\S+', self.stdout, 'scale', float
+        ))
diff --git a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
new file mode 100644
index 0000000000..f8af9a8ee5
--- /dev/null
+++ b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py
@@ -0,0 +1,74 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+__all__ = ['StridedBandwidth']
+
+
+class StridedBandwidth(rfm.RegressionTest, pin_prefix=True):
+    '''Strided bandwith benchmark.
+
+    The executable takes three required arguments. These are the buffer size
+    (in bytes), the stride (in multiples of 8 bytes) and the number of threads
+    to run this application with.
+
+    Derived tests must set the parameter ``stride_bytes``, and the variables
+    ``num_cpus`` and ``num_tasks``.
+
+    The performance stage measures the bandiwdth in GB/s.
+    '''
+
+    #: Parameter that controls the stride access pattern.
+    #: This parameter must be opverridden by the derived class.
+    #:
+    #: :default: ``()``
+    stride_bytes = parameter()
+
+    #: Set the number of cpus per node.
+    #:
+    #: :default: ``required``
+    num_cpus = variable(int)
+
+    # Required variables
+    num_tasks = required
+
+    sourcepath = 'strides.cpp'
+    build_system = 'SingleSource'
+    num_tasks_per_node = 1
+    maintainers = ['SK']
+
+    @run_before('run')
+    def set_exec_opts(self):
+        '''Set the exec options.
+
+        In order, these are the buffer size, stride and number of threads. See
+        the main docstring above for more info.
+        '''
+        self.executable_opts = [
+            '100000000', f'{self.stride_bytes}', f'{self.num_cpus}'
+        ]
+
+    @sanity_function
+    def assert_bandwidth_is_reported(self):
+        ''' Assert that the bandwidth is reported for all the tasks.'''
+
+        return sn.assert_eq(
+            sn.count(sn.findall(r'bandwidth:', self.stdout)),
+            self.job.num_tasks
+        )
+
+    @performance_function('GB/s')
+    def bandwidth(self):
+        '''Extract the min bandwidth as a performance metric.'''
+
+        return sn.min(
+            sn.extractall(
+                r'bandwidth: (?P<bw>\S+) GB/s',
+                self.stdout, 'bw', float
+            )
+        )
diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp
similarity index 100%
rename from cscs-checks/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp
rename to hpctestlib/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp