diff --git a/config/cscs.py b/config/cscs.py index 70ed35501d..0013afc391 100644 --- a/config/cscs.py +++ b/config/cscs.py @@ -39,7 +39,8 @@ ], 'environs': [ 'builtin', - 'PrgEnv-gnu' + 'PrgEnv-gnu', + 'PrgEnv-fujitsu' ], 'descr': 'Fujitsu A64FX CPUs', 'max_jobs': 100, @@ -857,6 +858,18 @@ 'cxx': 'mpicxx', 'ftn': 'mpif90' }, + { + 'name': 'PrgEnv-fujitsu', + 'target_systems': [ + 'ault' + ], + 'modules': [ + 'a64fxsdk' + ], + 'cc': 'mpifccpx', + 'cxx': 'mpiFCCpx', + 'ftn': 'mpifrtpx' + }, { 'name': 'builtin', 'target_systems': [ diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py index 632c527f4a..8ea51f62d6 100644 --- a/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py +++ b/cscs-checks/microbenchmarks/cpu/dgemm/dgemm.py @@ -4,105 +4,86 @@ # SPDX-License-Identifier: BSD-3-Clause import reframe as rfm -import reframe.utility.sanity as sn + +from hpctestlib.microbenchmarks.cpu.dgemm import Dgemm @rfm.simple_test -class DGEMMTest(rfm.RegressionTest): - def __init__(self): - self.descr = 'DGEMM performance test' - self.sourcepath = 'dgemm.c' - self.sanity_patterns = self.eval_sanity() +class dgemm_check(Dgemm): + '''CSCS DGEMM check. + + The matrix dimensions are set in the base class. + Every node reports its performance in Gflops/s. To do so, this class + overrides the performance patterns and references from the base test. + This is done in the ``set_perf_patterns`` pre-performance hook. + ''' + + valid_systems = [ + 'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', + 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn', + 'eiger:mc', 'pilatus:mc', 'ault:a64fx' + ] + num_tasks = 0 + sys_reference = variable( + dict, value={ + 'daint:gpu': (300.0, -0.15, None, 'Gflop/s'), + 'daint:mc': (1040.0, -0.15, None, 'Gflop/s'), + 'dom:gpu': (300.0, -0.15, None, 'Gflop/s'), + 'dom:mc': (1040.0, -0.15, None, 'Gflop/s'), + 'eiger:mc': (3200.0, -0.15, None, 'Gflop/s'), + 'pilatus:mc': (3200.0, -0.15, None, 'Gflop/s'), + 'ault:a64fx': (1930.0, -0.15, None, 'Gflop/s'), + '*': (None, None, None, 'Gflop/s'), + }, + ) + tags = {'benchmark', 'diagnostic', 'craype'} - # the perf patterns are automaticaly generated inside sanity - self.perf_patterns = {} - self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', - 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn', - 'eiger:mc', 'pilatus:mc'] + @run_after('init') + def set_valid_prog_environs(self): if self.current_system.name in ['daint', 'dom']: self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-intel'] elif self.current_system.name in ['arolla', 'tsa']: self.valid_prog_environs = ['PrgEnv-gnu-nompi'] elif self.current_system.name in ['eiger', 'pilatus']: self.valid_prog_environs = ['PrgEnv-gnu'] - else: - self.valid_prog_environs = [] + elif self.current_system.name in ['ault']: + self.valid_prog_environs = ['PrgEnv-fujitsu'] - self.num_tasks = 0 - self.use_multithreading = False - self.executable_opts = ['6144', '12288', '3072'] - self.build_system = 'SingleSource' - self.build_system.cflags = ['-O3'] - self.sys_reference = { - 'daint:gpu': (300.0, -0.15, None, 'Gflop/s'), - 'daint:mc': (1040.0, -0.15, None, 'Gflop/s'), - 'dom:gpu': (300.0, -0.15, None, 'Gflop/s'), - 'dom:mc': (1040.0, -0.15, None, 'Gflop/s'), - 'eiger:mc': (3200.0, -0.15, None, 'Gflop/s'), - 'pilatus:mc': (3200.0, -0.15, None, 'Gflop/s'), - } - self.maintainers = ['AJ', 'VH'] - self.tags = {'benchmark', 'diagnostic', 'craype'} + @run_after('setup') + def skip_incompatible_combinations(self): + '''Fujitsu env only available in ault's a64fx partition.''' + if self.current_environ.name.startswith('PrgEnv-fujitsu'): + self.skip_if( + self.current_partition.fullname not in ('ault:a64fx') + ) + + @run_after('setup') + def set_num_cpus_per_task(self): + proc = self.current_partition.processor + pname = self.current_partition.fullname + if not proc.info: + self.skip(f'no topology information found for partition {pname!r}') + + self.num_cpus_per_task = proc.num_cpus // proc.num_cpus_per_core @run_before('compile') - def setflags(self): + def set_flags(self): if self.current_environ.name.startswith('PrgEnv-gnu'): self.build_system.cflags += ['-fopenmp'] elif self.current_environ.name.startswith('PrgEnv-intel'): self.build_system.cppflags = [ '-DMKL_ILP64', '-I${MKLROOT}/include' ] - self.build_system.cflags = ['-qopenmp'] + self.build_system.cflags += ['-qopenmp'] self.build_system.ldflags = [ '-mkl', '-static-intel', '-liomp5', '-lpthread', '-lm', '-ldl' ] + elif self.current_environ.name.startswith('PrgEnv-fujitsu'): + self.build_system.cflags += ['-fopenmp', '-Nlibomp', '-mt'] + self.build_system.ldflags += ['-SSL2BLAMP', '-mt'] if self.current_partition.fullname in ['arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn']: self.build_system.cflags += ['-I$EBROOTOPENBLAS/include'] self.build_system.ldflags = ['-L$EBROOTOPENBLAS/lib', '-lopenblas', '-lpthread', '-lgfortran'] - - @run_before('run') - def set_tasks(self): - if self.current_partition.fullname in ['daint:gpu', 'dom:gpu']: - self.num_cpus_per_task = 12 - elif self.current_partition.fullname in ['daint:mc', 'dom:mc']: - self.num_cpus_per_task = 36 - elif self.current_partition.fullname in ['arolla:cn', 'tsa:cn']: - self.num_cpus_per_task = 16 - elif self.current_partition.fullname in ['arolla:pn', 'tsa:pn']: - self.num_cpus_per_task = 40 - elif self.current_partition.fullname in ['eiger:mc', 'pilatus:mc']: - self.num_cpus_per_task = 128 - - if self.num_cpus_per_task: - self.variables = { - 'OMP_NUM_THREADS': str(self.num_cpus_per_task), - 'OMP_BIND': 'cores', - 'OMP_PROC_BIND': 'spread', - 'OMP_SCHEDULE': 'static' - } - - @sn.sanity_function - def eval_sanity(self): - all_tested_nodes = sn.evaluate(sn.extractall( - r'(?P\S+):\s+Time for \d+ DGEMM operations', - self.stdout, 'hostname')) - num_tested_nodes = len(all_tested_nodes) - failure_msg = ('Requested %s node(s), but found %s node(s)' % - (self.job.num_tasks, num_tested_nodes)) - sn.evaluate(sn.assert_eq(num_tested_nodes, self.job.num_tasks, - msg=failure_msg)) - - for hostname in all_tested_nodes: - partition_name = self.current_partition.fullname - ref_name = '%s:%s' % (partition_name, hostname) - self.reference[ref_name] = self.sys_reference.get( - partition_name, (0.0, None, None, 'Gflop/s') - ) - self.perf_patterns[hostname] = sn.extractsingle( - r'%s:\s+Avg\. performance\s+:\s+(?P\S+)' - r'\sGflop/s' % hostname, self.stdout, 'gflops', float) - - return True diff --git a/cscs-checks/microbenchmarks/cpu/latency/latency.py b/cscs-checks/microbenchmarks/cpu/latency/latency.py index a766ee4554..583516dad6 100644 --- a/cscs-checks/microbenchmarks/cpu/latency/latency.py +++ b/cscs-checks/microbenchmarks/cpu/latency/latency.py @@ -4,61 +4,73 @@ # SPDX-License-Identifier: BSD-3-Clause import reframe as rfm -import reframe.utility.sanity as sn + +from hpctestlib.microbenchmarks.cpu.latency import CpuLatency @rfm.simple_test -class CPULatencyTest(rfm.RegressionTest): - sourcepath = 'latency.cpp' - build_system = 'SingleSource' - valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', - 'ault:intel', 'ault:amdvega', 'tave:compute'] - valid_prog_environs = ['PrgEnv-gnu'] +class cpu_latency_check(CpuLatency): + '''CPU latency check. + + This check performs a latency test for each provided buffer size. + ''' + # Set required variables + buffer_sizes = ['16000', '128000', '8000000', '500000000'] num_tasks = 0 - num_tasks_per_node = 1 - executable_opts = ['16000', '128000', '8000000', '500000000'] + + valid_systems = [ + 'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', + 'ault:intel', 'ault:amdvega', 'tave:compute', 'ault:a64fx' + ] + valid_prog_environs = ['PrgEnv-gnu'] reference = { 'dom:mc': { - 'latencyL1': (1.21, -0.01, 0.26, 'ns'), - 'latencyL2': (3.65, -0.01, 0.26, 'ns'), - 'latencyL3': (18.83, -0.01, 0.05, 'ns'), - 'latencyMem': (76.6, -0.01, 0.05, 'ns') + 'latencyL1': (1.21, -0.01, 0.26, 'ns'), + 'latencyL2': (3.65, -0.01, 0.26, 'ns'), + 'latencyL3': (18.83, -0.01, 0.05, 'ns'), + 'latencyL4': (76.6, -0.01, 0.05, 'ns') }, 'dom:gpu': { - 'latencyL1': (1.14, -0.01, 0.26, 'ns'), - 'latencyL2': (3.44, -0.01, 0.26, 'ns'), - 'latencyL3': (15.65, -0.01, 0.05, 'ns'), - 'latencyMem': (71.7, -0.01, 0.05, 'ns') + 'latencyL1': (1.14, -0.01, 0.26, 'ns'), + 'latencyL2': (3.44, -0.01, 0.26, 'ns'), + 'latencyL3': (15.65, -0.01, 0.05, 'ns'), + 'latencyL4': (71.7, -0.01, 0.05, 'ns') }, 'daint:mc': { - 'latencyL1': (1.21, -0.01, 0.26, 'ns'), - 'latencyL2': (3.65, -0.01, 0.26, 'ns'), - 'latencyL3': (18.83, -0.01, 0.05, 'ns'), - 'latencyMem': (76.6, -0.01, 0.05, 'ns') + 'latencyL1': (1.21, -0.01, 0.26, 'ns'), + 'latencyL2': (3.65, -0.01, 0.26, 'ns'), + 'latencyL3': (18.83, -0.01, 0.05, 'ns'), + 'latencyL4': (76.6, -0.01, 0.05, 'ns') }, 'daint:gpu': { - 'latencyL1': (1.14, -0.01, 0.26, 'ns'), - 'latencyL2': (3.44, -0.01, 0.26, 'ns'), - 'latencyL3': (15.65, -0.01, 0.05, 'ns'), - 'latencyMem': (71.7, -0.01, 0.05, 'ns') + 'latencyL1': (1.14, -0.01, 0.26, 'ns'), + 'latencyL2': (3.44, -0.01, 0.26, 'ns'), + 'latencyL3': (15.65, -0.01, 0.05, 'ns'), + 'latencyL4': (71.7, -0.01, 0.05, 'ns') }, 'ault:intel': { - 'latencyL1': (1.08, -0.01, 0.26, 'ns'), - 'latencyL2': (3.8, -0.01, 0.26, 'ns'), - 'latencyL3': (21.5, -0.01, 0.05, 'ns'), - 'latencyMem': (86.5, -0.01, 0.05, 'ns') + 'latencyL1': (1.08, -0.01, 0.26, 'ns'), + 'latencyL2': (3.8, -0.01, 0.26, 'ns'), + 'latencyL3': (21.5, -0.01, 0.05, 'ns'), + 'latencyL4': (86.5, -0.01, 0.05, 'ns') }, 'ault:amdvega': { - 'latencyL1': (1.32, -0.01, 0.26, 'ns'), - 'latencyL2': (4.02, -0.01, 0.26, 'ns'), - 'latencyL3': (14.4, -0.01, 0.26, 'ns'), - 'latencyMem': (90.0, -0.01, 0.05, 'ns') + 'latencyL1': (1.32, -0.01, 0.26, 'ns'), + 'latencyL2': (4.02, -0.01, 0.26, 'ns'), + 'latencyL3': (14.4, -0.01, 0.26, 'ns'), + 'latencyL4': (90.0, -0.01, 0.05, 'ns') }, 'tave:compute': { - 'latencyL1': (2.86, -0.01, 0.05, 'ns'), - 'latencyL2': (12.15, -0.01, 0.05, 'ns'), - 'latencyL3': (137, -0.01, 0.05, 'ns'), - 'latencyMem': (150, -0.05, 0.05, 'ns') + 'latencyL1': (2.86, -0.01, 0.05, 'ns'), + 'latencyL2': (12.15, -0.01, 0.05, 'ns'), + 'latencyL3': (137, -0.01, 0.05, 'ns'), + 'latencyL4': (150, -0.05, 0.05, 'ns') + }, + 'ault:a64fx': { + 'latencyL1': (2.78, None, 0.05, 'ns'), + 'latencyL2': (14.3, None, 0.05, 'ns'), + 'latencyL3': (32.1, None, 0.05, 'ns'), + 'latencyL4': (146, None, 0.05, 'ns') }, } maintainers = ['SK'] @@ -71,34 +83,7 @@ def set_modules(self): if self.current_system.name in {'tave'}: self.modules = ['craype-hugepages512M'] - @run_before('compile') - def set_flags(self): - self.build_system.cxxflags = ['-O3'] - - @sanity_function - def assert_success(self): - return sn.assert_eq( - sn.count(sn.findall(r'latency', self.stdout)), - self.num_tasks * len(self.executable_opts) - ) - - def lat_pattern(self, index): - return sn.extractsingle( - r'latency \(ns\) for input size %s: (?P\S+) clocks' % - self.executable_opts[index], self.stdout, 'bw', float) - - @performance_function('ns') - def latencyL1(self): - return self.lat_pattern(0) - - @performance_function('ns') - def latencyL2(self): - return self.lat_pattern(1) - - @performance_function('ns') - def latencyL3(self): - return self.lat_pattern(2) - - @performance_function('ns') - def latencyMem(self): - return self.lat_pattern(3) + @run_after('init') + def set_valid_environs(self): + if self.current_system.name in {'ault'}: + self.valid_prog_environs = ['PrgEnv-fujitsu'] diff --git a/cscs-checks/microbenchmarks/cpu/stream/src/stream.c b/cscs-checks/microbenchmarks/cpu/stream/src/stream.c deleted file mode 100644 index b9a2cee3b2..0000000000 --- a/cscs-checks/microbenchmarks/cpu/stream/src/stream.c +++ /dev/null @@ -1,585 +0,0 @@ -/*-----------------------------------------------------------------------*/ -/* Program: STREAM */ -/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */ -/* Original code developed by John D. McCalpin */ -/* Programmers: John D. McCalpin */ -/* Joe R. Zagar */ -/* */ -/* This program measures memory transfer rates in MB/s for simple */ -/* computational kernels coded in C. */ -/*-----------------------------------------------------------------------*/ -/* Copyright 1991-2013: John D. McCalpin */ -/*-----------------------------------------------------------------------*/ -/* License: */ -/* 1. You are free to use this program and/or to redistribute */ -/* this program. */ -/* 2. You are free to modify this program for your own use, */ -/* including commercial use, subject to the publication */ -/* restrictions in item 3. */ -/* 3. You are free to publish results obtained from running this */ -/* program, or from works that you derive from this program, */ -/* with the following limitations: */ -/* 3a. In order to be referred to as "STREAM benchmark results", */ -/* published results must be in conformance to the STREAM */ -/* Run Rules, (briefly reviewed below) published at */ -/* http://www.cs.virginia.edu/stream/ref.html */ -/* and incorporated herein by reference. */ -/* As the copyright holder, John McCalpin retains the */ -/* right to determine conformity with the Run Rules. */ -/* 3b. Results based on modified source code or on runs not in */ -/* accordance with the STREAM Run Rules must be clearly */ -/* labelled whenever they are published. Examples of */ -/* proper labelling include: */ -/* "tuned STREAM benchmark results" */ -/* "based on a variant of the STREAM benchmark code" */ -/* Other comparable, clear, and reasonable labelling is */ -/* acceptable. */ -/* 3c. Submission of results to the STREAM benchmark web site */ -/* is encouraged, but not required. */ -/* 4. Use of this program or creation of derived works based on this */ -/* program constitutes acceptance of these licensing restrictions. */ -/* 5. Absolutely no warranty is expressed or implied. */ -/*-----------------------------------------------------------------------*/ -# include -# include -# include -# include -# include -# include - -/*----------------------------------------------------------------------- - * INSTRUCTIONS: - * - * 1) STREAM requires different amounts of memory to run on different - * systems, depending on both the system cache size(s) and the - * granularity of the system timer. - * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) - * to meet *both* of the following criteria: - * (a) Each array must be at least 4 times the size of the - * available cache memory. I don't worry about the difference - * between 10^6 and 2^20, so in practice the minimum array size - * is about 3.8 times the cache size. - * Example 1: One Xeon E3 with 8 MB L3 cache - * STREAM_ARRAY_SIZE should be >= 4 million, giving - * an array size of 30.5 MB and a total memory requirement - * of 91.5 MB. - * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) - * STREAM_ARRAY_SIZE should be >= 20 million, giving - * an array size of 153 MB and a total memory requirement - * of 458 MB. - * (b) The size should be large enough so that the 'timing calibration' - * output by the program is at least 20 clock-ticks. - * Example: most versions of Windows have a 10 millisecond timer - * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. - * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. - * This means the each array must be at least 1 GB, or 128M elements. - * - * Version 5.10 increases the default array size from 2 million - * elements to 10 million elements in response to the increasing - * size of L3 caches. The new default size is large enough for caches - * up to 20 MB. - * Version 5.10 changes the loop index variables from "register int" - * to "ssize_t", which allows array indices >2^32 (4 billion) - * on properly configured 64-bit systems. Additional compiler options - * (such as "-mcmodel=medium") may be required for large memory runs. - * - * Array size can be set at compile time without modifying the source - * code for the (many) compilers that support preprocessor definitions - * on the compile line. E.g., - * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M - * will override the default size of 10M with a new size of 100M elements - * per array. - */ -#ifndef STREAM_ARRAY_SIZE -# define STREAM_ARRAY_SIZE 10000000 -#endif - -/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result - * for any iteration after the first, therefore the minimum value - * for NTIMES is 2. - * There are no rules on maximum allowable values for NTIMES, but - * values larger than the default are unlikely to noticeably - * increase the reported performance. - * NTIMES can also be set on the compile line without changing the source - * code using, for example, "-DNTIMES=7". - */ -#ifdef NTIMES -#if NTIMES<=1 -# define NTIMES 10 -#endif -#endif -#ifndef NTIMES -# define NTIMES 10 -#endif - -/* Users are allowed to modify the "OFFSET" variable, which *may* change the - * relative alignment of the arrays (though compilers may change the - * effective offset by making the arrays non-contiguous on some systems). - * Use of non-zero values for OFFSET can be especially helpful if the - * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. - * OFFSET can also be set on the compile line without changing the source - * code using, for example, "-DOFFSET=56". - */ -#ifndef OFFSET -# define OFFSET 0 -#endif - -/* - * 3) Compile the code with optimization. Many compilers generate - * unreasonably bad code before the optimizer tightens things up. - * If the results are unreasonably good, on the other hand, the - * optimizer might be too smart for me! - * - * For a simple single-core version, try compiling with: - * cc -O stream.c -o stream - * This is known to work on many, many systems.... - * - * To use multiple cores, you need to tell the compiler to obey the OpenMP - * directives in the code. This varies by compiler, but a common example is - * gcc -O -fopenmp stream.c -o stream_omp - * The environment variable OMP_NUM_THREADS allows runtime control of the - * number of threads/cores used when the resulting "stream_omp" program - * is executed. - * - * To run with single-precision variables and arithmetic, simply add - * -DSTREAM_TYPE=float - * to the compile line. - * Note that this changes the minimum array sizes required --- see (1) above. - * - * The preprocessor directive "TUNED" does not do much -- it simply causes the - * code to call separate functions to execute each kernel. Trivial versions - * of these functions are provided, but they are *not* tuned -- they just - * provide predefined interfaces to be replaced with tuned code. - * - * - * 4) Optional: Mail the results to mccalpin@cs.virginia.edu - * Be sure to include info that will help me understand: - * a) the computer hardware configuration (e.g., processor model, memory type) - * b) the compiler name/version and compilation flags - * c) any run-time information (such as OMP_NUM_THREADS) - * d) all of the output from the test case. - * - * Thanks! - * - *-----------------------------------------------------------------------*/ - -# define HLINE "-------------------------------------------------------------\n" - -# ifndef MIN -# define MIN(x,y) ((x)<(y)?(x):(y)) -# endif -# ifndef MAX -# define MAX(x,y) ((x)>(y)?(x):(y)) -# endif - -#ifndef STREAM_TYPE -#define STREAM_TYPE double -#endif - -static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], - b[STREAM_ARRAY_SIZE+OFFSET], - c[STREAM_ARRAY_SIZE+OFFSET]; - -static double avgtime[4] = {0}, maxtime[4] = {0}, - mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; - -static char *label[4] = {"Copy: ", "Scale: ", - "Add: ", "Triad: "}; - -static double bytes[4] = { - 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, - 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, - 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, - 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE - }; - -extern double mysecond(); -extern void checkSTREAMresults(); -#ifdef TUNED -extern void tuned_STREAM_Copy(); -extern void tuned_STREAM_Scale(STREAM_TYPE scalar); -extern void tuned_STREAM_Add(); -extern void tuned_STREAM_Triad(STREAM_TYPE scalar); -#endif -#ifdef _OPENMP -extern int omp_get_num_threads(); -#endif -int -main() - { - int quantum, checktick(); - int BytesPerWord; - int k; - ssize_t j; - STREAM_TYPE scalar; - double t, times[4][NTIMES]; - - /* --- SETUP --- determine precision and check timing --- */ - - printf(HLINE); - printf("STREAM version $Revision: 5.10 $\n"); - printf(HLINE); - BytesPerWord = sizeof(STREAM_TYPE); - printf("This system uses %d bytes per array element.\n", - BytesPerWord); - - printf(HLINE); -#ifdef N - printf("***** WARNING: ******\n"); - printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); - printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); - printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); - printf("***** WARNING: ******\n"); -#endif - - printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); - printf("Memory per array = %.1f MiB (= %.1f GiB).\n", - BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), - BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); - printf("Total memory required = %.1f MiB (= %.1f GiB).\n", - (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), - (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); - printf("Each kernel will be executed %d times.\n", NTIMES); - printf(" The *best* time for each kernel (excluding the first iteration)\n"); - printf(" will be used to compute the reported bandwidth.\n"); - -#ifdef _OPENMP - printf(HLINE); -#pragma omp parallel - { -#pragma omp master - { - k = omp_get_num_threads(); - printf ("Number of Threads requested = %i\n",k); - } - } -#endif - -#ifdef _OPENMP - k = 0; -#pragma omp parallel -#pragma omp atomic - k++; - printf ("Number of Threads counted = %i\n",k); -#endif - - /* Get initial value for system clock. */ -#pragma omp parallel for - for (j=0; j= 1) - printf("Your clock granularity/precision appears to be " - "%d microseconds.\n", quantum); - else { - printf("Your clock granularity appears to be " - "less than one microsecond.\n"); - quantum = 1; - } - - t = mysecond(); -#pragma omp parallel for - for (j = 0; j < STREAM_ARRAY_SIZE; j++) - a[j] = 2.0E0 * a[j]; - t = 1.0E6 * (mysecond() - t); - - printf("Each test below will take on the order" - " of %d microseconds.\n", (int) t ); - printf(" (= %d clock ticks)\n", (int) (t/quantum) ); - printf("Increase the size of the arrays if this shows that\n"); - printf("you are not getting at least 20 clock ticks per test.\n"); - - printf(HLINE); - - printf("WARNING -- The above is only a rough guideline.\n"); - printf("For best results, please be sure you know the\n"); - printf("precision of your system timer.\n"); - printf(HLINE); - - /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ - - scalar = 3.0; - for (k=0; k - -double mysecond() -{ - struct timeval tp; - struct timezone tzp; - int i; - - i = gettimeofday(&tp,&tzp); - return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); -} - -#ifndef abs -#define abs(a) ((a) >= 0 ? (a) : -(a)) -#endif -void checkSTREAMresults () -{ - STREAM_TYPE aj,bj,cj,scalar; - STREAM_TYPE aSumErr,bSumErr,cSumErr; - STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; - double epsilon; - ssize_t j; - int k,ierr,err; - - /* reproduce initialization */ - aj = 1.0; - bj = 2.0; - cj = 0.0; - /* a[] is modified during timing check */ - aj = 2.0E0 * aj; - /* now execute timing loop */ - scalar = 3.0; - for (k=0; k epsilon) { - err++; - printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); - printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); - ierr = 0; - for (j=0; j epsilon) { - ierr++; -#ifdef VERBOSE - if (ierr < 10) { - printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", - j,aj,a[j],abs((aj-a[j])/aAvgErr)); - } -#endif - } - } - printf(" For array a[], %d errors were found.\n",ierr); - } - if (abs(bAvgErr/bj) > epsilon) { - err++; - printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); - printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); - printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); - ierr = 0; - for (j=0; j epsilon) { - ierr++; -#ifdef VERBOSE - if (ierr < 10) { - printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", - j,bj,b[j],abs((bj-b[j])/bAvgErr)); - } -#endif - } - } - printf(" For array b[], %d errors were found.\n",ierr); - } - if (abs(cAvgErr/cj) > epsilon) { - err++; - printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); - printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); - printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); - ierr = 0; - for (j=0; j epsilon) { - ierr++; -#ifdef VERBOSE - if (ierr < 10) { - printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", - j,cj,c[j],abs((cj-c[j])/cAvgErr)); - } -#endif - } - } - printf(" For array c[], %d errors were found.\n",ierr); - } - if (err == 0) { - printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); - } -#ifdef VERBOSE - printf ("Results Validation Verbose Results: \n"); - printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); - printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); - printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); -#endif -} - -#ifdef TUNED -/* stubs for "tuned" versions of the kernels */ -void tuned_STREAM_Copy() -{ - ssize_t j; -#pragma omp parallel for - for (j=0; j\S+)\s+\S+', - self.stdout, 'triad', float) - } - self.stream_bw_reference = { + valid_systems = [ + 'daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', + 'arolla:cn', 'arolla:pn', 'tsa:cn', 'tsa:pn', 'ault:a64fx' + ] + valid_prog_environs = [ + 'PrgEnv-cray', 'PrgEnv-gnu', 'PrgEnv-intel', 'PrgEnv-pgi', + 'PrgEnv-nvidia' + ] + triad_reference = variable( + dict, value={ 'PrgEnv-cray': { 'daint:gpu': {'triad': (44000, -0.05, None, 'MB/s')}, 'daint:mc': {'triad': (89000, -0.05, None, 'MB/s')}, @@ -87,23 +45,57 @@ def __init__(self): 'daint:mc': {'triad': (88500, -0.05, None, 'MB/s')}, 'dom:gpu': {'triad': (44500, -0.05, None, 'MB/s')}, 'dom:mc': {'triad': (88500, -0.05, None, 'MB/s')}, - } + }, + 'PrgEnv-fujitsu': { + 'ault:a64fx': {'triad': (85500, -0.05, None, 'MB/s')}, + }, } - self.tags = {'production', 'craype'} - self.maintainers = ['RS', 'SK'] + ) + num_tasks = 1 + tags = {'production', 'craype'} + + @run_after('init') + def filter_valid_prog_environs(self): + '''Special conditions for arolla and tsa.''' + if self.current_system.name in ['arolla', 'tsa']: + self.valid_prog_environs = ['PrgEnv-gnu'] + elif self.current_system.name in ['ault']: + self.valid_prog_environs = ['PrgEnv-fujitsu'] @run_after('setup') - def prepare_test(self): - self.num_cpus_per_task = self.stream_cpus_per_task.get( - self.current_partition.fullname, 1) - self.variables['OMP_NUM_THREADS'] = str(self.num_cpus_per_task) + def set_num_cpus_per_task(self): + '''Set the num cpus based on the autodetected topology.''' + proc = self.current_partition.processor + pname = self.current_partition.fullname + if not proc.info: + self.skip(f'no topology information found for partition {pname!r}') + + self.num_cpus_per_task = proc.num_cpus // proc.num_cpus_per_core + + @run_before('compile') + def set_compiler_flags(self): + '''Set build flags for the different environments.''' + envname = self.current_environ.name + if envname in ('PrgEnv-cray', 'PrgEnv-gnu'): + self.build_system.cflags += ['-fopenmp', '-O3'] + elif envname in ('PrgEnv-intel'): + self.build_system.cflags += ['-qopenmp', '-O3'] + elif envname in ('PrgEnv-pgi', 'PrgEnv-nvidia'): + self.build_system.cflags += ['-mp', '-O3'] + elif envname in ('PrgEnv-fujitsu'): + self.build_system.cflags += ['-fopenmp', '-mt', '-O3'] + self.build_system.ldflags += ['-mt'] - self.build_system.cflags = self.prgenv_flags.get(envname, ['-O3']) - if envname == 'PrgEnv-pgi': + @run_before('run') + def set_env_vars(self): + '''Special environment treatment for the PrgEnv-pgi.''' + if self.current_environ.name == 'PrgEnv-pgi': self.variables['OMP_PROC_BIND'] = 'true' - try: - self.reference = self.stream_bw_reference[envname] - except KeyError: - self.reference = self.stream_bw_reference['PrgEnv-gnu'] + @run_before('performance') + def set_triad_references(self): + '''Set performance refs as defined in ``triad_reference``.''' + envname = self.current_environ.name + if envname in self.triad_reference: + self.reference = self.triad_reference[envname] diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py index cc6377b165..276ebfbff0 100644 --- a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py +++ b/cscs-checks/microbenchmarks/cpu/strided_bandwidth/strides.py @@ -4,39 +4,59 @@ # SPDX-License-Identifier: BSD-3-Clause import reframe as rfm -import reframe.utility.sanity as sn +from hpctestlib.microbenchmarks.cpu.strided_bandwidth import StridedBandwidth @rfm.simple_test -class StridedBandwidthTest(rfm.RegressionTest): - sourcepath = 'strides.cpp' - build_system = 'SingleSource' +class strided_bandwidth_check(StridedBandwidth): + '''Strided bandwidth check. + + This test is parameterized with the ``stride_bytes`` parameter, covering + the following scenarios: 8-byte, 64-byte and 128-byte strides. + + This test requires the ``num_cpus`` variable, which is set in a post-setup + hook. + ''' + valid_systems = ['daint:gpu', 'dom:gpu', 'daint:mc', 'dom:mc', - 'eiger:mc', 'pilatus:mc'] + 'eiger:mc', 'pilatus:mc', 'ault:a64fx'] valid_prog_environs = ['PrgEnv-gnu'] + + # Define the stride parameter + stride_bytes = parameter([8, 64, 128]) + + # Set required variables num_tasks = 1 - num_tasks_per_node = 1 - maintainers = ['SK'] tags = {'benchmark', 'diagnostic'} - stride_bytes = parameter([8, 64, 128]) + + # Bandwidth references reference_bw = { 8: { 'haswell': (50, -0.1, 0.1, 'GB/s'), 'broadwell': (100, -0.1, 0.1, 'GB/s'), - 'zen2': (270, -0.1, 0.1, 'GB/s') + 'zen2': (270, -0.1, 0.1, 'GB/s'), + 'a64fx': (50, -0.1, 0.1, 'GB/s') }, 64: { 'haswell': (6, -0.1, 0.2, 'GB/s'), 'broadwell': (12.5, -0.1, 0.2, 'GB/s'), - 'zen2': (33, -0.1, 0.2, 'GB/s') + 'zen2': (33, -0.1, 0.2, 'GB/s'), + 'a64fx': (45, -0.1, 0.1, 'GB/s') }, 128: { 'haswell': (4.5, -0.1, 0.2, 'GB/s'), 'broadwell': (9.1, -0.1, 0.2, 'GB/s'), - 'zen2': (33, -0.1, 0.2, 'GB/s') + 'zen2': (33, -0.1, 0.2, 'GB/s'), + 'a64fx': (25, -0.1, 0.1, 'GB/s') }, } + @run_after('init') + def set_valid_systems(self): + cp = self.current_system.name + if cp == 'ault': + self.valid_prog_environs = ['PrgEnv-fujitsu'] + @run_after('setup') def skip_if_no_topo(self): proc = self.current_partition.processor @@ -44,16 +64,6 @@ def skip_if_no_topo(self): if not proc.info: self.skip(f'no topology information found for partition {pname!r}') - @sanity_function - def assert_num_tasks(self): - return sn.assert_eq(sn.count(sn.findall(r'bandwidth', self.stdout)), - self.num_tasks) - - @performance_function('GB/s') - def bandwidth(self): - return sn.extractsingle(r'bandwidth: (?P\S+) GB/s', - self.stdout, 'bw', float) - @run_before('run') def set_exec_opts(self): proc = self.current_partition.processor diff --git a/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py new file mode 100644 index 0000000000..fb95bdeec6 --- /dev/null +++ b/hpctestlib/microbenchmarks/cpu/dgemm/__init__.py @@ -0,0 +1,84 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn + + +__all__ = ['Dgemm'] + + +class Dgemm(rfm.RegressionTest, pin_prefix=True): + '''Dgemm benchmark. + + Derived test must specify the variables ``num_tasks`` and + ``num_cpus_per_task``. + + The matrix sizes can be controlled through executable options. By default, + this test sets these as ``m=6144``, ``n=12288`` and ``k=3072``. Derived + tests are free to change these parameters at their convenience. The + performance of this tests is measured by the lowest performing node in + ``Gflops/s``. + ''' + + num_tasks = required + num_cpus_per_task = required + + descr = 'DGEMM performance test' + sourcepath = 'dgemm.c' + use_multithreading = False + executable_opts = ['6144', '12288', '3072'] + build_system = 'SingleSource' + maintainers = ['AJ', 'VH'] + + @run_before('compile') + def set_c_flags(self): + self.build_system.cflags += ['-O3'] + + @run_before('run') + def set_env_vars(self): + '''Set the environment variables.''' + + self.variables = { + 'OMP_NUM_THREADS': str(self.num_cpus_per_task), + 'OMP_BIND': 'cores', + 'OMP_PROC_BIND': 'spread', + 'OMP_SCHEDULE': 'static' + } + + @deferrable + def get_nodenames(self): + '''Return a set with the participating node IDs.''' + + return set(sn.extractall( + r'(?P\S+):\s+Time for \d+ DGEMM operations', + self.stdout, 'hostname' + )) + + @sanity_function + def assert_all_nodes_completed(self): + '''Assert that all requested nodes have completed.''' + + return sn.assert_eq( + self.job.num_tasks, sn.count(self.get_nodenames()), + msg='some nodes did not complete' + ) + + @deferrable + def get_node_performance(self, nodeid): + '''Get the performance data from a specific ``nodeid``.''' + + return sn.extractsingle( + r'%s:\s+Avg\. performance\s+:\s+(?P\S+)\sGflop/s' % nodeid, + self.stdout, 'gflops', float + ) + + @performance_function('Gflops/s') + def min_perf(self): + '''Get the lowest performance from all nodes.''' + + return sn.min([ + self.get_node_performance(nid) for nid in self.get_nodenames() + ]) diff --git a/cscs-checks/microbenchmarks/cpu/dgemm/src/dgemm.c b/hpctestlib/microbenchmarks/cpu/dgemm/src/dgemm.c similarity index 100% rename from cscs-checks/microbenchmarks/cpu/dgemm/src/dgemm.c rename to hpctestlib/microbenchmarks/cpu/dgemm/src/dgemm.c diff --git a/hpctestlib/microbenchmarks/cpu/latency/__init__.py b/hpctestlib/microbenchmarks/cpu/latency/__init__.py new file mode 100644 index 0000000000..4ba107d803 --- /dev/null +++ b/hpctestlib/microbenchmarks/cpu/latency/__init__.py @@ -0,0 +1,75 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn +import reframe.utility.typecheck as typ + +__all__ = ['CpuLatency'] + + +class CpuLatency(rfm.RegressionTest, pin_prefix=True): + ''' CPU latency test. + + Derived tests must set the variables ``buffer_size`` and ``num_tasks``. + The variable ``buffer_sizes`` is a list of the different buffer sizes to + be used on this latency test. The executable will return the latency in + ``ns`` for each of the buffer sizes specified in this list. + + This test assumes that the list of buffer sizes is provided in increasing + order, and this test will automatically extract a performance variable for + the latency of each buffer. These performance variables are named + ``latencyL1``, ``latencyL2`` and so on in increasing order. + ''' + + # Required variables + buffer_sizes = variable(typ.List[str]) + num_tasks = required + + sourcepath = 'latency.cpp' + build_system = 'SingleSource' + num_tasks_per_node = 1 + maintainers = ['SK', 'JO'] + + @run_before('compile') + def set_compiler_flags(self): + self.build_system.cxxflags = ['-O3'] + + @run_before('run') + def set_exc_opts(self): + '''Set the ``buffer_sizes`` as the executable options.''' + self.executable_opts = self.buffer_sizes + + @sanity_function + def count_reported_latencies(self): + '''Verify the number of reported latency measurements.''' + + return sn.assert_eq( + sn.count(sn.findall(r'latency \(ns\)', self.stdout)), + self.num_tasks*sn.count(self.executable_opts) + ) + + @deferrable + def get_latency(self, buffer_size): + '''Extract the worst latency for a given buffer size.''' + + return sn.max(sn.extractall( + r'latency \(ns\) for input size %s: (?P\S+) clocks' % + buffer_size, self.stdout, 'bw', float + )) + + @run_before('performance') + def set_perf_patterns(self): + '''Set the performance functions to extract all latency levels. + + The levels are named from ``L1`` to ``L{n}``, where ``n`` is the + length of ``buffer_sizes``. + ''' + + self.perf_variables = { + f'latencyL{i+1}': sn.make_performance_function( + self.get_latency(buff), 'ns' + ) for i, buff in enumerate(self.buffer_sizes) + } diff --git a/cscs-checks/microbenchmarks/cpu/latency/src/latency.cpp b/hpctestlib/microbenchmarks/cpu/latency/src/latency.cpp similarity index 100% rename from cscs-checks/microbenchmarks/cpu/latency/src/latency.cpp rename to hpctestlib/microbenchmarks/cpu/latency/src/latency.cpp diff --git a/hpctestlib/microbenchmarks/cpu/stream/__init__.py b/hpctestlib/microbenchmarks/cpu/stream/__init__.py new file mode 100644 index 0000000000..d4c5f893a3 --- /dev/null +++ b/hpctestlib/microbenchmarks/cpu/stream/__init__.py @@ -0,0 +1,74 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn + + +__all__ = ['Stream'] + + +class Stream(rfm.RegressionTest): + '''Stream benchmark. + + For info on the executable, see the executable sources. + + Derived tests must set the variables ``num_tasks`` and + ``num_cpus_per_task``. + ''' + + # Required variables + num_tasks = required + num_cpus_per_task = required + + descr = 'STREAM Benchmark' + exclusive_access = True + use_multithreading = False + prebuild_cmds = [ + 'wget http://www.cs.virginia.edu/stream/FTP/Code/stream.c', + ] + sourcepath = 'stream.c' + build_system = 'SingleSource' + num_tasks_per_node = 1 + variables = { + 'OMP_PLACES': 'threads', + 'OMP_PROC_BIND': 'spread' + } + maintainers = ['RS', 'SK'] + + @run_before('run') + def set_omp_num_threads(self): + '''Set the number of OMP threads to ``num_cpus_per_task``.''' + self.variables['OMP_NUM_THREADS'] = f'{self.num_cpus_per_task}' + + @sanity_function + def assert_solution_is_validated(self): + return sn.assert_found( + r'Solution Validates: avg error less than', self.stdout + ) + + @performance_function('MB/s', perf_key='triad') + def extract_min_triad(self): + return sn.min(sn.extractall( + r'Triad:\s+(?P\S+)\s+\S+', self.stdout, 'triad', float + )) + + @performance_function('MB/s', perf_key='add') + def extract_min_add(self): + return sn.min(sn.extractall( + r'Add:\s+(?P\S+)\s+\S+', self.stdout, 'add', float + )) + + @performance_function('MB/s', perf_key='copy') + def extract_min_copy(self): + return sn.min(sn.extractall( + r'Copy:\s+(?P\S+)\s+\S+', self.stdout, 'copy', float + )) + + @performance_function('MB/s', perf_key='scale') + def extract_min_scale(self): + return sn.min(sn.extractall( + r'Scale:\s+(?P\S+)\s+\S+', self.stdout, 'scale', float + )) diff --git a/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py new file mode 100644 index 0000000000..f8af9a8ee5 --- /dev/null +++ b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/__init__.py @@ -0,0 +1,74 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn + + +__all__ = ['StridedBandwidth'] + + +class StridedBandwidth(rfm.RegressionTest, pin_prefix=True): + '''Strided bandwith benchmark. + + The executable takes three required arguments. These are the buffer size + (in bytes), the stride (in multiples of 8 bytes) and the number of threads + to run this application with. + + Derived tests must set the parameter ``stride_bytes``, and the variables + ``num_cpus`` and ``num_tasks``. + + The performance stage measures the bandiwdth in GB/s. + ''' + + #: Parameter that controls the stride access pattern. + #: This parameter must be opverridden by the derived class. + #: + #: :default: ``()`` + stride_bytes = parameter() + + #: Set the number of cpus per node. + #: + #: :default: ``required`` + num_cpus = variable(int) + + # Required variables + num_tasks = required + + sourcepath = 'strides.cpp' + build_system = 'SingleSource' + num_tasks_per_node = 1 + maintainers = ['SK'] + + @run_before('run') + def set_exec_opts(self): + '''Set the exec options. + + In order, these are the buffer size, stride and number of threads. See + the main docstring above for more info. + ''' + self.executable_opts = [ + '100000000', f'{self.stride_bytes}', f'{self.num_cpus}' + ] + + @sanity_function + def assert_bandwidth_is_reported(self): + ''' Assert that the bandwidth is reported for all the tasks.''' + + return sn.assert_eq( + sn.count(sn.findall(r'bandwidth:', self.stdout)), + self.job.num_tasks + ) + + @performance_function('GB/s') + def bandwidth(self): + '''Extract the min bandwidth as a performance metric.''' + + return sn.min( + sn.extractall( + r'bandwidth: (?P\S+) GB/s', + self.stdout, 'bw', float + ) + ) diff --git a/cscs-checks/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp b/hpctestlib/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp similarity index 100% rename from cscs-checks/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp rename to hpctestlib/microbenchmarks/cpu/strided_bandwidth/src/strides.cpp