From db3758e42458b870403acfda90eeedef203520cd Mon Sep 17 00:00:00 2001
From: casparl <caspar.vanleeuwen@surf.nl>
Date: Mon, 9 Nov 2020 10:33:36 +0100
Subject: [PATCH 01/12] Initial try of mpi hello world test

---
 tests/reframe/config/settings.py              | 66 +++++++++++++++++++
 tests/reframe/config/system_properties.py     |  2 +
 tests/reframe/eessi-checks/prgenv/mpi.py      | 20 ++++++
 .../eessi-checks/prgenv/src/mpi_hello_world.c | 43 ++++++++++++
 4 files changed, 131 insertions(+)
 create mode 100644 tests/reframe/config/settings.py
 create mode 100644 tests/reframe/config/system_properties.py
 create mode 100644 tests/reframe/eessi-checks/prgenv/mpi.py
 create mode 100644 tests/reframe/eessi-checks/prgenv/src/mpi_hello_world.c

diff --git a/tests/reframe/config/settings.py b/tests/reframe/config/settings.py
new file mode 100644
index 0000000000..978bc4552b
--- /dev/null
+++ b/tests/reframe/config/settings.py
@@ -0,0 +1,66 @@
+site_configuration = {
+    'systems': [
+        {
+            'name': 'example_system',
+            'descr': 'This is just an example system',
+            'modules_system': 'tmod4',
+	    'hostnames': ['login'],
+	    'partitions': [
+		{
+                    'name': 'normal',
+                    'scheduler': 'slurm',
+	            'launcher': 'srun',
+                    'access':  ['-p normal'],
+                    'environs': ['foss'],
+                    'descr': 'normal partition'
+                },
+             ]
+         },
+    ],
+    'environments': [
+        {
+            'name': 'foss',
+	    'modules': ['foss-2020a'],
+            'cc': 'mpicc',
+            'cxx': 'mpicxx',
+            'ftn': 'mpifort',
+        },
+     ],
+     'logging': [
+        {
+            'level': 'debug',
+            'handlers': [
+                {
+                    'type': 'stream',
+                    'name': 'stdout',
+                    'level': 'info',
+                    'format': '%(message)s'
+                },
+                {
+                    'type': 'file',
+                    'name': 'reframe.log',
+                    'level': 'debug',
+                    'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s',   # noqa: E501
+                    'append': False
+                }
+            ],
+            'handlers_perflog': [
+                {
+                    'type': 'filelog',
+                    'prefix': '%(check_system)s/%(check_partition)s',
+                    'level': 'info',
+                    'format': (
+                        '%(check_job_completion_time)s|reframe %(version)s|'
+                        '%(check_info)s|jobid=%(check_jobid)s|'
+                        '%(check_perf_var)s=%(check_perf_value)s|'
+                        'ref=%(check_perf_ref)s '
+                        '(l=%(check_perf_lower_thres)s, '
+                        'u=%(check_perf_upper_thres)s)|'
+                        '%(check_perf_unit)s'
+                    ),
+                    'append': True
+                }
+            ]
+        }
+    ],
+}
diff --git a/tests/reframe/config/system_properties.py b/tests/reframe/config/system_properties.py
new file mode 100644
index 0000000000..b27bc7c56c
--- /dev/null
+++ b/tests/reframe/config/system_properties.py
@@ -0,0 +1,2 @@
+
+ncorespernode=16
diff --git a/tests/reframe/eessi-checks/prgenv/mpi.py b/tests/reframe/eessi-checks/prgenv/mpi.py
new file mode 100644
index 0000000000..df1a06483c
--- /dev/null
+++ b/tests/reframe/eessi-checks/prgenv/mpi.py
@@ -0,0 +1,20 @@
+import os
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+# Try to use an import to define all site-specific things
+# import system_properties
+
+@rfm.simple_test
+class MpiHelloWorld(rfm.RegressionTest):
+    def __init__(self):
+        # We don't define these here to keep tests generic
+        # Sensible systems & programming environments should be defined in your site configuration file
+        self.valid_systems = ['*']
+        self.valid_prog_environs = ['*']
+
+        self.sourcepath = 'mpi_hello_world.c'
+        self.maintainers = ['casparvl']
+        self.num_tasks_per_node = -2
+#       self.num_tasks_per_node = system_properties.ncorespernode
+        self.num_tasks_per_node = 16
diff --git a/tests/reframe/eessi-checks/prgenv/src/mpi_hello_world.c b/tests/reframe/eessi-checks/prgenv/src/mpi_hello_world.c
new file mode 100644
index 0000000000..35458d5d68
--- /dev/null
+++ b/tests/reframe/eessi-checks/prgenv/src/mpi_hello_world.c
@@ -0,0 +1,43 @@
+#include <stdio.h>
+#include <string.h>
+#include <mpi.h>
+
+#define MSG_SIZE_MAX 255
+
+
+int main(int argc, char **argv)
+{
+    const char *msg = "Hello, World!";
+    char msg_buff[MSG_SIZE_MAX+1];
+    size_t msg_len = strnlen(msg, MSG_SIZE_MAX);
+    int rank, num_tasks, i;
+    int dest = 0;
+    int tag  = 0;
+    int nr_correct = 0;
+    MPI_Status status;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_tasks);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    if (num_tasks < 2) {
+        fprintf(stderr, "Not enough tasks to run the test.\n");
+        MPI_Finalize();
+        return 1;
+    }
+
+    if (rank != 0) {
+        strncpy(msg_buff, msg, MSG_SIZE_MAX);
+        MPI_Send(msg_buff, msg_len+1, MPI_CHAR, dest, tag, MPI_COMM_WORLD);
+    } else {
+        for (i = 1; i < num_tasks; i++) {
+            MPI_Recv(msg_buff, msg_len+1, MPI_CHAR,
+                     i, tag, MPI_COMM_WORLD, &status);
+            if (!strncmp(msg, msg_buff, MSG_SIZE_MAX))
+                nr_correct++;
+        }
+        printf("Received correct messages from %d processes.\n", nr_correct);
+    }
+
+    MPI_Finalize();
+    return 0;
+}

From 5c6929d8daebeb1d756399a6ce022c4eec410627 Mon Sep 17 00:00:00 2001
From: Caspar van Leeuwen <caspar.vanleeuwen@surfsara.nl>
Date: Mon, 29 Mar 2021 12:34:41 +0200
Subject: [PATCH 02/12] Added sources for TF2 synthetic benchmark

---
 .../src/tensorflow2_synthetic_benchmark.py    | 154 ++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 tests/reframe/eessi-checks/applications/src/tensorflow2_synthetic_benchmark.py

diff --git a/tests/reframe/eessi-checks/applications/src/tensorflow2_synthetic_benchmark.py b/tests/reframe/eessi-checks/applications/src/tensorflow2_synthetic_benchmark.py
new file mode 100644
index 0000000000..e78a551a10
--- /dev/null
+++ b/tests/reframe/eessi-checks/applications/src/tensorflow2_synthetic_benchmark.py
@@ -0,0 +1,154 @@
+# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import os
+import numpy as np
+import timeit
+
+import tensorflow as tf
+from tensorflow.keras import applications
+from tensorflow.keras import mixed_precision
+
+def log(s, nl=True):
+    if args.use_horovod:
+        if hvd.rank() != 0:
+            return
+    print(s, end='\n' if nl else '')
+
+# Benchmark settings
+parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--fp16-allreduce', action='store_true', default=False,
+                    help='use fp16 compression during allreduce')
+parser.add_argument('--mixed-prec', action='store_true', default=False,
+                    help='Use mixed precision for training')
+
+parser.add_argument('--model', type=str, default='ResNet50',
+                    help='model to benchmark')
+parser.add_argument('--batch-size', type=int, default=32,
+                    help='input batch size')
+
+parser.add_argument('--num-warmup-batches', type=int, default=2,
+                    help='number of warm-up batches that don\'t count towards benchmark')
+parser.add_argument('--num-batches-per-iter', type=int, default=10,
+                    help='number of batches per benchmark iteration')
+parser.add_argument('--num-iters', type=int, default=10,
+                    help='number of benchmark iterations')
+
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disables CUDA training')
+parser.add_argument('--use-horovod', action='store_true', default=False)
+
+args = parser.parse_args()
+args.cuda = not args.no_cuda
+
+# Horovod: initialize Horovod.
+if args.use_horovod:
+    import horovod.tensorflow as hvd
+    hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+if args.cuda:
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    for gpu in gpus:
+        tf.config.experimental.set_memory_growth(gpu, True)
+    if gpus and args.use_horovod:
+        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+else:
+    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+if args.cuda:
+    tf.config.threading.set_inter_op_parallelism_threads(1)
+else:
+    tf.config.threading.set_inter_op_parallelism_threads(1)
+
+tf.config.threading.set_intra_op_parallelism_threads(int(os.environ['OMP_NUM_THREADS']))
+
+if args.mixed_prec:
+    log('Running with mixed_float16 as global policy for the precision')
+    mixed_precision.set_global_policy('mixed_float16')
+
+# Set up standard model.
+model = getattr(applications, args.model)(weights=None)
+opt = tf.optimizers.SGD(0.01)
+
+data = tf.random.uniform([args.batch_size, 224, 224, 3])
+target = tf.random.uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64)
+
+
+@tf.function
+def benchmark_step(first_batch):
+    # Horovod: (optional) compression algorithm.
+    if args.use_horovod:
+        compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
+
+    # Horovod: use DistributedGradientTape
+    with tf.GradientTape() as tape:
+        probs = model(data, training=True)
+        loss = tf.losses.sparse_categorical_crossentropy(target, probs)
+
+    # Horovod: add Horovod Distributed GradientTape.
+    if args.use_horovod:
+        tape = hvd.DistributedGradientTape(tape, compression=compression)
+
+    gradients = tape.gradient(loss, model.trainable_variables)
+    opt.apply_gradients(zip(gradients, model.trainable_variables))
+
+    # Horovod: broadcast initial variable states from rank 0 to all other processes.
+    # This is necessary to ensure consistent initialization of all workers when
+    # training is started with random weights or restored from a checkpoint.
+    #
+    # Note: broadcast should be done after the first gradient step to ensure optimizer
+    # initialization.
+    if args.use_horovod and first_batch:
+        hvd.broadcast_variables(model.variables, root_rank=0)
+        hvd.broadcast_variables(opt.variables(), root_rank=0)
+
+log('Model: %s' % args.model)
+log('Batch size: %d' % args.batch_size)
+device = 'GPU' if args.cuda else 'CPU'
+if args.use_horovod:
+    log('Number of %ss: %d' % (device, hvd.size()))
+else:
+    log('Number of %ss: %s' % (device, 1))
+
+
+with tf.device(device):
+    # Warm-up
+    log('Running warmup...')
+    benchmark_step(first_batch=True)
+
+    timeit.timeit(lambda: benchmark_step(first_batch=False),
+                  number=args.num_warmup_batches)
+
+    # Benchmark
+    log('Running benchmark...')
+    img_secs = []
+    for x in range(args.num_iters):
+        time = timeit.timeit(lambda: benchmark_step(first_batch=False),
+                             number=args.num_batches_per_iter)
+        img_sec = args.batch_size * args.num_batches_per_iter / time
+        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
+        img_secs.append(img_sec)
+
+    # Results
+    img_sec_mean = np.mean(img_secs)
+    img_sec_conf = 1.96 * np.std(img_secs)
+    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
+    if args.use_horovod:
+        log('Total img/sec on %d %s(s): %.1f +-%.1f' %
+            (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))

From 9c345828fd38e285db7be1ff80d0b9d8b3911e21 Mon Sep 17 00:00:00 2001
From: Caspar van Leeuwen <caspar.vanleeuwen@surfsara.nl>
Date: Mon, 29 Mar 2021 18:26:48 +0200
Subject: [PATCH 03/12] Added initial version of TF2 test. WIP...

---
 tests/reframe/config/settings.py              | 36 ++++++++++++++-----
 .../src/tensorflow2_synthetic_benchmark.py    |  8 +++--
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/tests/reframe/config/settings.py b/tests/reframe/config/settings.py
index 978bc4552b..7c1beed2c7 100644
--- a/tests/reframe/config/settings.py
+++ b/tests/reframe/config/settings.py
@@ -3,28 +3,48 @@
         {
             'name': 'example_system',
             'descr': 'This is just an example system',
-            'modules_system': 'tmod4',
-	    'hostnames': ['login'],
+            'modules_system': 'tmod',
+	    'hostnames': ['login', 'int'],
 	    'partitions': [
 		{
-                    'name': 'normal',
+                    'name': 'short',
                     'scheduler': 'slurm',
-	            'launcher': 'srun',
-                    'access':  ['-p normal'],
-                    'environs': ['foss'],
+	            'launcher': 'mpirun',
+                    'access':  ['-p gpu_short'],
+                    'environs': ['foss', 'container'],
+                    'container_platforms': [
+                        {
+                            'type': 'Singularity',
+                            'modules': [],
+                            'variables': [['SLURM_MPI_TYPE', 'pmix']]
+                        }
+                    ],
+                    'processor': {
+                        'num_cpus': 16,
+                    },
+                    'devices': [
+                        {
+                            'type': 'gpu',
+                            'num_devices': 2,
+                        },
+                    ],
                     'descr': 'normal partition'
                 },
              ]
          },
-    ],
+     ],
     'environments': [
         {
             'name': 'foss',
-	    'modules': ['foss-2020a'],
+	    'modules': ['fosscuda/2020a'],
             'cc': 'mpicc',
             'cxx': 'mpicxx',
             'ftn': 'mpifort',
         },
+        {
+            'name': 'container',
+            'modules': [],
+        },
      ],
      'logging': [
         {
diff --git a/tests/reframe/eessi-checks/applications/src/tensorflow2_synthetic_benchmark.py b/tests/reframe/eessi-checks/applications/src/tensorflow2_synthetic_benchmark.py
index e78a551a10..827b84a3ef 100644
--- a/tests/reframe/eessi-checks/applications/src/tensorflow2_synthetic_benchmark.py
+++ b/tests/reframe/eessi-checks/applications/src/tensorflow2_synthetic_benchmark.py
@@ -150,5 +150,9 @@ def benchmark_step(first_batch):
     img_sec_conf = 1.96 * np.std(img_secs)
     log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
     if args.use_horovod:
-        log('Total img/sec on %d %s(s): %.1f +-%.1f' %
-            (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
+        ndevices = hvd.size()
+    else:
+        ndevices = 1
+    log('Total img/sec on %d %s(s): %.1f +-%.1f' %
+        (ndevices, device, ndevices * img_sec_mean, ndevices * img_sec_conf))
+    log('Benchmark completed')

From 83db9d61800ed3afb257bee6d74a09bfdee1660f Mon Sep 17 00:00:00 2001
From: Caspar van Leeuwen <caspar.vanleeuwen@surfsara.nl>
Date: Mon, 29 Mar 2021 18:27:17 +0200
Subject: [PATCH 04/12] Added initial version of TF2 test. WIP...

---
 .../eessi-checks/applications/tensorflow2.py  | 151 ++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 tests/reframe/eessi-checks/applications/tensorflow2.py

diff --git a/tests/reframe/eessi-checks/applications/tensorflow2.py b/tests/reframe/eessi-checks/applications/tensorflow2.py
new file mode 100644
index 0000000000..136af1d34d
--- /dev/null
+++ b/tests/reframe/eessi-checks/applications/tensorflow2.py
@@ -0,0 +1,151 @@
+# This TensorFlow2 test is intended for single node, single GPU only
+# For multigpu and multinode tests, we use Horovod
+
+import os
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+class TensorFlow2Base(rfm.RunOnlyRegressionTest):
+
+    device = parameter(['cpu', 'gpu'])
+
+    def __init__(self):
+        self.valid_systems = ['*']
+
+        self.script = 'tensorflow2_synthetic_benchmark.py'
+        self.model = 'ResNet50'
+        self.batch_size = 32
+
+        self.sanity_patterns = sn.all([
+            sn.assert_found('Benchmark completed', self.stdout),
+        ])
+
+        self.perf_patterns = {
+            'throughput': sn.extractsingle(
+                rf'Total img/sec on {self.num_tasks} {self.device.upper()}\(s\): '
+                rf'(?P<throughput>\S+) \S+',
+                self.stdout, 'throughput', float),
+            f'throughput_per_{self.device}': sn.extractsingle(
+                rf'Img/sec per CPU: (?P<throughput_per_{self.device}>\S+) \S+',
+                self.stdout, f'throughput_per_{self.device}', float)
+        }
+        self.reference = {
+            '*': {
+                'throughput': (None, None, None, 'img/sec'),
+                f'throughput_per_{self.device}': (None, None, None, 'img/sec')
+            }
+        }
+
+        self.tags = {f'{self.device}'}
+
+        self.maintainers = ['casparvl']
+
+@rfm.simple_test
+class TensorFlow2Native(TensorFlow2Base):
+    def __init__(self):
+        super().__init__()
+
+        self.descr = 'TensorFlow 2.X single gpu test. Based on the Horovod tensorflow2_synthetic_benchmark.py example.'
+
+        self.tags.add('native')
+        self.valid_prog_environs = ['*']
+
+        self.modules = ['TensorFlow']
+        self.executable = 'python'
+
+        self.executable_opts = [
+            f'{self.script}',
+            f'--model {self.model}',
+            f'--batch-size {self.batch_size}',
+            '--num-iters 5',
+            '--num-batches-per-iter 5',
+            '--num-warmup-batches 5',
+        ]
+        if self.device == 'cpu':
+            self.executable_opts.append('--no-cuda')
+
+        self.tags.add('singlenode')
+
+class HorovodTensorFlow2Base(TensorFlow2Base):
+
+    scale = parameter(['singlenode', 'small', 'large'])
+
+    def __init__(self):
+        super().__init__()
+
+        if self.scale == 'singlenode':
+            self.num_nodes = 1
+        elif self.scale == 'small':
+            self.num_nodes = 4
+        elif self.scale == 'large':
+            self.num_nodes = 10
+        self.tags.add(self.scale)
+
+    @rfm.run_before('run')
+    def set_num_tasks(self):
+        if self.device == 'cpu':
+            # For now, keep it simple.
+            # In the future, we may want to launch 1 task per socket,
+            # and bind these tasks to their respective sockets.
+            self.num_tasks_per_node = 1
+        elif self.device == 'gpu':
+            # This should really be reading out something like 'self.current_partition.devices.num_devices_per_node', but that doesn't exist...
+            #print(f'self.current_partition.devices: {self.current_partition.devices}')
+            #for dev in self.current_partition.devices:
+            #    print(dev)
+            #    print(dir(dev))
+            #    print(dev.info)
+            device_count = [ dev.num_devices for dev in self.current_partition.devices if dev.device_type == 'gpu' ]
+
+            assert(len(device_count) == 1)
+            # print(dir(self.current_partition.devices))
+            self.num_tasks_per_node = device_count[0]
+            self.num_tasks = self.num_tasks_per_node * self.num_nodes
+
+@rfm.simple_test
+class HorovodTensorFlow2Native(HorovodTensorFlow2Base):
+
+    def __init__(self):
+        super().__init__()
+
+        self.descr = 'TensorFlow 2.X with Horovod multi-node and multi-GPU test. Based on the Horovod tensorflow2_synthetic_benchmark.py example.'
+
+        self.tags.add('native')
+        self.valid_prog_environs = ['*']
+
+        self.modules = ['Horovod', 'TensorFlow']
+        self.executable = 'python'
+
+        self.executable_opts = [
+            f'{self.script}',
+            f'--model {self.model}',
+            f'--batch-size {self.batch_size}',
+            '--num-iters 5',
+            '--num-batches-per-iter 5',
+            '--num-warmup-batches 5',
+        ]
+
+
+# @rfm.parametrized_test(['cpu'], ['gpu'])
+# class TensorFlow2Container(TensorFlow2Base):
+#     def __init__(self, device):
+#         super().__init__(device)
+# 
+#         self.tags.add('container')
+#         self.valid_prog_environs = ['*']
+# 
+#         self.prerun_cmds = ['source shared_alien_cache_minimal.sh > /dev/null']
+# 
+#         self.container_platform = 'Singularity'
+#         self.container_platform.image = 'docker://eessi/client-pilot:centos7-$(uname -m)'
+#         self.container_platform.options = [
+#             '--fusemount "container:cvmfs2 cvmfs-config.eessi-hpc.org /cvmfs/cvmfs-config.eessi-hpc.org"',
+#             '--fusemount "container:cvmfs2 pilot.eessi-hpc.org /cvmfs/pilot.eessi-hpc.org"'
+#         ]
+# 
+#         self.container_platform.commands = [
+#             'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash',
+#             'module load TensorFlow',
+#             'python {self.script} --model {self.model} --batch-size {self.batch_size} --num-iters 5 --num-batches-per-iter 5 --num-warmup-batches 5'
+#         ]
+#         self.tags.add('singlenode')

From ac10f3d53b8539c31397c99f35d080323538209d Mon Sep 17 00:00:00 2001
From: Caspar van Leeuwen <caspar.vanleeuwen@surfsara.nl>
Date: Tue, 30 Mar 2021 16:20:16 +0200
Subject: [PATCH 05/12] Changed TF2 test so that it now runs with the correct
 number of tasks, threads, and uses the parameter builtin method of ReFrame

---
 .../eessi-checks/applications/tensorflow2.py   | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/reframe/eessi-checks/applications/tensorflow2.py b/tests/reframe/eessi-checks/applications/tensorflow2.py
index 136af1d34d..56cc094479 100644
--- a/tests/reframe/eessi-checks/applications/tensorflow2.py
+++ b/tests/reframe/eessi-checks/applications/tensorflow2.py
@@ -22,11 +22,11 @@ def __init__(self):
 
         self.perf_patterns = {
             'throughput': sn.extractsingle(
-                rf'Total img/sec on {self.num_tasks} {self.device.upper()}\(s\): '
+                rf'Total img\/sec on [0-9]+ {self.device.upper()}\(s\): '
                 rf'(?P<throughput>\S+) \S+',
                 self.stdout, 'throughput', float),
             f'throughput_per_{self.device}': sn.extractsingle(
-                rf'Img/sec per CPU: (?P<throughput_per_{self.device}>\S+) \S+',
+                rf'Img\/sec per {self.device.upper()}: (?P<throughput_per_{self.device}>\S+) \S+',
                 self.stdout, f'throughput_per_{self.device}', float)
         }
         self.reference = {
@@ -90,17 +90,14 @@ def set_num_tasks(self):
             self.num_tasks_per_node = 1
         elif self.device == 'gpu':
             # This should really be reading out something like 'self.current_partition.devices.num_devices_per_node', but that doesn't exist...
-            #print(f'self.current_partition.devices: {self.current_partition.devices}')
-            #for dev in self.current_partition.devices:
-            #    print(dev)
-            #    print(dir(dev))
-            #    print(dev.info)
             device_count = [ dev.num_devices for dev in self.current_partition.devices if dev.device_type == 'gpu' ]
-
             assert(len(device_count) == 1)
-            # print(dir(self.current_partition.devices))
             self.num_tasks_per_node = device_count[0]
             self.num_tasks = self.num_tasks_per_node * self.num_nodes
+        self.num_cpus_per_task = int(self.current_partition.processor.num_cpus / self.num_tasks_per_node)
+        self.variables = {
+            'OMP_NUM_THREADS': f'{self.num_cpus_per_task}',
+        }
 
 @rfm.simple_test
 class HorovodTensorFlow2Native(HorovodTensorFlow2Base):
@@ -123,7 +120,10 @@ def __init__(self):
             '--num-iters 5',
             '--num-batches-per-iter 5',
             '--num-warmup-batches 5',
+            '--use-horovod',
         ]
+        if self.device == 'cpu':
+            self.executable_opts.append('--no-cuda')
 
 
 # @rfm.parametrized_test(['cpu'], ['gpu'])

From 7b05b2e383246564e2fa3f51353841d185957a96 Mon Sep 17 00:00:00 2001
From: Caspar van Leeuwen <caspar.vanleeuwen@surf.nl>
Date: Tue, 30 Mar 2021 15:13:16 +0000
Subject: [PATCH 06/12] Moved the setting of numbers of tasks threads etc to
 the base test, since it also applies to TensorFlow-only tests

---
 .../eessi-checks/applications/tensorflow2.py  | 49 ++++++++++---------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/tests/reframe/eessi-checks/applications/tensorflow2.py b/tests/reframe/eessi-checks/applications/tensorflow2.py
index 56cc094479..b27a0af97c 100644
--- a/tests/reframe/eessi-checks/applications/tensorflow2.py
+++ b/tests/reframe/eessi-checks/applications/tensorflow2.py
@@ -40,6 +40,27 @@ def __init__(self):
 
         self.maintainers = ['casparvl']
 
+    # Set number of tasks and threads (OMP_NUM_THREADS) based on current partition properties
+    @rfm.run_before('run')
+    def set_num_tasks(self):
+        if self.device == 'cpu':
+            # For now, keep it simple.
+            # In the future, we may want to launch 1 task per socket,
+            # and bind these tasks to their respective sockets.
+            self.num_tasks_per_node = 1
+        elif self.device == 'gpu':
+            # This should really be reading out something like 'self.current_partition.devices.num_devices_per_node', but that doesn't exist...
+            device_count = [ dev.num_devices for dev in self.current_partition.devices if dev.device_type == 'gpu' ]
+            assert(len(device_count) == 1)
+            self.num_tasks_per_node = device_count[0]
+            self.num_tasks = self.num_tasks_per_node * self.num_nodes
+        self.num_cpus_per_task = int(self.current_partition.processor.num_cpus / self.num_tasks_per_node)
+        self.variables = {
+            'OMP_NUM_THREADS': f'{self.num_cpus_per_task}',
+        }
+#        if self.current_partition.launcher == 'mpirun':
+#            self.job.launcher.options = ['-x OMP_NUM_THREADS']
+
 @rfm.simple_test
 class TensorFlow2Native(TensorFlow2Base):
     def __init__(self):
@@ -64,6 +85,8 @@ def __init__(self):
         if self.device == 'cpu':
             self.executable_opts.append('--no-cuda')
 
+        self.num_nodes = 1
+
         self.tags.add('singlenode')
 
 class HorovodTensorFlow2Base(TensorFlow2Base):
@@ -81,24 +104,6 @@ def __init__(self):
             self.num_nodes = 10
         self.tags.add(self.scale)
 
-    @rfm.run_before('run')
-    def set_num_tasks(self):
-        if self.device == 'cpu':
-            # For now, keep it simple.
-            # In the future, we may want to launch 1 task per socket,
-            # and bind these tasks to their respective sockets.
-            self.num_tasks_per_node = 1
-        elif self.device == 'gpu':
-            # This should really be reading out something like 'self.current_partition.devices.num_devices_per_node', but that doesn't exist...
-            device_count = [ dev.num_devices for dev in self.current_partition.devices if dev.device_type == 'gpu' ]
-            assert(len(device_count) == 1)
-            self.num_tasks_per_node = device_count[0]
-            self.num_tasks = self.num_tasks_per_node * self.num_nodes
-        self.num_cpus_per_task = int(self.current_partition.processor.num_cpus / self.num_tasks_per_node)
-        self.variables = {
-            'OMP_NUM_THREADS': f'{self.num_cpus_per_task}',
-        }
-
 @rfm.simple_test
 class HorovodTensorFlow2Native(HorovodTensorFlow2Base):
 
@@ -130,19 +135,19 @@ def __init__(self):
 # class TensorFlow2Container(TensorFlow2Base):
 #     def __init__(self, device):
 #         super().__init__(device)
-# 
+#
 #         self.tags.add('container')
 #         self.valid_prog_environs = ['*']
-# 
+#
 #         self.prerun_cmds = ['source shared_alien_cache_minimal.sh > /dev/null']
-# 
+#
 #         self.container_platform = 'Singularity'
 #         self.container_platform.image = 'docker://eessi/client-pilot:centos7-$(uname -m)'
 #         self.container_platform.options = [
 #             '--fusemount "container:cvmfs2 cvmfs-config.eessi-hpc.org /cvmfs/cvmfs-config.eessi-hpc.org"',
 #             '--fusemount "container:cvmfs2 pilot.eessi-hpc.org /cvmfs/pilot.eessi-hpc.org"'
 #         ]
-# 
+#
 #         self.container_platform.commands = [
 #             'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash',
 #             'module load TensorFlow',

From ac1a52d69e984e0a9eae2492ef7e34958b0c03a4 Mon Sep 17 00:00:00 2001
From: Caspar van Leeuwen <caspar.vanleeuwen@surfsara.nl>
Date: Tue, 30 Mar 2021 17:16:04 +0200
Subject: [PATCH 07/12] Set total number of tasks for CPUs also, just take it
 outside of the if statement

---
 tests/reframe/eessi-checks/applications/tensorflow2.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/reframe/eessi-checks/applications/tensorflow2.py b/tests/reframe/eessi-checks/applications/tensorflow2.py
index b27a0af97c..0010a0e555 100644
--- a/tests/reframe/eessi-checks/applications/tensorflow2.py
+++ b/tests/reframe/eessi-checks/applications/tensorflow2.py
@@ -43,6 +43,7 @@ def __init__(self):
     # Set number of tasks and threads (OMP_NUM_THREADS) based on current partition properties
     @rfm.run_before('run')
     def set_num_tasks(self):
+        # On CPU nodes, start 1 task per node. On GPU nodes, start 1 task per GPU.
         if self.device == 'cpu':
             # For now, keep it simple.
             # In the future, we may want to launch 1 task per socket,
@@ -53,13 +54,13 @@ def set_num_tasks(self):
             device_count = [ dev.num_devices for dev in self.current_partition.devices if dev.device_type == 'gpu' ]
             assert(len(device_count) == 1)
             self.num_tasks_per_node = device_count[0]
-            self.num_tasks = self.num_tasks_per_node * self.num_nodes
+        self.num_tasks = self.num_tasks_per_node * self.num_nodes
         self.num_cpus_per_task = int(self.current_partition.processor.num_cpus / self.num_tasks_per_node)
         self.variables = {
             'OMP_NUM_THREADS': f'{self.num_cpus_per_task}',
         }
-#        if self.current_partition.launcher == 'mpirun':
-#            self.job.launcher.options = ['-x OMP_NUM_THREADS']
+        if self.current_partition.launcher == 'mpirun':
+            self.job.launcher.options = ['-x OMP_NUM_THREADS']
 
 @rfm.simple_test
 class TensorFlow2Native(TensorFlow2Base):

From a8468518364ed416487af2712f33a75dc78c8829 Mon Sep 17 00:00:00 2001
From: Caspar van Leeuwen <caspar.vanleeuwen@surfsara.nl>
Date: Wed, 31 Mar 2021 10:37:57 +0200
Subject: [PATCH 08/12] Moved setting task count back to child class - for
 native TF on GPU it should only launch a single task

---
 .../eessi-checks/applications/tensorflow2.py  | 55 ++++++++++++-------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/tests/reframe/eessi-checks/applications/tensorflow2.py b/tests/reframe/eessi-checks/applications/tensorflow2.py
index 0010a0e555..eb920595f4 100644
--- a/tests/reframe/eessi-checks/applications/tensorflow2.py
+++ b/tests/reframe/eessi-checks/applications/tensorflow2.py
@@ -40,27 +40,6 @@ def __init__(self):
 
         self.maintainers = ['casparvl']
 
-    # Set number of tasks and threads (OMP_NUM_THREADS) based on current partition properties
-    @rfm.run_before('run')
-    def set_num_tasks(self):
-        # On CPU nodes, start 1 task per node. On GPU nodes, start 1 task per GPU.
-        if self.device == 'cpu':
-            # For now, keep it simple.
-            # In the future, we may want to launch 1 task per socket,
-            # and bind these tasks to their respective sockets.
-            self.num_tasks_per_node = 1
-        elif self.device == 'gpu':
-            # This should really be reading out something like 'self.current_partition.devices.num_devices_per_node', but that doesn't exist...
-            device_count = [ dev.num_devices for dev in self.current_partition.devices if dev.device_type == 'gpu' ]
-            assert(len(device_count) == 1)
-            self.num_tasks_per_node = device_count[0]
-        self.num_tasks = self.num_tasks_per_node * self.num_nodes
-        self.num_cpus_per_task = int(self.current_partition.processor.num_cpus / self.num_tasks_per_node)
-        self.variables = {
-            'OMP_NUM_THREADS': f'{self.num_cpus_per_task}',
-        }
-        if self.current_partition.launcher == 'mpirun':
-            self.job.launcher.options = ['-x OMP_NUM_THREADS']
 
 @rfm.simple_test
 class TensorFlow2Native(TensorFlow2Base):
@@ -87,9 +66,20 @@ def __init__(self):
             self.executable_opts.append('--no-cuda')
 
         self.num_nodes = 1
+        self.num_tasks_per_node = 1
 
         self.tags.add('singlenode')
 
+    # Set OMP_NUM_THREADS based on current partition properties
+    @rfm.run_before('run')
+    def set_num_threads(self):
+        self.num_cpus_per_task = int(self.current_partition.processor.num_cpus / self.num_tasks_per_node)
+        self.variables = {
+            'OMP_NUM_THREADS': f'{self.num_cpus_per_task}',
+        }
+        if self.current_partition.launcher_type == 'mpirun':
+            self.job.launcher.options = ['-x OMP_NUM_THREADS']
+
 class HorovodTensorFlow2Base(TensorFlow2Base):
 
     scale = parameter(['singlenode', 'small', 'large'])
@@ -105,6 +95,29 @@ def __init__(self):
             self.num_nodes = 10
         self.tags.add(self.scale)
 
+    # Set number of tasks and threads (OMP_NUM_THREADS) based on current partition properties
+    @rfm.run_before('run')
+    def set_num_tasks(self):
+        # On CPU nodes, start 1 task per node. On GPU nodes, start 1 task per GPU.
+        if self.device == 'cpu':
+            # For now, keep it simple.
+            # In the future, we may want to launch 1 task per socket,
+            # and bind these tasks to their respective sockets.
+            self.num_tasks_per_node = 1
+        elif self.device == 'gpu':
+            # This should really be reading out something like 'self.current_partition.devices.num_devices_per_node', but that doesn't exist...
+            device_count = [ dev.num_devices for dev in self.current_partition.devices if dev.device_type == 'gpu' ]
+            assert(len(device_count) == 1)
+            self.num_tasks_per_node = device_count[0]
+        self.num_tasks = self.num_tasks_per_node * self.num_nodes
+        self.num_cpus_per_task = int(self.current_partition.processor.num_cpus / self.num_tasks_per_node)
+        self.variables = {
+            'OMP_NUM_THREADS': f'{self.num_cpus_per_task}',
+        }
+        if self.current_partition.launcher_type == 'mpirun':
+            self.job.launcher.options = ['-x OMP_NUM_THREADS']
+
+
 @rfm.simple_test
 class HorovodTensorFlow2Native(HorovodTensorFlow2Base):
 

From bbaa9a66ee0fc2c6f0a1edef6cf040c5b220c623 Mon Sep 17 00:00:00 2001
From: Caspar van Leeuwen <caspar.vanleeuwen@surfsara.nl>
Date: Fri, 9 Apr 2021 18:53:28 +0200
Subject: [PATCH 09/12] Added processor and devices statements to the system
 description

---
 tests/reframe/config/settings.py | 41 ++++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/tests/reframe/config/settings.py b/tests/reframe/config/settings.py
index 7c1beed2c7..d3d1c90ad5 100644
--- a/tests/reframe/config/settings.py
+++ b/tests/reframe/config/settings.py
@@ -1,7 +1,7 @@
 site_configuration = {
     'systems': [
         {
-            'name': 'example_system',
+            'name': 'Example_system',
             'descr': 'This is just an example system',
             'modules_system': 'tmod',
 	    'hostnames': ['login', 'int'],
@@ -9,8 +9,8 @@
 		{
                     'name': 'short',
                     'scheduler': 'slurm',
-	            'launcher': 'mpirun',
-                    'access':  ['-p gpu_short'],
+	            'launcher': 'srun',
+                    'access':  ['-p short'],
                     'environs': ['foss', 'container'],
                     'container_platforms': [
                         {
@@ -20,7 +20,7 @@
                         }
                     ],
                     'processor': {
-                        'num_cpus': 16,
+                        'num_cpus': 24,
                     },
                     'devices': [
                         {
@@ -30,13 +30,44 @@
                     ],
                     'descr': 'normal partition'
                 },
+                {
+                    'name': 'gpu_short',
+                    'scheduler': 'slurm',
+                    'launcher': 'srun',
+                    'access':  ['-p gpu_short'],
+                    'environs': ['fosscuda', 'container'],
+                    'container_platforms': [
+                        {
+                            'type': 'Singularity',
+                            'modules': [],
+                            'variables': [['SLURM_MPI_TYPE', 'pmix']]
+                        }
+                    ],
+                    'processor': {
+                        'num_cpus': 16,
+                    },
+                    'devices': [
+                        {
+                            'type': 'gpu',
+                            'num_devices': 2,
+                        },
+                    ],
+                    'descr': 'gpu partition'
+                },
              ]
          },
      ],
     'environments': [
         {
             'name': 'foss',
-	    'modules': ['fosscuda/2020a'],
+	    'modules': ['foss/2020a'],
+            'cc': 'mpicc',
+            'cxx': 'mpicxx',
+            'ftn': 'mpifort',
+        },
+        {
+            'name': 'fosscuda',
+            'modules': ['fosscuda/2020a'],
             'cc': 'mpicc',
             'cxx': 'mpicxx',
             'ftn': 'mpifort',

From 83dcb1285ebec5b1f85c7b0c37e3dc7bb3e1e738 Mon Sep 17 00:00:00 2001
From: Caspar van Leeuwen <caspar.vanleeuwen@surfsara.nl>
Date: Mon, 12 Apr 2021 18:27:04 +0200
Subject: [PATCH 10/12] Added extra_resources. That way, sites can put a
 'resources' section in their ReFrame config if they need to specify
 additional flags in order to request a certain number of GPUs per node. This
 may depend on the allocation manager and its default configuration

---
 tests/reframe/eessi-checks/applications/tensorflow2.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/reframe/eessi-checks/applications/tensorflow2.py b/tests/reframe/eessi-checks/applications/tensorflow2.py
index eb920595f4..46aa8d0934 100644
--- a/tests/reframe/eessi-checks/applications/tensorflow2.py
+++ b/tests/reframe/eessi-checks/applications/tensorflow2.py
@@ -107,8 +107,16 @@ def set_num_tasks(self):
         elif self.device == 'gpu':
             # This should really be reading out something like 'self.current_partition.devices.num_devices_per_node', but that doesn't exist...
             device_count = [ dev.num_devices for dev in self.current_partition.devices if dev.device_type == 'gpu' ]
+            # This test doesn't know what to do if multiple GPU devices are present in a single partition, so assert:
             assert(len(device_count) == 1)
             self.num_tasks_per_node = device_count[0]
+            # On some resource schedules, you may need to request GPUs explicitely (e.g. --gpus-per-node=4).
+            # The extra_resources allows that to be put in the ReFrame settings file.
+            # See: https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html?highlight=num_gpus_per_node#reframe.core.pipeline.RegressionTest.extra_resources
+            # If the partition in the reframe settings file doesn't contain a resource with the name 'gpu', the self.extra_resources wil be ignored.
+            self.extra_resources = {
+                'gpu': {'num_gpus_per_node': device_count[0]}
+            }
         self.num_tasks = self.num_tasks_per_node * self.num_nodes
         self.num_cpus_per_task = int(self.current_partition.processor.num_cpus / self.num_tasks_per_node)
         self.variables = {

From cf9b9e10f14266f248a793284b2d118eab768793 Mon Sep 17 00:00:00 2001
From: Caspar van Leeuwen <caspar.vanleeuwen@surfsara.nl>
Date: Mon, 3 May 2021 16:35:32 +0200
Subject: [PATCH 11/12] Updated test to use one less thread than the max. This
 gives more reliable performance numbers due to less context switching with
 the Horovod thread. Additionally, for the Horovod test, only require the
 Horovod module - TensorFlow will be loaded as a dep automatically anyway.

---
 .../reframe/eessi-checks/applications/tensorflow2.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/reframe/eessi-checks/applications/tensorflow2.py b/tests/reframe/eessi-checks/applications/tensorflow2.py
index 46aa8d0934..d45338e7af 100644
--- a/tests/reframe/eessi-checks/applications/tensorflow2.py
+++ b/tests/reframe/eessi-checks/applications/tensorflow2.py
@@ -105,9 +105,8 @@ def set_num_tasks(self):
             # and bind these tasks to their respective sockets.
             self.num_tasks_per_node = 1
         elif self.device == 'gpu':
-            # This should really be reading out something like 'self.current_partition.devices.num_devices_per_node', but that doesn't exist...
             device_count = [ dev.num_devices for dev in self.current_partition.devices if dev.device_type == 'gpu' ]
-            # This test doesn't know what to do if multiple GPU devices are present in a single partition, so assert:
+            # This test doesn't know what to do if multiple DIFFERENT GPU devices are present in a single partition, so assert that we only found one in the ReFrame config:
             assert(len(device_count) == 1)
             self.num_tasks_per_node = device_count[0]
             # On some resource schedules, you may need to request GPUs explicitely (e.g. --gpus-per-node=4).
@@ -119,8 +118,13 @@ def set_num_tasks(self):
             }
         self.num_tasks = self.num_tasks_per_node * self.num_nodes
         self.num_cpus_per_task = int(self.current_partition.processor.num_cpus / self.num_tasks_per_node)
+        # If test runs on CPU, leave one thread idle for Horovod. See https://github.com/horovod/horovod/issues/2804
+        if self.device == 'cpu': 
+            num_threads = max(self.num_cpus_per_task-1, 1)
+        elif self.device == 'gpu':
+            num_threads = self.num_cpus_per_task
         self.variables = {
-            'OMP_NUM_THREADS': f'{self.num_cpus_per_task}',
+            'OMP_NUM_THREADS': f'{num_threads}',
         }
         if self.current_partition.launcher_type == 'mpirun':
             self.job.launcher.options = ['-x OMP_NUM_THREADS']
@@ -137,7 +141,7 @@ def __init__(self):
         self.tags.add('native')
         self.valid_prog_environs = ['*']
 
-        self.modules = ['Horovod', 'TensorFlow']
+        self.modules = ['Horovod']
         self.executable = 'python'
 
         self.executable_opts = [

From 346a5f5f03dd4db7bc262557c4b924e7a8c82d0c Mon Sep 17 00:00:00 2001
From: Caspar van Leeuwen <caspar.vanleeuwen@surfsara.nl>
Date: Tue, 4 May 2021 17:41:37 +0200
Subject: [PATCH 12/12] Let's not do containerized tests... Native is
 complicated enough

---
 .../eessi-checks/applications/tensorflow2.py  | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/tests/reframe/eessi-checks/applications/tensorflow2.py b/tests/reframe/eessi-checks/applications/tensorflow2.py
index d45338e7af..01e2bc511b 100644
--- a/tests/reframe/eessi-checks/applications/tensorflow2.py
+++ b/tests/reframe/eessi-checks/applications/tensorflow2.py
@@ -156,27 +156,3 @@ def __init__(self):
         if self.device == 'cpu':
             self.executable_opts.append('--no-cuda')
 
-
-# @rfm.parametrized_test(['cpu'], ['gpu'])
-# class TensorFlow2Container(TensorFlow2Base):
-#     def __init__(self, device):
-#         super().__init__(device)
-#
-#         self.tags.add('container')
-#         self.valid_prog_environs = ['*']
-#
-#         self.prerun_cmds = ['source shared_alien_cache_minimal.sh > /dev/null']
-#
-#         self.container_platform = 'Singularity'
-#         self.container_platform.image = 'docker://eessi/client-pilot:centos7-$(uname -m)'
-#         self.container_platform.options = [
-#             '--fusemount "container:cvmfs2 cvmfs-config.eessi-hpc.org /cvmfs/cvmfs-config.eessi-hpc.org"',
-#             '--fusemount "container:cvmfs2 pilot.eessi-hpc.org /cvmfs/pilot.eessi-hpc.org"'
-#         ]
-#
-#         self.container_platform.commands = [
-#             'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash',
-#             'module load TensorFlow',
-#             'python {self.script} --model {self.model} --batch-size {self.batch_size} --num-iters 5 --num-batches-per-iter 5 --num-warmup-batches 5'
-#         ]
-#         self.tags.add('singlenode')