diff --git a/tests/reframe/config/settings.py b/tests/reframe/config/settings.py
new file mode 100644
index 0000000000..9afc5021f3
--- /dev/null
+++ b/tests/reframe/config/settings.py
@@ -0,0 +1,90 @@
+site_configuration = {
+    'systems': [
+        {
+            'name': 'Example_system',
+            'descr': 'This is just an example system',
+            'modules_system': 'tmod',
+	    'hostnames': ['login', 'int'],
+            'partitions': [
+                {
+                    'name': 'cpu',
+                    'scheduler': 'slurm',
+                    'launcher': 'srun',
+                    'access':  ['-p cpu'],
+                    'environs': ['builtin'],
+                    'processor': {
+                        'num_cpus': 24,
+                    },
+                    'descr': 'normal CPU partition'
+                },
+                {
+                    'name': 'gpu',
+                    'descr': 'GPU partition',
+                    'scheduler': 'slurm',
+                    'access':  ['-p gpu'],
+                    'environs': ['builtin'],
+                    'max_jobs': 100,
+                    'launcher': 'srun',
+                    'processor': {
+                        'num_cpus': 24,
+                    },
+                    'devices': [
+                        {
+                            'type': 'gpu',
+                            'num_devices': 2,
+                        },
+                    ],
+                },
+             ]
+         },
+     ],
+    'environments': [
+        {
+            'name': 'builtin',
+            'cc': 'cc',
+            'cxx': '',
+            'ftn': '',
+        },
+        {
+            'name': 'container',
+            'modules': [],
+        },
+     ],
+     'logging': [
+        {
+            'level': 'debug',
+            'handlers': [
+                {
+                    'type': 'stream',
+                    'name': 'stdout',
+                    'level': 'info',
+                    'format': '%(message)s'
+                },
+                {
+                    'type': 'file',
+                    'name': 'reframe.log',
+                    'level': 'debug',
+                    'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s',   # noqa: E501
+                    'append': False
+                }
+            ],
+            'handlers_perflog': [
+                {
+                    'type': 'filelog',
+                    'prefix': '%(check_system)s/%(check_partition)s',
+                    'level': 'info',
+                    'format': (
+                        '%(check_job_completion_time)s|reframe %(version)s|'
+                        '%(check_info)s|jobid=%(check_jobid)s|'
+                        '%(check_perf_var)s=%(check_perf_value)s|'
+                        'ref=%(check_perf_ref)s '
+                        '(l=%(check_perf_lower_thres)s, '
+                        'u=%(check_perf_upper_thres)s)|'
+                        '%(check_perf_unit)s'
+                    ),
+                    'append': True
+                }
+            ]
+        }
+    ],
+}
diff --git a/tests/reframe/config/settings_magic_castle.py b/tests/reframe/config/settings_magic_castle.py
new file mode 100644
index 0000000000..1de4a9d00e
--- /dev/null
+++ b/tests/reframe/config/settings_magic_castle.py
@@ -0,0 +1,94 @@
+site_configuration = {
+    'systems': [
+        {
+            'name': 'example_system',
+            'descr': 'This is just an example system',
+            'modules_system': 'lmod',
+	    'hostnames': ['login', 'node'],
+            'partitions': [
+                {
+                    'name': 'cpu',
+                    'scheduler': 'slurm',
+                    'launcher': 'mpirun',
+                    'access':  ['-p cpubase_bycore_b1 --exclusive --mem=94515M'],
+                    'environs': ['builtin'],
+                    'max_jobs': 4,
+                    'processor': {
+                        'num_cpus': 36,
+                        'num_sockets': 1,
+                        'num_cpus_per_socket': 36,
+                    },
+                    'descr': 'normal CPU partition'
+                },
+#                 {
+#                     'name': 'gpu',
+#                     'descr': 'GPU partition',
+#                     'scheduler': 'slurm',
+#                     'access':  ['-p gpu --gpus-per-node 4 --exclusive'],
+#                     'environs': ['builtin'],
+#                     'max_jobs': 10,
+#                     'launcher': 'srun',
+#                     'processor': {
+#                         'num_cpus': 72,
+#                     },
+#                     'devices': [
+#                         {
+#                             'type': 'gpu',
+#                             'num_devices': 4,
+#                         },
+#                     ],
+#                 },
+             ]
+         },
+     ],
+    'environments': [
+        {
+            'name': 'builtin',
+            'cc': 'cc',
+            'cxx': '',
+            'ftn': '',
+        },
+     ],
+     'logging': [
+        {
+            'level': 'debug',
+            'handlers': [
+                {
+                    'type': 'stream',
+                    'name': 'stdout',
+                    'level': 'info',
+                    'format': '%(message)s'
+                },
+                {
+                    'type': 'file',
+                    'name': 'reframe.log',
+                    'level': 'debug',
+                    'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s',   # noqa: E501
+                    'append': False
+                }
+            ],
+            'handlers_perflog': [
+                {
+                    'type': 'filelog',
+                    'prefix': '%(check_system)s/%(check_partition)s',
+                    'level': 'info',
+                    'format': (
+                        '%(check_job_completion_time)s|reframe %(version)s|'
+                        '%(check_info)s|jobid=%(check_jobid)s|'
+                        '%(check_perf_var)s=%(check_perf_value)s|'
+                        'ref=%(check_perf_ref)s '
+                        '(l=%(check_perf_lower_thres)s, '
+                        'u=%(check_perf_upper_thres)s)|'
+                        '%(check_perf_unit)s'
+                    ),
+                    'append': True
+                }
+            ]
+        }
+    ],
+    'general': [
+        {
+            'remote_detect': True,
+        }
+    ],
+}
diff --git a/tests/reframe/eessi-checks/applications/tensorflow2.py b/tests/reframe/eessi-checks/applications/tensorflow2.py
new file mode 100644
index 0000000000..257596efe5
--- /dev/null
+++ b/tests/reframe/eessi-checks/applications/tensorflow2.py
@@ -0,0 +1,104 @@
+import os
+import reframe as rfm
+from reframe.utility import find_modules
+
+from testlib.applications.tensorflow2 import TensorFlow2
+import eessi_utils.hooks as hooks
+import eessi_utils.utils as utils
+
+@rfm.required_version('>=3.6.2')
+@rfm.simple_test
+class TensorFlow2_EESSI(TensorFlow2):
+    '''EESSI TensorFlow 2 check, for pure TensorFlow. This test does not support running on multiple nodes'''
+
+    modules = required # Make sure that our apply_module_info hook sets a value
+    module_info = parameter(find_modules('TensorFlow', environ_mapping={r'.*': 'builtin'}))
+    # This test is singlenode and should be run in CI
+    tags = {'singlenode', 'CI'}
+
+    @run_after('init')
+    def apply_module_info(self):
+        hooks.apply_module_info(test = self, module_info = self.module_info)
+
+    # Skip testing GPU-based modules on CPU-based nodes
+    @run_after('setup')
+    def skip_gpu_test_on_cpu_nodes(self):
+        hooks.skip_gpu_test_on_cpu_nodes(self)
+
+    # Skip testing with device == gpu on CPU based nodes
+    @run_after('setup')
+    def skip_device_gpu_on_cpu_nodes(self):
+        self.skip_if(
+            (self.device == 'gpu' and not utils.is_gpu_present(self)),
+            "Skipping test variant where tf.device is GPU, since this partition contains non-GPU nodes"
+        )
+
+    # This test uses only OpenMP for parallelism, so simply run on all cores
+    @run_after('setup')
+    def set_num_tasks(self):
+        self.num_tasks = 1
+        self.num_tasks_per_node = 1
+        self.num_cpus_per_task = self.current_partition.processor.num_cpus
+        self.omp_num_threads = self.num_cpus_per_task
+
+    @run_before('run')
+    def bind_to_none(self):
+        hooks.bind_to_none(self)
+
+@rfm.required_version('>=3.6.2')
+@rfm.simple_test
+class TensorFlow2_Horovod_EESSI(TensorFlow2):
+    '''EESSI TensorFlow 2 check, with multiprocessing support through Horovod.
+    This test will run TensorFlow2 using all modules with 'TensorFlow' in the module environment it can find.
+    On GPU nodes, it will only run tests if the module names also contain 'cuda'.
+    On CPU nodes, it will only run tests if a module name does NOT contain 'cuda'.
+    Whether a node is CPU/GPU is determined based on if a device named 'gpu' is specified in the ReFrame settings file for the current partition.
+    Number of tasks, tasks per node and cpus per task are set based on the number of GPUs and number of CPUs specified in the ReFrame config file for the current partition.
+    When using multiple CPU nodes, the number of OMP_NUM_THREADS is set to the core count minus 1, to leave one dedicated thread for Horovod.
+    '''
+
+    modules = required # Make sure that our apply_module_info hook sets a value
+    scale = parameter([
+        ('singlenode', 1),
+        ('n_small', 2),
+        ('n_medium', 8),
+        ('n_large', 16)
+    ])
+    module_info = parameter(find_modules('Horovod', environ_mapping={r'.*': 'builtin'}))
+
+    @run_after('init')
+    def apply_module_info(self):
+        hooks.apply_module_info(test = self, module_info = self.module_info)
+
+    @run_after('init')
+    def set_test_scale(self):
+        scale_variant, self.num_nodes = self.scale
+        self.tags.add(scale_variant)
+
+    # Skip testing GPU-based modules on CPU-based nodes
+    @run_after('setup')
+    def skip_gpu_test_on_cpu_nodes(self):
+        hooks.skip_gpu_test_on_cpu_nodes(self)
+
+    # Skip testing with device == gpu on CPU based nodes
+    @run_after('setup')
+    def skip_device_gpu_on_cpu_nodes(self):
+        self.skip_if(
+            (self.device == 'gpu' and not utils.is_gpu_present(self)),
+            "Skipping test variant where tf.device is GPU, since this partition contains non-GPU nodes"
+        )
+
+    # Assign num_tasks, num_tasks_per_node and num_cpus_per_task automatically based on current partition's num_cpus and gpus
+    @run_after('setup')
+    def set_num_tasks(self):
+        hooks.auto_assign_num_tasks_hybrid(test = self, num_nodes = self.num_nodes)
+
+    @run_after('setup')
+    def set_omp_num_threads(self):
+        # For CPU runs on more than 4 cores, leave one thread idle for Horovod
+        if self.device == 'cpu' and self.num_cpus_per_task > 4:
+            self.omp_num_threads = self.num_cpus_per_task - 1
+        else:
+            self.omp_num_threads = self.num_cpus_per_task
+        print("Set omp_num_threads to: %s" % self.omp_num_threads)
+
diff --git a/tests/reframe/eessi_utils/hooks.py b/tests/reframe/eessi_utils/hooks.py
new file mode 100644
index 0000000000..84b10ac997
--- /dev/null
+++ b/tests/reframe/eessi_utils/hooks.py
@@ -0,0 +1,61 @@
+import reframe as rfm
+import eessi_utils.utils as utils
+from typing import Tuple
+
+def apply_module_info(test: rfm.RegressionTest, module_info: Tuple[str, str, str]):
+    '''Apply module info that was obtained with a find_modules.
+    To work with this hook, module_info should store the return of parameter(find_modules(...))'''
+    sys, env, mod = module_info
+    test.valid_systems = [sys]
+    test.modules = [mod]
+    test.valid_prog_environs = [env]
+
+def skip_cpu_test_on_gpu_nodes(test: rfm.RegressionTest):
+    '''Skip test if GPUs are present, but no CUDA is required'''
+    skip = ( utils.is_gpu_present(test) and not utils.is_cuda_required(test) )
+    if skip:
+        print("GPU is present on this partition, skipping CPU-based test")
+        test.skip_if(True)
+
+def skip_gpu_test_on_cpu_nodes(test: rfm.RegressionTest):
+    '''Skip test if CUDA is required, but no GPU is present'''
+    skip = ( utils.is_cuda_required(test) and not utils.is_gpu_present(test) )
+    if skip:
+        print("Test requires CUDA, but no GPU is present in this partition. Skipping test...")
+        test.skip_if(True)
+
+def auto_assign_num_tasks_MPI(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest:
+    '''Automatically sets num_tasks, tasks_per_node and cpus_per_task based on the current partitions num_cpus, number of GPUs and test.num_nodes. For GPU tests, one task per GPU is set, and num_cpus_per_task is based on the ratio of CPU cores/GPUs. For CPU tests, one task per CPU is set, and num_cpus_per_task is set to 1. Total task count is determined based on the number of nodes to be used in the test. Behaviour of this function is (usually) sensible for pure MPI tests.'''
+    if utils.is_cuda_required(test):
+        test.num_tasks_per_node = utils.get_num_gpus(test)
+        test.num_cpus_per_task = int(test.current_partition.processor.num_cpus / test.num_tasks_per_node)
+    else:
+        test.num_tasks_per_node = test.current_partition.processor.num_cpus
+        test.num_cpus_per_task = 1
+    test.num_tasks = num_nodes * test.num_tasks_per_node
+
+def auto_assign_num_tasks_hybrid(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest:
+    '''Automatically sets num_tasks, tasks_per_node and cpus_per_task based on the current partitions num_cpus, num_sockets, number of GPUs and test.num_nodes. For GPU tests, one task per GPU is set, and num_cpus_per_task is based on the ratio of CPU cores/GPUs. For CPU tests, one task per CPU socket is set, and num_cpus_per_task is set to #CPU cores / #sockets. Total task count is determined based on the number of nodes to be used in the test. Behaviour of this function is (usually) sensible for hybrid OpenMP-MPI tests. For sockets with very large core counts (i.e. where OpenMP cannot exploit sufficient parallelism), this approach may be inefficient and more than 1 task per socket may be desirable.'''
+    if utils.is_cuda_required(test):
+        test.num_tasks_per_node = utils.get_num_gpus(test)
+        test.num_cpus_per_task = int(test.current_partition.processor.num_cpus / test.num_tasks_per_node)
+    else:
+        # For AMD ZEN2 nodes, hybrid programs might run faster when launching task per numa domain instaed of per socket.
+        # Not sure if numa domain detection is supported (yet) in ReFrame... If so, we can improve this segment (and change the --bind-to socket to --bind-to numa)
+        test.num_tasks_per_node = test.current_partition.processor.num_sockets
+        test.num_cpus_per_task = test.current_partition.processor.num_cpus_per_socket
+    test.num_tasks = num_nodes * test.num_tasks_per_node
+
+    # Bind to none if running hybrid with only a single task per node
+    if test.current_partition.launcher_type.registered_name == 'mpirun':
+        if test.num_tasks_per_node == 1:
+            test.job.launcher.options.append(' --bind-to none')
+        else:
+            test.job.launcher.options.append(' --bind-to socket')
+
+def bind_to_none(test:rfm.RegressionTest) -> rfm.RegressionTest:
+    '''Set --bind-to none in case the launcher is mpirun'''
+    if test.current_partition.launcher_type.registered_name == 'mpirun':
+        if test.num_tasks_per_node > 1:
+            print("Warning: test is running with more than one task per node, but you are binding to 'none'. This is probably a mistake in the test implementation.")
+        test.job.launcher.options.append(' --bind-to none')
diff --git a/tests/reframe/eessi_utils/utils.py b/tests/reframe/eessi_utils/utils.py
new file mode 100644
index 0000000000..8b5cd3194b
--- /dev/null
+++ b/tests/reframe/eessi_utils/utils.py
@@ -0,0 +1,35 @@
+import re
+
+import reframe as rfm
+
+
+gpu_dev_name = 'gpu'
+
+def _get_gpu_list(test: rfm.RegressionTest):
+    return [ dev.num_devices for dev in test.current_partition.devices if dev.device_type == gpu_dev_name ]
+
+def get_num_gpus(test: rfm.RegressionTest) -> int:
+    '''Returns the number of GPUs for the current partition'''
+    gpu_list = _get_gpu_list(test)
+    # If multiple devices are called 'GPU' in the current partition,
+    # we don't know for which to return the device count...
+    if(len(gpu_list) != 1):
+        raise ValueError(f"Multiple different devices exist with the name " 
+                         f"'{gpu_dev_name}' for partition '{test.current_partition.name}'. "
+                         f"Cannot determine number of GPUs available for the test. "
+                         f"Please check the definition of partition '{test.current_partition.name}' "
+                         f"in your ReFrame config file.")
+
+    return gpu_list[0]
+
+def is_gpu_present(test: rfm.RegressionTest) -> bool:
+    '''Checks if GPUs are present in the current partition'''
+    return ( len(_get_gpu_list(test)) >= 1 )
+
+def is_cuda_required(test: rfm.RegressionTest) -> bool:
+    '''Checks if CUDA seems to be required by current module'''
+    requires_cuda = False
+    for module in test.modules:
+        if re.search("(?i)cuda", module):
+            requires_cuda = True
+    return requires_cuda
diff --git a/tests/reframe/testlib/applications/tensorflow2/__init__.py b/tests/reframe/testlib/applications/tensorflow2/__init__.py
new file mode 100644
index 0000000000..6a140f71d8
--- /dev/null
+++ b/tests/reframe/testlib/applications/tensorflow2/__init__.py
@@ -0,0 +1,82 @@
+import os
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+class TensorFlow2(rfm.RunOnlyRegressionTest, pin_prefix=True):
+
+    num_tasks = required
+    num_tasks_per_node = required
+    num_cpus_per_task = required
+
+    # Can be 'gpu' or 'cpu'
+    device = parameter(['gpu', 'cpu'])
+    batch_size = variable(int, value = 32) # Smaller batch sizes may be used if running out of memory
+
+    # For multinode runs, Horovod is used. Horovod may perform better on CPU if one thread is left idle
+    # See https://github.com/horovod/horovod/issues/2804
+    omp_num_threads = variable(int)
+
+    descr = 'TensorFlow 2 synthetic benchmark'
+    executable = 'python'
+    script = 'tensorflow2_synthetic_benchmark.py'
+    model = 'ResNet50'
+
+    maintainers = ['casparvl']
+
+    @run_before('performance')
+    def set_reference(self):
+        self.reference = {
+            '*': {
+                'throughput': (None, None, None, 'img/sec'),
+                f'throughput_per_{self.device}': (None, None, None, 'img/sec')
+            }
+        }
+
+    @run_before('run')
+    def set_executable_opts(self):
+        '''Set the executable opts, with correct batch_size'''
+        self.executable_opts = [
+            f'{self.script}',
+            f'--model {self.model}',
+            f'--batch-size {self.batch_size}',
+            '--num-iters 2',
+            '--num-batches-per-iter 2',
+            '--num-warmup-batches 1',
+            '--inter-op-threads 1',
+            '--intra-op-threads %s' % self.omp_num_threads
+        ]
+        if self.device == 'cpu':
+            self.executable_opts.append('--no-cuda')
+        # Use horovod for parallelism
+        if self.num_tasks > 1:
+            self.executable_opts.append('--use-horovod')
+
+    @sn.sanity_function
+    def get_throughput(self):
+        throughput_sn = sn.extractsingle(
+            rf'Total img\/sec on [0-9]+ {self.device.upper()}\(s\): '
+            rf'(?P<throughput>\S+) \S+',
+            self.stdout, 'throughput', float)
+        return throughput_sn
+
+    @sn.sanity_function
+    def get_throughput_per_dev(self):
+        throughput_dev_sn = sn.extractsingle(
+            rf'Img\/sec per {self.device.upper()}: '
+            rf'(?P<throughput>\S+) \S+',
+            self.stdout, 'throughput', float)
+        return throughput_dev_sn
+
+    @run_before('sanity')
+    def set_sanity_patterns(self):
+        self.sanity_patterns = sn.all([
+            sn.assert_found('Benchmark completed', self.stdout),
+        ])
+
+    @run_before('performance')
+    def set_perf_patterns(self):
+        self.perf_patterns = {
+            'throughput': self.get_throughput(),
+            f'throughput_per_{self.device}': self.get_throughput_per_dev()
+        }
+
diff --git a/tests/reframe/testlib/applications/tensorflow2/src/tensorflow2_synthetic_benchmark.py b/tests/reframe/testlib/applications/tensorflow2/src/tensorflow2_synthetic_benchmark.py
new file mode 100644
index 0000000000..7b0543012f
--- /dev/null
+++ b/tests/reframe/testlib/applications/tensorflow2/src/tensorflow2_synthetic_benchmark.py
@@ -0,0 +1,163 @@
+# Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import os
+import numpy as np
+import timeit
+
+import tensorflow as tf
+from tensorflow.keras import applications
+from tensorflow.keras import mixed_precision
+
+def log(s, nl=True):
+    if args.use_horovod:
+        if hvd.rank() != 0:
+            return
+    print(s, end='\n' if nl else '')
+
+# Benchmark settings
+parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--fp16-allreduce', action='store_true', default=False,
+                    help='use fp16 compression during allreduce')
+parser.add_argument('--mixed-prec', action='store_true', default=False,
+                    help='Use mixed precision for training')
+
+parser.add_argument('--model', type=str, default='ResNet50',
+                    help='model to benchmark')
+parser.add_argument('--batch-size', type=int, default=32,
+                    help='input batch size')
+
+parser.add_argument('--num-warmup-batches', type=int, default=2,
+                    help='number of warm-up batches that don\'t count towards benchmark')
+parser.add_argument('--num-batches-per-iter', type=int, default=10,
+                    help='number of batches per benchmark iteration')
+parser.add_argument('--num-iters', type=int, default=10,
+                    help='number of benchmark iterations')
+
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disables CUDA training')
+parser.add_argument('--use-horovod', action='store_true', default=False)
+
+parser.add_argument('--inter-op-threads', type=int, default=None)
+parser.add_argument('--intra-op-threads', type=int, default=None)
+
+args = parser.parse_args()
+args.cuda = not args.no_cuda
+
+# Horovod: initialize Horovod.
+if args.use_horovod:
+    import horovod.tensorflow as hvd
+    hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+if args.cuda:
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    for gpu in gpus:
+        tf.config.experimental.set_memory_growth(gpu, True)
+    if gpus and args.use_horovod:
+        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+else:
+    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+# Set number of inter and intra-op threads for TensorFlow to use
+if args.inter_op_threads is not None:
+    tf.config.threading.set_inter_op_parallelism_threads(args.inter_op_threads)
+if args.intra_op_threads is not None:
+    tf.config.threading.set_intra_op_parallelism_threads(args.intra_op_threads)
+
+log("Running with inter-op-threads: %s" % tf.config.threading.get_inter_op_parallelism_threads())
+log("Running with intra-op-threads: %s" % tf.config.threading.get_intra_op_parallelism_threads())
+
+if args.mixed_prec:
+    log('Running with mixed_float16 as global policy for the precision')
+    mixed_precision.set_global_policy('mixed_float16')
+
+# Set up standard model.
+model = getattr(applications, args.model)(weights=None)
+opt = tf.optimizers.SGD(0.01)
+
+data = tf.random.uniform([args.batch_size, 224, 224, 3])
+target = tf.random.uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64)
+
+
+@tf.function
+def benchmark_step(first_batch):
+    # Horovod: (optional) compression algorithm.
+    if args.use_horovod:
+        compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
+
+    # Horovod: use DistributedGradientTape
+    with tf.GradientTape() as tape:
+        probs = model(data, training=True)
+        loss = tf.losses.sparse_categorical_crossentropy(target, probs)
+
+    # Horovod: add Horovod Distributed GradientTape.
+    if args.use_horovod:
+        tape = hvd.DistributedGradientTape(tape, compression=compression)
+
+    gradients = tape.gradient(loss, model.trainable_variables)
+    opt.apply_gradients(zip(gradients, model.trainable_variables))
+
+    # Horovod: broadcast initial variable states from rank 0 to all other processes.
+    # This is necessary to ensure consistent initialization of all workers when
+    # training is started with random weights or restored from a checkpoint.
+    #
+    # Note: broadcast should be done after the first gradient step to ensure optimizer
+    # initialization.
+    if args.use_horovod and first_batch:
+        hvd.broadcast_variables(model.variables, root_rank=0)
+        hvd.broadcast_variables(opt.variables(), root_rank=0)
+
+log('Model: %s' % args.model)
+log('Batch size: %d' % args.batch_size)
+device = 'GPU' if args.cuda else 'CPU'
+if args.use_horovod:
+    log('Number of %ss: %d' % (device, hvd.size()))
+else:
+    log('Number of %ss: %s' % (device, 1))
+
+
+with tf.device(device):
+    # Warm-up
+    log('Running warmup...')
+    benchmark_step(first_batch=True)
+
+    timeit.timeit(lambda: benchmark_step(first_batch=False),
+                  number=args.num_warmup_batches)
+
+    # Benchmark
+    log('Running benchmark...')
+    img_secs = []
+    for x in range(args.num_iters):
+        time = timeit.timeit(lambda: benchmark_step(first_batch=False),
+                             number=args.num_batches_per_iter)
+        img_sec = args.batch_size * args.num_batches_per_iter / time
+        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
+        img_secs.append(img_sec)
+
+    # Results
+    img_sec_mean = np.mean(img_secs)
+    img_sec_conf = 1.96 * np.std(img_secs)
+    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
+    if args.use_horovod:
+        ndevices = hvd.size()
+    else:
+        ndevices = 1
+    log('Total img/sec on %d %s(s): %.1f +-%.1f' %
+        (ndevices, device, ndevices * img_sec_mean, ndevices * img_sec_conf))
+    log('Benchmark completed')