Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
db3758e
Initial try of mpi hello world test
casparvl Nov 9, 2020
fc7e8a0
Merge branch 'master' of github.com:EESSI/software-layer
casparvl Nov 9, 2020
d7aa7e2
Made the config do something on both our systems...
Nov 26, 2020
0d71815
Changed config to submit to short queue for faster testing. Added san…
Nov 26, 2020
3d79f66
Make the test system-independent
Nov 26, 2020
8636c4c
Make it system independent
Nov 26, 2020
c6a62a3
Use a flexible num_tasks_per_node by defining it in config/system_pro…
Dec 3, 2020
eb94473
remove pychaches
Dec 3, 2020
0b446e7
Removed logfile
Dec 3, 2020
bee7398
Some cleanup
Dec 3, 2020
545d9ed
Use flexible task count
Dec 3, 2020
11011a7
Trying to develop a GROMACS test that runs with the EESSI container. …
Dec 17, 2020
bd2c8ec
Added a test that will just ls into the container, to see if we get t…
Jan 25, 2021
30cc2bd
Updated gromacs test. It seems to run in parallel, and returns the co…
Jan 25, 2021
1c335c6
made it work with our ancient tmod...
Jan 26, 2021
c81f95f
Could only get the MPI in the container to work with pmix, since PMI2…
Jan 26, 2021
49857b6
Added sanity check for gromacs
Jan 26, 2021
ebabe02
Wrote down some todo, so that I don't forget...
Jan 26, 2021
94cedcb
Gromacs test works now, but requires setup of an alien cache to run m…
Feb 2, 2021
5ffcd5c
Put script for shared alien cache under version control...
Feb 2, 2021
fd613f7
made test parameterized. Todo: create seperate container and native t…
Feb 2, 2021
45f5977
Added tags
Feb 2, 2021
6d1f892
Made specific class for container test. Todo: do the same for CVMFS n…
Feb 3, 2021
2f574ee
deleted non-needed files for GROMACS
Feb 4, 2021
fabf21e
Somehow I failed to get the right version of the GROMACS test in this…
Feb 4, 2021
02309e8
For now, tag this as a CPU only test. For GPU, probably only the modu…
Feb 4, 2021
cf7610b
Tag must be a set
Feb 4, 2021
90f5826
Changed tag to singlenode, so we can destinguish between single node …
casparvl Feb 5, 2021
fb5016b
Updated gromacs for ReFrame 3.5.0 to run a task count based on the nu…
casparvl Mar 26, 2021
718cd89
[ECT] [cart,lisa] [ReFrame-3.5.0.eb] [production] Testing if I can bu…
Mar 26, 2021
455d639
Updated gromacs check to use parameter instead of parameterized_test
May 4, 2021
7ee97e1
Added Gromacs PRACE testcase A in library test format, including 'sys…
Jun 9, 2021
5c8c1e2
Set correct number of tasks for GPU runs, to one per GPU. Set correct…
Jun 9, 2021
bebeaf1
Moved setting OMP_NUM_THREADS to library test. Disable running CPU ba…
Jun 9, 2021
e3c72d8
Set valid systems based on what is returned by find_modules. Otherwis…
Jun 9, 2021
3f01a3d
Added more clear description of how the test decides to run CPU/GPU t…
Jun 9, 2021
520007d
Added more clear description of how num_tasks, num_tasks_per_node and…
Jun 9, 2021
a6d945a
Merge branch 'EESSI:main' into gromacs_libtest
casparvl Jun 10, 2021
b9af81e
Removed dummy file
Jun 10, 2021
76eeda9
Merge branch 'gromacs_libtest' of github.com:casparvl/software-layer …
Jun 10, 2021
11f91a7
Add an example settings.py that works with the gromacs.py test in thi…
Jun 10, 2021
9c4a6a7
Changed various things in response to @jjotero's review. 1. Created s…
Jun 14, 2021
261a106
Clarified comment on commented ReFrame version requirement
Jun 14, 2021
28878cf
Clarified error message
Jun 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 86 additions & 0 deletions tests/reframe/config/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
site_configuration = {
'systems': [
{
'name': 'example_system',
'descr': 'This is just an example system',
'modules_system': 'tmod',
'hostnames': ['login', 'int'],
'partitions': [
{
'name': 'cpu',
'scheduler': 'slurm',
'launcher': 'srun',
'access': ['-p cpu'],
'environs': ['builtin'],
'processor': {
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can now be taken out, since ReFrame supports autodetection of CPU architecture (not yet of GPU arch though).
https://reframe-hpc.readthedocs.io/en/stable/configure.html?highlight=auto%20detection#auto-detecting-processor-information

'num_cpus': 24,
},
'descr': 'normal CPU partition'
},
{
'name': 'gpu',
'descr': 'GPU partition',
'scheduler': 'slurm',
'access': ['-p gpu'],
'environs': ['builtin'],
'max_jobs': 100,
'launcher': 'srun',
'processor': {
'num_cpus': 16,
},
'devices': [
{
'type': 'gpu',
'num_devices': 2,
},
],
},
]
},
],
'environments': [
{
'name': 'builtin',
'cc': 'cc',
'cxx': '',
'ftn': '',
},
],
'logging': [
{
'level': 'debug',
'handlers': [
{
'type': 'stream',
'name': 'stdout',
'level': 'info',
'format': '%(message)s'
},
{
'type': 'file',
'name': 'reframe.log',
'level': 'debug',
'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', # noqa: E501
'append': False
}
],
'handlers_perflog': [
{
'type': 'filelog',
'prefix': '%(check_system)s/%(check_partition)s',
'level': 'info',
'format': (
'%(check_job_completion_time)s|reframe %(version)s|'
'%(check_info)s|jobid=%(check_jobid)s|'
'%(check_perf_var)s=%(check_perf_value)s|'
'ref=%(check_perf_ref)s '
'(l=%(check_perf_lower_thres)s, '
'u=%(check_perf_upper_thres)s)|'
'%(check_perf_unit)s'
),
'append': True
}
]
}
],
}
53 changes: 53 additions & 0 deletions tests/reframe/eessi-checks/applications/gromacs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import re
import reframe as rfm
from reframe.utility import find_modules

from testlib.applications.gromacs import Gromacs
import eessi_utils.hooks as hooks

@rfm.required_version('>=3.6.2')
@rfm.simple_test
class Gromacs_EESSI(Gromacs):
'''EESSI Gromacs check.
This test will run GROMACS using all modules with 'GROMACS' in the module environment it can find.
On GPU nodes, it will only run tests if module names also contain 'cuda'.
On CPU nodes, it will only run tests if a module name does NOT contain 'cuda'.
Whether a nodes is CPU/GPU is determined based on if a device named 'gpu' is specified in the ReFrame settings file for the current partition.
Number of tasks, tasks per node and cpus per task are set based on the number of GPUs and number of CPUs specified in the ReFrame config file for the current partition.
'''

Comment thread
casparvl marked this conversation as resolved.
modules = required # Make sure that our apply_module_info hook sets a value
scale = parameter([
('singlenode', 10000, 1),
('small', 40000, 4),
('large', 100000, 10)])
module_info = parameter(find_modules('GROMACS', environ_mapping={r'.*': 'builtin'}))

@run_after('init')
def apply_module_info(self):
self.s, self.e, self.m = self.module_info
self.valid_systems = [self.s]
self.modules = [self.m]
self.valid_prog_environs = [self.e]

@run_after('init')
def set_test_scale(self):
Comment thread
casparvl marked this conversation as resolved.
scale_variant, self.nsteps, self.num_nodes = self.scale
self.tags.add(scale_variant)

# Skip testing GPU-based modules on CPU-based nodes
@run_after('setup')
def skip_gpu_test_on_cpu_nodes(self):
hooks.skip_gpu_test_on_cpu_nodes(self)

# Skip testing CPU-based modules on GPU-based nodes
# (though these would run fine, one is usually not interested in them)
@run_after('setup')
def skip_cpu_test_on_gpu_nodes(self):
hooks.skip_cpu_test_on_gpu_nodes(self)

# Assign num_tasks, num_tasks_per_node and num_cpus_per_task automatically based on current partition's num_cpus and gpus
@run_after('setup')
def set_num_tasks(self):
hooks.auto_assign_num_tasks_MPI(test = self, num_nodes = self.num_nodes)

26 changes: 26 additions & 0 deletions tests/reframe/eessi_utils/hooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import reframe as rfm
import eessi_utils.utils as utils

def skip_cpu_test_on_gpu_nodes(test: rfm.RegressionTest):
'''Skip test if GPUs are present, but no CUDA is required'''
skip = ( utils.is_gpu_present(test) and not utils.is_cuda_required(test) )
if skip:
print("GPU is present on this partition, skipping CPU-based test")
test.skip_if(True)

Comment thread
casparvl marked this conversation as resolved.
def skip_gpu_test_on_cpu_nodes(test: rfm.RegressionTest):
'''Skip test if CUDA is required, but no GPU is present'''
skip = ( utils.is_cuda_required(test) and not utils.is_gpu_present(test) )
if skip:
print("Test requires CUDA, but no GPU is present in this partition. Skipping test...")
test.skip_if(True)

def auto_assign_num_tasks_MPI(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest:
'''Automatically sets num_tasks, tasks_per_node and cpus_per_task based on the current partitions num_cpus, number of GPUs and test.num_nodes. For GPU tests, one task per GPU is set, and num_cpus_per_task is based on the ratio of CPU cores/GPUs. For CPU tests, one task per CPU is set, and num_cpus_per_task is set to 1. Total task count is determined based on the number of nodes to be used in the test. Behaviour of this function is (usually) sensible for pure MPI tests.'''
if utils.is_cuda_required(test):
test.num_tasks_per_node = utils.get_num_gpus(test)
test.num_cpus_per_task = int(test.current_partition.processor.num_cpus / test.num_tasks_per_node)
else:
test.num_tasks_per_node = test.current_partition.processor.num_cpus
test.num_cpus_per_task = 1
test.num_tasks = num_nodes * test.num_tasks_per_node
35 changes: 35 additions & 0 deletions tests/reframe/eessi_utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import re

import reframe as rfm


gpu_dev_name = 'gpu'

def _get_gpu_list(test: rfm.RegressionTest):
return [ dev.num_devices for dev in test.current_partition.devices if dev.device_type == gpu_dev_name ]

def get_num_gpus(test: rfm.RegressionTest) -> int:
'''Returns the number of GPUs for the current partition'''
gpu_list = _get_gpu_list(test)
# If multiple devices are called 'GPU' in the current partition,
# we don't know for which to return the device count...
if(len(gpu_list) != 1):
raise ValueError(f"Multiple different devices exist with the name "
f"'{gpu_dev_name}' for partition '{test.current_partition.name}'. "
f"Cannot determine number of GPUs available for the test. "
f"Please check the definition of partition '{test.current_partition.name}' "
f"in your ReFrame config file.")

return gpu_list[0]

def is_gpu_present(test: rfm.RegressionTest) -> bool:
'''Checks if GPUs are present in the current partition'''
return ( len(_get_gpu_list(test)) >= 1 )

def is_cuda_required(test: rfm.RegressionTest) -> bool:
'''Checks if CUDA seems to be required by current module'''
requires_cuda = False
for module in test.modules:
if re.search("(?i)cuda", module):
requires_cuda = True
return requires_cuda
66 changes: 66 additions & 0 deletions tests/reframe/testlib/applications/gromacs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import os
import reframe as rfm
import reframe.utility.sanity as sn

# Cannot currently set required version on a library test
#@rfm.required_version('>=3.6.2')
class Gromacs(rfm.RunOnlyRegressionTest, pin_prefix=True):
'''Gromacs benchmark based on Prace Benchmark Suite GROMACS case A.

Derived tests must specify the variables ``num_tasks``, ``num_tasks_per_node``, ``num_cpus_per_task``, ``nsteps`` and ``modules``.
Note that a sufficiently large ``nsteps`` needs to be defined in order for GROMACS to pass the load balancing phase.
As a rough estimate: 10000 steps would generally be ok for 24 tasks, a 100000 steps for 240 tasks, etc.
'''

num_tasks = required
num_tasks_per_node = required
num_cpus_per_task = required
nsteps = variable(int)

descr = 'GROMACS Prace Benchmark Suite case A'
use_multithreading = False
executable = 'gmx_mpi'
output_file = 'md.log'
energy_reference = -1509290.0
reference = {
'*': {
'perf': (None, None, None, 'ns/day')
}
}
maintainers = ['casparvl']

@run_before('run')
def set_executable_opts(self):
'''Set the executable opts, with correct nsteps'''
self.executable_opts = ['mdrun', '-s ion_channel.tpr', '-maxh 0.50',
'-resethway', '-noconfout', '-nsteps %s ' % self.nsteps]

@run_before('run')
def set_omp_num_threads(self):
self.variables = {
'OMP_NUM_THREADS': f'{self.num_cpus_per_task}',
}

@run_before('performance')
def set_perf_patterns(self):
'''Set the perf patterns to report'''
self.perf_patterns = {
'perf': sn.extractsingle(r'Performance:\s+(?P<perf>\S+)',
self.output_file, 'perf', float)
}

@sn.sanity_function
def get_energy(self):
return sn.extractsingle(r'\s+Coul\. recip\.\s+Potential\s+Kinetic En\.\s+Total Energy\s+Conserved En.\n'
r'(\s+\S+){3}\s+(?P<energy>\S+)(\s+\S+){1}\n',
self.output_file, 'energy', float, item=-1)

@run_before('sanity')
def set_sanity_patterns(self):
self.sanity_patterns = sn.all([
sn.assert_found('Finished mdrun', self.output_file,
msg = "Run seems to not have finished succesfully"),
sn.assert_reference(self.get_energy(), self.energy_reference, -0.001, 0.001,
msg = "Final energy reference not within expected limits")
])

Binary file not shown.