From 41d4ee7c1b7c245be37d38e60df5777d904e4029 Mon Sep 17 00:00:00 2001 From: hategan Date: Thu, 9 Jan 2025 15:19:53 -0800 Subject: [PATCH 1/5] On Frontier, the node count is mandatory. We should probably discuss having one node as a default and specify that in the spec. In the mean time, add one node as default for slurm. --- src/psij/executors/batch/slurm/slurm.mustache | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/psij/executors/batch/slurm/slurm.mustache b/src/psij/executors/batch/slurm/slurm.mustache index b044ba27..962bc2ab 100644 --- a/src/psij/executors/batch/slurm/slurm.mustache +++ b/src/psij/executors/batch/slurm/slurm.mustache @@ -38,6 +38,9 @@ #SBATCH --cpus-per-task={{.}} {{/cpu_cores_per_process}} {{/job.spec.resources}} +{{^job.spec.resources}} +#SBATCH --nodes=1 +{{/job.spec.resources}} {{#formatted_job_duration}} #SBATCH --time={{.}} From 8fe07286c272fad4765025f96a76a94cfea691e8 Mon Sep 17 00:00:00 2001 From: hategan Date: Thu, 9 Jan 2025 18:45:46 -0800 Subject: [PATCH 2/5] Set default resources on submit and use calculated resource numbers in templates. --- src/psij/executors/batch/cobalt/cobalt.mustache | 8 ++++---- src/psij/executors/batch/lsf/lsf.mustache | 8 ++++---- src/psij/executors/batch/slurm/slurm.mustache | 15 ++++++--------- src/psij/job_executor.py | 6 ++++++ tests/plugins1/_batch_test/test/test.mustache | 4 ++-- 5 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/psij/executors/batch/cobalt/cobalt.mustache b/src/psij/executors/batch/cobalt/cobalt.mustache index 8281ed3e..90d9b73c 100644 --- a/src/psij/executors/batch/cobalt/cobalt.mustache +++ b/src/psij/executors/batch/cobalt/cobalt.mustache @@ -6,12 +6,12 @@ #COBALT --cwd={{.}} {{/job.spec.directory}} {{#job.spec.resources}} - {{#node_count}} + {{#computed_node_count}} #COBALT --nodecount={{.}} - {{/node_count}} - {{#process_count}} + {{/computed_node_count}} + {{#computed_process_count}} #COBALT --proccount={{.}} - {{/process_count}} + {{/computed_process_count}} {{/job.spec.resources}} {{#formatted_job_duration}} #COBALT --time={{duration}} diff --git a/src/psij/executors/batch/lsf/lsf.mustache b/src/psij/executors/batch/lsf/lsf.mustache index a616ad37..aaa5f319 100644 --- a/src/psij/executors/batch/lsf/lsf.mustache +++ b/src/psij/executors/batch/lsf/lsf.mustache @@ -20,13 +20,13 @@ {{#job.spec.resources}} - {{#node_count}} + {{#computed_node_count}} #BSUB -nnodes {{.}} - {{/node_count}} + {{/computed_node_count}} - {{#process_count}} + {{#computed_process_count}} #BSUB -n {{.}} - {{/process_count}} + {{/computed_process_count}} {{#gpu_cores_per_process}} #BSUB -gpu num={{.}}/task diff --git a/src/psij/executors/batch/slurm/slurm.mustache b/src/psij/executors/batch/slurm/slurm.mustache index 962bc2ab..b9862d1b 100644 --- a/src/psij/executors/batch/slurm/slurm.mustache +++ b/src/psij/executors/batch/slurm/slurm.mustache @@ -14,17 +14,17 @@ #SBATCH --exclusive {{/exclusive_node_use}} - {{#node_count}} + {{#computed_node_count}} #SBATCH --nodes={{.}} - {{/node_count}} + {{/computed_node_count}} - {{#process_count}} + {{#computed_process_count}} #SBATCH --ntasks={{.}} - {{/process_count}} + {{/computed_process_count}} - {{#processes_per_node}} + {{#computed_ppn}} #SBATCH --ntasks-per-node={{.}} - {{/processes_per_node}} + {{/computed_ppn}} {{#gpu_cores_per_process}} #SBATCH --gpus-per-task={{.}} @@ -38,9 +38,6 @@ #SBATCH --cpus-per-task={{.}} {{/cpu_cores_per_process}} {{/job.spec.resources}} -{{^job.spec.resources}} -#SBATCH --nodes=1 -{{/job.spec.resources}} {{#formatted_job_duration}} #SBATCH --time={{.}} diff --git a/src/psij/job_executor.py b/src/psij/job_executor.py index 9f686b73..286fc315 100644 --- a/src/psij/job_executor.py +++ b/src/psij/job_executor.py @@ -14,11 +14,15 @@ from psij.job_executor_config import JobExecutorConfig from psij.job_launcher import Launcher from psij.job_spec import JobSpec +from psij.resource_spec import ResourceSpecV1 logger = logging.getLogger(__name__) +_DEFAULT_RESOURCES = ResourceSpecV1() + + class JobExecutor(ABC): """An abstract base class for all JobExecutor implementations.""" @@ -92,6 +96,8 @@ def _check_job(self, job: Job) -> JobSpec: spec = job.spec if not spec: raise InvalidJobException('Missing specification') + if not spec.resources: + spec.resources = _DEFAULT_RESOURCES if __debug__: if spec.environment is not None: diff --git a/tests/plugins1/_batch_test/test/test.mustache b/tests/plugins1/_batch_test/test/test.mustache index bfd09a45..0759e0af 100644 --- a/tests/plugins1/_batch_test/test/test.mustache +++ b/tests/plugins1/_batch_test/test/test.mustache @@ -9,9 +9,9 @@ cd "{{.}}" export PSIJ_TEST_BATCH_EXEC_COUNT=1 {{#job.spec.resources}} - {{#process_count}} + {{#computed_process_count}} export PSIJ_TEST_BATCH_EXEC_COUNT={{.}} - {{/process_count}} + {{/computed_process_count}} {{/job.spec.resources}} {{#job.spec.attributes}} From acaa1627c3e85d2bffee32d9affaa2add11fd6e6 Mon Sep 17 00:00:00 2001 From: hategan Date: Fri, 10 Jan 2025 12:43:35 -0800 Subject: [PATCH 3/5] "computed_ppn" is not a thing. --- src/psij/executors/batch/slurm/slurm.mustache | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/psij/executors/batch/slurm/slurm.mustache b/src/psij/executors/batch/slurm/slurm.mustache index b9862d1b..68e86271 100644 --- a/src/psij/executors/batch/slurm/slurm.mustache +++ b/src/psij/executors/batch/slurm/slurm.mustache @@ -22,9 +22,9 @@ #SBATCH --ntasks={{.}} {{/computed_process_count}} - {{#computed_ppn}} + {{#computed_processes_per_node}} #SBATCH --ntasks-per-node={{.}} - {{/computed_ppn}} + {{/computed_processes_per_node}} {{#gpu_cores_per_process}} #SBATCH --gpus-per-task={{.}} From 9e2354d44f505acade95416f39d4bfa4a989a4ca Mon Sep 17 00:00:00 2001 From: hategan Date: Fri, 10 Jan 2025 14:16:07 -0800 Subject: [PATCH 4/5] Set `exclusive_node_use` default to `False` (for some reason it was `True`) and add a test (for slurm only at this time) to ensure that the corresponding parameter is not generated in the submit script. --- src/psij/resource_spec.py | 2 +- tests/test_executor.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/psij/resource_spec.py b/src/psij/resource_spec.py index 68aa680f..7b565fe4 100644 --- a/src/psij/resource_spec.py +++ b/src/psij/resource_spec.py @@ -54,7 +54,7 @@ def __init__(self, node_count: Optional[int] = None, processes_per_node: Optional[int] = None, cpu_cores_per_process: Optional[int] = None, gpu_cores_per_process: Optional[int] = None, - exclusive_node_use: bool = True) -> None: + exclusive_node_use: bool = False) -> None: """ Some of the properties of this class are constrained. Specifically, `process_count = node_count * processes_per_node`. Specifying all constrained properties diff --git a/tests/test_executor.py b/tests/test_executor.py index 0cc8b510..b940b40a 100644 --- a/tests/test_executor.py +++ b/tests/test_executor.py @@ -224,3 +224,16 @@ def test_submit_script_generation(exec_name: str) -> None: lambda k, v: setattr(attrs, k, v)) _check_str_attrs(ex, job, [prefix + '.cust_attr1', prefix + '.cust_attr2'], lambda k, v: c_attrs.__setitem__(k, v)) + + +def test_resource_generation1() -> None: + res = ResourceSpecV1() + spec = JobSpec('/bin/date', resources=res) + job = Job(spec=spec) + ex = JobExecutor.get_instance('slurm') + with TemporaryFile(mode='w+') as f: + ex.generate_submit_script(job, ex._create_script_context(job), f) + f.seek(0) + contents = f.read() + if contents.find('--exclusive') != -1: + pytest.fail('Spurious exclusive flag') From b0d3253c7a2d65e62db49602b914a74915bf822c Mon Sep 17 00:00:00 2001 From: hategan Date: Fri, 10 Jan 2025 14:18:12 -0800 Subject: [PATCH 5/5] Typechecking --- tests/test_executor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_executor.py b/tests/test_executor.py index b940b40a..af2bbd69 100644 --- a/tests/test_executor.py +++ b/tests/test_executor.py @@ -231,6 +231,7 @@ def test_resource_generation1() -> None: spec = JobSpec('/bin/date', resources=res) job = Job(spec=spec) ex = JobExecutor.get_instance('slurm') + assert isinstance(ex, BatchSchedulerExecutor) with TemporaryFile(mode='w+') as f: ex.generate_submit_script(job, ex._create_script_context(job), f) f.seek(0)