From 0c16809114b1ab9180532b982fd42305b51169bf Mon Sep 17 00:00:00 2001 From: Ernst Bablick Date: Fri, 27 Feb 2026 10:55:11 +0100 Subject: [PATCH] Added OCS/GCS scheduler support --- .gitignore | 1 + DESCRIPTION | 2 +- R/clustermq-package.r | 2 +- R/qsys_sge.r | 73 +++++++++++++++++ R/zzz.r | 9 +-- README.md | 2 + inst/GCS.tmpl | 49 +++++++++++ inst/OCS.tmpl | 37 +++++++++ vignettes/faq.Rmd | 4 +- vignettes/technicaldocs.Rmd | 2 + vignettes/userguide.Rmd | 157 +++++++++++++++++++++++++++++++++++- 11 files changed, 328 insertions(+), 10 deletions(-) create mode 100644 inst/GCS.tmpl create mode 100644 inst/OCS.tmpl diff --git a/.gitignore b/.gitignore index 7d526ae7..cc0fb1df 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ src/Makevars windows /doc/ /Meta/ +.idea/ diff --git a/DESCRIPTION b/DESCRIPTION index 97255f69..d5753d86 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: clustermq -Title: Evaluate Function Calls on HPC Schedulers (LSF, SGE, SLURM, PBS/Torque) +Title: Evaluate Function Calls on HPC Schedulers (LSF, SGE, GCS, OCS, SLURM, PBS/Torque) Version: 0.9.9 Authors@R: c( person('Michael', 'Schubert', email='mschu.dev@gmail.com', diff --git a/R/clustermq-package.r b/R/clustermq-package.r index 6040bdd1..6b9fe709 100644 --- a/R/clustermq-package.r +++ b/R/clustermq-package.r @@ -1,4 +1,4 @@ -#' Evaluate Function Calls on HPC Schedulers (LSF, SGE, SLURM) +#' Evaluate Function Calls on HPC Schedulers (LSF, SGE, GCS, OCS, SLURM) #' #' Provides the \code{Q} function to send arbitrary function calls to #' workers on HPC schedulers without relying on network-mounted storage. diff --git a/R/qsys_sge.r b/R/qsys_sge.r index 894b5cdb..91775f1c 100644 --- a/R/qsys_sge.r +++ b/R/qsys_sge.r @@ -57,6 +57,79 @@ SGE = R6::R6Class("SGE", cloneable = FALSE ) +#' Class for Open Cluster Scheduler (OCS) +OCS = R6::R6Class("OCS", + inherit = QSys, + + public = list( + initialize = function(addr, n_jobs, master, ..., template=getOption("clustermq.template", class(self)[1]), + log_worker=FALSE, log_file=NULL, verbose=TRUE) { + super$initialize(addr=addr, master=master, template=template) + + # fill the template with options and required fields + opts = private$fill_options(n_jobs=n_jobs, ...) + filled = fill_template(private$template, opts, required=c("master", "n_jobs")) + + private$job_name = opts$job_name + if (verbose) + message("Submitting ", n_jobs, " worker jobs to ", class(self)[1], " as ", sQuote(private$job_name), " ...") + + # submit the job with qsub (stdin-based) and capture the output + # on success the output will contain the job id, on failure the error message + private$qsub_stdout = system2("qsub", input=filled, stdout=TRUE) + + # check the return code and stop on error + status = attr(private$qsub_stdout, "status") + if (!is.null(status) && status != 0) + private$template_error(class(self)[1], status, filled) + + # try to read the job ID from stdout. On error stop + private$job_id <- regmatches(private$qsub_stdout, regexpr("^[0-9]+", private$qsub_stdout)) + if (length(private$job_id) == 0) + private$template_error(class(self)[1], private$qsub_stdout, filled) + + # if we got here, we have a job ID and can proceed + if (verbose) + message("Submitted job has ID ", private$job_id) + + private$master$add_pending_workers(n_jobs) + private$is_cleaned_up = FALSE + }, + + cleanup = function(success, timeout) { + # first call finalize to send qdel ... + private$finalize() + + # ... then set the cleaned up flag to avoid sending qdel again + private$is_cleaned_up = success + } + ), + + private = list( + qsub_stdout = NULL, + job_name = NULL, + job_id = NULL, + + finalize = function(quiet = TRUE) { + if (!private$is_cleaned_up) { + system(paste("qdel", private$job_id), ignore.stdout=quiet, ignore.stderr=quiet, wait=FALSE) + } + private$is_cleaned_up = TRUE + } + ), + + cloneable = FALSE +) + +#' Class for Gridware Cluster Scheduler (GCS) +GCS = R6::R6Class("GCS", + inherit = OCS, + cloneable = FALSE + + # no changes needed, but we want to have a separate class for GCS to allow for GCS-specific + # templates and enterprise edition options +) + PBS = R6::R6Class("PBS", inherit = SGE, diff --git a/R/zzz.r b/R/zzz.r index f6cbe7b9..8b5f85c1 100644 --- a/R/zzz.r +++ b/R/zzz.r @@ -10,9 +10,9 @@ qsys_default = toupper(getOption('clustermq.scheduler')) if (length(qsys_default) == 0) { - qname = c("SLURM", "LSF", "SGE", "LOCAL") - exec = Sys.which(c("sbatch", "bsub", "qsub")) - select = c(which(nchar(exec) > 0), 4)[1] + qname = c("SLURM", "LSF", "SGE", "GCS", "OCS", "LOCAL") + exec = Sys.which(c("sbatch", "bsub", "qsub", "qsub", "qsub")) + select = c(which(nchar(exec) > 0), 6)[1] qsys_default = qname[select] } @@ -26,8 +26,7 @@ #' @keywords internal .onAttach = function(libname, pkgname) { if (is.null(getOption("clustermq.scheduler"))) { - packageStartupMessage("* Option 'clustermq.scheduler' not set, ", - "defaulting to ", sQuote(qsys_default)) + packageStartupMessage("* Option 'clustermq.scheduler' not set, ", "defaulting to ", sQuote(qsys_default)) packageStartupMessage("--- see: https://mschubert.github.io/clustermq/articles/userguide.html#configuration") } if (!libzmq_has_draft()) { diff --git a/README.md b/README.md index 6c7d7db9..52569566 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,8 @@ schedulers](https://mschubert.github.io/clustermq/articles/userguide.html#config * [SLURM](https://mschubert.github.io/clustermq/articles/userguide.html#slurm) - *should work without setup* * [LSF](https://mschubert.github.io/clustermq/articles/userguide.html#lsf) - *should work without setup* * [SGE](https://mschubert.github.io/clustermq/articles/userguide.html#sge) - *may require configuration* +* [GCS](https://mschubert.github.io/clustermq/articles/userguide.html#gcs) - *works without setup* +* [OCS](https://mschubert.github.io/clustermq/articles/userguide.html#ocs) - *works without setup* * [PBS](https://mschubert.github.io/clustermq/articles/userguide.html#pbs)/[Torque](https://mschubert.github.io/clustermq/articles/userguide.html#torque) - *needs* `options(clustermq.scheduler="PBS"/"Torque")` * via [SSH](https://mschubert.github.io/clustermq/articles/userguide.html#ssh-connector) - *needs* `options(clustermq.scheduler="ssh", clustermq.ssh.host=)` diff --git a/inst/GCS.tmpl b/inst/GCS.tmpl new file mode 100644 index 00000000..8b418bbc --- /dev/null +++ b/inst/GCS.tmpl @@ -0,0 +1,49 @@ +# Submit client should only show the job ID of the new job on success +#$ -terse + +# Name of the job visible in OCS +#$ -N {{ job_name }} + +# Join error and output file. +#$ -j y + +# Location of the output file +#$ -o {{ log_file | /dev/null }} + +# Start R job in the working directory +#$ -cwd + +# Export the full environment to the R job (e.g if *LD_LIBRARY_PATH* is required). +# Depending on security settings might require a cluster manager to set +# ENABLE_SUBMIT_LIB_PATH=1 as *qmaster_param* +#$ -V + +# Spawns workload as tasks of an array job into the scheduler (one job with multiple tasks) +#$ -t 1-{{ n_jobs }} + +# Each array task will allocate one slot in the cluster, if not other specified. +#$ -pe mytestpe {{ cores | 1 }} + +# Per slot the job will get one power core (C) assuming R code is single-threaded, if not other specified. +#$ -bunit C +#$ -bamount {{ threads | 1 }} + +# Cores on a host are packed (cores on a die or chiplet sharing same NUMA node and caches if possible) +#$ -bstrategy packed +#$ -btype host + +# The scheduler will do the binding via *HWLOC*. +# Change to *env* if scheduler should make binding decision but not do the binding itself. +#$ -binstance set + +# Allows to set resource requests like memory (1 GB [in bytes]) +# to set runtime limits (1 hour [in seconds]) +# or to influence scheduler resource selection (job will be executed in all.q queue) +#$ -l mem_free={{ memory | 1073741824 }},h_rt={{ walltime | 3600 }},q=all.q + +# Tag the job so that it can be identified later on (e.g. in a JSV script before +# submission so the job can get adapted or for filtering later on) +#$ -ac application=clustermq + +ulimit -v $(( 1024 * {{ memory | 4096 }} )) +CMQ_AUTH={{ auth }} R --no-save --no-restore -e 'clustermq:::worker("{{ master }}")' diff --git a/inst/OCS.tmpl b/inst/OCS.tmpl new file mode 100644 index 00000000..881d2619 --- /dev/null +++ b/inst/OCS.tmpl @@ -0,0 +1,37 @@ +# Submit client should only show the job ID of the new job on success +#$ -terse + +# Name of the job visible in OCS +#$ -N {{ job_name }} + +# Join error and output file +#$ -j y + +# Location of the output file +#$ -o {{ log_file | /dev/null }} + +# Start R job in the working directory +#$ -cwd + +# Export the full environment to the R job (e.g if *LD_LIBRARY_PATH* is required) +# depending on security settings might require a cluster manager to set +# ENABLE_SUBMIT_LIB_PATH=1 as *qmaster_param* +#$ -V + +# Spawns workload as tasks of an array job into the scheduler (one job with multiple tasks) +#$ -t 1-{{ n_jobs }} + +# Each array task will allocate one slot in the cluster, if not other specified. +#$ -pe mytestpe {{ cores | 1 }} + +# Per slot the job will get one power core (C) assuming R code is single-threaded, if not other specified. +#$ -bunit C +#$ -bamount {{ threads | 1 }} + +# Allows to set resource requests like memory (default 1 GB in bytes) +# to set runtime limits (default 1 hour in seconds) +# or to influence scheduler resource selection (job will be executed in all.q) +#$ -l mem_free={{ memory | 1073741824 }},h_rt={{ runtime | 3600 }},q=all.q + +ulimit -v $(( 1024 * {{ memory | 4096 }} )) +CMQ_AUTH={{ auth }} R --no-save --no-restore -e 'clustermq:::worker("{{ master }}")' diff --git a/vignettes/faq.Rmd b/vignettes/faq.Rmd index e1758fff..059b0ccb 100644 --- a/vignettes/faq.Rmd +++ b/vignettes/faq.Rmd @@ -71,7 +71,7 @@ Running 1 calculations (5 objs/19.4 Kb common; 1 calls/chunk) ... You will see this every time your jobs are queued but not yet started. Depending on how busy your HPC is, this may take a long time. You can check the -queueing status of your jobs in the terminal with _e.g._ `qstat` (SGE), `bjobs` +queueing status of your jobs in the terminal with _e.g._ `qstat` (SGE, GCS or OCS), `bjobs` (LSF), or `sinfo` (SLURM). If your jobs are already finished, this likely means that the `clustermq` @@ -201,7 +201,7 @@ Alternatively, you can create a script that uses SSH to execute the scheduler on the login node. For this, you will need an SSH client in the container, [keys set up for password-less login](https://www.digitalocean.com/community/tutorials/how-to-configure-ssh-key-based-authentication-on-a-linux-server), and create a script to call the scheduler on the login node via ssh (e.g. -`~/bin/qsub` for SGE/PBS/Torque, `bsub` for LSF and `sbatch` for Slurm): +`~/bin/qsub` for SGE/GCS/OCS/PBS/Torque, `bsub` for LSF and `sbatch` for Slurm): ```{sh eval=FALSE} #!/bin/bash diff --git a/vignettes/technicaldocs.Rmd b/vignettes/technicaldocs.Rmd index 029c033d..93d7eb9d 100644 --- a/vignettes/technicaldocs.Rmd +++ b/vignettes/technicaldocs.Rmd @@ -46,6 +46,8 @@ functions for submitting and cleaning up jobs: |- Multicore |- LSF + SGE + |- GCS + |- OCS |- PBS |- Torque |- etc. diff --git a/vignettes/userguide.Rmd b/vignettes/userguide.Rmd index f8d4a461..9f626299 100644 --- a/vignettes/userguide.Rmd +++ b/vignettes/userguide.Rmd @@ -82,6 +82,8 @@ To set up a scheduler explicitly, see the following links: * [SLURM](#slurm) - *should work without setup* * [LSF](#lsf) - *should work without setup* * [SGE](#sge) - *may require configuration* +* [GCS](#gcs) - *works without setup* +* [OCS](#ocs) - *works without setup* * [PBS](#pbs)/[Torque](#torque) - *needs* `options(clustermq.scheduler="PBS"/"Torque")` * you can suggest another scheduler by [opening an issue](https://github.com/mschubert/clustermq/issues) @@ -284,7 +286,7 @@ time after you restart R. * `clustermq.scheduler` - One of the supported [`clustermq` schedulers](#configuration); options are `"LOCAL"`, - `"multiprocess"`, `"multicore"`, `"lsf"`, `"sge"`, `"slurm"`, `"pbs"`, + `"multiprocess"`, `"multicore"`, `"lsf"`, `"sge"`, `"gcs"`, `"ocs"`, `"slurm"`, `"pbs"`, `"Torque"`, or `"ssh"` (default is the HPC scheduler found in `$PATH`, otherwise `"LOCAL"`) * `clustermq.host` - The name of the node or device for constructing the @@ -481,6 +483,159 @@ In this file, `#BSUB-*` defines command-line arguments to the `bsub` program. Once this is done, the package will use your settings and no longer warn you of the missing options. + +### GCS + +Set the following options in your _R_ session that will submit jobs: + +```{r eval=FALSE} +options( + clustermq.scheduler = "gcs", + clustermq.template = "/path/to/file/below" # if using your own template +) +``` + +To supply your own template, save the contents below with any desired changes +to a file and have `clustermq.template` point to it. + +```{sh eval=FALSE} +# Name of the job visible in GCS +#$ -N {{ job_name }} + +# Join error and output file. +#$ -j y + +# Location of the output file +#$ -o {{ log_file | /dev/null }} + +# Start R job in the working directory +#$ -cwd + +# Export the full environment to the R job (e.g if *LD_LIBRARY_PATH* is required). +# Depending on security settings might require a cluster manager to set +# ENABLE_SUBMIT_LIB_PATH=1 as *qmaster_param* +#$ -V + +# Spawns workload as tasks of an array job into the scheduler (one job with multiple tasks) +#$ -t 1-{{ n_jobs }} + +# Each array task will allocate one slot in the cluster, if not other specified. +#$ -pe mytestpe {{ cores | 1 }} + +# Per slot the job will get one power core (C) assuming R code is single-threaded, if not other specified. +#$ -bunit C +#$ -bamount {{ threads | 1 }} + +# Cores on a host are packed (cores on a die or chiplet sharing same NUMA node and caches if possible) +#$ -bstrategy packed +#$ -btype host + +# The scheduler will do the binding via *HWLOC*. +# Change to *env* if scheduler should make binding decision but not do the binding itself. +#$ -binstance set + +# Allows to set resource requests like memory (1 GB [in bytes]) +# to set runtime limits (1 hour [in seconds]) +# or to influence scheduler resource selection (job will be executed in all.q queue) +#$ -l mem_free={{ memory | 1073741824 }},h_rt={{ walltime | 3600 }},q=all.q + +# Tag the job so that it can be identified later on (e.g. in a JSV script before +# submission so the job can get adapted or for filtering later on) +#$ -ac application=clustermq,hostname={{ master }} + +ulimit -v $(( 1024 * {{ memory | 4096 }} )) +CMQ_AUTH={{ auth }} R --no-save --no-restore -e 'clustermq:::worker("{{ master }}")' +``` + +In this file, `#$-*` defines command-line arguments to the `qsub` program. + +* Used objects (mytestpe, all.q) in the template example are default objects + available after the default installation of GCS. Adapt the template to your + local setup if these objects are not available or if you want to use other + ones. +* The mytestpe is limited to 5 slots as default. Increase as appropriate for + your cluster and needs. +* For other options, see the [qsub man page](https://github.com/hpc-gridware/clusterscheduler/blob/master/doc/markdown/man/man1/submit.include.md) +* Find more detailed documentation on the options in the GCS manuals part of + your product installation ($SGE_ROOT/doc/pdf) +* Do not change the identifiers in curly braces (`{{ ... }}`), as they are + used to fill in the right variables. + +Once this is done, the package will use your settings and no longer warn you of +the missing options. + + +### OCS + +Set the following options in your _R_ session that will submit jobs: + +```{r eval=FALSE} +options( + clustermq.scheduler = "ocs", + clustermq.template = "/path/to/file/below" # if using your own template +) +``` + +To supply your own template, save the contents below with any desired changes +to a file and have `clustermq.template` point to it. + +```{sh eval=FALSE} +# Name of the job visible in GCS +#$ -N {{ job_name }} + +# Join error and output file. +#$ -j y + +# Location of the output file +#$ -o {{ log_file | /dev/null }} + +# Start R job in the working directory +#$ -cwd + +# Export the full environment to the R job (e.g if *LD_LIBRARY_PATH* is required). +# Depending on security settings might require a cluster manager to set +# ENABLE_SUBMIT_LIB_PATH=1 as *qmaster_param* +#$ -V + +# Spawns workload as tasks of an array job into the scheduler (one job with multiple tasks) +#$ -t 1-{{ n_jobs }} + +# Each array task will allocate one slot in the cluster, if not other specified. +#$ -pe mytestpe {{ cores | 1 }} + +# Per slot the job will get one power core (C) assuming R code is single-threaded, if not other specified. +#$ -bunit C +#$ -bamount {{ threads | 1 }} + +# Allows to set resource requests like memory (1 GB [in bytes]) +# to set runtime limits (1 hour [in seconds]) +# or to influence scheduler resource selection (job will be executed in all.q queue) +#$ -l mem_free={{ memory | 1073741824 }},h_rt={{ walltime | 3600 }},q=all.q + +# Tag the job so that it can be identified later on (e.g. in a JSV script before +# submission so the job can get adapted or for filtering later on) +#$ -ac application=clustermq,hostname={{ master }} + +ulimit -v $(( 1024 * {{ memory | 4096 }} )) +CMQ_AUTH={{ auth }} R --no-save --no-restore -e 'clustermq:::worker("{{ master }}")' +``` + +In this file, `#$-*` defines command-line arguments to the `qsub` program. + +* Used objects (mytestpe, all.q) in the template example are default objects + available after the default installation of GCS. Adapt the template to your + local setup if these objects are not available or if you want to use other + ones. +* The mytestpe is limited to 5 slots as default. Increase as appropriate for + your cluster and needs. +* For other options, see the [qsub man page](https://github.com/hpc-gridware/clusterscheduler/blob/master/doc/markdown/man/man1/submit.include.md) +* Do not change the identifiers in curly braces (`{{ ... }}`), as they are + used to fill in the right variables. + +Once this is done, the package will use your settings and no longer warn you of +the missing options. + + ### SGE Set the following options in your _R_ session that will submit jobs: