PecanProject · DongchenZ · Dec 1, 2025 · Dec 1, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,8 @@ For more information about this file see also [Keep a Changelog](http://keepacha
 ## Unreleased
 
 ### Added
-
+- Add function `qsub_sda()` for submitting SDA batch jobs by splitting a large number of sites into multiple small groups of sites (#3634).
+- Add function `generate_joint_ensemble_design()` into the current SDA workflows to maintain the joint input sampling (#3634).
 - Add function `clip_and_save_raster_file()` for subsetting rasters to match a polygon of interest (#3537).
 - Add CH4 and N2O to standard_vars in PEcAn.utils
 - New function `sat_vapor_pressure()` added for computing saturation vapor pressure from temperature using various methods.

diff --git a/book_source/03_topical_pages/03_pecan_xml.Rmd b/book_source/03_topical_pages/03_pecan_xml.Rmd
@@ -766,6 +766,13 @@ The following tags can be used for state data assimilation. More detailed inform
   <Localization.FUN>Local.support</Localization.FUN>
   <scalef>1</scalef>
   <chains>5</chains>
+  <batch.settings>
+    <general.job>
+      <cores>28</cores>
+      <folder.num>40</folder.num>
+    </general.job>
+    <qsub.cmd>qsub -l h_rt=24:00:00 -l mem_per_core=4G -l buyin -pe omp @CORES@ -V -N @NAME@ -o @STDOUT@ -e @STDERR@ -S /bin/bash</qsub.cmd>
+  </batch.settings>
   <state.variables>
    <variable>
     <variable.name>AbvGrndWood</variable.name>
@@ -860,6 +867,8 @@ The following tags can be used for state data assimilation. More detailed inform
 * **scalef** : [optional] The scale parameter used for the localization operation, the smaller the value is, the sites are more isolated.
 * **chains** : [optional] The number of chains needed to be estimated during the MCMC sampling process.
 * **_NOTE:_** If TRUE, you must also assign a vector of trait names to pick.trait.params within the sda.enkf function.
+* **batch.settings** : [optional] The configurations are used to set up batch job submissions. It will be used only if you use the `qsub_sda` function for the SDA job submissions with a large number of sites (>500). The `general.job` contains the number of CPUs per job and the number of jobs you would like to submit to the entire SDA experiment (e.g., 8,000 sites with 40 folders will end up with 200 sites per job).
+The `qsub.cmd` contains the string for configuring extra qsub arguments.
 * **state.variable** : [required] State variable that is to be assimilated (in PEcAn standard format, with pre-specified variable name, unit, and range). Four variables can be assimilated so far: including Aboveground biomass (AbvGrndWood), LAI, SoilMoistFrac, and Soil carbon (TotSoilCarb).
 * **Obs_Prep** : [required] This section will be handled through the SDA_Obs_Assembler function, if you want to proceed with this function, this section is required.
 * **spin.up** : [required] start.date and end.date for model spin up.

diff --git a/modules/assim.sequential/R/sda.enkf_MultiSite.R b/modules/assim.sequential/R/sda.enkf_MultiSite.R
@@ -329,6 +329,30 @@ sda.enkf.multisite <- function(settings,
     #reformatting params
     new.params <- sda_matchparam(settings, ensemble.samples, site.ids, nens)
     # if it's not a restart run, we will generate the joint input design.
+    # following code tries to catch if there is any mismatch between
+    # samplingspace and site inputs.
+    # get the input names that are registered for sampling.
+    names.sampler <- names(settings$ensemble$samplingspace)
+    # remove parameters field from the list.
+    names.sampler <- names.sampler[-which(names.sampler == "parameters")]
+    mis.match.table <- settings %>% purrr::map(function(s){
+      # get the input names for the current site.
+      names.site.input <- names(s$run$inputs)
+      # check if there is any mismatch.
+      inds <- which(!names.sampler %in% names.site.input)
+      # if this site has missing inputs.
+      if (length(inds) > 0) {
+        return(data.frame(site_id = s$run$site$id, missed.input = names.sampler[inds]))
+      }
+    }) %>% dplyr::bind_rows()
+    # if we see any site that has missing inputs.
+    if (nrow(mis.match.table) > 0) {
+      PEcAn.logger::logger.info("There are sites that have missing inputs than the sampling space.")
+      return(mis.match.table)
+    }
+
+    # find a site that has all registered inputs except for the parameter field.
+    if (all(names.sampler %in% names.site.input)) {}
-    # find a site that has all registered inputs except for the parameter field.
-    if (all(names.sampler %in% names.site.input)) {}
-    # find a site that has all registered inputs except for the parameter field.
-    if (all(names.sampler %in% names.site.input)) {}
     # get the joint input design.
     input_design <- PEcAn.uncertainty::generate_joint_ensemble_design(settings = settings[[1]], 
                                                                       ensemble_samples = ensemble.samples, 

diff --git a/modules/assim.sequential/inst/anchor/NA_downscale_script.R b/modules/assim.sequential/inst/anchor/NA_downscale_script.R
@@ -244,7 +244,7 @@ for (i in seq_along(date)) {
   for (j in seq_along(variables)) {
     # setup folder.
     variable <- variables[j]
-    folder.path <- file.path(file.path(outdir, "downscale_maps_analysis_lc_ts_noGEDI_rf"), paste0(variables[j], "_", date[i]))
+    folder.path <- file.path(file.path(outdir, "downscale_maps_analysis_lc_ts_noGEDI_debias_rf"), paste0(variables[j], "_", date[i]))
     dir.create(folder.path)
     saveRDS(list(settings = settings, 
                  analysis.yr = analysis.yr, 
@@ -254,8 +254,8 @@ for (i in seq_along(date)) {
                  folder.path = folder.path, 
                  base.map.dir = base.map.dir,
                  method = method,
-                 cores = cores, 
-                 outdir = file.path(outdir, "downscale_maps_analysis_lc_ts_noGEDI_rf")),
+                 cores = cores-1, 
+                 outdir = file.path(outdir, "downscale_maps_analysis_lc_ts_noGEDI_debias_rf")),
          file = file.path(folder.path, "dat.rds"))
     # prepare for qsub.
     jobsh <- c("#!/bin/bash -l", 
@@ -268,7 +268,7 @@ for (i in seq_along(date)) {
     jobsh <- gsub("@FOLDER_PATH@", folder.path, jobsh)
     writeLines(jobsh, con = file.path(folder.path, "job.sh"))
     # qsub command.
-    qsub <- "qsub -l h_rt=24:00:00 -l mem_per_core=4G -l buyin -pe omp @CORES@ -V -N @NAME@ -o @STDOUT@ -e @STDERR@ -S /bin/bash"
+    qsub <- "qsub -l h_rt=10:00:00 -l mem_per_core=8G -l buyin -pe omp @CORES@ -V -N @NAME@ -o @STDOUT@ -e @STDERR@ -S /bin/bash"
     qsub <- gsub("@CORES@", cores, qsub)
     qsub <- gsub("@NAME@", paste0("ds_", i, "_", j), qsub)
     qsub <- gsub("@STDOUT@", file.path(folder.path, "stdout.log"), qsub)