From fc03b2d69b5750579f8abbedf57c15809af0f28a Mon Sep 17 00:00:00 2001 From: divne7022 Date: Wed, 8 Oct 2025 10:22:23 +0000 Subject: [PATCH 01/15] replace point-based sampling with raster-based WCS approach --- modules/data.land/R/extract_soil_nc.R | 420 ++++++++++++-------------- 1 file changed, 187 insertions(+), 233 deletions(-) diff --git a/modules/data.land/R/extract_soil_nc.R b/modules/data.land/R/extract_soil_nc.R index a44c001cf23..e6217514684 100644 --- a/modules/data.land/R/extract_soil_nc.R +++ b/modules/data.land/R/extract_soil_nc.R @@ -13,12 +13,6 @@ #' #' @return It returns the address for the generated soil netcdf file #' -#' @section Current Limitations: -#' - MUKEY frequency weighting treats occurrence counts as proportional to area coverage -#' - This approximation may introduce geometric bias for irregular polygon data -#' - Buffer radius is set to grid_spacing/2 to reduce overlapping queries, but may still miss coverage -#' - True area-weighted aggregation using polygon geometries is planned (see issue #3609) -#' #' @importFrom rlang .data #' @examples #' \dontrun{ @@ -31,168 +25,147 @@ #' @export #' extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spacing=100, depths=c(0.15,0.30,0.60)){ - # I keep all the ensembles here - all.soil.ens <-list() - - # Grid-based spatial sampling around the center point (via WFS queries) - # This creates a grid_size x grid_size sampling grid centered on lat/lon - proj_crs <- sf::st_crs("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs") - wgs84_crs <- sf::st_crs(4326) + all.soil.ens <- list() - # Convert single center lat/lon to projected coordinates - point_sf <- sf::st_sfc(sf::st_point(c(lon, lat)), crs = wgs84_crs) - point_proj <- sf::st_transform(point_sf, proj_crs) - coords_proj <- sf::st_coordinates(point_proj) + # create spatial bounding box + half_extent_m <- (grid_size - 1) / 2 * grid_spacing + lat_offset <- half_extent_m / 111000 + lon_offset <- half_extent_m / (111000 * cos(lat * pi / 180)) - # Define grid extent - half_extent <- (grid_size - 1) / 2 * grid_spacing - xmin <- coords_proj[1] - half_extent - xmax <- coords_proj[1] + half_extent - ymin <- coords_proj[2] - half_extent - ymax <- coords_proj[2] + half_extent - - # Create raster template - raster_template <- terra::rast( - xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, - resolution = grid_spacing, crs = proj_crs$wkt + bbox <- sf::st_bbox( + c(xmin = lon - lon_offset, + xmax = lon + lon_offset, + ymin = lat - lat_offset, + ymax = lat + lat_offset), + crs = sf::st_crs(4326) ) - grid_coords <- terra::crds(raster_template) + aoi <- sf::st_as_sfc(bbox) - # Transform grid coordinates back to WGS84 for gSSURGO queries - grid_sf <- sf::st_as_sf(data.frame(x = grid_coords[, 1], y = grid_coords[, 2]), - coords = c("x", "y"), crs = proj_crs) - grid_wgs84 <- sf::st_transform(grid_sf, wgs84_crs) - grid_coords_wgs84 <- sf::st_coordinates(grid_wgs84) + PEcAn.logger::logger.info("Querying gSSURGO Web Coverage Service for map unit keys") + mu_raster <- soilDB::mukey.wcs(aoi = aoi, db = 'gSSURGO', res = 30) - # Query gSSURGO for each grid point to capture spatial variability - buffer_radius <- grid_spacing / 2 - PEcAn.logger::logger.warn( - "Buffer radius set to grid_spacing/2 to avoid overlap", - "results may be biased due to lack of area weighting and incomplete spatial coverage." - ) - mukeys_all <- c() - for (i in seq_len(nrow(grid_coords_wgs84))) { - # Extract coordinates for this grid point (not user input) - this_lon <- grid_coords_wgs84[i, 1] - this_lat <- grid_coords_wgs84[i, 2] - - # I ask the gSSURGO to find all the mukeys (loosely can be thought of soil type) within grid_spacing distance of each grid point location. - # Basically I think of this as me going around and taking soil samples at each grid point. - #https://sdmdataaccess.nrcs.usda.gov/SpatialFilterHelp.htm - mu.Path <- paste0( - "https://sdmdataaccess.nrcs.usda.gov/Spatial/SDMWGS84Geographic.wfs?", - "SERVICE=WFS", - "&VERSION=1.1.0", - "&REQUEST=GetFeature&TYPENAME=MapunitPoly", - "&FILTER=", - "", - "", - "Geometry", - "", - "", this_lon, ",", this_lat, "", - "", - "", buffer_radius, "", - "", - "", - "&OUTPUTFORMAT=XMLMukeyList" - ) + # Extract unique mukeys and their pixel counts for area weighting + mukey_values <- terra::values(mu_raster) + mukey_values <- mukey_values[!is.na(mukey_values)] + mukey_counts <- table(mukey_values) + mukeys_all <- as.character(names(mukey_counts)) + + if (length(mukeys_all) == 0) { + PEcAn.logger::logger.severe("No mapunit keys were found for this site.") + } + + # Get soil properties using soilDB + depths_cm <- depths * 100 + all_soil_data <- list() + + for (i in seq_along(depths_cm)) { + if (i == 1) { + top_depth <- 0 + bottom_depth <- depths_cm[1] + } else { + top_depth <- depths_cm[i-1] + bottom_depth <- depths_cm[i] + } - # XML handling with temp file - temp_file <- tempfile(fileext = ".xml") - xmll <- curl::curl_download( - mu.Path, - destfile = temp_file, - handle = curl::new_handle(ssl_verifypeer = FALSE, ssl_verifyhost = FALSE) - ) + # get soil properties per mukey + soil_props <- tryCatch({ + soilDB::get_SDA_property( + property = c("sandtotal_r", "silttotal_r", "claytotal_r", "om_r", "dbthirdbar_r"), + method = "Weighted Average", + mukeys = as.integer(mukeys_all), + top_depth = top_depth, + bottom_depth = bottom_depth, + include_minors = TRUE + ) + }, error = function(e) { + PEcAn.logger::logger.error(paste("Failed to get SDA properties:", e$message)) + return(NULL) + }) - # mukey extraction with error recovery - mukey_str <- tryCatch({ - xml_doc <- XML::xmlParse(temp_file) - mapunit_nodes <- XML::getNodeSet(xml_doc, "//MapUnitKeyList") - - if (length(mapunit_nodes) > 0) { - mukey_data <- XML::xmlValue(mapunit_nodes[[1]]) - if (!is.null(mukey_data) && nchar(trimws(mukey_data)) > 0) { - mukey_data - } else { - PEcAn.logger::logger.debug(paste("Empty MapUnitKeyList for coordinates", - this_lat, ",", this_lon)) - NULL - } - } else { - PEcAn.logger::logger.debug(paste("No MapUnitKeyList found for coordinates", - this_lat, ",", this_lon, "skipping grid point")) - NULL - } + # Use fetchSDA instead of get_SDA_property to obtain complete rock fragment data + # get_SDA_property only provides frag3to10_r and fraggt10_r + # but fetchSDA returns fragvol_r which represents TOTAL rock fragment volume including + # all size classes: 2-75mm (pebbles), 75-250mm (cobbles), 250-600mm (stones), and >600mm (boulders). + # plus component weighting needed for aggregation + sda_data <- tryCatch({ + soilDB::fetchSDA( + WHERE = paste0("mukey IN (", paste(mukeys_all, collapse = ","), ")"), + duplicates = TRUE, + childs = TRUE, + nullFragsAreZero = TRUE, + rmHzErrors = TRUE + ) }, error = function(e) { - PEcAn.logger::logger.warn(paste("Failed to parse gSSURGO response for coordinates", - this_lat, ",", this_lon, ":", e$message)) - NULL + PEcAn.logger::logger.warn(paste("Failed to fetch SDA data:", e$message)) + return(NULL) }) - if (file.exists(temp_file)) unlink(temp_file) - if (is.null(mukey_str)) next - mukeys <- strsplit(mukey_str, ",")[[1]] - if (length(mukeys) == 0) next + if (!is.null(sda_data)) { + # extract horizon and site data + hz_data <- aqp::horizons(sda_data) + site_data <- aqp::site(sda_data) + + fragment_data <- hz_data %>% + dplyr::left_join(site_data[, c("cokey", "comppct_r", "mukey")], by = "cokey") %>% + dplyr::filter(hzdept_r < bottom_depth & hzdepb_r > top_depth) %>% + dplyr::mutate( + hz_top_adj = pmax(hzdept_r, top_depth), + hz_bot_adj = pmin(hzdepb_r, bottom_depth), + hz_thickness = hz_bot_adj - hz_top_adj + ) %>% + dplyr::group_by(mukey) %>% + dplyr::summarise( + fragvol_r = stats::weighted.mean( + fragvol_r, + comppct_r * hz_thickness, + na.rm = TRUE + ), + .groups = "drop" + ) + + # Merge soil properties with fragment data + depth_data <- soil_props %>% + dplyr::left_join(fragment_data, by = "mukey") %>% + dplyr::mutate( + depth_layer = depths[i], + hzdept_r = top_depth, + hzdepb_r = bottom_depth + ) + } else { + # Keep other soil data, mark fragments as explicitly missing + # complete.cases() will filter these out later + PEcAn.logger::logger.info( + paste("Fragment data unavailable for depth", top_depth, "-", bottom_depth, + "cm. These records will be excluded from final analysis.") + ) + depth_data <- soil_props %>% + dplyr::mutate( + fragvol_r = NA_real_, + depth_layer = depths[i], + hzdept_r = top_depth, + hzdepb_r = bottom_depth + ) + } - mukeys_all <- c(mukeys_all, mukeys) - } - - # mukey occurrences across all grid points - mukey_counts <- table(mukeys_all) - # Get unique mukeys from all grid points - mukeys_all <- unique(mukeys_all) - if (length(mukeys_all) == 0) { - PEcAn.logger::logger.severe("No mapunit keys were found for this site.") - return(NULL) + all_soil_data[[i]] <- depth_data + # Loop continues to next depth layer regardless } - # calling the query function sending the mapunit keys - soilprop <- gSSURGO.Query( - mukeys_all, - c("chorizon.sandtotal_r", - "chorizon.silttotal_r", - "chorizon.claytotal_r", - "chorizon.hzdept_r", - "chorizon.hzdepb_r", - "chorizon.om_r", - "chorizon.dbthirdbar_r", # bulk density at 1/3 bar (field capacity);which is the standard field capacity bulk density measurement - "chfrags.fragvol_r", - "component.comppct_r")) - - # Two-step aggregation: - # (1) Sum fragments within horizons, (2) Component area-weighting by mapunit - soilprop.weighted <- soilprop %>% - dplyr::group_by(.data$cokey, .data$hzdept_r, .data$hzdepb_r) %>% - # Each horizon may have multiple rows from different fragment size classes - # Sum fragments across size classes and remove duplicate horizon data - dplyr::mutate(fragvol_r = min(sum(.data$fragvol_r, na.rm = TRUE), 100)) %>% - dplyr::distinct() %>% # Remove duplicate rows created by multiple fragment size classes - dplyr::ungroup() %>% - # Component area-weighted aggregation by mapunit and horizon depth - dplyr::group_by(.data$mukey, .data$hzdept_r, .data$hzdepb_r) %>% - dplyr::summarise( - sandtotal_r = stats::weighted.mean(.data$sandtotal_r, .data$comppct_r, na.rm = TRUE), - silttotal_r = stats::weighted.mean(.data$silttotal_r, .data$comppct_r, na.rm = TRUE), - claytotal_r = stats::weighted.mean(.data$claytotal_r, .data$comppct_r, na.rm = TRUE), - om_r = stats::weighted.mean(.data$om_r, .data$comppct_r, na.rm = TRUE), - dbthirdbar_r = stats::weighted.mean(.data$dbthirdbar_r, .data$comppct_r, na.rm = TRUE), - fragvol_r = stats::weighted.mean(.data$fragvol_r, .data$comppct_r, na.rm = TRUE), - .groups = "drop" - ) + # Transform to match original code format + soilprop <- do.call(rbind, all_soil_data) - soilprop.new <- soilprop.weighted %>% - dplyr::arrange(.data$hzdept_r) %>% + soilprop.new <- soilprop %>% dplyr::select( - fraction_of_sand_in_soil = "sandtotal_r", # % - fraction_of_silt_in_soil = "silttotal_r", # % - fraction_of_clay_in_soil = "claytotal_r", # % - soil_depth = "hzdept_r", # cm - soil_depth_bottom = "hzdepb_r", # cm - organic_matter_pct = "om_r", # % - bulk_density = "dbthirdbar_r", # g/cm3 - coarse_fragment_pct = "fragvol_r", # % - mukey = "mukey") %>% + fraction_of_sand_in_soil = "sandtotal_r", + fraction_of_silt_in_soil = "silttotal_r", + fraction_of_clay_in_soil = "claytotal_r", + soil_depth = "hzdept_r", + soil_depth_bottom = "hzdepb_r", + organic_matter_pct = "om_r", + bulk_density = "dbthirdbar_r", + coarse_fragment_pct = "fragvol_r", + mukey = "mukey" + ) %>% dplyr::mutate( dplyr::across(c(dplyr::starts_with("fraction_of"), "coarse_fragment_pct"), ~ . / 100), @@ -205,13 +178,14 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa ) ) %>% dplyr::filter(stats::complete.cases(.)) + if(nrow(soilprop.new) == 0) { PEcAn.logger::logger.error("No valid soil properties after filtering") return(NULL) } - if(!dir.exists(outdir)) dir.create(outdir, recursive = TRUE) - #converting it to list + if(!dir.exists(outdir)) dir.create(outdir, recursive = TRUE) + soil.data.gssurgo <- list( fraction_of_sand_in_soil = soilprop.new$fraction_of_sand_in_soil, fraction_of_silt_in_soil = soilprop.new$fraction_of_silt_in_soil, @@ -219,21 +193,19 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa soil_depth = soilprop.new$soil_depth, soil_organic_carbon_stock = soilprop.new$soil_organic_carbon_stock ) - #This ensures that I have at least one soil ensemble in case the modeling part failed - all.soil.ens <-c(all.soil.ens, list(soil.data.gssurgo)) + all.soil.ens <- c(all.soil.ens, list(soil.data.gssurgo)) - # What I do here is that I put soil data into depth classes and then model each class speparatly - #- see if we need to generate soil ensemble and add that to the list of all + # Generate modeled ensembles tryCatch({ - # find the soil depth levels based on the depth argument - # if soil profile is deeper than what is specified in the argument then I go as deep as the soil profile. - if (max(soilprop.new$soil_depth) > max(depths)) { - depths <- sort(c(depths, max(soilprop.new$soil_depth))) + # Adjust depth levels if needed + if (max(soilprop.new$soil_depth_bottom) > max(depths_cm)) { + depths_cm <- sort(c(depths_cm, max(soilprop.new$soil_depth))) } - depth.levs<-findInterval(soilprop.new$soil_depth, depths) - depth.levs[depth.levs==0] <-1 - depth.levs[depth.levs>length(depths)] <-length(depths) + + depth.levs <- findInterval(soilprop.new$soil_depth_bottom, depths_cm) + depth.levs[depth.levs == 0] <- 1 + depth.levs[depth.levs > length(depths_cm)] <- length(depths_cm) # Remove any NA depth levels valid_indices <- !is.na(depth.levs) @@ -242,76 +214,65 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa depth.levs <- depth.levs[valid_indices] } - soilprop.new.grouped<-soilprop.new %>% - dplyr::mutate(DepthL=depths[depth.levs]) + soilprop.new.grouped <- soilprop.new %>% + dplyr::mutate(DepthL = depths_cm[depth.levs]) - # let's fit dirichlet for each depth level separately - simulated.soil.props<-soilprop.new.grouped %>% - split(list(soilprop.new.grouped$DepthL, soilprop.new.grouped$mukey)) %>% - purrr::map_df(function(DepthL.Data){ + # Dirichlet modeling per mukey + simulated.soil.props <- soilprop.new.grouped %>% + split(.$mukey) %>% + purrr::map_df(function(mukey_group) { tryCatch({ - # I model the soil properties for this depth - dir.model <-DepthL.Data[,c(1:3)] %>% - as.matrix() %>% - sirt::dirichlet.mle(.) - # Monte Carlo sampling based on my dirichlet model - alpha <- dir.model$alpha - alpha <- matrix(alpha, nrow= size, ncol=length(alpha), byrow=TRUE ) + texture_data <- mukey_group[,c("fraction_of_sand_in_soil", + "fraction_of_silt_in_soil", + "fraction_of_clay_in_soil")] %>% + as.matrix() + + if(nrow(texture_data) == 0) return(NULL) + + dir.model <- sirt::dirichlet.mle(texture_data) + alpha <- matrix(dir.model$alpha, nrow = size, ncol = length(dir.model$alpha), byrow = TRUE) simulated.soil <- sirt::dirichlet.simul(alpha) - # Validate SOC data before processing - if (any(is.na(DepthL.Data$soil_organic_carbon_stock))) { - PEcAn.logger::logger.warn("Found NA values in soil_organic_carbon_stock data. Removing incomplete records.") - DepthL.Data <- DepthL.Data[!is.na(DepthL.Data$soil_organic_carbon_stock), ] - } - if (nrow(DepthL.Data) == 0) { - PEcAn.logger::logger.warn("No valid SOC data after removing NAs") - return(NULL) - } - # Simulate SOC uncertainty using Gamma distribution - soc_mean <- mean(DepthL.Data$soil_organic_carbon_stock, na.rm = TRUE) - soc_sd <- stats::sd(DepthL.Data$soil_organic_carbon_stock, na.rm = TRUE) - # Handle edge cases for SOC simulation - if (nrow(DepthL.Data) == 1) { - simulated_soc <- rep(NA_real_, size) - } else if (is.na(soc_sd) || soc_sd == 0) { + # SOC modeling + soc_mean <- mukey_group$soil_organic_carbon_stock + soc_sd <- stats::sd(soc_mean, na.rm = TRUE) + n_depths <- length(soc_mean) + + if (n_depths == 1 || is.na(soc_sd) || soc_sd == 0) { simulated_soc <- rep(NA_real_, size) } else { - shape <- (soc_mean^2) / (soc_sd^2) - rate <- soc_mean / (soc_sd^2) - simulated_soc <- stats::rgamma(size, shape=shape, rate=rate) + shape <- (mean(soc_mean, na.rm=TRUE)^2) / (soc_sd^2) + rate <- mean(soc_mean, na.rm=TRUE) / (soc_sd^2) + simulated_soc <- stats::rgamma(size, shape = shape, rate = rate) } - simulated.soil<-simulated.soil %>% - as.data.frame %>% - dplyr::mutate(DepthL=rep(DepthL.Data$DepthL[1], size), - mukey=rep(DepthL.Data$mukey[1], size), - soil_organic_carbon_stock = simulated_soc) %>% - `colnames<-`(c("fraction_of_sand_in_soil", - "fraction_of_silt_in_soil", - "fraction_of_clay_in_soil", - "soil_depth", - "mukey", - "soil_organic_carbon_stock")) - simulated.soil + result_df <- data.frame( + fraction_of_sand_in_soil = simulated.soil[,1], + fraction_of_silt_in_soil = simulated.soil[,2], + fraction_of_clay_in_soil = simulated.soil[,3], + soil_depth = mukey_group$soil_depth, + mukey = unique(mukey_group$mukey), + soil_organic_carbon_stock = simulated_soc + ) + + return(result_df) }, error = function(e) { PEcAn.logger::logger.warn(conditionMessage(e)) return(NULL) }) - }) + }) - # estimating the proportion of areas for those mukeys which are modeled - - # defining mukey_area + # calculate mukey area mukey_area <- data.frame( mukey = names(mukey_counts), Area = as.numeric(mukey_counts) / sum(mukey_counts) ) %>% dplyr::filter(.data$mukey %in% unique(simulated.soil.props$mukey)) %>% dplyr::mutate(Area = .data$Area / sum(.data$Area, na.rm = TRUE)) - #--- Mixing the depths - soil.profiles<-simulated.soil.props %>% + + # generate weighted profiles + soil.profiles <- simulated.soil.props %>% split(.$mukey) %>% purrr::map(function(soiltype.sim){ sizein <- mukey_area$Area[mukey_area$mukey == unique(soiltype.sim$mukey)] * size @@ -324,8 +285,9 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa }) }) %>% purrr::flatten() - #- add them to the list of all the ensembles ready to be converted to .nc file - all.soil.ens<-soil.profiles %>% + + # convert profiles to ensemble arrays + all.soil.ens <- soil.profiles %>% purrr::map(function(SEns){ SEns <- SEns[, names(SEns) != "mukey"] names(SEns) %>% @@ -334,24 +296,19 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa }) %>% stats::setNames(names(SEns)) }) %>% - c(all.soil.ens,.) + c(all.soil.ens, .) }, error = function(e) { PEcAn.logger::logger.warn(conditionMessage(e)) }) - - #-- generating the .nc files for all the collected ensembles + # generate NetCDF files out.ense <- (1:length(all.soil.ens)) %>% purrr::map(function(i) { - tryCatch({ - #browser() - # calc new filename prefix <- paste0("gSSURGO_soil_", i) new.file <- file.path(outdir, paste0(prefix, ".nc")) - #sending it to the func where some new params will be added and then it will be written down as nc file. suppressWarnings({ PEcAn.data.land::soil2netcdf(all.soil.ens[[i]], new.file) }) @@ -362,19 +319,16 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa return(NULL) }) }) - # removing the nulls or the ones that throw exception in the above trycatch - out.ense<- out.ense %>% + + # remove nulls + out.ense <- out.ense %>% purrr::discard(is.null) - out.ense<-out.ense %>% + out.ense <- out.ense %>% stats::setNames(rep("path", length(out.ense))) return(out.ense) -} - - - - +} #' Extract soil data from the gridpoint closest to a location From 771d05d6e6288fe841c79de6bc60d56dbc840c07 Mon Sep 17 00:00:00 2001 From: divne7022 Date: Wed, 8 Oct 2025 10:22:47 +0000 Subject: [PATCH 02/15] update .Rd --- modules/data.land/man/extract_soil_gssurgo.Rd | 8 -------- 1 file changed, 8 deletions(-) diff --git a/modules/data.land/man/extract_soil_gssurgo.Rd b/modules/data.land/man/extract_soil_gssurgo.Rd index 4f696305016..3ab1ef9ab86 100644 --- a/modules/data.land/man/extract_soil_gssurgo.Rd +++ b/modules/data.land/man/extract_soil_gssurgo.Rd @@ -40,14 +40,6 @@ This function takes a single lat/lon point and creates a spatial grid around it for sampling soil variability. The grid_size parameter determines how many grid points (grid_size x grid_size) are created around the center point. } -\section{Current Limitations}{ - -- MUKEY frequency weighting treats occurrence counts as proportional to area coverage -- This approximation may introduce geometric bias for irregular polygon data -- Buffer radius is set to grid_spacing/2 to reduce overlapping queries, but may still miss coverage -- True area-weighted aggregation using polygon geometries is planned (see issue #3609) -} - \examples{ \dontrun{ outdir <- "~/paleon/envTest" From 7f09a36b93d2e92b3f0a935cb94ef076f6e15bf4 Mon Sep 17 00:00:00 2001 From: divne7022 Date: Wed, 8 Oct 2025 10:32:21 +0000 Subject: [PATCH 03/15] update NEWS.md --- modules/data.land/NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/data.land/NEWS.md b/modules/data.land/NEWS.md index 425008658af..ac46f2dc845 100644 --- a/modules/data.land/NEWS.md +++ b/modules/data.land/NEWS.md @@ -4,7 +4,8 @@ * New utility script `IC_SOILGRID_Utilities.R` for processing SoilGrids data to generate soil carbon initial condition (IC) files. This includes (#3508): - **`soilgrids_ic_process`**: A function to extract, process, and generate ensemble members from SoilGrids250m data. - **`preprocess_soilgrids_data`**: A helper function to handle missing values and ensure data integrity during preprocessing. - - **`generate_soilgrids_ensemble`**: A function to create ensemble members for a site based on processed soil carbon data. + - **`generate_soilgrids_ensemble`**: A function to create ensemble members for a site based on processed soil carbon data. +* `extract_soil_gssurgo()` -- replaced point-based WFS queries with raster-based WCS approach using `soilDB::mukey.wcs()` for accurate area-weighted sampling. Integrated `soilDB::get_SDA_property()` for depth-integrated soil property retrieval and `soilDB::fetchSDA()` for comprehensive rock fragment data across all size classes. This eliminates spatial coverage gaps and reduces network requests while maintaining backward compatibility. # PEcAn.data.land 1.8.2 - Removed unused parameter `machine` from put_veg_module() From beceada2d14b4fe3c378571317cdb4728243f673 Mon Sep 17 00:00:00 2001 From: divne7022 Date: Wed, 8 Oct 2025 10:32:59 +0000 Subject: [PATCH 04/15] update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae553aadb26..040c2024f5d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ section for the next release. - Fixed a bugs and BADM now process both single-site and multi-site settings, detecting the input structure and processing each site independently to generate the correct number of ensemble members per site. - Fixed "external pointer is not valid" error and addressed key bugs in `soilgrids_soilC_extract()` function (#3506) - Fixed a bug within the `model2netcdf.SIPNET` function where we assumed the constant calculations of `pecan_start_doy` across years (the calculations should vary depending on the last date from the last loop and the start date of the current loop), which will lead to incorrect calculations of the start `sub_dates` and `sub_dates_cf` if we are jumping between years (e.g., from 2012-12-31 to 2013-01-01). The `sipnet2datetime` function is no longer used anywhere and therefore has been removed. +- `extract_soil_gssurgo()` -- replaced point-based WFS queries with raster-based WCS approach using `soilDB::mukey.wcs()` for accurate area-weighted sampling. Integrated `soilDB::get_SDA_property()` for depth-integrated soil property retrieval and `soilDB::fetchSDA()` for comprehensive rock fragment data across all size classes. This eliminates spatial coverage gaps and reduces network requests while maintaining backward compatibility. ### Changed From 0103bbe9d12cbc9d8d88ee34100ba3ffaa605e12 Mon Sep 17 00:00:00 2001 From: divne7022 Date: Sat, 8 Nov 2025 16:57:06 +0000 Subject: [PATCH 05/15] replaced grid_size/grid_spacing parameters with radius,added support for custom aoi and switched to single fetchSDA call --- modules/data.land/R/extract_soil_nc.R | 257 +++++++++++--------------- 1 file changed, 110 insertions(+), 147 deletions(-) diff --git a/modules/data.land/R/extract_soil_nc.R b/modules/data.land/R/extract_soil_nc.R index e6217514684..fbfc00b9627 100644 --- a/modules/data.land/R/extract_soil_nc.R +++ b/modules/data.land/R/extract_soil_nc.R @@ -1,15 +1,15 @@ #' Extract soil data from gssurgo -#' @details This function takes a single lat/lon point and creates a spatial grid -#' around it for sampling soil variability. The grid_size parameter determines -#' how many grid points (grid_size x grid_size) are created around the center point. +#' @details This function extracts soil property data from the USDA gSSURGO database +#' for a specified area of interest. It can work with either a lat/lon point +#' (creating a circular buffer) or a custom polygon AOI. #' -#' @param outdir Output directory for writing down the netcdf file -#' @param lat Latitude of center point (single numeric value) -#' @param lon Longitude of center point (single numeric value) -#' @param size Ensemble size -#' @param grid_size Size of the spatial sampling grid around the center point (default: 3) -#' @param grid_spacing Spacing between grid cells in meters (default: 100) -#' @param depths Standard set of soil depths in m to create the ensemble of soil profiles with. +#' @param outdir Output directory for writing NetCDF files +#' @param lat Latitude of center point (optional if aoi provided) +#' @param lon Longitude of center point (optional if aoi provided) +#' @param aoi Custom area of interest as sf or terra polygon (optional) +#' @param size Ensemble size (number of ensemble members to generate) +#' @param radius Buffer radius in meters around lat/lon point (default: 500) +#' @param depths Standard soil depth intervals in meters (default: c(0.15, 0.30, 0.60)) #' #' @return It returns the address for the generated soil netcdf file #' @@ -24,27 +24,26 @@ #' @author Hamze Dokoohaki, Akash #' @export #' -extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spacing=100, depths=c(0.15,0.30,0.60)){ +extract_soil_gssurgo <- function(outdir, lat = NULL, lon = NULL, aoi = NULL, size = 1, radius = 500, depths = c(0.15, 0.30, 0.60)){ all.soil.ens <- list() - # create spatial bounding box - half_extent_m <- (grid_size - 1) / 2 * grid_spacing - lat_offset <- half_extent_m / 111000 - lon_offset <- half_extent_m / (111000 * cos(lat * pi / 180)) - - bbox <- sf::st_bbox( - c(xmin = lon - lon_offset, - xmax = lon + lon_offset, - ymin = lat - lat_offset, - ymax = lat + lat_offset), - crs = sf::st_crs(4326) - ) - aoi <- sf::st_as_sfc(bbox) + # Validate inputs + if (is.null(aoi) && (is.null(lat) || is.null(lon))) { + PEcAn.logger::logger.severe("Must provide either 'aoi' OR both 'lat' and 'lon'") + } + # Create AOI from point + radius if needed + if (is.null(aoi)) { + aoi <- data.frame(lon = lon, lat = lat) %>% + terra::vect(crs = "epsg:4326") %>% + terra::project("epsg:5070") %>% + terra::buffer(width = radius) %>% + terra::project("epsg:4326") + } + PEcAn.logger::logger.info("Querying gSSURGO Web Coverage Service for map unit keys") mu_raster <- soilDB::mukey.wcs(aoi = aoi, db = 'gSSURGO', res = 30) - # Extract unique mukeys and their pixel counts for area weighting mukey_values <- terra::values(mu_raster) mukey_values <- mukey_values[!is.na(mukey_values)] mukey_counts <- table(mukey_values) @@ -54,7 +53,29 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa PEcAn.logger::logger.severe("No mapunit keys were found for this site.") } - # Get soil properties using soilDB + # Fetch complete soil data + sda_data <- tryCatch({ + soilDB::fetchSDA( + WHERE = paste0("mukey IN (", paste(mukeys_all, collapse = ","), ")"), + duplicates = TRUE, + childs = TRUE, + nullFragsAreZero = TRUE, + rmHzErrors = TRUE + ) + }, error = function(e) { + PEcAn.logger::logger.error(paste("Failed to fetch SDA data:", e$message)) + return(NULL) + }) + + if (is.null(sda_data)) { + PEcAn.logger::logger.error("Could not retrieve soil data from SDA") + return(NULL) + } + + hz_data <- aqp::horizons(sda_data) + site_data <- aqp::site(sda_data) + + # Component-level aggregation by depth depths_cm <- depths * 100 all_soil_data <- list() @@ -67,108 +88,57 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa bottom_depth <- depths_cm[i] } - # get soil properties per mukey - soil_props <- tryCatch({ - soilDB::get_SDA_property( - property = c("sandtotal_r", "silttotal_r", "claytotal_r", "om_r", "dbthirdbar_r"), - method = "Weighted Average", - mukeys = as.integer(mukeys_all), - top_depth = top_depth, - bottom_depth = bottom_depth, - include_minors = TRUE - ) - }, error = function(e) { - PEcAn.logger::logger.error(paste("Failed to get SDA properties:", e$message)) - return(NULL) - }) + depth_hz <- hz_data %>% + dplyr::filter(hzdept_r < bottom_depth & hzdepb_r > top_depth) - # Use fetchSDA instead of get_SDA_property to obtain complete rock fragment data - # get_SDA_property only provides frag3to10_r and fraggt10_r - # but fetchSDA returns fragvol_r which represents TOTAL rock fragment volume including - # all size classes: 2-75mm (pebbles), 75-250mm (cobbles), 250-600mm (stones), and >600mm (boulders). - # plus component weighting needed for aggregation - sda_data <- tryCatch({ - soilDB::fetchSDA( - WHERE = paste0("mukey IN (", paste(mukeys_all, collapse = ","), ")"), - duplicates = TRUE, - childs = TRUE, - nullFragsAreZero = TRUE, - rmHzErrors = TRUE - ) - }, error = function(e) { - PEcAn.logger::logger.warn(paste("Failed to fetch SDA data:", e$message)) - return(NULL) - }) + if (nrow(depth_hz) == 0) next - if (!is.null(sda_data)) { - # extract horizon and site data - hz_data <- aqp::horizons(sda_data) - site_data <- aqp::site(sda_data) - - fragment_data <- hz_data %>% - dplyr::left_join(site_data[, c("cokey", "comppct_r", "mukey")], by = "cokey") %>% - dplyr::filter(hzdept_r < bottom_depth & hzdepb_r > top_depth) %>% - dplyr::mutate( - hz_top_adj = pmax(hzdept_r, top_depth), - hz_bot_adj = pmin(hzdepb_r, bottom_depth), - hz_thickness = hz_bot_adj - hz_top_adj - ) %>% - dplyr::group_by(mukey) %>% - dplyr::summarise( - fragvol_r = stats::weighted.mean( - fragvol_r, - comppct_r * hz_thickness, - na.rm = TRUE - ), - .groups = "drop" - ) - - # Merge soil properties with fragment data - depth_data <- soil_props %>% - dplyr::left_join(fragment_data, by = "mukey") %>% - dplyr::mutate( - depth_layer = depths[i], - hzdept_r = top_depth, - hzdepb_r = bottom_depth - ) - } else { - # Keep other soil data, mark fragments as explicitly missing - # complete.cases() will filter these out later - PEcAn.logger::logger.info( - paste("Fragment data unavailable for depth", top_depth, "-", bottom_depth, - "cm. These records will be excluded from final analysis.") + # Aggregate by COMPONENT (preserves within-mapunit variability) + component_data <- depth_hz %>% + dplyr::left_join(site_data[, c("cokey", "comppct_r", "mukey")], by = "cokey") %>% + dplyr::mutate( + hz_top_adj = pmax(hzdept_r, top_depth), + hz_bot_adj = pmin(hzdepb_r, bottom_depth), + hz_thickness = hz_bot_adj - hz_top_adj + ) %>% + dplyr::group_by(mukey, cokey, comppct_r) %>% + dplyr::summarise( + sandtotal_r = stats::weighted.mean(sandtotal_r, hz_thickness, na.rm = TRUE), + silttotal_r = stats::weighted.mean(silttotal_r, hz_thickness, na.rm = TRUE), + claytotal_r = stats::weighted.mean(claytotal_r, hz_thickness, na.rm = TRUE), + om_r = stats::weighted.mean(om_r, hz_thickness, na.rm = TRUE), + dbthirdbar_r = stats::weighted.mean(dbthirdbar_r, hz_thickness, na.rm = TRUE), + fragvol_r = stats::weighted.mean(fragvol_r, hz_thickness, na.rm = TRUE), + .groups = "drop" + ) %>% + dplyr::mutate( + hzdept_r = top_depth, + hzdepb_r = bottom_depth ) - depth_data <- soil_props %>% - dplyr::mutate( - fragvol_r = NA_real_, - depth_layer = depths[i], - hzdept_r = top_depth, - hzdepb_r = bottom_depth - ) - } - all_soil_data[[i]] <- depth_data - # Loop continues to next depth layer regardless + all_soil_data[[i]] <- component_data } - # Transform to match original code format soilprop <- do.call(rbind, all_soil_data) soilprop.new <- soilprop %>% dplyr::select( fraction_of_sand_in_soil = "sandtotal_r", - fraction_of_silt_in_soil = "silttotal_r", + fraction_of_silt_in_soil = "silttotal_r", fraction_of_clay_in_soil = "claytotal_r", soil_depth = "hzdept_r", soil_depth_bottom = "hzdepb_r", organic_matter_pct = "om_r", bulk_density = "dbthirdbar_r", coarse_fragment_pct = "fragvol_r", - mukey = "mukey" + mukey = "mukey", + cokey = "cokey", + comppct_r = "comppct_r" ) %>% dplyr::mutate( dplyr::across(c(dplyr::starts_with("fraction_of"), "coarse_fragment_pct"), ~ . / 100), + coarse_fragment_pct = ifelse(is.na(coarse_fragment_pct), 0, coarse_fragment_pct), horizon_thickness_cm = .data$soil_depth_bottom - .data$soil_depth, soil_organic_carbon_stock = PEcAn.data.land::soc2ocs( soc_percent = PEcAn.data.land::om2soc(.data$organic_matter_pct), @@ -182,10 +152,10 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa if(nrow(soilprop.new) == 0) { PEcAn.logger::logger.error("No valid soil properties after filtering") return(NULL) - } + } if(!dir.exists(outdir)) dir.create(outdir, recursive = TRUE) - + soil.data.gssurgo <- list( fraction_of_sand_in_soil = soilprop.new$fraction_of_sand_in_soil, fraction_of_silt_in_soil = soilprop.new$fraction_of_silt_in_soil, @@ -193,56 +163,54 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa soil_depth = soilprop.new$soil_depth, soil_organic_carbon_stock = soilprop.new$soil_organic_carbon_stock ) - all.soil.ens <- c(all.soil.ens, list(soil.data.gssurgo)) # Generate modeled ensembles tryCatch({ - # Adjust depth levels if needed - if (max(soilprop.new$soil_depth_bottom) > max(depths_cm)) { - depths_cm <- sort(c(depths_cm, max(soilprop.new$soil_depth))) - } - depth.levs <- findInterval(soilprop.new$soil_depth_bottom, depths_cm) depth.levs[depth.levs == 0] <- 1 depth.levs[depth.levs > length(depths_cm)] <- length(depths_cm) - # Remove any NA depth levels valid_indices <- !is.na(depth.levs) if(sum(!valid_indices) > 0) { soilprop.new <- soilprop.new[valid_indices, ] depth.levs <- depth.levs[valid_indices] } - soilprop.new.grouped <- soilprop.new %>% + soilprop.new.grouped <- soilprop.new %>% dplyr::mutate(DepthL = depths_cm[depth.levs]) - # Dirichlet modeling per mukey + # Dirichlet modeling per mukey AND depth (component-level) simulated.soil.props <- soilprop.new.grouped %>% - split(.$mukey) %>% - purrr::map_df(function(mukey_group) { + split(list(soilprop.new.grouped$DepthL, soilprop.new.grouped$mukey)) %>% + purrr::map_df(function(DepthL.Data){ tryCatch({ - texture_data <- mukey_group[,c("fraction_of_sand_in_soil", + texture_data <- DepthL.Data[,c("fraction_of_sand_in_soil", "fraction_of_silt_in_soil", - "fraction_of_clay_in_soil")] %>% + "fraction_of_clay_in_soil")] %>% as.matrix() if(nrow(texture_data) == 0) return(NULL) + # Fit Dirichlet on component textures dir.model <- sirt::dirichlet.mle(texture_data) alpha <- matrix(dir.model$alpha, nrow = size, ncol = length(dir.model$alpha), byrow = TRUE) simulated.soil <- sirt::dirichlet.simul(alpha) - # SOC modeling - soc_mean <- mukey_group$soil_organic_carbon_stock - soc_sd <- stats::sd(soc_mean, na.rm = TRUE) - n_depths <- length(soc_mean) + # Component-weighted SOC + soc_values <- DepthL.Data$soil_organic_carbon_stock + weights <- DepthL.Data$comppct_r / sum(DepthL.Data$comppct_r) + + soc_mean <- stats::weighted.mean(soc_values, weights) + soc_sd <- sqrt(stats::weighted.mean((soc_values - soc_mean)^2, weights)) - if (n_depths == 1 || is.na(soc_sd) || soc_sd == 0) { - simulated_soc <- rep(NA_real_, size) + if (is.na(soc_sd) || soc_sd == 0) { + # No variability - use mean value (preserves data for single observations) + simulated_soc <- rep(soc_mean, size) } else { - shape <- (mean(soc_mean, na.rm=TRUE)^2) / (soc_sd^2) - rate <- mean(soc_mean, na.rm=TRUE) / (soc_sd^2) + # Has variability - sample from gamma distribution + shape <- (soc_mean^2) / (soc_sd^2) + rate <- soc_mean / (soc_sd^2) simulated_soc <- stats::rgamma(size, shape = shape, rate = rate) } @@ -250,8 +218,8 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa fraction_of_sand_in_soil = simulated.soil[,1], fraction_of_silt_in_soil = simulated.soil[,2], fraction_of_clay_in_soil = simulated.soil[,3], - soil_depth = mukey_group$soil_depth, - mukey = unique(mukey_group$mukey), + soil_depth = DepthL.Data$soil_depth[1], + mukey = unique(DepthL.Data$mukey), soil_organic_carbon_stock = simulated_soc ) @@ -263,7 +231,7 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa }) }) - # calculate mukey area + # Calculate area weights mukey_area <- data.frame( mukey = names(mukey_counts), Area = as.numeric(mukey_counts) / sum(mukey_counts) @@ -271,22 +239,22 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa dplyr::filter(.data$mukey %in% unique(simulated.soil.props$mukey)) %>% dplyr::mutate(Area = .data$Area / sum(.data$Area, na.rm = TRUE)) - # generate weighted profiles - soil.profiles <- simulated.soil.props %>% - split(.$mukey) %>% + # Generate weighted profiles + soil.profiles <- simulated.soil.props %>% + split(.$mukey) %>% purrr::map(function(soiltype.sim){ sizein <- mukey_area$Area[mukey_area$mukey == unique(soiltype.sim$mukey)] * size 1:ceiling(sizein) %>% purrr::map(function(x){ - soiltype.sim %>% + soiltype.sim %>% split(.$soil_depth) %>% purrr::map_dfr(~.x[x,]) }) }) %>% purrr::flatten() - # convert profiles to ensemble arrays + # Convert to ensemble arrays all.soil.ens <- soil.profiles %>% purrr::map(function(SEns){ SEns <- SEns[, names(SEns) != "mukey"] @@ -303,7 +271,7 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa PEcAn.logger::logger.warn(conditionMessage(e)) }) - # generate NetCDF files + # Generate NetCDF files out.ense <- (1:length(all.soil.ens)) %>% purrr::map(function(i) { tryCatch({ @@ -318,14 +286,9 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa PEcAn.logger::logger.warn(conditionMessage(e)) return(NULL) }) - }) - - # remove nulls - out.ense <- out.ense %>% - purrr::discard(is.null) - - out.ense <- out.ense %>% - stats::setNames(rep("path", length(out.ense))) + }) %>% + purrr::discard(is.null) %>% + stats::setNames(rep("path", length(.))) return(out.ense) } From a49755d9d33ffba383281ac0029086b5775ca2ec Mon Sep 17 00:00:00 2001 From: divne7022 Date: Sat, 8 Nov 2025 16:59:04 +0000 Subject: [PATCH 06/15] replaced grid_size/grid_spacing param with radius --- modules/data.land/R/soil_process.R | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/modules/data.land/R/soil_process.R b/modules/data.land/R/soil_process.R index 515a2aadcc4..3e34709aca5 100644 --- a/modules/data.land/R/soil_process.R +++ b/modules/data.land/R/soil_process.R @@ -58,19 +58,13 @@ soil_process <- function(settings, input, dbfiles, overwrite = FALSE,run.local=T names(newfile) <- rep("path", length(newfile)) if(length(newfile)==0){ - radius <- ifelse(is.null(settings$run$input$soil$radius), 100, + radius <- ifelse(is.null(settings$run$input$soil$radius), 500, as.numeric(settings$run$input$soil$radius)) - grid_size <- max(3, ifelse(is.null(settings$run$input$soil$grid_size), 3, - as.numeric(settings$run$input$soil$grid_size))) - - grid_extent <- radius * sqrt(pi) - grid_spacing <- grid_extent / (grid_size - 1) newfile <- extract_soil_gssurgo( outfolder, lat = latlon$lat, lon = latlon$lon, - grid_size = grid_size, - grid_spacing = grid_spacing + radius = radius ) # register files in DB From 3d2db8024e6218be260bcaf30ab3ab000edfc316 Mon Sep 17 00:00:00 2001 From: divne7022 Date: Sat, 8 Nov 2025 16:59:34 +0000 Subject: [PATCH 07/15] update .Rd --- modules/data.land/man/extract_soil_gssurgo.Rd | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/modules/data.land/man/extract_soil_gssurgo.Rd b/modules/data.land/man/extract_soil_gssurgo.Rd index 3ab1ef9ab86..d0416be2dca 100644 --- a/modules/data.land/man/extract_soil_gssurgo.Rd +++ b/modules/data.land/man/extract_soil_gssurgo.Rd @@ -6,28 +6,28 @@ \usage{ extract_soil_gssurgo( outdir, - lat, - lon, + lat = NULL, + lon = NULL, + aoi = NULL, size = 1, - grid_size = 3, - grid_spacing = 100, + radius = 500, depths = c(0.15, 0.3, 0.6) ) } \arguments{ -\item{outdir}{Output directory for writing down the netcdf file} +\item{outdir}{Output directory for writing NetCDF files} -\item{lat}{Latitude of center point (single numeric value)} +\item{lat}{Latitude of center point (optional if aoi provided)} -\item{lon}{Longitude of center point (single numeric value)} +\item{lon}{Longitude of center point (optional if aoi provided)} -\item{size}{Ensemble size} +\item{aoi}{Custom area of interest as sf or terra polygon (optional)} -\item{grid_size}{Size of the spatial sampling grid around the center point (default: 3)} +\item{size}{Ensemble size (number of ensemble members to generate)} -\item{grid_spacing}{Spacing between grid cells in meters (default: 100)} +\item{radius}{Buffer radius in meters around lat/lon point (default: 500)} -\item{depths}{Standard set of soil depths in m to create the ensemble of soil profiles with.} +\item{depths}{Standard soil depth intervals in meters (default: c(0.15, 0.30, 0.60))} } \value{ It returns the address for the generated soil netcdf file @@ -36,9 +36,9 @@ It returns the address for the generated soil netcdf file Extract soil data from gssurgo } \details{ -This function takes a single lat/lon point and creates a spatial grid -around it for sampling soil variability. The grid_size parameter determines -how many grid points (grid_size x grid_size) are created around the center point. +This function extracts soil property data from the USDA gSSURGO database +for a specified area of interest. It can work with either a lat/lon point +(creating a circular buffer) or a custom polygon AOI. } \examples{ \dontrun{ From 7ca57f6dd3e6882ea8c6e1962ecf9a4d82e4d7bf Mon Sep 17 00:00:00 2001 From: divne7022 Date: Sat, 8 Nov 2025 18:18:23 +0000 Subject: [PATCH 08/15] update doceker depends --- docker/depends/pecan_package_dependencies.csv | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv index 04cef8720aa..f3e1edfe339 100644 --- a/docker/depends/pecan_package_dependencies.csv +++ b/docker/depends/pecan_package_dependencies.csv @@ -5,6 +5,7 @@ "abind",">= 1.4.5","models/ed","Imports",FALSE "abind",">= 1.4.5","modules/data.atmosphere","Imports",FALSE "amerifluxr","*","modules/data.atmosphere","Imports",FALSE +"aqp","*","modules/data.land","Imports",FALSE "arrow","*","modules/data.atmosphere","Imports",FALSE "assertthat","*","models/ed","Imports",FALSE "assertthat","*","modules/data.atmosphere","Imports",FALSE @@ -567,6 +568,7 @@ "sf","*","modules/data.land","Imports",FALSE "sf","*","modules/data.remote","Suggests",FALSE "SimilarityMeasures","*","modules/benchmark","Imports",FALSE +"soilDB","*","modules/data.land","Imports",FALSE "sirt","*","modules/data.land","Imports",FALSE "sp","*","base/visualization","Suggests",FALSE "sp","*","modules/assim.sequential","Suggests",FALSE From 0dc560b47860b95591828f01a12a0d9d8519ebf3 Mon Sep 17 00:00:00 2001 From: divne7022 Date: Sat, 8 Nov 2025 18:19:22 +0000 Subject: [PATCH 09/15] add aqp and soilDB to DESCRIPTON --- modules/data.land/DESCRIPTION | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/data.land/DESCRIPTION b/modules/data.land/DESCRIPTION index 52794ca0ad7..3117e0b6937 100644 --- a/modules/data.land/DESCRIPTION +++ b/modules/data.land/DESCRIPTION @@ -25,6 +25,7 @@ URL: https://pecanproject.github.io BugReports: https://github.com/PecanProject/pecan/issues Depends: R (>= 4.1.0) Imports: + aqp, coda, curl, doSNOW, @@ -53,6 +54,7 @@ Imports: rlang, sf, sirt, + soilDB, sp, stringr, terra, From 62fb9c0e574748557b7f0e0666e3082f1752198b Mon Sep 17 00:00:00 2001 From: divne7022 Date: Sat, 8 Nov 2025 18:34:42 +0000 Subject: [PATCH 10/15] Revert "update doceker depends" This reverts commit 7ca57f6dd3e6882ea8c6e1962ecf9a4d82e4d7bf. --- docker/depends/pecan_package_dependencies.csv | 2 -- 1 file changed, 2 deletions(-) diff --git a/docker/depends/pecan_package_dependencies.csv b/docker/depends/pecan_package_dependencies.csv index f3e1edfe339..04cef8720aa 100644 --- a/docker/depends/pecan_package_dependencies.csv +++ b/docker/depends/pecan_package_dependencies.csv @@ -5,7 +5,6 @@ "abind",">= 1.4.5","models/ed","Imports",FALSE "abind",">= 1.4.5","modules/data.atmosphere","Imports",FALSE "amerifluxr","*","modules/data.atmosphere","Imports",FALSE -"aqp","*","modules/data.land","Imports",FALSE "arrow","*","modules/data.atmosphere","Imports",FALSE "assertthat","*","models/ed","Imports",FALSE "assertthat","*","modules/data.atmosphere","Imports",FALSE @@ -568,7 +567,6 @@ "sf","*","modules/data.land","Imports",FALSE "sf","*","modules/data.remote","Suggests",FALSE "SimilarityMeasures","*","modules/benchmark","Imports",FALSE -"soilDB","*","modules/data.land","Imports",FALSE "sirt","*","modules/data.land","Imports",FALSE "sp","*","base/visualization","Suggests",FALSE "sp","*","modules/assim.sequential","Suggests",FALSE From 6221bec5627bb69397c8045190eaece9f8d4178d Mon Sep 17 00:00:00 2001 From: divne7022 Date: Sat, 29 Nov 2025 07:37:54 +0000 Subject: [PATCH 11/15] use direct terra::buffer() to avoid projection distortion and normalize sand/silt/clay fractions after weighted mean aggregation --- modules/data.land/R/extract_soil_nc.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/modules/data.land/R/extract_soil_nc.R b/modules/data.land/R/extract_soil_nc.R index fbfc00b9627..3932c92bedd 100644 --- a/modules/data.land/R/extract_soil_nc.R +++ b/modules/data.land/R/extract_soil_nc.R @@ -36,9 +36,8 @@ extract_soil_gssurgo <- function(outdir, lat = NULL, lon = NULL, aoi = NULL, siz if (is.null(aoi)) { aoi <- data.frame(lon = lon, lat = lat) %>% terra::vect(crs = "epsg:4326") %>% - terra::project("epsg:5070") %>% - terra::buffer(width = radius) %>% - terra::project("epsg:4326") + terra::buffer(width = radius) + } PEcAn.logger::logger.info("Querying gSSURGO Web Coverage Service for map unit keys") @@ -111,6 +110,13 @@ extract_soil_gssurgo <- function(outdir, lat = NULL, lon = NULL, aoi = NULL, siz fragvol_r = stats::weighted.mean(fragvol_r, hz_thickness, na.rm = TRUE), .groups = "drop" ) %>% + dplyr::mutate( + tex_sum = sandtotal_r + silttotal_r + claytotal_r, + sandtotal_r = sandtotal_r / tex_sum * 100, + silttotal_r = silttotal_r / tex_sum * 100, + claytotal_r = claytotal_r / tex_sum * 100 + ) %>% + dplyr::select(-tex_sum) %>% dplyr::mutate( hzdept_r = top_depth, hzdepb_r = bottom_depth From 0f181f6a663214a49012606782c4a1cb8042ab7c Mon Sep 17 00:00:00 2001 From: divne7022 Date: Sat, 29 Nov 2025 07:40:26 +0000 Subject: [PATCH 12/15] refactored tests to reflect new updates --- .../tests/testthat/test-extract_soil_nc.R | 116 ++++++++++++++++-- 1 file changed, 106 insertions(+), 10 deletions(-) diff --git a/modules/data.land/tests/testthat/test-extract_soil_nc.R b/modules/data.land/tests/testthat/test-extract_soil_nc.R index 9574ddd952c..522676c927f 100644 --- a/modules/data.land/tests/testthat/test-extract_soil_nc.R +++ b/modules/data.land/tests/testthat/test-extract_soil_nc.R @@ -12,11 +12,10 @@ test_that("extract_soil_gssurgo returns valid NetCDF files for valid US coordina lat = lat, lon = lon, size = 2, - grid_size = 3, - grid_spacing = 100, + radius = 500, depths = c(0.15, 0.30) ) - + expect_false(is.null(res)) expect_type(res, "list") @@ -30,7 +29,7 @@ test_that("extract_soil_gssurgo returns valid NetCDF files for valid US coordina # Validate NetCDF content if (requireNamespace("ncdf4", quietly = TRUE)) { expected_vars <- c("fraction_of_sand_in_soil", "fraction_of_silt_in_soil", - "fraction_of_clay_in_soil", "soil_organic_carbon_stock") + "fraction_of_clay_in_soil", "soil_organic_carbon_stock") # Skip first ensemble member (first ensemble member always uses the reported values without sampling) # and use subsequent members are simulated ensemble member with uncertainty @@ -41,6 +40,7 @@ test_that("extract_soil_gssurgo returns valid NetCDF files for valid US coordina for (var in expected_vars) { expect_true(var %in% names(nc$var)) } + # Validate data quality sand <- ncdf4::ncvar_get(nc, "fraction_of_sand_in_soil") silt <- ncdf4::ncvar_get(nc, "fraction_of_silt_in_soil") @@ -71,13 +71,12 @@ test_that("extract_soil_gssurgo performance is reasonable", { lat = 40.1164, lon = -88.2434, size = 1, - grid_size = 3, - grid_spacing = 100, + radius = 500, depths = c(0.15) ) end_time <- Sys.time() exec_time <- as.numeric(difftime(end_time, start_time, units = "secs")) - expect_lt(exec_time, 40) + expect_lt(exec_time, 60) }) test_that("extract_soil_gssurgo handles ensemble generation", { @@ -90,16 +89,113 @@ test_that("extract_soil_gssurgo handles ensemble generation", { lat = 40.1164, lon = -88.2434, size = 3, - grid_size = 3, - grid_spacing = 100, + radius = 500, depths = c(0.15, 0.30) ) expect_false(is.null(res)) expect_type(res, "list") - expect_equal(length(res), 4) + expect_equal(length(res), 5) + file_paths <- unlist(res) + expect_true(all(file.exists(file_paths))) +}) + +test_that("extract_soil_gssurgo works with custom AOI polygon", { + skip_on_cran() + skip_on_ci() + skip_if_not_installed("terra") + skip_if_not_installed("sf") + + tmp_outdir <- withr::local_tempdir("gssurgo_test_") + + # Create small polygon AOI + aoi_coords <- matrix(c( + -88.25, 40.11, + -88.24, 40.11, + -88.24, 40.12, + -88.25, 40.12, + -88.25, 40.11 + ), ncol = 2, byrow = TRUE) + + aoi <- terra::vect(aoi_coords, type = "polygons", crs = "epsg:4326") + + res <- extract_soil_gssurgo( + outdir = tmp_outdir, + aoi = aoi, + size = 2, + depths = c(0.15, 0.30) + ) + + expect_false(is.null(res)) + expect_type(res, "list") + expect_gt(length(res), 0) file_paths <- unlist(res) expect_true(all(file.exists(file_paths))) +}) + +test_that("extract_soil_gssurgo handles different buffer radii", { + skip_on_cran() + skip_on_ci() + tmp_outdir <- withr::local_tempdir("gssurgo_test_") + + # Small radius + res_small <- extract_soil_gssurgo( + outdir = tmp_outdir, + lat = 40.1164, + lon = -88.2434, + size = 1, + radius = 200, + depths = c(0.15) + ) + + # Larger radius (should potentially capture more mukeys) + res_large <- extract_soil_gssurgo( + outdir = tmp_outdir, + lat = 40.1164, + lon = -88.2434, + size = 1, + radius = 1000, + depths = c(0.15) + ) + + expect_false(is.null(res_small)) + expect_false(is.null(res_large)) + expect_type(res_small, "list") + expect_type(res_large, "list") +}) + +test_that("extract_soil_gssurgo component-level variability is preserved", { + skip_on_cran() + skip_on_ci() + tmp_outdir <- withr::local_tempdir("gssurgo_test_") + + res <- extract_soil_gssurgo( + outdir = tmp_outdir, + lat = 40.1164, + lon = -88.2434, + size = 5, # Multiple ensemble members to check variability + radius = 500, + depths = c(0.15, 0.30) + ) + + expect_false(is.null(res)) + expect_gt(length(res), 2) + + if (requireNamespace("ncdf4", quietly = TRUE) && length(res) >= 3) { + # Compare two different ensemble members (skip first - it's unsampled) + nc1 <- ncdf4::nc_open(unlist(res)[2]) + nc2 <- ncdf4::nc_open(unlist(res)[3]) + on.exit({ + ncdf4::nc_close(nc1) + ncdf4::nc_close(nc2) + }, add = TRUE) + + sand1 <- ncdf4::ncvar_get(nc1, "fraction_of_sand_in_soil") + sand2 <- ncdf4::ncvar_get(nc2, "fraction_of_sand_in_soil") + + # Ensemble members should show variability (not identical) + expect_false(all(sand1 == sand2)) + } }) \ No newline at end of file From 5fa917792b729e50739fbf29d213b6fab1f452cd Mon Sep 17 00:00:00 2001 From: divne7022 Date: Wed, 3 Dec 2025 08:12:48 +0000 Subject: [PATCH 13/15] require depths to start at 0 and convert depths to n+1 breakpoints for interval processing --- modules/data.land/R/extract_soil_nc.R | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/modules/data.land/R/extract_soil_nc.R b/modules/data.land/R/extract_soil_nc.R index 3932c92bedd..f928d60e2b3 100644 --- a/modules/data.land/R/extract_soil_nc.R +++ b/modules/data.land/R/extract_soil_nc.R @@ -9,7 +9,7 @@ #' @param aoi Custom area of interest as sf or terra polygon (optional) #' @param size Ensemble size (number of ensemble members to generate) #' @param radius Buffer radius in meters around lat/lon point (default: 500) -#' @param depths Standard soil depth intervals in meters (default: c(0.15, 0.30, 0.60)) +#' @param depths Soil depth breakpoints in meters, must start with 0 (default: c(0, 0.15, 0.30, 0.60)) #' #' @return It returns the address for the generated soil netcdf file #' @@ -24,7 +24,7 @@ #' @author Hamze Dokoohaki, Akash #' @export #' -extract_soil_gssurgo <- function(outdir, lat = NULL, lon = NULL, aoi = NULL, size = 1, radius = 500, depths = c(0.15, 0.30, 0.60)){ +extract_soil_gssurgo <- function(outdir, lat = NULL, lon = NULL, aoi = NULL, size = 1, radius = 500, depths = c(0, 0.15, 0.30, 0.60)){ all.soil.ens <- list() # Validate inputs @@ -40,6 +40,14 @@ extract_soil_gssurgo <- function(outdir, lat = NULL, lon = NULL, aoi = NULL, siz } + # Validate depths parameter (must start with 0, like hist() breaks) + if (depths[1] != 0) { + PEcAn.logger::logger.severe( + "First depth must be 0. Use depths = c(0, 0.15, 0.30, ...) like hist() breaks. ", + "This creates n layers from n+1 breakpoints." + ) + } + PEcAn.logger::logger.info("Querying gSSURGO Web Coverage Service for map unit keys") mu_raster <- soilDB::mukey.wcs(aoi = aoi, db = 'gSSURGO', res = 30) @@ -78,14 +86,10 @@ extract_soil_gssurgo <- function(outdir, lat = NULL, lon = NULL, aoi = NULL, siz depths_cm <- depths * 100 all_soil_data <- list() - for (i in seq_along(depths_cm)) { - if (i == 1) { - top_depth <- 0 - bottom_depth <- depths_cm[1] - } else { - top_depth <- depths_cm[i-1] - bottom_depth <- depths_cm[i] - } + # loop through depth intervals (n+1 breaks --> n intervals, like hist()) + for (i in seq_len(length(depths_cm) - 1)) { + top_depth <- depths_cm[i] + bottom_depth <- depths_cm[i + 1] depth_hz <- hz_data %>% dplyr::filter(hzdept_r < bottom_depth & hzdepb_r > top_depth) From 55b40925171dd87b87d135d13d210def19aeece6 Mon Sep 17 00:00:00 2001 From: divne7022 Date: Wed, 3 Dec 2025 08:13:19 +0000 Subject: [PATCH 14/15] update tests --- .../tests/testthat/test-extract_soil_nc.R | 48 ++++++++++++++++--- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/modules/data.land/tests/testthat/test-extract_soil_nc.R b/modules/data.land/tests/testthat/test-extract_soil_nc.R index 522676c927f..ee3c2a9c99e 100644 --- a/modules/data.land/tests/testthat/test-extract_soil_nc.R +++ b/modules/data.land/tests/testthat/test-extract_soil_nc.R @@ -13,7 +13,7 @@ test_that("extract_soil_gssurgo returns valid NetCDF files for valid US coordina lon = lon, size = 2, radius = 500, - depths = c(0.15, 0.30) + depths = c(0, 0.15, 0.30) ) expect_false(is.null(res)) @@ -72,7 +72,7 @@ test_that("extract_soil_gssurgo performance is reasonable", { lon = -88.2434, size = 1, radius = 500, - depths = c(0.15) + depths = c(0, 0.15) ) end_time <- Sys.time() exec_time <- as.numeric(difftime(end_time, start_time, units = "secs")) @@ -90,7 +90,7 @@ test_that("extract_soil_gssurgo handles ensemble generation", { lon = -88.2434, size = 3, radius = 500, - depths = c(0.15, 0.30) + depths = c(0, 0.15, 0.30) ) expect_false(is.null(res)) @@ -124,7 +124,7 @@ test_that("extract_soil_gssurgo works with custom AOI polygon", { outdir = tmp_outdir, aoi = aoi, size = 2, - depths = c(0.15, 0.30) + depths = c(0, 0.15, 0.30) ) expect_false(is.null(res)) @@ -147,7 +147,7 @@ test_that("extract_soil_gssurgo handles different buffer radii", { lon = -88.2434, size = 1, radius = 200, - depths = c(0.15) + depths = c(0, 0.15) ) # Larger radius (should potentially capture more mukeys) @@ -157,7 +157,7 @@ test_that("extract_soil_gssurgo handles different buffer radii", { lon = -88.2434, size = 1, radius = 1000, - depths = c(0.15) + depths = c(0, 0.15) ) expect_false(is.null(res_small)) @@ -177,7 +177,7 @@ test_that("extract_soil_gssurgo component-level variability is preserved", { lon = -88.2434, size = 5, # Multiple ensemble members to check variability radius = 500, - depths = c(0.15, 0.30) + depths = c(0, 0.15, 0.30) ) expect_false(is.null(res)) @@ -198,4 +198,38 @@ test_that("extract_soil_gssurgo component-level variability is preserved", { # Ensemble members should show variability (not identical) expect_false(all(sand1 == sand2)) } +}) + +test_that("extract_soil_gssurgo requires depths to start with 0", { + skip_on_cran() + skip_on_ci() + tmp_outdir <- withr::local_tempdir("gssurgo_test_") + + # Disable debugging during error testing + withr::local_options(error = NULL) + + # Should error when depths doesn't start with 0 + expect_error( + extract_soil_gssurgo( + outdir = tmp_outdir, + lat = 40.1164, + lon = -88.2434, + size = 1, + radius = 500, + depths = c(0.15, 0.30) # Missing 0 at start + ), + regexp = "First depth must be 0" + ) + + # Should work when depths starts with 0 + res <- extract_soil_gssurgo( + outdir = tmp_outdir, + lat = 40.1164, + lon = -88.2434, + size = 1, + radius = 500, + depths = c(0, 0.15, 0.30) # Correct format + ) + + expect_false(is.null(res)) }) \ No newline at end of file From c2261b090f1c0696af0d42a46ec077013242f0c1 Mon Sep 17 00:00:00 2001 From: divne7022 Date: Wed, 3 Dec 2025 08:13:45 +0000 Subject: [PATCH 15/15] updade .Rd --- modules/data.land/man/extract_soil_gssurgo.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/data.land/man/extract_soil_gssurgo.Rd b/modules/data.land/man/extract_soil_gssurgo.Rd index d0416be2dca..4958acaef0b 100644 --- a/modules/data.land/man/extract_soil_gssurgo.Rd +++ b/modules/data.land/man/extract_soil_gssurgo.Rd @@ -11,7 +11,7 @@ extract_soil_gssurgo( aoi = NULL, size = 1, radius = 500, - depths = c(0.15, 0.3, 0.6) + depths = c(0, 0.15, 0.3, 0.6) ) } \arguments{ @@ -27,7 +27,7 @@ extract_soil_gssurgo( \item{radius}{Buffer radius in meters around lat/lon point (default: 500)} -\item{depths}{Standard soil depth intervals in meters (default: c(0.15, 0.30, 0.60))} +\item{depths}{Soil depth breakpoints in meters, must start with 0 (default: c(0, 0.15, 0.30, 0.60))} } \value{ It returns the address for the generated soil netcdf file