diff --git a/CHANGELOG.md b/CHANGELOG.md index b05e4780c8..0449f445ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,6 +58,11 @@ For more information about this file see also [Keep a Changelog](http://keepacha - Fixed a bugs and BADM now process both single-site and multi-site settings, detecting the input structure and processing each site independently to generate the correct number of ensemble members per site. - Fixed "external pointer is not valid" error and addressed key bugs in `soilgrids_soilC_extract()` function (#3506) - Fixed a bug within the `model2netcdf.SIPNET` function where we assumed the constant calculations of `pecan_start_doy` across years (the calculations should vary depending on the last date from the last loop and the start date of the current loop), which will lead to incorrect calculations of the start `sub_dates` and `sub_dates_cf` if we are jumping between years (e.g., from 2012-12-31 to 2013-01-01). The `sipnet2datetime` function is no longer used anywhere and therefore has been removed. +- `extract_soil_gssurgo()` (#3643) + - Replaced point-based WFS queries with raster-based WCS approach using `soilDB::mukey.wcs()`. + - Replaced `grid_size`/`grid_spacing` parameters with `radius` (meters) for simpler buffer-based AOI creation. + - Switched to single `soilDB::fetchSDA()` call for component-level soil data retrieval, enabling better ensemble uncertainty quantification through within-map-unit variability. Added support for custom AOI polygons. + - This eliminates spatial coverage gaps and reduces network requests while maintaining backward compatibility. ### Changed diff --git a/modules/data.land/DESCRIPTION b/modules/data.land/DESCRIPTION index 69155d6054..d0c94bea9f 100644 --- a/modules/data.land/DESCRIPTION +++ b/modules/data.land/DESCRIPTION @@ -25,6 +25,7 @@ URL: https://pecanproject.github.io BugReports: https://github.com/PecanProject/pecan/issues Depends: R (>= 4.1.0) Imports: + aqp, coda, curl, doSNOW, @@ -53,6 +54,7 @@ Imports: rlang, sf, sirt, + soilDB, sp, stringr, terra, diff --git a/modules/data.land/NEWS.md b/modules/data.land/NEWS.md index b3d6c68de4..d67b33c871 100644 --- a/modules/data.land/NEWS.md +++ b/modules/data.land/NEWS.md @@ -1,3 +1,12 @@ +# PEcAn.data.land 1.9.0.9000 + +- `extract_soil_gssurgo()` now accepts a polygon area of interest or a circular buffer radius, + replacing the rectangular area previously specified through the `grid_size`/`grid_spacing` parameters. + It also now performs gSSURGO queries through the `soilDB` package, which should be faster and more + reliable than the previous hand-rolled XML queries (#3643). + + + # PEcAn.data.land 1.9.0 ## Added diff --git a/modules/data.land/R/extract_soil_nc.R b/modules/data.land/R/extract_soil_nc.R index a44c001cf2..f928d60e2b 100644 --- a/modules/data.land/R/extract_soil_nc.R +++ b/modules/data.land/R/extract_soil_nc.R @@ -1,24 +1,18 @@ #' Extract soil data from gssurgo -#' @details This function takes a single lat/lon point and creates a spatial grid -#' around it for sampling soil variability. The grid_size parameter determines -#' how many grid points (grid_size x grid_size) are created around the center point. +#' @details This function extracts soil property data from the USDA gSSURGO database +#' for a specified area of interest. It can work with either a lat/lon point +#' (creating a circular buffer) or a custom polygon AOI. #' -#' @param outdir Output directory for writing down the netcdf file -#' @param lat Latitude of center point (single numeric value) -#' @param lon Longitude of center point (single numeric value) -#' @param size Ensemble size -#' @param grid_size Size of the spatial sampling grid around the center point (default: 3) -#' @param grid_spacing Spacing between grid cells in meters (default: 100) -#' @param depths Standard set of soil depths in m to create the ensemble of soil profiles with. +#' @param outdir Output directory for writing NetCDF files +#' @param lat Latitude of center point (optional if aoi provided) +#' @param lon Longitude of center point (optional if aoi provided) +#' @param aoi Custom area of interest as sf or terra polygon (optional) +#' @param size Ensemble size (number of ensemble members to generate) +#' @param radius Buffer radius in meters around lat/lon point (default: 500) +#' @param depths Soil depth breakpoints in meters, must start with 0 (default: c(0, 0.15, 0.30, 0.60)) #' #' @return It returns the address for the generated soil netcdf file #' -#' @section Current Limitations: -#' - MUKEY frequency weighting treats occurrence counts as proportional to area coverage -#' - This approximation may introduce geometric bias for irregular polygon data -#' - Buffer radius is set to grid_spacing/2 to reduce overlapping queries, but may still miss coverage -#' - True area-weighted aggregation using polygon geometries is planned (see issue #3609) -#' #' @importFrom rlang .data #' @examples #' \dontrun{ @@ -30,172 +24,131 @@ #' @author Hamze Dokoohaki, Akash #' @export #' -extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spacing=100, depths=c(0.15,0.30,0.60)){ - # I keep all the ensembles here - all.soil.ens <-list() - - # Grid-based spatial sampling around the center point (via WFS queries) - # This creates a grid_size x grid_size sampling grid centered on lat/lon - proj_crs <- sf::st_crs("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=23 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs") - wgs84_crs <- sf::st_crs(4326) +extract_soil_gssurgo <- function(outdir, lat = NULL, lon = NULL, aoi = NULL, size = 1, radius = 500, depths = c(0, 0.15, 0.30, 0.60)){ + all.soil.ens <- list() - # Convert single center lat/lon to projected coordinates - point_sf <- sf::st_sfc(sf::st_point(c(lon, lat)), crs = wgs84_crs) - point_proj <- sf::st_transform(point_sf, proj_crs) - coords_proj <- sf::st_coordinates(point_proj) + # Validate inputs + if (is.null(aoi) && (is.null(lat) || is.null(lon))) { + PEcAn.logger::logger.severe("Must provide either 'aoi' OR both 'lat' and 'lon'") + } - # Define grid extent - half_extent <- (grid_size - 1) / 2 * grid_spacing - xmin <- coords_proj[1] - half_extent - xmax <- coords_proj[1] + half_extent - ymin <- coords_proj[2] - half_extent - ymax <- coords_proj[2] + half_extent + # Create AOI from point + radius if needed + if (is.null(aoi)) { + aoi <- data.frame(lon = lon, lat = lat) %>% + terra::vect(crs = "epsg:4326") %>% + terra::buffer(width = radius) + + } + + # Validate depths parameter (must start with 0, like hist() breaks) + if (depths[1] != 0) { + PEcAn.logger::logger.severe( + "First depth must be 0. Use depths = c(0, 0.15, 0.30, ...) like hist() breaks. ", + "This creates n layers from n+1 breakpoints." + ) + } + + PEcAn.logger::logger.info("Querying gSSURGO Web Coverage Service for map unit keys") + mu_raster <- soilDB::mukey.wcs(aoi = aoi, db = 'gSSURGO', res = 30) - # Create raster template - raster_template <- terra::rast( - xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, - resolution = grid_spacing, crs = proj_crs$wkt - ) - grid_coords <- terra::crds(raster_template) + mukey_values <- terra::values(mu_raster) + mukey_values <- mukey_values[!is.na(mukey_values)] + mukey_counts <- table(mukey_values) + mukeys_all <- as.character(names(mukey_counts)) - # Transform grid coordinates back to WGS84 for gSSURGO queries - grid_sf <- sf::st_as_sf(data.frame(x = grid_coords[, 1], y = grid_coords[, 2]), - coords = c("x", "y"), crs = proj_crs) - grid_wgs84 <- sf::st_transform(grid_sf, wgs84_crs) - grid_coords_wgs84 <- sf::st_coordinates(grid_wgs84) + if (length(mukeys_all) == 0) { + PEcAn.logger::logger.severe("No mapunit keys were found for this site.") + } - # Query gSSURGO for each grid point to capture spatial variability - buffer_radius <- grid_spacing / 2 - PEcAn.logger::logger.warn( - "Buffer radius set to grid_spacing/2 to avoid overlap", - "results may be biased due to lack of area weighting and incomplete spatial coverage." - ) - mukeys_all <- c() - for (i in seq_len(nrow(grid_coords_wgs84))) { - # Extract coordinates for this grid point (not user input) - this_lon <- grid_coords_wgs84[i, 1] - this_lat <- grid_coords_wgs84[i, 2] - - # I ask the gSSURGO to find all the mukeys (loosely can be thought of soil type) within grid_spacing distance of each grid point location. - # Basically I think of this as me going around and taking soil samples at each grid point. - #https://sdmdataaccess.nrcs.usda.gov/SpatialFilterHelp.htm - mu.Path <- paste0( - "https://sdmdataaccess.nrcs.usda.gov/Spatial/SDMWGS84Geographic.wfs?", - "SERVICE=WFS", - "&VERSION=1.1.0", - "&REQUEST=GetFeature&TYPENAME=MapunitPoly", - "&FILTER=", - "", - "", - "Geometry", - "", - "", this_lon, ",", this_lat, "", - "", - "", buffer_radius, "", - "", - "", - "&OUTPUTFORMAT=XMLMukeyList" + # Fetch complete soil data + sda_data <- tryCatch({ + soilDB::fetchSDA( + WHERE = paste0("mukey IN (", paste(mukeys_all, collapse = ","), ")"), + duplicates = TRUE, + childs = TRUE, + nullFragsAreZero = TRUE, + rmHzErrors = TRUE ) + }, error = function(e) { + PEcAn.logger::logger.error(paste("Failed to fetch SDA data:", e$message)) + return(NULL) + }) + + if (is.null(sda_data)) { + PEcAn.logger::logger.error("Could not retrieve soil data from SDA") + return(NULL) + } + + hz_data <- aqp::horizons(sda_data) + site_data <- aqp::site(sda_data) + + # Component-level aggregation by depth + depths_cm <- depths * 100 + all_soil_data <- list() + + # loop through depth intervals (n+1 breaks --> n intervals, like hist()) + for (i in seq_len(length(depths_cm) - 1)) { + top_depth <- depths_cm[i] + bottom_depth <- depths_cm[i + 1] - # XML handling with temp file - temp_file <- tempfile(fileext = ".xml") - xmll <- curl::curl_download( - mu.Path, - destfile = temp_file, - handle = curl::new_handle(ssl_verifypeer = FALSE, ssl_verifyhost = FALSE) - ) + depth_hz <- hz_data %>% + dplyr::filter(hzdept_r < bottom_depth & hzdepb_r > top_depth) - # mukey extraction with error recovery - mukey_str <- tryCatch({ - xml_doc <- XML::xmlParse(temp_file) - mapunit_nodes <- XML::getNodeSet(xml_doc, "//MapUnitKeyList") - - if (length(mapunit_nodes) > 0) { - mukey_data <- XML::xmlValue(mapunit_nodes[[1]]) - if (!is.null(mukey_data) && nchar(trimws(mukey_data)) > 0) { - mukey_data - } else { - PEcAn.logger::logger.debug(paste("Empty MapUnitKeyList for coordinates", - this_lat, ",", this_lon)) - NULL - } - } else { - PEcAn.logger::logger.debug(paste("No MapUnitKeyList found for coordinates", - this_lat, ",", this_lon, "skipping grid point")) - NULL - } - }, error = function(e) { - PEcAn.logger::logger.warn(paste("Failed to parse gSSURGO response for coordinates", - this_lat, ",", this_lon, ":", e$message)) - NULL - }) - if (file.exists(temp_file)) unlink(temp_file) - if (is.null(mukey_str)) next + if (nrow(depth_hz) == 0) next - mukeys <- strsplit(mukey_str, ",")[[1]] - if (length(mukeys) == 0) next + # Aggregate by COMPONENT (preserves within-mapunit variability) + component_data <- depth_hz %>% + dplyr::left_join(site_data[, c("cokey", "comppct_r", "mukey")], by = "cokey") %>% + dplyr::mutate( + hz_top_adj = pmax(hzdept_r, top_depth), + hz_bot_adj = pmin(hzdepb_r, bottom_depth), + hz_thickness = hz_bot_adj - hz_top_adj + ) %>% + dplyr::group_by(mukey, cokey, comppct_r) %>% + dplyr::summarise( + sandtotal_r = stats::weighted.mean(sandtotal_r, hz_thickness, na.rm = TRUE), + silttotal_r = stats::weighted.mean(silttotal_r, hz_thickness, na.rm = TRUE), + claytotal_r = stats::weighted.mean(claytotal_r, hz_thickness, na.rm = TRUE), + om_r = stats::weighted.mean(om_r, hz_thickness, na.rm = TRUE), + dbthirdbar_r = stats::weighted.mean(dbthirdbar_r, hz_thickness, na.rm = TRUE), + fragvol_r = stats::weighted.mean(fragvol_r, hz_thickness, na.rm = TRUE), + .groups = "drop" + ) %>% + dplyr::mutate( + tex_sum = sandtotal_r + silttotal_r + claytotal_r, + sandtotal_r = sandtotal_r / tex_sum * 100, + silttotal_r = silttotal_r / tex_sum * 100, + claytotal_r = claytotal_r / tex_sum * 100 + ) %>% + dplyr::select(-tex_sum) %>% + dplyr::mutate( + hzdept_r = top_depth, + hzdepb_r = bottom_depth + ) - mukeys_all <- c(mukeys_all, mukeys) - } - - # mukey occurrences across all grid points - mukey_counts <- table(mukeys_all) - # Get unique mukeys from all grid points - mukeys_all <- unique(mukeys_all) - if (length(mukeys_all) == 0) { - PEcAn.logger::logger.severe("No mapunit keys were found for this site.") - return(NULL) + all_soil_data[[i]] <- component_data } - # calling the query function sending the mapunit keys - soilprop <- gSSURGO.Query( - mukeys_all, - c("chorizon.sandtotal_r", - "chorizon.silttotal_r", - "chorizon.claytotal_r", - "chorizon.hzdept_r", - "chorizon.hzdepb_r", - "chorizon.om_r", - "chorizon.dbthirdbar_r", # bulk density at 1/3 bar (field capacity);which is the standard field capacity bulk density measurement - "chfrags.fragvol_r", - "component.comppct_r")) - - # Two-step aggregation: - # (1) Sum fragments within horizons, (2) Component area-weighting by mapunit - soilprop.weighted <- soilprop %>% - dplyr::group_by(.data$cokey, .data$hzdept_r, .data$hzdepb_r) %>% - # Each horizon may have multiple rows from different fragment size classes - # Sum fragments across size classes and remove duplicate horizon data - dplyr::mutate(fragvol_r = min(sum(.data$fragvol_r, na.rm = TRUE), 100)) %>% - dplyr::distinct() %>% # Remove duplicate rows created by multiple fragment size classes - dplyr::ungroup() %>% - # Component area-weighted aggregation by mapunit and horizon depth - dplyr::group_by(.data$mukey, .data$hzdept_r, .data$hzdepb_r) %>% - dplyr::summarise( - sandtotal_r = stats::weighted.mean(.data$sandtotal_r, .data$comppct_r, na.rm = TRUE), - silttotal_r = stats::weighted.mean(.data$silttotal_r, .data$comppct_r, na.rm = TRUE), - claytotal_r = stats::weighted.mean(.data$claytotal_r, .data$comppct_r, na.rm = TRUE), - om_r = stats::weighted.mean(.data$om_r, .data$comppct_r, na.rm = TRUE), - dbthirdbar_r = stats::weighted.mean(.data$dbthirdbar_r, .data$comppct_r, na.rm = TRUE), - fragvol_r = stats::weighted.mean(.data$fragvol_r, .data$comppct_r, na.rm = TRUE), - .groups = "drop" - ) + soilprop <- do.call(rbind, all_soil_data) - soilprop.new <- soilprop.weighted %>% - dplyr::arrange(.data$hzdept_r) %>% + soilprop.new <- soilprop %>% dplyr::select( - fraction_of_sand_in_soil = "sandtotal_r", # % - fraction_of_silt_in_soil = "silttotal_r", # % - fraction_of_clay_in_soil = "claytotal_r", # % - soil_depth = "hzdept_r", # cm - soil_depth_bottom = "hzdepb_r", # cm - organic_matter_pct = "om_r", # % - bulk_density = "dbthirdbar_r", # g/cm3 - coarse_fragment_pct = "fragvol_r", # % - mukey = "mukey") %>% + fraction_of_sand_in_soil = "sandtotal_r", + fraction_of_silt_in_soil = "silttotal_r", + fraction_of_clay_in_soil = "claytotal_r", + soil_depth = "hzdept_r", + soil_depth_bottom = "hzdepb_r", + organic_matter_pct = "om_r", + bulk_density = "dbthirdbar_r", + coarse_fragment_pct = "fragvol_r", + mukey = "mukey", + cokey = "cokey", + comppct_r = "comppct_r" + ) %>% dplyr::mutate( dplyr::across(c(dplyr::starts_with("fraction_of"), "coarse_fragment_pct"), ~ . / 100), + coarse_fragment_pct = ifelse(is.na(coarse_fragment_pct), 0, coarse_fragment_pct), horizon_thickness_cm = .data$soil_depth_bottom - .data$soil_depth, soil_organic_carbon_stock = PEcAn.data.land::soc2ocs( soc_percent = PEcAn.data.land::om2soc(.data$organic_matter_pct), @@ -205,13 +158,14 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa ) ) %>% dplyr::filter(stats::complete.cases(.)) + if(nrow(soilprop.new) == 0) { PEcAn.logger::logger.error("No valid soil properties after filtering") return(NULL) - } + } + if(!dir.exists(outdir)) dir.create(outdir, recursive = TRUE) - #converting it to list soil.data.gssurgo <- list( fraction_of_sand_in_soil = soilprop.new$fraction_of_sand_in_soil, fraction_of_silt_in_soil = soilprop.new$fraction_of_silt_in_soil, @@ -219,113 +173,99 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa soil_depth = soilprop.new$soil_depth, soil_organic_carbon_stock = soilprop.new$soil_organic_carbon_stock ) - #This ensures that I have at least one soil ensemble in case the modeling part failed - all.soil.ens <-c(all.soil.ens, list(soil.data.gssurgo)) - + all.soil.ens <- c(all.soil.ens, list(soil.data.gssurgo)) - # What I do here is that I put soil data into depth classes and then model each class speparatly - #- see if we need to generate soil ensemble and add that to the list of all + # Generate modeled ensembles tryCatch({ - # find the soil depth levels based on the depth argument - # if soil profile is deeper than what is specified in the argument then I go as deep as the soil profile. - if (max(soilprop.new$soil_depth) > max(depths)) { - depths <- sort(c(depths, max(soilprop.new$soil_depth))) - } - depth.levs<-findInterval(soilprop.new$soil_depth, depths) - depth.levs[depth.levs==0] <-1 - depth.levs[depth.levs>length(depths)] <-length(depths) + depth.levs <- findInterval(soilprop.new$soil_depth_bottom, depths_cm) + depth.levs[depth.levs == 0] <- 1 + depth.levs[depth.levs > length(depths_cm)] <- length(depths_cm) - # Remove any NA depth levels valid_indices <- !is.na(depth.levs) if(sum(!valid_indices) > 0) { soilprop.new <- soilprop.new[valid_indices, ] depth.levs <- depth.levs[valid_indices] } - soilprop.new.grouped<-soilprop.new %>% - dplyr::mutate(DepthL=depths[depth.levs]) + soilprop.new.grouped <- soilprop.new %>% + dplyr::mutate(DepthL = depths_cm[depth.levs]) - # let's fit dirichlet for each depth level separately - simulated.soil.props<-soilprop.new.grouped %>% + # Dirichlet modeling per mukey AND depth (component-level) + simulated.soil.props <- soilprop.new.grouped %>% split(list(soilprop.new.grouped$DepthL, soilprop.new.grouped$mukey)) %>% purrr::map_df(function(DepthL.Data){ tryCatch({ - # I model the soil properties for this depth - dir.model <-DepthL.Data[,c(1:3)] %>% - as.matrix() %>% - sirt::dirichlet.mle(.) - # Monte Carlo sampling based on my dirichlet model - alpha <- dir.model$alpha - alpha <- matrix(alpha, nrow= size, ncol=length(alpha), byrow=TRUE ) + texture_data <- DepthL.Data[,c("fraction_of_sand_in_soil", + "fraction_of_silt_in_soil", + "fraction_of_clay_in_soil")] %>% + as.matrix() + + if(nrow(texture_data) == 0) return(NULL) + + # Fit Dirichlet on component textures + dir.model <- sirt::dirichlet.mle(texture_data) + alpha <- matrix(dir.model$alpha, nrow = size, ncol = length(dir.model$alpha), byrow = TRUE) simulated.soil <- sirt::dirichlet.simul(alpha) - # Validate SOC data before processing - if (any(is.na(DepthL.Data$soil_organic_carbon_stock))) { - PEcAn.logger::logger.warn("Found NA values in soil_organic_carbon_stock data. Removing incomplete records.") - DepthL.Data <- DepthL.Data[!is.na(DepthL.Data$soil_organic_carbon_stock), ] - } - if (nrow(DepthL.Data) == 0) { - PEcAn.logger::logger.warn("No valid SOC data after removing NAs") - return(NULL) - } - # Simulate SOC uncertainty using Gamma distribution - soc_mean <- mean(DepthL.Data$soil_organic_carbon_stock, na.rm = TRUE) - soc_sd <- stats::sd(DepthL.Data$soil_organic_carbon_stock, na.rm = TRUE) - # Handle edge cases for SOC simulation - if (nrow(DepthL.Data) == 1) { - simulated_soc <- rep(NA_real_, size) - } else if (is.na(soc_sd) || soc_sd == 0) { - simulated_soc <- rep(NA_real_, size) + # Component-weighted SOC + soc_values <- DepthL.Data$soil_organic_carbon_stock + weights <- DepthL.Data$comppct_r / sum(DepthL.Data$comppct_r) + + soc_mean <- stats::weighted.mean(soc_values, weights) + soc_sd <- sqrt(stats::weighted.mean((soc_values - soc_mean)^2, weights)) + + if (is.na(soc_sd) || soc_sd == 0) { + # No variability - use mean value (preserves data for single observations) + simulated_soc <- rep(soc_mean, size) } else { + # Has variability - sample from gamma distribution shape <- (soc_mean^2) / (soc_sd^2) rate <- soc_mean / (soc_sd^2) - simulated_soc <- stats::rgamma(size, shape=shape, rate=rate) + simulated_soc <- stats::rgamma(size, shape = shape, rate = rate) } - simulated.soil<-simulated.soil %>% - as.data.frame %>% - dplyr::mutate(DepthL=rep(DepthL.Data$DepthL[1], size), - mukey=rep(DepthL.Data$mukey[1], size), - soil_organic_carbon_stock = simulated_soc) %>% - `colnames<-`(c("fraction_of_sand_in_soil", - "fraction_of_silt_in_soil", - "fraction_of_clay_in_soil", - "soil_depth", - "mukey", - "soil_organic_carbon_stock")) - simulated.soil + result_df <- data.frame( + fraction_of_sand_in_soil = simulated.soil[,1], + fraction_of_silt_in_soil = simulated.soil[,2], + fraction_of_clay_in_soil = simulated.soil[,3], + soil_depth = DepthL.Data$soil_depth[1], + mukey = unique(DepthL.Data$mukey), + soil_organic_carbon_stock = simulated_soc + ) + + return(result_df) }, error = function(e) { PEcAn.logger::logger.warn(conditionMessage(e)) return(NULL) }) - }) + }) - # estimating the proportion of areas for those mukeys which are modeled - - # defining mukey_area + # Calculate area weights mukey_area <- data.frame( mukey = names(mukey_counts), Area = as.numeric(mukey_counts) / sum(mukey_counts) ) %>% dplyr::filter(.data$mukey %in% unique(simulated.soil.props$mukey)) %>% dplyr::mutate(Area = .data$Area / sum(.data$Area, na.rm = TRUE)) - #--- Mixing the depths - soil.profiles<-simulated.soil.props %>% - split(.$mukey) %>% + + # Generate weighted profiles + soil.profiles <- simulated.soil.props %>% + split(.$mukey) %>% purrr::map(function(soiltype.sim){ sizein <- mukey_area$Area[mukey_area$mukey == unique(soiltype.sim$mukey)] * size 1:ceiling(sizein) %>% purrr::map(function(x){ - soiltype.sim %>% + soiltype.sim %>% split(.$soil_depth) %>% purrr::map_dfr(~.x[x,]) }) }) %>% purrr::flatten() - #- add them to the list of all the ensembles ready to be converted to .nc file - all.soil.ens<-soil.profiles %>% + + # Convert to ensemble arrays + all.soil.ens <- soil.profiles %>% purrr::map(function(SEns){ SEns <- SEns[, names(SEns) != "mukey"] names(SEns) %>% @@ -334,24 +274,19 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa }) %>% stats::setNames(names(SEns)) }) %>% - c(all.soil.ens,.) + c(all.soil.ens, .) }, error = function(e) { PEcAn.logger::logger.warn(conditionMessage(e)) }) - - #-- generating the .nc files for all the collected ensembles + # Generate NetCDF files out.ense <- (1:length(all.soil.ens)) %>% purrr::map(function(i) { - tryCatch({ - #browser() - # calc new filename prefix <- paste0("gSSURGO_soil_", i) new.file <- file.path(outdir, paste0(prefix, ".nc")) - #sending it to the func where some new params will be added and then it will be written down as nc file. suppressWarnings({ PEcAn.data.land::soil2netcdf(all.soil.ens[[i]], new.file) }) @@ -361,20 +296,12 @@ extract_soil_gssurgo <- function(outdir, lat, lon, size=1, grid_size=3, grid_spa PEcAn.logger::logger.warn(conditionMessage(e)) return(NULL) }) - }) - # removing the nulls or the ones that throw exception in the above trycatch - out.ense<- out.ense %>% - purrr::discard(is.null) - - out.ense<-out.ense %>% - stats::setNames(rep("path", length(out.ense))) + }) %>% + purrr::discard(is.null) %>% + stats::setNames(rep("path", length(.))) return(out.ense) -} - - - - +} #' Extract soil data from the gridpoint closest to a location diff --git a/modules/data.land/R/soil_process.R b/modules/data.land/R/soil_process.R index 515a2aadcc..3e34709aca 100644 --- a/modules/data.land/R/soil_process.R +++ b/modules/data.land/R/soil_process.R @@ -58,19 +58,13 @@ soil_process <- function(settings, input, dbfiles, overwrite = FALSE,run.local=T names(newfile) <- rep("path", length(newfile)) if(length(newfile)==0){ - radius <- ifelse(is.null(settings$run$input$soil$radius), 100, + radius <- ifelse(is.null(settings$run$input$soil$radius), 500, as.numeric(settings$run$input$soil$radius)) - grid_size <- max(3, ifelse(is.null(settings$run$input$soil$grid_size), 3, - as.numeric(settings$run$input$soil$grid_size))) - - grid_extent <- radius * sqrt(pi) - grid_spacing <- grid_extent / (grid_size - 1) newfile <- extract_soil_gssurgo( outfolder, lat = latlon$lat, lon = latlon$lon, - grid_size = grid_size, - grid_spacing = grid_spacing + radius = radius ) # register files in DB diff --git a/modules/data.land/man/extract_soil_gssurgo.Rd b/modules/data.land/man/extract_soil_gssurgo.Rd index 4f69630501..4958acaef0 100644 --- a/modules/data.land/man/extract_soil_gssurgo.Rd +++ b/modules/data.land/man/extract_soil_gssurgo.Rd @@ -6,28 +6,28 @@ \usage{ extract_soil_gssurgo( outdir, - lat, - lon, + lat = NULL, + lon = NULL, + aoi = NULL, size = 1, - grid_size = 3, - grid_spacing = 100, - depths = c(0.15, 0.3, 0.6) + radius = 500, + depths = c(0, 0.15, 0.3, 0.6) ) } \arguments{ -\item{outdir}{Output directory for writing down the netcdf file} +\item{outdir}{Output directory for writing NetCDF files} -\item{lat}{Latitude of center point (single numeric value)} +\item{lat}{Latitude of center point (optional if aoi provided)} -\item{lon}{Longitude of center point (single numeric value)} +\item{lon}{Longitude of center point (optional if aoi provided)} -\item{size}{Ensemble size} +\item{aoi}{Custom area of interest as sf or terra polygon (optional)} -\item{grid_size}{Size of the spatial sampling grid around the center point (default: 3)} +\item{size}{Ensemble size (number of ensemble members to generate)} -\item{grid_spacing}{Spacing between grid cells in meters (default: 100)} +\item{radius}{Buffer radius in meters around lat/lon point (default: 500)} -\item{depths}{Standard set of soil depths in m to create the ensemble of soil profiles with.} +\item{depths}{Soil depth breakpoints in meters, must start with 0 (default: c(0, 0.15, 0.30, 0.60))} } \value{ It returns the address for the generated soil netcdf file @@ -36,18 +36,10 @@ It returns the address for the generated soil netcdf file Extract soil data from gssurgo } \details{ -This function takes a single lat/lon point and creates a spatial grid -around it for sampling soil variability. The grid_size parameter determines -how many grid points (grid_size x grid_size) are created around the center point. +This function extracts soil property data from the USDA gSSURGO database +for a specified area of interest. It can work with either a lat/lon point +(creating a circular buffer) or a custom polygon AOI. } -\section{Current Limitations}{ - -- MUKEY frequency weighting treats occurrence counts as proportional to area coverage -- This approximation may introduce geometric bias for irregular polygon data -- Buffer radius is set to grid_spacing/2 to reduce overlapping queries, but may still miss coverage -- True area-weighted aggregation using polygon geometries is planned (see issue #3609) -} - \examples{ \dontrun{ outdir <- "~/paleon/envTest" diff --git a/modules/data.land/tests/testthat/test-extract_soil_nc.R b/modules/data.land/tests/testthat/test-extract_soil_nc.R index 9574ddd952..ee3c2a9c99 100644 --- a/modules/data.land/tests/testthat/test-extract_soil_nc.R +++ b/modules/data.land/tests/testthat/test-extract_soil_nc.R @@ -12,11 +12,10 @@ test_that("extract_soil_gssurgo returns valid NetCDF files for valid US coordina lat = lat, lon = lon, size = 2, - grid_size = 3, - grid_spacing = 100, - depths = c(0.15, 0.30) + radius = 500, + depths = c(0, 0.15, 0.30) ) - + expect_false(is.null(res)) expect_type(res, "list") @@ -30,7 +29,7 @@ test_that("extract_soil_gssurgo returns valid NetCDF files for valid US coordina # Validate NetCDF content if (requireNamespace("ncdf4", quietly = TRUE)) { expected_vars <- c("fraction_of_sand_in_soil", "fraction_of_silt_in_soil", - "fraction_of_clay_in_soil", "soil_organic_carbon_stock") + "fraction_of_clay_in_soil", "soil_organic_carbon_stock") # Skip first ensemble member (first ensemble member always uses the reported values without sampling) # and use subsequent members are simulated ensemble member with uncertainty @@ -41,6 +40,7 @@ test_that("extract_soil_gssurgo returns valid NetCDF files for valid US coordina for (var in expected_vars) { expect_true(var %in% names(nc$var)) } + # Validate data quality sand <- ncdf4::ncvar_get(nc, "fraction_of_sand_in_soil") silt <- ncdf4::ncvar_get(nc, "fraction_of_silt_in_soil") @@ -71,13 +71,12 @@ test_that("extract_soil_gssurgo performance is reasonable", { lat = 40.1164, lon = -88.2434, size = 1, - grid_size = 3, - grid_spacing = 100, - depths = c(0.15) + radius = 500, + depths = c(0, 0.15) ) end_time <- Sys.time() exec_time <- as.numeric(difftime(end_time, start_time, units = "secs")) - expect_lt(exec_time, 40) + expect_lt(exec_time, 60) }) test_that("extract_soil_gssurgo handles ensemble generation", { @@ -90,16 +89,147 @@ test_that("extract_soil_gssurgo handles ensemble generation", { lat = 40.1164, lon = -88.2434, size = 3, - grid_size = 3, - grid_spacing = 100, - depths = c(0.15, 0.30) + radius = 500, + depths = c(0, 0.15, 0.30) ) expect_false(is.null(res)) expect_type(res, "list") - expect_equal(length(res), 4) + expect_equal(length(res), 5) + file_paths <- unlist(res) + expect_true(all(file.exists(file_paths))) +}) + +test_that("extract_soil_gssurgo works with custom AOI polygon", { + skip_on_cran() + skip_on_ci() + skip_if_not_installed("terra") + skip_if_not_installed("sf") + + tmp_outdir <- withr::local_tempdir("gssurgo_test_") + + # Create small polygon AOI + aoi_coords <- matrix(c( + -88.25, 40.11, + -88.24, 40.11, + -88.24, 40.12, + -88.25, 40.12, + -88.25, 40.11 + ), ncol = 2, byrow = TRUE) + + aoi <- terra::vect(aoi_coords, type = "polygons", crs = "epsg:4326") + + res <- extract_soil_gssurgo( + outdir = tmp_outdir, + aoi = aoi, + size = 2, + depths = c(0, 0.15, 0.30) + ) + + expect_false(is.null(res)) + expect_type(res, "list") + expect_gt(length(res), 0) file_paths <- unlist(res) expect_true(all(file.exists(file_paths))) +}) + +test_that("extract_soil_gssurgo handles different buffer radii", { + skip_on_cran() + skip_on_ci() + tmp_outdir <- withr::local_tempdir("gssurgo_test_") + + # Small radius + res_small <- extract_soil_gssurgo( + outdir = tmp_outdir, + lat = 40.1164, + lon = -88.2434, + size = 1, + radius = 200, + depths = c(0, 0.15) + ) + + # Larger radius (should potentially capture more mukeys) + res_large <- extract_soil_gssurgo( + outdir = tmp_outdir, + lat = 40.1164, + lon = -88.2434, + size = 1, + radius = 1000, + depths = c(0, 0.15) + ) + + expect_false(is.null(res_small)) + expect_false(is.null(res_large)) + expect_type(res_small, "list") + expect_type(res_large, "list") +}) + +test_that("extract_soil_gssurgo component-level variability is preserved", { + skip_on_cran() + skip_on_ci() + tmp_outdir <- withr::local_tempdir("gssurgo_test_") + + res <- extract_soil_gssurgo( + outdir = tmp_outdir, + lat = 40.1164, + lon = -88.2434, + size = 5, # Multiple ensemble members to check variability + radius = 500, + depths = c(0, 0.15, 0.30) + ) + + expect_false(is.null(res)) + expect_gt(length(res), 2) + + if (requireNamespace("ncdf4", quietly = TRUE) && length(res) >= 3) { + # Compare two different ensemble members (skip first - it's unsampled) + nc1 <- ncdf4::nc_open(unlist(res)[2]) + nc2 <- ncdf4::nc_open(unlist(res)[3]) + on.exit({ + ncdf4::nc_close(nc1) + ncdf4::nc_close(nc2) + }, add = TRUE) + + sand1 <- ncdf4::ncvar_get(nc1, "fraction_of_sand_in_soil") + sand2 <- ncdf4::ncvar_get(nc2, "fraction_of_sand_in_soil") + + # Ensemble members should show variability (not identical) + expect_false(all(sand1 == sand2)) + } +}) + +test_that("extract_soil_gssurgo requires depths to start with 0", { + skip_on_cran() + skip_on_ci() + tmp_outdir <- withr::local_tempdir("gssurgo_test_") + + # Disable debugging during error testing + withr::local_options(error = NULL) + + # Should error when depths doesn't start with 0 + expect_error( + extract_soil_gssurgo( + outdir = tmp_outdir, + lat = 40.1164, + lon = -88.2434, + size = 1, + radius = 500, + depths = c(0.15, 0.30) # Missing 0 at start + ), + regexp = "First depth must be 0" + ) + + # Should work when depths starts with 0 + res <- extract_soil_gssurgo( + outdir = tmp_outdir, + lat = 40.1164, + lon = -88.2434, + size = 1, + radius = 500, + depths = c(0, 0.15, 0.30) # Correct format + ) + + expect_false(is.null(res)) }) \ No newline at end of file